partis-bcr 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. bin/FastTree +0 -0
  2. bin/add-chimeras.py +59 -0
  3. bin/add-seqs-to-outputs.py +81 -0
  4. bin/bcr-phylo-run.py +799 -0
  5. bin/build.sh +24 -0
  6. bin/cf-alleles.py +97 -0
  7. bin/cf-germlines.py +57 -0
  8. bin/cf-linearham.py +199 -0
  9. bin/chimera-plot.py +76 -0
  10. bin/choose-partially-paired.py +143 -0
  11. bin/circle-plots.py +30 -0
  12. bin/compare-plotdirs.py +298 -0
  13. bin/diff-parameters.py +133 -0
  14. bin/docker-hub-push.sh +6 -0
  15. bin/extract-pairing-info.py +55 -0
  16. bin/gcdyn-simu-run.py +223 -0
  17. bin/gctree-run.py +244 -0
  18. bin/get-naive-probabilities.py +126 -0
  19. bin/iqtree-1.6.12 +0 -0
  20. bin/lonr.r +1020 -0
  21. bin/makeHtml +52 -0
  22. bin/mds-run.py +46 -0
  23. bin/parse-output.py +277 -0
  24. bin/partis +1869 -0
  25. bin/partis-pip +116 -0
  26. bin/partis.py +1869 -0
  27. bin/plot-gl-set-trees.py +519 -0
  28. bin/plot-hmms.py +151 -0
  29. bin/plot-lb-tree.py +427 -0
  30. bin/raxml-ng +0 -0
  31. bin/read-bcr-phylo-trees.py +38 -0
  32. bin/read-gctree-output.py +166 -0
  33. bin/run-chimeras.sh +64 -0
  34. bin/run-dtr-scan.sh +25 -0
  35. bin/run-paired-loci.sh +100 -0
  36. bin/run-tree-metrics.sh +88 -0
  37. bin/smetric-run.py +62 -0
  38. bin/split-loci.py +317 -0
  39. bin/swarm-2.1.13-linux-x86_64 +0 -0
  40. bin/test-germline-inference.py +425 -0
  41. bin/tree-perf-run.py +194 -0
  42. bin/vsearch-2.4.3-linux-x86_64 +0 -0
  43. bin/vsearch-2.4.3-macos-x86_64 +0 -0
  44. bin/xvfb-run +194 -0
  45. partis_bcr-1.0.2.data/scripts/cf-alleles.py +97 -0
  46. partis_bcr-1.0.2.data/scripts/cf-germlines.py +57 -0
  47. partis_bcr-1.0.2.data/scripts/extract-pairing-info.py +55 -0
  48. partis_bcr-1.0.2.data/scripts/gctree-run.py +244 -0
  49. partis_bcr-1.0.2.data/scripts/parse-output.py +277 -0
  50. partis_bcr-1.0.2.data/scripts/split-loci.py +317 -0
  51. partis_bcr-1.0.2.data/scripts/test.py +1005 -0
  52. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/METADATA +1 -1
  53. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/RECORD +101 -51
  54. partis_bcr-1.0.2.dist-info/top_level.txt +1 -0
  55. {partis → python}/glutils.py +1 -1
  56. python/main.py +30 -0
  57. {partis → python}/plotting.py +10 -1
  58. {partis → python}/treeutils.py +18 -16
  59. {partis → python}/utils.py +14 -7
  60. packages/ham/bcrham +0 -0
  61. partis/main.py +0 -59
  62. partis_bcr-1.0.0.dist-info/top_level.txt +0 -1
  63. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/WHEEL +0 -0
  64. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/entry_points.txt +0 -0
  65. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/licenses/COPYING +0 -0
  66. {partis → python}/__init__.py +0 -0
  67. {partis → python}/alleleclusterer.py +0 -0
  68. {partis → python}/allelefinder.py +0 -0
  69. {partis → python}/alleleremover.py +0 -0
  70. {partis → python}/annotationclustering.py +0 -0
  71. {partis → python}/baseutils.py +0 -0
  72. {partis → python}/cache/__init__.py +0 -0
  73. {partis → python}/cache/cached_uncertainties.py +0 -0
  74. {partis → python}/clusterpath.py +0 -0
  75. {partis → python}/coar.py +0 -0
  76. {partis → python}/corrcounter.py +0 -0
  77. {partis → python}/datautils.py +0 -0
  78. {partis → python}/event.py +0 -0
  79. {partis → python}/fraction_uncertainty.py +0 -0
  80. {partis → python}/gex.py +0 -0
  81. {partis → python}/glomerator.py +0 -0
  82. {partis → python}/hist.py +0 -0
  83. {partis → python}/hmmwriter.py +0 -0
  84. {partis → python}/hutils.py +0 -0
  85. {partis → python}/indelutils.py +0 -0
  86. {partis → python}/lbplotting.py +0 -0
  87. {partis → python}/mds.py +0 -0
  88. {partis → python}/mutefreqer.py +0 -0
  89. {partis → python}/paircluster.py +0 -0
  90. {partis → python}/parametercounter.py +0 -0
  91. {partis → python}/paramutils.py +0 -0
  92. {partis → python}/partitiondriver.py +0 -0
  93. {partis → python}/partitionplotter.py +0 -0
  94. {partis → python}/performanceplotter.py +0 -0
  95. {partis → python}/plotconfig.py +0 -0
  96. {partis → python}/processargs.py +0 -0
  97. {partis → python}/prutils.py +0 -0
  98. {partis → python}/recombinator.py +0 -0
  99. {partis → python}/scanplot.py +0 -0
  100. {partis → python}/seqfileopener.py +0 -0
  101. {partis → python}/treegenerator.py +0 -0
  102. {partis → python}/viterbicluster.py +0 -0
  103. {partis → python}/vrc01.py +0 -0
  104. {partis → python}/waterer.py +0 -0
@@ -0,0 +1,1005 @@
1
+ #!python
2
+ from __future__ import absolute_import, division, unicode_literals
3
+ from __future__ import print_function
4
+ import argparse
5
+ import random
6
+ import numpy
7
+ import os
8
+ import csv
9
+ import glob
10
+ import math
11
+ import shutil
12
+ import time
13
+ from collections import OrderedDict
14
+ from subprocess import Popen, PIPE, check_call, check_output, CalledProcessError
15
+ import copy
16
+ import colored_traceback.always
17
+ import sys
18
+ from io import open
19
+ from pathlib import Path
20
+ partis_dir = str(Path(__file__).parent.parent)
21
+ sys.path.insert(1, partis_dir) # + '/python')
22
+ import yaml
23
+
24
+ from python.baseutils import get_extra_str
25
+ import python.utils as utils
26
+ import python.glutils as glutils
27
+ from python.hist import Hist
28
+ from python.clusterpath import ClusterPath
29
+ import python.paircluster as paircluster
30
+
31
+ # ----------------------------------------------------------------------------------------
32
+ class Tester(object):
33
+ # ----------------------------------------------------------------------------------------
34
+ def dirs(self, tstr, force_paired=False):
35
+ assert tstr in ['ref', 'new']
36
+ return 'test%s/%s-results%s' % ('/paired' if (args.paired or force_paired) else '', tstr, '-slow' if args.slow else '')
37
+ # ----------------------------------------------------------------------------------------
38
+ def nqr(self, act):
39
+ if act == 'quick': # bears no relation to the others, so makes sense to handle it differently
40
+ return 10
41
+ nqdict = {'normal' : {'simu' : 50, 'data' : 100 if args.paired else 50},
42
+ 'slow' : {'simu' : 1000, 'data' : -1}}
43
+ return nqdict['slow' if args.slow else 'normal'][act]
44
+ # ----------------------------------------------------------------------------------------
45
+ def get_stypes(self, ptest):
46
+ namelist = ptest.split('-')
47
+ if ptest == 'simulate':
48
+ input_stype = 'new'
49
+ input_dtype = None
50
+ elif 'cache-parameters' in ptest:
51
+ input_stype = 'new'
52
+ input_dtype = namelist[-1]
53
+ else:
54
+ input_stype, input_dtype = namelist[-2:]
55
+
56
+ assert input_stype in self.stypes + [None]
57
+ assert input_dtype in self.dtypes + [None]
58
+ return input_stype, input_dtype
59
+ # ----------------------------------------------------------------------------------------
60
+ def inpath(self, st, dt):
61
+ if dt == 'data':
62
+ return self.paired_datafname if args.paired else self.datafname
63
+ else:
64
+ spath = self.label + '/simu' + ('' if args.paired else '.yaml')
65
+ if st is None:
66
+ return spath
67
+ return self.dirs(st) + '/' + spath
68
+ # ----------------------------------------------------------------------------------------
69
+ def paramdir(self, st, dt):
70
+ pd = self.label + '/parameters/' + dt
71
+ if st is None: # if <st> isn't set, we want the subpath (without parent dir), e.g. when --dont-run/evaluating results
72
+ return pd
73
+ return self.dirs(st) + '/' + pd
74
+ # ----------------------------------------------------------------------------------------
75
+ def astr(self, inout, dt=None):
76
+ if not args.paired or (args.paired and inout == 'in' and dt == 'data'):
77
+ return '%sfname' % inout
78
+ return 'paired-%sdir' % inout
79
+ # ----------------------------------------------------------------------------------------
80
+ # i'm adding this late (especially for the non-production tests) so there's probably some more places it could be used
81
+ def opath(self, ptest, st=None, force_paired=False): # don't set <st> if you just want the basename-type stuff (no base/parent dirs)
82
+ if 'cache-parameters' in ptest:
83
+ return self.paramdir(None, ptest.split('-')[2])
84
+ elif ptest == 'simulate':
85
+ return self.inpath(None, 'simu')
86
+ else:
87
+ op = '%s%s' % (ptest, '' if (args.paired or force_paired) else '.yaml')
88
+ if (args.paired or force_paired) and 'get-selection-metrics' in ptest:
89
+ op += '-chosen-abs.csv'
90
+ if st is None:
91
+ return op
92
+ return '%s/%s' % (self.dirs(st, force_paired=force_paired), op)
93
+ # ----------------------------------------------------------------------------------------
94
+ def ptn_cachefn(self, st, for_cmd=False, lpair=None, locus=None): # see note above for opath()
95
+ assert st == 'new' # i think?
96
+ cfn = ''
97
+ if for_cmd:
98
+ if args.paired:
99
+ return 'paired-outdir'
100
+ else:
101
+ cfn += self.dirs('new')
102
+ if args.paired:
103
+ assert lpair is not None and locus is not None
104
+ cfn += '%s/persistent-cache-%s.csv' % ('+'.join(lpair), locus) # duplicates code in bin/partis getofn()
105
+ else:
106
+ cfn = '%s%scache-%s-partition.csv' % (cfn, '' if cfn=='' else '/', st)
107
+ return cfn
108
+ # ----------------------------------------------------------------------------------------
109
+ def all_ptn_cachefns(self): # return all of them (ok atm it's juse the one, but we used to also have the 'ref' one, and maybe will want it in the future?)
110
+ if args.paired:
111
+ return [self.ptn_cachefn(s, lpair=lp, locus=l) for s in ['new'] for lp in utils.locus_pairs[args.ig_or_tr] for l in lp]
112
+ else:
113
+ return [self.ptn_cachefn(s) for s in ['new']]
114
+ # ----------------------------------------------------------------------------------------
115
+ def is_prod_test(self, ptest):
116
+ return 'cache-parameters' in ptest or ptest == 'simulate'
117
+ # ----------------------------------------------------------------------------------------
118
+ def sclust_sizes(self): # NOTE depends on self.n_simu_leaves
119
+ return (15, 20) if args.slow else (5, 10)
120
+ # ----------------------------------------------------------------------------------------
121
+ def min_smetric_cluster_size(self):
122
+ return 10 if args.slow else 3
123
+ # ----------------------------------------------------------------------------------------
124
+ def cluster_size_args(self):
125
+ return ['--min-selection-metric-cluster-size', str(self.min_smetric_cluster_size()),
126
+ '--min-paired-cluster-size-to-read', str(self.min_smetric_cluster_size())]
127
+ # ----------------------------------------------------------------------------------------
128
+ def __init__(self):
129
+ self.partis_path = 'partis' if shutil.which('partis') else '%s/bin/partis' % utils.get_partis_dir() # use version in PATH if it's there (pipx seems to leave two incompatible versions lying around)
130
+ if args.prepend_coverage:
131
+ self.partis_path = 'coverage3 run --append %s' % self.partis_path
132
+ self.datafname = 'test/mishmash.fa' # some data from adaptive, chaim, and vollmers
133
+ # generate paired data dir with: UPDATE i cat'd the ig?.fa files into all-seqs.fa (in the same dir) so extract-pair-info and split-loci get run
134
+ # - ./bin/split-loci.py /fh/fast/matsen_e/data/10x-examples/data/sc5p_v2_hs_B_prevax_10k_5gex_B_vdj_b_filtered_contig.fasta --outdir test/paired-data --input-metafname /fh/fast/matsen_e/data/10x-examples/processed-data/v0/sc5p_v2_hs_B_prevax_10k_5gex_B_vdj_b_filtered_contig/meta.yaml --n-max-queries 100 >test/paired-data/split-loci.log
135
+ # - ./bin/extract-pairing-info.py /fh/fast/matsen_e/data/10x-examples/data/sc5p_v2_hs_B_prevax_10k_5gex_B_vdj_b_filtered_contig.fasta test/paired-data/meta.yaml --n-max-queries 100 >test/paired-data/extract.log
136
+ self.paired_datafname = 'test/paired-data/all-seqs.fa'
137
+ self.input_metafname = 'test/input-meta.yaml'
138
+
139
+ self.stypes = ['ref', 'new'] # I don't know what the 's' stands for
140
+ self.dtypes = ['data', 'simu']
141
+ if not os.path.exists(self.dirs('new')):
142
+ os.makedirs(self.dirs('new'))
143
+ self.common_extras = ['--random-seed', '1', '--n-procs', '10'] # would be nice to set --n-procs based on the machine, but for some reason the order of things in the parameter files gets shuffled a bit if you change the number of procs
144
+ self.label = 'test' # i really don't think there's any reason to have this, but i don't feel like removing it atm since it's not really causing much complication
145
+
146
+ self.tiny_eps = 1e-4
147
+ self.run_times = {}
148
+ self.eps_vals = {} # fractional difference which we allow for each test type (these were generated with the code in get_typical_variances() above)
149
+ self.eps_vals['v_call'] = 0.02 # hm, actually, I think I just made the annotation ones up
150
+ self.eps_vals['d_call'] = 0.02
151
+ self.eps_vals['j_call'] = 0.02
152
+ self.eps_vals['mean_hamming'] = 0.1
153
+ self.eps_vals['v_hamming'] = 0.1
154
+ self.eps_vals['d_hamming'] = 0.1
155
+ self.eps_vals['j_hamming'] = 0.1
156
+ self.eps_vals['cdr3_hamming'] = 0.1
157
+ self.eps_vals['purity'] = 0.08
158
+ self.eps_vals['completeness'] = 0.08
159
+
160
+ self.n_simu_leaves = 5
161
+
162
+ self.selection_metrics = ['lbi', 'lbr', 'cons-dist-aa', 'aa-lbi', 'aa-lbr'] # NOTE kind of duplicates treeutils.selection_metrics, but I want to be able to change the latter
163
+ self.pair_clean_metrics = ['correct', 'unpaired', 'mispaired'] if args.paired else []
164
+ self.expected_trees = ['tree', 'aa-tree']
165
+
166
+ self.logfname = self.dirs('new') + '/test.log'
167
+
168
+ self.tests = OrderedDict()
169
+
170
+ # ----------------------------------------------------------------------------------------
171
+ def add_inference_tests(input_stype): # if input_stype is 'ref', infer on old simulation and parameters, if it's 'new' use the new ones
172
+ if not args.paired:
173
+ self.tests['annotate-%s-simu'%input_stype] = {'extras' : ['--plot-annotation-performance', ]}
174
+ self.tests['multi-annotate-%s-simu'%input_stype] = {'extras' : ['--plot-annotation-performance', '--simultaneous-true-clonal-seqs']} # NOTE this is mostly different to the multi-seq annotations from the partition step because it uses the whole sample
175
+ self.tests['partition-%s-simu'%input_stype] = {'extras' : ['--plot-annotation-performance', '--max-ccf-fail-frac', '0.10']} # '--biggest-logprob-cluster-to-calculate', '5', '--biggest-naive-seq-cluster-to-calculate', '5',
176
+ if args.paired:
177
+ self.tests['subset-partition-%s-simu'%input_stype] = {'extras' : ['--max-ccf-fail-frac', '0.15']} # '--biggest-logprob-cluster-to-calculate', '5', '--biggest-naive-seq-cluster-to-calculate', '5',
178
+ # this runs ok, but i's need to modify some things so its output is actually checked self.tests['subset-annotate-%s-simu'%input_stype] = {'extras' : ['--max-ccf-fail-frac', '0.15']} # '--biggest-logprob-cluster-to-calculate', '5', '--biggest-naive-seq-cluster-to-calculate', '5',
179
+ self.tests['seed-partition-%s-simu'%input_stype] = {'extras' : ['--max-ccf-fail-frac', '0.10']}
180
+ if not args.paired:
181
+ self.tests['vsearch-partition-%s-simu'%input_stype] = {'extras' : ['--naive-vsearch']}
182
+ self.tests['get-selection-metrics-%s-simu'%input_stype] = {'extras' : ['--existing-output-run-cfg', 'paired'] + self.cluster_size_args()} # NOTE this runs on simulation, but it's checking the inferred selection metrics
183
+
184
+ # ----------------------------------------------------------------------------------------
185
+ if args.quick:
186
+ self.tests['cache-parameters-quick-new-simu'] = {'extras' : ['--n-max-queries', str(self.nqr('quick'))]}
187
+ else:
188
+ pcache_data_args = {'extras' : ['--n-max-queries', str(self.nqr('data'))]}
189
+ pcache_simu_args = {'extras' : []}
190
+ n_events = int(self.nqr('simu') / float(self.n_simu_leaves))
191
+ simulate_args = {'extras' : ['--n-sim-events', str(n_events), '--n-trees', str(n_events), '--n-leaf-distribution', 'geometric', '--n-leaves', str(self.n_simu_leaves)]}
192
+ if args.paired:
193
+ simulate_args['extras'] += ['--min-observations-per-gene', '5', '--mean-cells-per-droplet', '1.25', '--constant-cells-per-droplet', '--fraction-of-reads-to-remove', '0.15'] # it was crashing and this fixes it, i dunno if we should turn it on also for non-paired but whatever
194
+ if args.bust_cache: # if we're cache busting, we need to run these *first*, so that the inference tests run on a simulation file in the new dir that was just made (i.e. *not* whatever simulation file in the new dir happens to be there)
195
+ self.tests['cache-parameters-data'] = pcache_data_args
196
+ self.tests['simulate'] = simulate_args
197
+ self.tests['cache-parameters-simu'] = pcache_simu_args
198
+ add_inference_tests('new')
199
+ if not args.bust_cache: # normally (if we're not cache busting) we want to run these last, to make it super clear that the inference tests are running on the *reference* simulation file
200
+ self.tests['cache-parameters-data'] = pcache_data_args
201
+ if not args.no_simu:
202
+ self.tests['simulate'] = simulate_args
203
+
204
+ self.quick_tests = ['cache-parameters-quick-new-simu'] # this is kind of dumb to keep track of what the quick tests are in two different ways, but I only just started not adding the non-quick tests if --quick is set, and I don't want to mess with all the other places that use <self.quick_tests>
205
+
206
+ self.perfdirs = {} # set in fiddle_with_arguments() NOTE these correspond only to annotation performance, whereas <self.perf_info> has also partition performance
207
+ for ptest, argfo in self.tests.items():
208
+ self.fiddle_with_arguments(ptest, argfo)
209
+
210
+ self.perf_info = {version_stype : {} for version_stype in self.stypes}
211
+
212
+ # ----------------------------------------------------------------------------------------
213
+ def test(self, args):
214
+ if not args.dont_run:
215
+ self.run(args)
216
+ if args.dry_run or args.bust_cache or args.quick:
217
+ return
218
+ self.compare_production_results(['cache-parameters-simu'])
219
+ self.compare_stuff(input_stype='new')
220
+ self.compare_production_results(['cache-parameters-data'])
221
+ if not args.no_simu:
222
+ self.compare_production_results(['simulate'])
223
+ self.compare_run_times()
224
+
225
+ # ----------------------------------------------------------------------------------------
226
+ def run_coverage(self, args):
227
+ # ----------------------------------------------------------------------------------------
228
+ def run_cmd(cmd, shell=False, logfname=None, dont_prepend=False):
229
+ if logfname is not None:
230
+ print(' writing log to %s' % logfname)
231
+ cov_str = 'coverage3 run --append' # --data-file=%s/coverage/%d.cov (this doesn't seem to be supported in my version
232
+ utils.simplerun('%s%s' % ('' if dont_prepend else cov_str+' ', cmd), shell=shell, logfname=logfname)
233
+ # ----------------------------------------------------------------------------------------
234
+ ivsn = 0
235
+ while True:
236
+ odir = '%s/vsn-%d' % (args.coverage_outdir, ivsn)
237
+ if os.path.exists(odir):
238
+ print(' coverage outdir %s exists, you may want to rm -r it by hand' % odir)
239
+ else:
240
+ break
241
+ ivsn += 1
242
+ cfn = '%s/.coverage' % os.getcwd()
243
+ if os.path.exists(cfn):
244
+ print(' removing existing coverage file %s' % cfn)
245
+ os.remove(cfn)
246
+
247
+ run_cmd('./test/test.py --prepend-coverage', dont_prepend=True) # NOTE tests may fail because of the coverage stuff, which is fine (at the least they'll be way too slow)
248
+ run_cmd('./test/test.py --prepend-coverage --paired', dont_prepend=True) # also note that we have to put dont_prepend since recursive subprocs having coverage commands breaks things (at least before coverage 6.3)
249
+
250
+ # cp output files so that working files (e.g. tree inference output files) don't get scattered around the normal test output dir
251
+ if not os.path.exists(odir):
252
+ os.makedirs(odir)
253
+ ptnfn = '%s/%s' % (odir, utils.insert_before_suffix('-single', os.path.basename(self.opath('partition-new-simu', st='ref'))))
254
+ utils.simplerun('cp %s %s' % (self.opath('partition-new-simu', st='ref'), ptnfn))
255
+ pair_ptndir = '%s/%s-paired' % (odir, os.path.basename(self.opath('partition-new-simu', st='ref', force_paired=True)))
256
+ utils.simplerun('cp -r %s %s' % (self.opath('partition-new-simu', st='ref', force_paired=True), pair_ptndir))
257
+
258
+ for ft in ['csv', 'fa', 'yaml']:
259
+ run_cmd('./bin/parse-output.py %s %s/parse-output.%s' % (ptnfn, odir, ft))
260
+ run_cmd('./bin/parse-output.py %s %s/parse-output-paired --paired' % (pair_ptndir, odir))
261
+ run_cmd('./bin/cf-alleles.py --bases all', logfname='%s/cf-alleles.log'%odir)
262
+ run_cmd('./bin/cf-alleles.py --bases 8-51-1', logfname='%s/cf-alleles-8-51.log'%odir)
263
+
264
+ run_cmd('./bin/partis view-output --outfname %s' % ptnfn, logfname='%s/view-output.log'%odir)
265
+ run_cmd('./bin/partis view-output --paired-loci --paired-outdir %s' % pair_ptndir, logfname='%s/view-output-paired.log'%odir)
266
+
267
+ run_cmd('./bin/cf-germlines.py %s/hmm/germline-sets %s/hmm/germline-sets' % (self.paramdir('ref', 'simu'), self.paramdir('ref', 'data')), logfname='%s/cf-germlines.log'%odir)
268
+ run_cmd('./bin/compare-plotdirs.py --outdir %s/compare-plotdirs --plotdirs %s/hmm/mutation:%s/sw/mutation --names hmm:sw' % (odir, self.opath('annotate-new-simu-annotation-performance', st='ref').replace('.yaml', ''), self.opath('annotate-new-simu-annotation-performance', st='ref').replace('.yaml', '')))
269
+
270
+ ptn_plot_cmd = './bin/partis plot-partitions --partition-plot-cfg mds:trees --tree-inference-method iqtree --cluster-indices 0:2 %s' % ' '.join(self.cluster_size_args())
271
+ run_cmd('%s --outfname %s --plotdir %s/plot-partitions' % (ptn_plot_cmd, ptnfn, odir))
272
+ run_cmd('%s --paired-loci --paired-outdir %s --plotdir %s/plot-partitions-paired' % (ptn_plot_cmd, pair_ptndir, odir))
273
+
274
+ run_cmd('./bin/plot-hmms.py --outdir %s/plot-hmms --infiles %s' % (odir, ':'.join(glob.glob('%s/hmm/hmms/IGHD1*.yaml'%self.paramdir('ref', 'data')))))
275
+
276
+ gct_sm_cmd = './bin/partis get-selection-metrics --tree-inference-method gctree %s --cluster-indices 0:2' % ' '.join(self.cluster_size_args())
277
+ run_cmd('%s --outfname %s --plotdir %s/gctree-smetric-plots' % (gct_sm_cmd, ptnfn, odir))
278
+ run_cmd('./bin/read-gctree-output.py --locus igh --species human --gctreedir %s/gctree/iclust-0 --outdir %s/read-gctree-output' % (utils.getprefix(ptnfn), odir)) # NOTE this uses output from the previous line
279
+ run_cmd('%s --paired-loci --paired-outdir %s --plotdir %s/paired-gctree-smetric-plots' % (gct_sm_cmd, pair_ptndir, odir))
280
+
281
+ run_cmd('./bin/bcr-phylo-run.py --base-outdir %s/bcr-phylo-run' % odir) # don't use multiple gc rounds here, since we need the tree in the next line (and the tree isn't written for multiple gc rounds)
282
+ run_cmd('./bin/smetric-run.py --infname %s/bcr-phylo-run/selection/simu/mutated-simu.yaml --base-plotdir %s/smetric-run --metric-method lbi' % (odir, odir)) # NOTE uses results from previous line
283
+ run_cmd('./bin/bcr-phylo-run.py --base-outdir %s/bcr-phylo-run-paired --paired --n-gc-rounds 3 --obs-times 30:5,10:5' % odir)
284
+ run_cmd('./test/cf-paired-loci.py --label coverage --version v0 --n-replicates 2 --n-sub-procs 10 --scratch-mute-freq-list 0.01:0.1 --simu-extra-args=\"--flat-mute-freq --same-mute-freq-for-all-seqs --mutate-stop-codons\" --final-plot-xvar scratch-mute-freq --n-leaves-list 3 --n-sim-events-list 100 --single-light-locus igk --base-outdir %s/cf-paired-loci --perf-metrics precision:sensitivity --actions simu:cache-parameters:partition:plot:combine-plots' % odir, shell=True)
285
+
286
+ # it'd be nice to add germline inference to the normal (non-coverage) tests, but doing more than a trivial test like this requires lots of sequences, which is slower than I really want to add to testing
287
+ run_cmd('./bin/test-germline-inference.py --prepend-coverage-command --n-sim-events 1000 --outdir %s/test-germline-inference --sim-v-genes=IGHV1-18*01 --inf-v-genes=IGHV1-18*01 --snp-positions 27,55,88 --mutation-multiplier 0.00001 --seed 1' % odir, dont_prepend=True)
288
+
289
+ print('now run: coverage3 report --omit=python/__init__.py') # could automate this, but whatever
290
+
291
+ # ----------------------------------------------------------------------------------------
292
+ def fiddle_with_arguments(self, ptest, argfo):
293
+ input_stype, input_dtype = self.get_stypes(ptest)
294
+ argfo['input_stype'] = input_stype
295
+ argfo['bin'] = self.partis_path
296
+
297
+ if ptest == 'simulate':
298
+ argfo['parameter-dir'] = self.paramdir(input_stype, 'data')
299
+ else:
300
+ argfo['inpath'] = self.inpath('new' if args.bust_cache else 'ref', input_dtype)
301
+ if ptest.find('subset-') != 0:
302
+ argfo['parameter-dir'] = self.paramdir(input_stype, input_dtype)
303
+ if not args.paired:
304
+ argfo['sw-cachefname'] = self.paramdir(input_stype, input_dtype) + '/sw-cache.yaml'
305
+ if ptest != 'simulate' and input_dtype == 'simu' and not args.quick:
306
+ argfo['extras'] += ['--is-simu']
307
+
308
+ if 'annotate' in ptest:
309
+ argfo['action'] = 'annotate'
310
+ elif 'partition' in ptest:
311
+ argfo['action'] = 'partition'
312
+ if not args.paired: # eh, i don't think there's really a reason to do this for paired (although I partially implemented -- i got the files in single-chain/, but then getting the igh+igk/ etc. was going to be more work)
313
+ argfo['extras'] += ['--persistent-cachefname', self.ptn_cachefn(input_stype, for_cmd=True)]
314
+ elif 'get-selection-metrics' in ptest:
315
+ argfo['action'] = 'get-selection-metrics' # could really remove almost all of the arguments, mostly just need --outfname
316
+ elif 'cache-parameters-' in ptest:
317
+ argfo['action'] = 'cache-parameters'
318
+ else:
319
+ argfo['action'] = ptest
320
+ if ptest.find('subset-') == 0:
321
+ argfo['action'] = 'subset-%s' % argfo['action']
322
+ argfo['extras'] += ['--n-subsets', '2']
323
+
324
+ if '--plot-annotation-performance' in argfo['extras']:
325
+ self.perfdirs[ptest] = ptest + '-annotation-performance'
326
+ argfo['extras'] += ['--plotdir', self.dirs('new') + '/' + self.perfdirs[ptest]]
327
+
328
+ if '--plotdir' in argfo['extras']:
329
+ argfo['extras'] += ['--only-csv-plots']
330
+ if 'partition' in ptest:
331
+ argfo['extras'] += ['--no-partition-plots']
332
+
333
+ if '-data' in ptest and not args.paired: # would be cleaner to check that inpath is self.datafname
334
+ argfo['extras'] += ['--input-metafnames', self.input_metafname]
335
+
336
+ # ----------------------------------------------------------------------------------------
337
+ def compare_stuff(self, input_stype):
338
+ print('%s input' % input_stype)
339
+ for version_stype in self.stypes: # <version_stype> is the code version, i.e. 'ref' is the reference results, 'new' is the results we just made with the new code
340
+ self.read_annotation_performance(version_stype, input_stype)
341
+ self.read_partition_performance(version_stype, input_stype) # NOTE also calls read_annotation_performance()
342
+ self.read_selection_metric_performance(version_stype, input_stype)
343
+ self.compare_performance(input_stype)
344
+ if not args.paired:
345
+ self.compare_partition_cachefiles(input_stype)
346
+
347
+ # ----------------------------------------------------------------------------------------
348
+ def prepare_to_run(self, args, name, info):
349
+ """ Pre-run stuff that you don't want to do until *right* before you actually run. """
350
+ # ----------------------------------------------------------------------------------------
351
+ def clean_dir(sdir): # rm whole seed dir to make sure the dir for the previous seed id doesn't hang around
352
+ if args.dry_run:
353
+ print(' would rm %s' % sdir)
354
+ else: # maybe i should just rm the output dir for every test before running? although it might get complicated since some of them i think share dirs
355
+ print(' removing %s' % sdir)
356
+ shutil.rmtree(sdir)
357
+ # ----------------------------------------------------------------------------------------
358
+ def rm_file(fn): # for search: remove
359
+ if args.dry_run:
360
+ files_to_rm.append(fn)
361
+ else:
362
+ check_call(['rm', '-v', fn])
363
+ # ----------------------------------------------------------------------------------------
364
+ files_to_rm = [] # just for dbg
365
+ # delete any old partition cache files
366
+ if name == 'partition-' + info['input_stype'] + '-simu':
367
+ cachefnames = ['%s/%s' % (self.dirs('new'), f) for f in self.all_ptn_cachefns()]
368
+ for cfn in [f for f in cachefnames if os.path.exists(f)]:
369
+ rm_file(cfn)
370
+ # and any old tree inference files
371
+ if name == 'get-selection-metrics-' + info['input_stype'] + '-simu':
372
+ tfns = []
373
+ for subd in ['', '/*+*/partition-*']:
374
+ for tmeth in ['fasttree', 'iqtree']:
375
+ for ftp in ['fasttree.out', 'log*', 'input-seqs.fa']:
376
+ tfns += glob.glob('%s%s/%s/iclust-*/%s' % (self.opath('partition-new-simu', st='new'), subd, tmeth, ftp))
377
+ for ffn in [f for f in tfns if os.path.exists(f)]:
378
+ rm_file(ffn)
379
+ if len(files_to_rm) > 0:
380
+ print(' would rm %d tree inference working files' % len(files_to_rm))
381
+
382
+ # choose a seed uid
383
+ if name == 'seed-partition-' + info['input_stype'] + '-simu':
384
+ ifn = info['inpath']
385
+ seed_uid, _ = utils.choose_seed_unique_id(ifn, self.sclust_sizes()[0], self.sclust_sizes()[1], paired=args.paired) # , n_max_queries=self.nqr('partition')
386
+ if args.paired:
387
+ seed_uid, seed_loci = seed_uid
388
+ info['extras'] += ['--seed-unique-id', ':'.join(seed_uid), '--seed-loci', ':'.join(seed_loci)]
389
+ sdir = '%s/seeds' % self.opath(name, st=info['input_stype'])
390
+ if os.path.exists(sdir):
391
+ clean_dir(sdir)
392
+ else:
393
+ info['extras'] += ['--seed-unique-id', seed_uid]
394
+
395
+ if name.find('subset-') == 0:
396
+ if os.path.exists(self.opath(name, st='new')):
397
+ clean_dir(self.opath(name, st='new'))
398
+
399
+ # ----------------------------------------------------------------------------------------
400
+ def run(self, args):
401
+ if not args.dry_run:
402
+ open(self.logfname, 'w').close()
403
+
404
+ os.environ['PYTHONHASHSEED'] = '0' # turn off hash seed randomization for repeatable results
405
+ for name, info in self.tests.items():
406
+ if args.quick and name not in self.quick_tests:
407
+ continue
408
+
409
+ self.prepare_to_run(args, name, info)
410
+
411
+ ist, idt = self.get_stypes(name)
412
+ action = info['action']
413
+ cmd_str = info['bin'] + ' ' + action + ' --dont-write-git-info'
414
+ if args.prepend_coverage:
415
+ cmd_str += ' --prepend-coverage-command'
416
+ if args.paired:
417
+ cmd_str += ' --paired-loci'
418
+ for tkey in ['inpath', 'parameter-dir', 'sw-cachefname']:
419
+ if tkey in info:
420
+ cmd_str += ' --%s %s' % (tkey if tkey != 'inpath' else self.astr('in', dt=idt), info[tkey])
421
+ cmd_str += ' %s' % ' '.join(info['extras'] + self.common_extras)
422
+
423
+ if name == 'simulate':
424
+ cmd_str += ' --%s %s' % (self.astr('out'), self.inpath('new', 'simu'))
425
+ cmd_str += ' --indel-frequency 0.2' # super high indel freq, partly because we want to make sure we have some even with the new smaller default number of seqs
426
+ elif 'get-selection-metrics-' in name:
427
+ cmd_str += ' --%s %s' % (self.astr('out'), self.opath(name.replace('get-selection-metrics-', 'partition-'), st='new'))
428
+ cmd_str += ' --%s %s' % ('chosen-ab-fname' if args.paired else 'selection-metric-fname', self.opath(name, st='new'))
429
+ if args.slow:
430
+ cmd_str += ' --ab-choice-cfg %s/test/ab-choice-slow.yaml' % utils.get_partis_dir()
431
+ clist = cmd_str.split()
432
+ utils.remove_from_arglist(clist, '--%s'%self.astr('in', dt=idt), has_arg=True)
433
+ utils.remove_from_arglist(clist, '--parameter-dir', has_arg=True)
434
+ utils.remove_from_arglist(clist, '--sw-cachefname', has_arg=True)
435
+ utils.remove_from_arglist(clist, '--is-simu')
436
+ cmd_str = ' '.join(clist)
437
+ elif 'cache-parameters-' not in name:
438
+ cmd_str += ' --%s %s' % (self.astr('out'), self.opath(name, st='new'))
439
+
440
+ logstr = '%s %s' % (utils.color('green', name, width=30, padside='right'), cmd_str)
441
+ print(logstr if utils.len_excluding_colors(logstr) < args.print_width else logstr[:args.print_width] + '[...]')
442
+ if args.dry_run:
443
+ continue
444
+ logfile = open(self.logfname, 'a')
445
+ logfile.write(logstr + '\n')
446
+ logfile.close()
447
+ start = time.time()
448
+ try:
449
+ check_call(cmd_str + ' 1>>' + self.logfname + ' 2>>' + self.logfname, shell=True)
450
+ if args.quick:
451
+ print('\n %s' % 'ok')
452
+ except CalledProcessError as err:
453
+ # print err # this just says it exited with code != 0
454
+ print(' log tail: %s' % self.logfname)
455
+ print(utils.pad_lines(check_output(['tail', self.logfname], universal_newlines=True)))
456
+ sys.exit(1) # raise Exception('exited with error')
457
+ self.run_times[name] = time.time() - start # seconds
458
+
459
+ if not args.quick and not args.dry_run:
460
+ self.write_run_times()
461
+
462
+ # ----------------------------------------------------------------------------------------
463
+ def remove_reference_results(self, expected_content):
464
+ print(' removing ref files')
465
+ dir_content = set([os.path.basename(f) for f in glob.glob(self.dirs('ref') + '/*')])
466
+ if len(dir_content - expected_content) > 0 or len(expected_content - dir_content) > 0:
467
+ if len(dir_content - expected_content) > 0:
468
+ print('in ref dir but not expected\n %s' % (utils.color('red', ' '.join(dir_content - expected_content))))
469
+ if len(expected_content - dir_content) > 0:
470
+ print('expected but not in ref dir\n %s' % (utils.color('red', ' '.join(expected_content - dir_content))))
471
+ raise Exception('unexpected or missing content in reference dir (see above)')
472
+ for fname in [self.dirs('ref') + '/' + ec for ec in expected_content]:
473
+ print(' %srm %s' % ('(would) ' if args.dry_run else '', fname))
474
+ if args.dry_run:
475
+ continue
476
+ if os.path.isdir(fname):
477
+ shutil.rmtree(fname)
478
+ else:
479
+ os.remove(fname)
480
+
481
+ # ----------------------------------------------------------------------------------------
482
+ def bust_cache(self):
483
+ test_outputs = [self.opath(k) for k in self.tests if not self.is_prod_test(k)]
484
+ expected_content = set(test_outputs + list(self.perfdirs.values()) + [os.path.basename(self.logfname), self.label])
485
+ if not args.paired:
486
+ expected_content |= set(self.all_ptn_cachefns()) # they're in the partition outdir if --paired is set, so don't need to be moved
487
+ expected_content.add('run-times.csv')
488
+
489
+ # remove (very, very gingerly) whole reference dir
490
+ self.remove_reference_results(expected_content)
491
+
492
+ # copy over parameters, simulation, and plots
493
+ print(' mv new files to ref')
494
+ for fname in expected_content:
495
+ print(' mv %s --> %s/' % (fname, self.dirs('ref')))
496
+ if args.dry_run:
497
+ continue
498
+ shutil.move(self.dirs('new') + '/' + fname, self.dirs('ref') + '/')
499
+
500
+ # ----------------------------------------------------------------------------------------
501
+ def read_annotation_performance(self, version_stype, input_stype, these_are_cluster_annotations=False):
502
+ for sequence_multiplicity in ['single', 'multi']:
503
+ self.read_each_annotation_performance(sequence_multiplicity, version_stype, input_stype, these_are_cluster_annotations=these_are_cluster_annotations)
504
+
505
+ # ----------------------------------------------------------------------------------------
506
+ def read_each_annotation_performance(self, sequence_multiplicity, version_stype, input_stype, these_are_cluster_annotations=False): # <these_are_cluster_annotations> means this fcn is being called from within read_partition_performance()
507
+ """ version_stype is the code version, while input_stype is the input data version, i.e. 'ref', 'new' is the reference code version (last commit) run on the then-new simulation and parameters"""
508
+ if these_are_cluster_annotations:
509
+ ptest = '-'.join(['partition', input_stype, 'simu'])
510
+ methods = ['hmm']
511
+ elif sequence_multiplicity == 'single':
512
+ ptest = '-'.join(['annotate', input_stype, 'simu'])
513
+ methods = ['sw', 'hmm']
514
+ elif sequence_multiplicity == 'multi':
515
+ ptest = '-'.join(['multi', 'annotate', input_stype, 'simu'])
516
+ methods = ['hmm']
517
+ else:
518
+ assert False
519
+ if (args.quick and ptest not in self.quick_tests) or ptest not in self.tests: # ok i think now i'm adding the second clause i don't need the first, but not quite sure
520
+ return
521
+ if input_stype not in self.perf_info[version_stype]:
522
+ self.perf_info[version_stype][input_stype] = OrderedDict()
523
+ if ptest not in self.perf_info[version_stype][input_stype]:
524
+ self.perf_info[version_stype][input_stype][ptest] = OrderedDict()
525
+ perfdir = self.dirs(version_stype) + '/' + self.perfdirs[ptest]
526
+ perffo = self.perf_info[version_stype][input_stype][ptest]
527
+ for method in methods:
528
+ perffo[method] = OrderedDict() # arg, this is deeper than I'd like
529
+ perffo[method]['mean_hamming'] = Hist(fname=perfdir + '/' + method + '/mutation/hamming_to_true_naive.csv').get_mean()
530
+ for region in utils.regions + ['cdr3']:
531
+ perffo[method][region + '_hamming'] = Hist(fname=perfdir + '/' + method + '/mutation/' + region + '_hamming_to_true_naive.csv').get_mean()
532
+ for bound in utils.boundaries:
533
+ perffo[method][bound + '_insertion'] = Hist(fname=perfdir + '/' + method + '/boundaries/' + bound + '_insertion.csv').get_mean(absval=True)
534
+ for erosion in utils.real_erosions:
535
+ perffo[method][erosion + '_del'] = Hist(fname=perfdir + '/' + method + '/boundaries/' + erosion + '_del.csv').get_mean(absval=True)
536
+
537
+ # ----------------------------------------------------------------------------------------
538
+ def do_this_test(self, tstr, input_stype, pt):
539
+ if tstr not in pt:
540
+ return False
541
+ if input_stype not in pt:
542
+ return False
543
+ if args.quick and pt not in self.quick_tests:
544
+ return False
545
+ return True
546
+
547
+ # ----------------------------------------------------------------------------------------
548
+ def read_partition_performance(self, version_stype, input_stype, debug=False):
549
+ """ Read new partitions from self.dirs('new'), and put the comparison numbers in self.perf_info (compare either to true, for simulation, or to the partition in reference dir, for data). """
550
+ # ----------------------------------------------------------------------------------------
551
+ def read_cpath(fname):
552
+ _, _, cpath = utils.read_yaml_output(fname=fname, skip_annotations=True)
553
+ ccfs = cpath.ccfs[cpath.i_best]
554
+ if None in ccfs:
555
+ raise Exception('none type ccf read from %s' % fname)
556
+ if debug:
557
+ print(' %5.2f %5.2f %-28s to true partition' % (ccfs[0], ccfs[1], fname)) #os.path.basename(fname))
558
+ return ccfs
559
+ # ----------------------------------------------------------------------------------------
560
+ ptest_list = [k for k in self.tests.keys() if self.do_this_test('partition', input_stype, k)]
561
+ if len(ptest_list) == 0:
562
+ return
563
+ if input_stype not in self.perf_info[version_stype]:
564
+ self.perf_info[version_stype][input_stype] = OrderedDict()
565
+ pinfo = self.perf_info[version_stype][input_stype]
566
+ if debug:
567
+ print(' version %s input %s partitioning' % (version_stype, input_stype))
568
+ print(' purity completeness test description')
569
+ for ptest in ptest_list:
570
+ if ptest not in pinfo:
571
+ pinfo[ptest] = OrderedDict()
572
+ if args.paired:
573
+ l_ccfs = []
574
+ for locus in utils.sub_loci(args.ig_or_tr):
575
+ ofn = '%s/partition-%s.yaml' % (self.opath(ptest, st=version_stype), locus)
576
+ if 'seed-' in ptest:
577
+ sfns = glob.glob(ofn.replace('/partition-', '/seeds/*/partition-'))
578
+ if len(sfns) == 0: # non-seed light chain
579
+ continue
580
+ if len(sfns) > 1:
581
+ raise Exception('multiple seed subdirs in %s/seeds/, you probably changed the seed seq and need to delete the old dir (yes --bust-cache should do this, but doesn\'t atm)' % os.path.dirname(ofn) + '/seeds/')
582
+ ofn = sfns[0]
583
+ l_ccfs.append(read_cpath(ofn))
584
+ pinfo[ptest]['purity'], pinfo[ptest]['completeness'] = [numpy.mean(lcfs) for lcfs in zip(*l_ccfs)]
585
+ if 'seed-' not in ptest:
586
+ htmp = Hist(fname='%s/true-pair-clean-performance.csv' % self.opath(ptest, st=version_stype))
587
+ ttot = htmp.integral(False)
588
+ for pcat in self.pair_clean_metrics:
589
+ pinfo[ptest][pcat] = htmp.bin_contents[htmp.find_bin(None, label=pcat)] / float(ttot)
590
+ else:
591
+ ccfs = read_cpath(self.opath(ptest, st=version_stype)) # self.dirs(version_stype) + '/' + ptest + '.yaml')
592
+ pinfo[ptest]['purity'], pinfo[ptest]['completeness'] = ccfs
593
+ if ptest in self.perfdirs:
594
+ self.read_each_annotation_performance('single', version_stype, input_stype, these_are_cluster_annotations=True)
595
+
596
+ # ----------------------------------------------------------------------------------------
597
+ def read_selection_metric_performance(self, version_stype, input_stype, debug=False):
598
+ # ----------------------------------------------------------------------------------------
599
+ def read_smfile(fname, smfo):
600
+ if not os.path.exists(fname): # probably e.g. igh+igl for a sample with only igh+igk
601
+ print(' %s selection metric output file doesn\'t exist: %s' % (utils.wrnstr(), fname))
602
+ return
603
+ with open(fname) as yfile:
604
+ lbfos = yaml.load(yfile, Loader=yaml.CLoader)
605
+ for metric in self.selection_metrics:
606
+ for lbfo in lbfos: # one lbfo for each cluster
607
+ smfo[metric] += list(lbfo['lb'][metric].values())
608
+ if debug:
609
+ print(' read lbfos for %d cluster%s from %s' % (len(lbfos), utils.plural(len(lbfos)), fname))
610
+ # ----------------------------------------------------------------------------------------
611
+ def read_chosen_abs(fname):
612
+ with open(fname) as chfile:
613
+ chlines = list(csv.DictReader(chfile))
614
+ return set((l['h_id'], l['l_id']) for l in chlines) # not really proper to key only by h_id, but the pairing info shouldn't be able to change, right?
615
+ # ----------------------------------------------------------------------------------------
616
+ pinfo = self.perf_info[version_stype][input_stype]
617
+ if debug:
618
+ print(' version %s input %s selection metrics' % (version_stype, input_stype))
619
+ ptest_list = [k for k in self.tests.keys() if self.do_this_test('get-selection', input_stype, k)]
620
+ for ptest in ptest_list:
621
+ if ptest not in pinfo: # perf_info should already have all the parent keys cause we run read_partition_performance() first
622
+ pinfo[ptest] = OrderedDict([(m, []) for m in self.selection_metrics])
623
+ if args.paired:
624
+ for lpair in utils.locus_pairs[args.ig_or_tr]:
625
+ for locus in lpair:
626
+ smfname = '%s/%s/partition-%s-selection-metrics.yaml' % (self.opath(ptest.replace('get-selection-metrics', 'partition'), st=version_stype), '+'.join(lpair), locus)
627
+ read_smfile(smfname, pinfo[ptest])
628
+ if debug:
629
+ print(' total values read: %s' % ' '.join('%s %d'%(m, len(pinfo[ptest][m])) for m in self.selection_metrics))
630
+ pinfo[ptest]['chosen-abs'] = read_chosen_abs(self.opath(ptest, st=version_stype))
631
+ else:
632
+ read_smfile(self.opath(ptest, st=version_stype), pinfo[ptest])
633
+
634
+ # ----------------------------------------------------------------------------------------
635
+ def compare_performance(self, input_stype):
636
+ # ----------------------------------------------------------------------------------------
637
+ def print_comparison_str(ref_val, new_val, epsval, fw=7, dp=3, pm=False):
638
+ fractional_change = 0. if ref_val == 0. else (new_val - ref_val) / float(ref_val) # NOTE not the abs value yet
639
+ if abs(fractional_change) > epsval:
640
+ color = 'red'
641
+ elif abs(fractional_change) > self.tiny_eps:
642
+ color = 'yellow'
643
+ else:
644
+ color = None
645
+ def floatstr(v):
646
+ fmstr = '%%-%d.%df' % (fw, dp)
647
+ if pm:
648
+ fmstr = fmstr.replace('%', '%+')
649
+ return fmstr % v
650
+ print(' %s%s ' % (floatstr(ref_val), (fw+4)*' ' if color is None else utils.color(color, '--> %s'%floatstr(new_val))), end=' ')
651
+
652
+ # ----------------------------------------------------------------------------------------
653
+ print(' performance with %s simulation and parameters (smaller is better for all annotation metrics)' % input_stype)
654
+ all_annotation_ptests = ['annotate-' + input_stype + '-simu', 'multi-annotate-' + input_stype + '-simu', 'partition-' + input_stype + '-simu'] # hard code for order
655
+ all_partition_ptests = [flavor + 'partition-' + input_stype + '-simu' for flavor in ['', 'vsearch-', 'seed-', 'subset-']]
656
+ annotation_ptests = [pt for pt in all_annotation_ptests if pt in self.perf_info['ref'][input_stype]]
657
+ partition_ptests = [pt for pt in all_partition_ptests if pt in self.perf_info['ref'][input_stype]]
658
+ selection_metric_tests = ['get-selection-metrics-'+input_stype+'-simu']
659
+ metricstrs = {
660
+ 'mean_hamming' : 'hamming',
661
+ 'v_hamming' : 'v ',
662
+ 'd_hamming' : 'd ',
663
+ 'j_hamming' : 'j ',
664
+ 'cdr3_hamming' : 'cdr3 ',
665
+ 'vd_insertion' : 'vd insert',
666
+ 'dj_insertion' : 'dj insert',
667
+ 'd_call' : 'd ',
668
+ 'j_call' : 'j ',
669
+ 'completeness' : 'compl.',
670
+ 'cons-dist-aa' : 'aa-cdist',
671
+ 'correct' : 'pair clean correct',
672
+ 'mispaired' : ' mispaired',
673
+ 'unpaired' : ' unpaired',
674
+ }
675
+ refpfo, newpfo = [self.perf_info[st][input_stype] for st in ['ref', 'new']]
676
+
677
+
678
+ # print annotation header
679
+ print('%8s %9s' % ('', ''), end=' ')
680
+ for ptest in annotation_ptests:
681
+ for method in [m for m in refpfo[ptest] if m in ['sw', 'hmm']]: # 'if' is just to skip purity and completeness
682
+ printstr = method
683
+ if 'multi-annotate' in ptest:
684
+ printstr = 'multi %s' % method
685
+ if 'partition' in ptest:
686
+ printstr = 'partition %s' % method
687
+ print(' %-15s' % printstr, end=' ')
688
+ print('')
689
+
690
+ # print values
691
+ if 'hmm' in refpfo[annotation_ptests[0]]: # it's not in there for paired partition test, since (at least atm) we don't do annotation tests for it
692
+ allmetrics = [m for m in refpfo[annotation_ptests[0]]['hmm']]
693
+ for metric in allmetrics:
694
+ alignstr = '' if len(metricstrs.get(metric, metric).strip()) < 5 else '-'
695
+ print(('%8s %' + alignstr + '9s') % ('', metricstrs.get(metric, metric)), end=' ')
696
+ for ptest in annotation_ptests:
697
+ for method in [m for m in refpfo[ptest] if m in ['sw', 'hmm']]: # 'if' is just to skip purity and completeness
698
+ if set(refpfo[ptest]) != set(newpfo[ptest]):
699
+ raise Exception('different metrics in ref vs new:\n %s\n %s' % (sorted(refpfo[ptest]), sorted(newpfo[ptest])))
700
+ print_comparison_str(refpfo[ptest][method][metric], newpfo[ptest][method][metric], self.eps_vals.get(metric, 0.1))
701
+ print('')
702
+
703
+ # print partition header
704
+ print('%8s %5s' % ('', ''), end=' ')
705
+ for ptest in partition_ptests:
706
+ print(' %-18s' % ptest.split('-')[0], end=' ')
707
+ print('')
708
+ for metric in ['purity', 'completeness'] + self.pair_clean_metrics:
709
+ alignstr = '' if len(metricstrs.get(metric, metric).strip()) < 5 else '-'
710
+ print(('%8s %' + alignstr + '9s') % ('', metricstrs.get(metric, metric)), end=' ')
711
+ for ptest in partition_ptests:
712
+ if 'seed-' in ptest and metric in self.pair_clean_metrics: # ick
713
+ continue
714
+ if set(refpfo[ptest]) != set(newpfo[ptest]):
715
+ raise Exception('different metrics in ref vs new:\n %s\n %s' % (sorted(refpfo[ptest]), sorted(newpfo[ptest])))
716
+ method = ptest.split('-')[0]
717
+ if metric != 'purity':
718
+ method = ''
719
+ print_comparison_str(refpfo[ptest][metric], newpfo[ptest][metric], self.eps_vals.get(metric, 0.1))
720
+ print('')
721
+
722
+ # selection metrics
723
+ print(' %s' % ''.join(['%-23s'%metricstrs.get(m, m) for m in self.selection_metrics]))
724
+ for mfname, mfcn in [('mean', numpy.mean), ('min', min), ('max', max), ('len', len)]:
725
+ print(' %5s' % mfname, end=' ')
726
+ for metric in self.selection_metrics:
727
+ for ptest in selection_metric_tests: # this'll break if there's more than one selection metric ptest
728
+ if set(refpfo[ptest]) != set(newpfo[ptest]):
729
+ raise Exception('different metrics in ref vs new:\n %s\n %s' % (sorted(refpfo[ptest]), sorted(newpfo[ptest])))
730
+ ref_list, new_list = [self.perf_info[rn][input_stype][ptest][metric] for rn in ['ref', 'new']]
731
+ dp = 1 if metric=='cons-dist-aa' else 3
732
+ if mfname=='len': dp = 0
733
+ print_comparison_str(mfcn(ref_list), mfcn(new_list), self.eps_vals.get(metric, 0.1), dp=dp, pm=metric=='cons-dist-aa')
734
+ print('')
735
+ if args.paired:
736
+ ptest = utils.get_single_entry(selection_metric_tests)
737
+ ref_abs, new_abs = refpfo[ptest]['chosen-abs'], newpfo[ptest]['chosen-abs']
738
+ n_ref, n_new = len(ref_abs), len(new_abs)
739
+ n_common = len(ref_abs & new_abs)
740
+ n_only_ref, n_only_new = len(ref_abs - new_abs), len(new_abs - ref_abs)
741
+ diffstr = ' ok'
742
+ if n_ref != n_new or n_only_ref > 0 or n_only_new > 0:
743
+ diffstr = ' %s in common, %s only in ref, %s only in new' % (utils.color(None if n_common==n_ref else 'red', str(n_common)), utils.color(None if n_only_ref==0 else 'red', str(n_only_ref)), utils.color(None if n_only_new==0 else 'red', str(n_only_new)))
744
+ print(' chose %d abs %s%s' % (n_ref, '' if n_new==n_ref else utils.color('red', '--> %d'%n_new), diffstr))
745
+
746
+ # ----------------------------------------------------------------------------------------
747
+ def compare_production_results(self, ptests):
748
+ print('diffing production results')
749
+ for ptest in ptests:
750
+ if args.quick and ptest not in self.quick_tests:
751
+ continue
752
+ fname = self.opath(ptest) # sometimes a dir rather than a file
753
+ print(' %-30s' % fname, end=' ')
754
+ cmd = 'diff -qbr ' + ' '.join(self.dirs(st) + '/' + fname for st in self.stypes)
755
+ proc = Popen(cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True)
756
+ out, err = proc.communicate()
757
+ if proc.returncode == 0:
758
+ print(' ok')
759
+ else:
760
+ differlines = [ l for l in out.split('\n') if 'differ' in l]
761
+ onlylines = [ l for l in out.split('\n') if 'Only' in l]
762
+ print('')
763
+ if len(differlines) > 0:
764
+ n_total_files = int(check_output('find ' + self.dirs('ref') + '/' + fname + ' -type f | wc -l', shell=True, universal_newlines=True))
765
+ if n_total_files == 1:
766
+ assert len(differlines) == 1
767
+ print(utils.color('red', ' file differs'), end=' ')
768
+ else:
769
+ print(utils.color('red', ' %d / %d files differ' % (len(differlines), n_total_files)), end=' ')
770
+ if len(onlylines) > 0:
771
+ for st in self.stypes:
772
+ theseonlylines = [l for l in onlylines if self.dirs(st) + '/' + fname in l]
773
+ if len(theseonlylines) > 0:
774
+ print(utils.color('red', ' %d files only in %s' % (len(theseonlylines), st)), end=' ')
775
+ if differlines == 0 and onlylines == 0:
776
+ print(utils.color('red', ' not sure why, but diff returned %d' % proc.returncode), end=' ')
777
+ print(' (%s)' % cmd)
778
+ if err != '':
779
+ print(err)
780
+
781
+ # ----------------------------------------------------------------------------------------
782
+ def write_run_times(self):
783
+ with open(self.dirs('new') + '/run-times.csv', utils.csv_wmode()) as newfile:
784
+ writer = csv.DictWriter(newfile, ('name', 'seconds'))
785
+ writer.writeheader()
786
+ for name, seconds in self.run_times.items():
787
+ writer.writerow({'name' : name, 'seconds' : '%.1f'%seconds})
788
+
789
+ # ----------------------------------------------------------------------------------------
790
+ def compare_run_times(self):
791
+ print('checking run times')
792
+
793
+ def read_run_times(stype):
794
+ times[stype] = {}
795
+ with open(self.dirs(stype) + '/run-times.csv') as timefile:
796
+ reader = csv.DictReader(timefile)
797
+ for line in reader:
798
+ times[stype][line['name']] = float(line['seconds'])
799
+ times = {}
800
+ for stype in self.stypes:
801
+ read_run_times(stype)
802
+
803
+ for name in self.tests:
804
+ if args.quick and name not in self.quick_tests:
805
+ continue
806
+ print(' %30s %7.1f' % (name, times['ref'][name]), end=' ')
807
+ if name not in times['new']:
808
+ print(' no new time for %s' % utils.color('red', name))
809
+ continue
810
+ fractional_change = (times['new'][name] - times['ref'][name]) / float(times['ref'][name])
811
+ if abs(fractional_change) > 0.2:
812
+ print('--> %-5.1f %s' % (times['new'][name], utils.color('red', '(%+.3f)' % fractional_change)), end=' ')
813
+ elif abs(fractional_change) > 0.1:
814
+ print('--> %-5.1f %s' % (times['new'][name], utils.color('yellow', '(%+.3f)' % fractional_change)), end=' ')
815
+ else:
816
+ print(' ok ', end=' ')
817
+ print('')
818
+
819
+ # ----------------------------------------------------------------------------------------
820
+ def compare_partition_cachefiles(self, input_stype, debug=False):
821
+ # ----------------------------------------------------------------------------------------
822
+ def print_key_differences(vtype, refkeys, newkeys):
823
+ print(' %s keys' % vtype)
824
+ if len(refkeys - newkeys) > 0 or len(newkeys - refkeys) > 0:
825
+ if len(refkeys - newkeys) > 0:
826
+ print(utils.color('red', ' %d only in ref version' % len(refkeys - newkeys)))
827
+ if len(newkeys - refkeys) > 0:
828
+ print(utils.color('red', ' %d only in new version' % len(newkeys - refkeys)))
829
+ print(' %d in common' % len(refkeys & newkeys))
830
+ else:
831
+ print(' %d identical keys in new and ref cache' % len(refkeys))
832
+ # ----------------------------------------------------------------------------------------
833
+ def readcache(fname):
834
+ if debug: print(' reading partition cache from %s' % fname)
835
+ cache = {'naive_seqs' : {}, 'logprobs' : {}}
836
+ with open(fname) as cachefile:
837
+ reader = csv.DictReader(cachefile)
838
+ for line in reader:
839
+ if line['naive_seq'] != '':
840
+ cache['naive_seqs'][line['unique_ids']] = line['naive_seq']
841
+ if line['logprob'] != '':
842
+ cache['logprobs'][line['unique_ids']] = float(line['logprob'])
843
+ return cache
844
+ # ----------------------------------------------------------------------------------------
845
+ def compare_files(fname):
846
+ print(' %s input partition cache file' % input_stype)
847
+ refcache = readcache(self.dirs('ref') + '/' + fname)
848
+ newcache = readcache(self.dirs('new') + '/' + fname)
849
+
850
+ # work out intersection and complement
851
+ refkeys, newkeys = {}, {}
852
+ for vtype in ['naive_seqs', 'logprobs']:
853
+ refkeys[vtype] = set(refcache[vtype].keys())
854
+ newkeys[vtype] = set(newcache[vtype].keys())
855
+ print_key_differences(vtype, refkeys[vtype], newkeys[vtype])
856
+
857
+ hammings = []
858
+ n_hammings = 0
859
+ n_different_length, n_big_hammings = 0, 0
860
+ hamming_eps = 0.
861
+ vtype = 'naive_seqs'
862
+ for uids in refkeys[vtype] & newkeys[vtype]:
863
+ refseq = refcache[vtype][uids]
864
+ newseq = newcache[vtype][uids]
865
+ n_hammings += 1
866
+ if len(refseq) == len(newseq):
867
+ hamming_fraction = utils.hamming_fraction(refseq, newseq)
868
+ if hamming_fraction > hamming_eps:
869
+ n_big_hammings += 1
870
+ hammings.append(hamming_fraction)
871
+ else:
872
+ n_different_length += 1
873
+
874
+ diff_hfracs_str = '%3d / %4d' % (n_big_hammings, n_hammings)
875
+ mean_hfrac_str = '%.3f' % (numpy.average(hammings) if len(hammings) > 0 else 0.)
876
+ if n_big_hammings > 0:
877
+ diff_hfracs_str = utils.color('red', diff_hfracs_str)
878
+ mean_hfrac_str = utils.color('red', mean_hfrac_str)
879
+
880
+ abs_delta_logprobs = []
881
+ n_delta_logprobs = 0
882
+ n_big_delta_logprobs = 0
883
+ logprob_eps = 1e-5
884
+ vtype = 'logprobs'
885
+ for uids in refkeys[vtype] & newkeys[vtype]:
886
+ refval = refcache[vtype][uids]
887
+ newval = newcache[vtype][uids]
888
+ n_delta_logprobs += 1
889
+ abs_delta_logprob = abs(refval - newval)
890
+ if abs_delta_logprob > logprob_eps:
891
+ # print '%s %s ref %f new %f' % (vtype, uids, refval, newval)
892
+ n_big_delta_logprobs += 1
893
+ abs_delta_logprobs.append(abs_delta_logprob)
894
+
895
+ diff_logprob_str = '%3d / %4d' % (n_big_delta_logprobs, n_delta_logprobs)
896
+ mean_logprob_str = '%.3f' % (numpy.average(abs_delta_logprobs) if len(abs_delta_logprobs) > 0 else 0.)
897
+ if n_big_delta_logprobs > 0:
898
+ diff_logprob_str = utils.color('red', diff_logprob_str)
899
+ mean_logprob_str = utils.color('red', mean_logprob_str)
900
+ print(' fraction different mean abs difference among differents')
901
+ print(' naive seqs %s %s (hamming fraction)' % (diff_hfracs_str, mean_hfrac_str))
902
+ print(' log probs %s %s' % (diff_logprob_str, mean_logprob_str))
903
+ if n_different_length > 0:
904
+ print(utils.color('red', ' %d different length' % n_different_length))
905
+
906
+ # ----------------------------------------------------------------------------------------
907
+ ptest = 'partition-' + input_stype + '-simu'
908
+ if args.quick and ptest not in self.quick_tests:
909
+ return
910
+
911
+ if args.paired:
912
+ assert False # eh, probably not really any point
913
+ # for locus in XXX:
914
+ # compare_files(self.ptn_cachefn(input_stype))
915
+ else:
916
+ compare_files(self.ptn_cachefn(input_stype))
917
+
918
+ # ----------------------------------------------------------------------------------------
919
+ parser = argparse.ArgumentParser()
920
+ parser.add_argument('--dont-run', action='store_true', help='don\'t actually run anything, just check the results')
921
+ parser.add_argument('--dry-run', action='store_true', help='do all preparations to run, but don\'t actually run the commands, and don\'t check results')
922
+ parser.add_argument('--quick', action='store_true', help='only run one command: cache-parameters on a small numbrer of simulation events')
923
+ parser.add_argument('--slow', action='store_true', help='by default, we run tests on a fairly small number of sequences, which is sufficient for checking that *nothing* has changed. But --slow is for cases where you\'ve made changes that you know will affect results, and you want to look at the details of how they\'re affected, for which you need to run on more sequences. Note that whether --slow is set or not (runs all tests with more or less sequences) is separate from --quick (which only runs one test).')
924
+ parser.add_argument('--run-coverage', action='store_true', help='instead of running the normal suite of tests (which compare results to make sure they haven\'t changed), instead run a series of commands that\'s designed to execute as many of the lines as possible (without comparing results).')
925
+ parser.add_argument('--prepend-coverage', action='store_true', help='run normal tests, but prepending coverage append commands')
926
+ parser.add_argument('--coverage-outdir', default='%s/partis/tmp/coverage' % os.getenv('fs', default=os.getenv('HOME')))
927
+ parser.add_argument('--bust-cache', action='store_true', help='overwrite current ref info, i.e. run this when things have changed, but you\'ve decided they\'re fine')
928
+ parser.add_argument('--only-bust-current', action='store_true', help='only bust cache for current command line args (as opposed to the default of busting caches for both slow and non-slow, paired and non-paired)')
929
+ parser.add_argument('--paired', action='store_true', help='run paired tests, i.e. with --paired-loci. Note that this doesn\'t test all the things (e.g. seed partitioning) that non-paired does.')
930
+ parser.add_argument('--run-all', action='store_true', help='run all four combinations of tests: paired/non-paired and slow/non-slow (by default only runs one). *Not* for use with --bust-cache, which runs all of them by default.')
931
+ parser.add_argument('--no-simu', action='store_true', help='don\'t run simulation, e.g. if using a minimal install')
932
+ parser.add_argument('--ig-or-tr', default='ig')
933
+ parser.add_argument('--print-width', type=int, default=300, help='set to 0 for infinite')
934
+
935
+ parser.add_argument('--glfo-dir', default='data/germlines/human')
936
+ parser.add_argument('--locus', default='igh')
937
+ args = parser.parse_args()
938
+ assert not (args.quick and args.slow) # it just doesn't make sense
939
+ assert not (args.quick and args.paired) # --quick ignores --paired, which is confusing
940
+
941
+ random.seed(0)
942
+ numpy.random.seed(0)
943
+
944
+ if args.print_width == 0:
945
+ args.print_width = 99999
946
+
947
+ if args.run_all or (args.bust_cache and not args.only_bust_current): # run all four combos
948
+ for slowval in [False, True]:
949
+ for pairedval in [False, True]:
950
+ clist = copy.deepcopy(sys.argv)
951
+ utils.remove_from_arglist(clist, '--slow')
952
+ utils.remove_from_arglist(clist, '--paired')
953
+ if args.bust_cache:
954
+ assert not args.run_all
955
+ clist += ['--only-bust-current']
956
+ else:
957
+ utils.remove_from_arglist(clist, '--run-all')
958
+ cmd_str = ' '.join(clist)
959
+ if slowval:
960
+ cmd_str += ' --slow'
961
+ if pairedval:
962
+ cmd_str += ' --paired'
963
+ utils.simplerun(cmd_str, dryrun=args.dry_run)
964
+ sys.exit(0)
965
+
966
+ tester = Tester()
967
+ if args.run_coverage:
968
+ tester.run_coverage(args)
969
+ sys.exit(0)
970
+ tester.test(args)
971
+ if args.bust_cache:
972
+ tester.bust_cache()
973
+
974
+ # ----------------------------------------------------------------------------------------
975
+ def get_typical_variances():
976
+ # NOTE don't delete this, since it was used (and might be needed again) to get the expected variances hardcoded above
977
+ raise Exception('needs updating to work as a function')
978
+ # cp = ClusterPath(fname='tmp.csv')
979
+ # cp.print_partitions()
980
+ # sys.exit()
981
+ # cps = []
982
+ # adj_mis, ccf_unders, ccf_overs = [], [], []
983
+ # for iseed in range(6):
984
+ # # print 'seed %d' % iseed
985
+ # cp = ClusterPath(fname='%d.csv' % iseed)
986
+ # cp.print_partitions() #(cp.i_best) #, abbreviate=False)
987
+ # adj_mis.append(cp.adj_mis[cp.i_best])
988
+ # ccf_unders.append(cp.ccfs[cp.i_best][0])
989
+ # ccf_overs.append(cp.ccfs[cp.i_best][1])
990
+ # cps.append(cp)
991
+ # def print_mean_variance(vals):
992
+ # mean = numpy.average(vals)
993
+ # variance = numpy.average((vals - mean)**2) #, weights=wgts)
994
+ # print 'mean %.2f std dev %.3f (%.1f%%)' % (mean, math.sqrt(variance), 100. * math.sqrt(variance) / mean)
995
+
996
+ # # mean/var for six random seeds
997
+ # print_mean_variance(adj_mis) # mean 0.61 std dev 0.053 (8.7%)
998
+ # print_mean_variance(ccf_unders) # mean 0.74 std dev 0.026 (3.5%)
999
+ # print_mean_variance(ccf_overs) # mean 0.90 std dev 0.015 (1.7%)
1000
+ # # for iseed in range(len(cps)):
1001
+ # # icp = cps[iseed]
1002
+ # # for jseed in range(iseed, len(cps)):
1003
+ # # jcp = cps[jseed]
1004
+ # # print ' %d %d %.3f' % (iseed, jseed, utils.adjusted_mutual_information(icp.partitions[icp.i_best], jcp.partitions[jcp.i_best]))
1005
+