partis-bcr 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bin/FastTree +0 -0
- bin/add-chimeras.py +59 -0
- bin/add-seqs-to-outputs.py +81 -0
- bin/bcr-phylo-run.py +799 -0
- bin/build.sh +24 -0
- bin/cf-alleles.py +97 -0
- bin/cf-germlines.py +57 -0
- bin/cf-linearham.py +199 -0
- bin/chimera-plot.py +76 -0
- bin/choose-partially-paired.py +143 -0
- bin/circle-plots.py +30 -0
- bin/compare-plotdirs.py +298 -0
- bin/diff-parameters.py +133 -0
- bin/docker-hub-push.sh +6 -0
- bin/extract-pairing-info.py +55 -0
- bin/gcdyn-simu-run.py +223 -0
- bin/gctree-run.py +244 -0
- bin/get-naive-probabilities.py +126 -0
- bin/iqtree-1.6.12 +0 -0
- bin/lonr.r +1020 -0
- bin/makeHtml +52 -0
- bin/mds-run.py +46 -0
- bin/parse-output.py +277 -0
- bin/partis +1869 -0
- bin/partis-pip +116 -0
- bin/partis.py +1869 -0
- bin/plot-gl-set-trees.py +519 -0
- bin/plot-hmms.py +151 -0
- bin/plot-lb-tree.py +427 -0
- bin/raxml-ng +0 -0
- bin/read-bcr-phylo-trees.py +38 -0
- bin/read-gctree-output.py +166 -0
- bin/run-chimeras.sh +64 -0
- bin/run-dtr-scan.sh +25 -0
- bin/run-paired-loci.sh +100 -0
- bin/run-tree-metrics.sh +88 -0
- bin/smetric-run.py +62 -0
- bin/split-loci.py +317 -0
- bin/swarm-2.1.13-linux-x86_64 +0 -0
- bin/test-germline-inference.py +425 -0
- bin/tree-perf-run.py +194 -0
- bin/vsearch-2.4.3-linux-x86_64 +0 -0
- bin/vsearch-2.4.3-macos-x86_64 +0 -0
- bin/xvfb-run +194 -0
- partis_bcr-1.0.2.data/scripts/cf-alleles.py +97 -0
- partis_bcr-1.0.2.data/scripts/cf-germlines.py +57 -0
- partis_bcr-1.0.2.data/scripts/extract-pairing-info.py +55 -0
- partis_bcr-1.0.2.data/scripts/gctree-run.py +244 -0
- partis_bcr-1.0.2.data/scripts/parse-output.py +277 -0
- partis_bcr-1.0.2.data/scripts/split-loci.py +317 -0
- partis_bcr-1.0.2.data/scripts/test.py +1005 -0
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/METADATA +1 -1
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/RECORD +101 -51
- partis_bcr-1.0.2.dist-info/top_level.txt +1 -0
- {partis → python}/glutils.py +1 -1
- python/main.py +30 -0
- {partis → python}/plotting.py +10 -1
- {partis → python}/treeutils.py +18 -16
- {partis → python}/utils.py +14 -7
- packages/ham/bcrham +0 -0
- partis/main.py +0 -59
- partis_bcr-1.0.0.dist-info/top_level.txt +0 -1
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/WHEEL +0 -0
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/entry_points.txt +0 -0
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/licenses/COPYING +0 -0
- {partis → python}/__init__.py +0 -0
- {partis → python}/alleleclusterer.py +0 -0
- {partis → python}/allelefinder.py +0 -0
- {partis → python}/alleleremover.py +0 -0
- {partis → python}/annotationclustering.py +0 -0
- {partis → python}/baseutils.py +0 -0
- {partis → python}/cache/__init__.py +0 -0
- {partis → python}/cache/cached_uncertainties.py +0 -0
- {partis → python}/clusterpath.py +0 -0
- {partis → python}/coar.py +0 -0
- {partis → python}/corrcounter.py +0 -0
- {partis → python}/datautils.py +0 -0
- {partis → python}/event.py +0 -0
- {partis → python}/fraction_uncertainty.py +0 -0
- {partis → python}/gex.py +0 -0
- {partis → python}/glomerator.py +0 -0
- {partis → python}/hist.py +0 -0
- {partis → python}/hmmwriter.py +0 -0
- {partis → python}/hutils.py +0 -0
- {partis → python}/indelutils.py +0 -0
- {partis → python}/lbplotting.py +0 -0
- {partis → python}/mds.py +0 -0
- {partis → python}/mutefreqer.py +0 -0
- {partis → python}/paircluster.py +0 -0
- {partis → python}/parametercounter.py +0 -0
- {partis → python}/paramutils.py +0 -0
- {partis → python}/partitiondriver.py +0 -0
- {partis → python}/partitionplotter.py +0 -0
- {partis → python}/performanceplotter.py +0 -0
- {partis → python}/plotconfig.py +0 -0
- {partis → python}/processargs.py +0 -0
- {partis → python}/prutils.py +0 -0
- {partis → python}/recombinator.py +0 -0
- {partis → python}/scanplot.py +0 -0
- {partis → python}/seqfileopener.py +0 -0
- {partis → python}/treegenerator.py +0 -0
- {partis → python}/viterbicluster.py +0 -0
- {partis → python}/vrc01.py +0 -0
- {partis → python}/waterer.py +0 -0
@@ -0,0 +1,1005 @@
|
|
1
|
+
#!python
|
2
|
+
from __future__ import absolute_import, division, unicode_literals
|
3
|
+
from __future__ import print_function
|
4
|
+
import argparse
|
5
|
+
import random
|
6
|
+
import numpy
|
7
|
+
import os
|
8
|
+
import csv
|
9
|
+
import glob
|
10
|
+
import math
|
11
|
+
import shutil
|
12
|
+
import time
|
13
|
+
from collections import OrderedDict
|
14
|
+
from subprocess import Popen, PIPE, check_call, check_output, CalledProcessError
|
15
|
+
import copy
|
16
|
+
import colored_traceback.always
|
17
|
+
import sys
|
18
|
+
from io import open
|
19
|
+
from pathlib import Path
|
20
|
+
partis_dir = str(Path(__file__).parent.parent)
|
21
|
+
sys.path.insert(1, partis_dir) # + '/python')
|
22
|
+
import yaml
|
23
|
+
|
24
|
+
from python.baseutils import get_extra_str
|
25
|
+
import python.utils as utils
|
26
|
+
import python.glutils as glutils
|
27
|
+
from python.hist import Hist
|
28
|
+
from python.clusterpath import ClusterPath
|
29
|
+
import python.paircluster as paircluster
|
30
|
+
|
31
|
+
# ----------------------------------------------------------------------------------------
|
32
|
+
class Tester(object):
|
33
|
+
# ----------------------------------------------------------------------------------------
|
34
|
+
def dirs(self, tstr, force_paired=False):
|
35
|
+
assert tstr in ['ref', 'new']
|
36
|
+
return 'test%s/%s-results%s' % ('/paired' if (args.paired or force_paired) else '', tstr, '-slow' if args.slow else '')
|
37
|
+
# ----------------------------------------------------------------------------------------
|
38
|
+
def nqr(self, act):
|
39
|
+
if act == 'quick': # bears no relation to the others, so makes sense to handle it differently
|
40
|
+
return 10
|
41
|
+
nqdict = {'normal' : {'simu' : 50, 'data' : 100 if args.paired else 50},
|
42
|
+
'slow' : {'simu' : 1000, 'data' : -1}}
|
43
|
+
return nqdict['slow' if args.slow else 'normal'][act]
|
44
|
+
# ----------------------------------------------------------------------------------------
|
45
|
+
def get_stypes(self, ptest):
|
46
|
+
namelist = ptest.split('-')
|
47
|
+
if ptest == 'simulate':
|
48
|
+
input_stype = 'new'
|
49
|
+
input_dtype = None
|
50
|
+
elif 'cache-parameters' in ptest:
|
51
|
+
input_stype = 'new'
|
52
|
+
input_dtype = namelist[-1]
|
53
|
+
else:
|
54
|
+
input_stype, input_dtype = namelist[-2:]
|
55
|
+
|
56
|
+
assert input_stype in self.stypes + [None]
|
57
|
+
assert input_dtype in self.dtypes + [None]
|
58
|
+
return input_stype, input_dtype
|
59
|
+
# ----------------------------------------------------------------------------------------
|
60
|
+
def inpath(self, st, dt):
|
61
|
+
if dt == 'data':
|
62
|
+
return self.paired_datafname if args.paired else self.datafname
|
63
|
+
else:
|
64
|
+
spath = self.label + '/simu' + ('' if args.paired else '.yaml')
|
65
|
+
if st is None:
|
66
|
+
return spath
|
67
|
+
return self.dirs(st) + '/' + spath
|
68
|
+
# ----------------------------------------------------------------------------------------
|
69
|
+
def paramdir(self, st, dt):
|
70
|
+
pd = self.label + '/parameters/' + dt
|
71
|
+
if st is None: # if <st> isn't set, we want the subpath (without parent dir), e.g. when --dont-run/evaluating results
|
72
|
+
return pd
|
73
|
+
return self.dirs(st) + '/' + pd
|
74
|
+
# ----------------------------------------------------------------------------------------
|
75
|
+
def astr(self, inout, dt=None):
|
76
|
+
if not args.paired or (args.paired and inout == 'in' and dt == 'data'):
|
77
|
+
return '%sfname' % inout
|
78
|
+
return 'paired-%sdir' % inout
|
79
|
+
# ----------------------------------------------------------------------------------------
|
80
|
+
# i'm adding this late (especially for the non-production tests) so there's probably some more places it could be used
|
81
|
+
def opath(self, ptest, st=None, force_paired=False): # don't set <st> if you just want the basename-type stuff (no base/parent dirs)
|
82
|
+
if 'cache-parameters' in ptest:
|
83
|
+
return self.paramdir(None, ptest.split('-')[2])
|
84
|
+
elif ptest == 'simulate':
|
85
|
+
return self.inpath(None, 'simu')
|
86
|
+
else:
|
87
|
+
op = '%s%s' % (ptest, '' if (args.paired or force_paired) else '.yaml')
|
88
|
+
if (args.paired or force_paired) and 'get-selection-metrics' in ptest:
|
89
|
+
op += '-chosen-abs.csv'
|
90
|
+
if st is None:
|
91
|
+
return op
|
92
|
+
return '%s/%s' % (self.dirs(st, force_paired=force_paired), op)
|
93
|
+
# ----------------------------------------------------------------------------------------
|
94
|
+
def ptn_cachefn(self, st, for_cmd=False, lpair=None, locus=None): # see note above for opath()
|
95
|
+
assert st == 'new' # i think?
|
96
|
+
cfn = ''
|
97
|
+
if for_cmd:
|
98
|
+
if args.paired:
|
99
|
+
return 'paired-outdir'
|
100
|
+
else:
|
101
|
+
cfn += self.dirs('new')
|
102
|
+
if args.paired:
|
103
|
+
assert lpair is not None and locus is not None
|
104
|
+
cfn += '%s/persistent-cache-%s.csv' % ('+'.join(lpair), locus) # duplicates code in bin/partis getofn()
|
105
|
+
else:
|
106
|
+
cfn = '%s%scache-%s-partition.csv' % (cfn, '' if cfn=='' else '/', st)
|
107
|
+
return cfn
|
108
|
+
# ----------------------------------------------------------------------------------------
|
109
|
+
def all_ptn_cachefns(self): # return all of them (ok atm it's juse the one, but we used to also have the 'ref' one, and maybe will want it in the future?)
|
110
|
+
if args.paired:
|
111
|
+
return [self.ptn_cachefn(s, lpair=lp, locus=l) for s in ['new'] for lp in utils.locus_pairs[args.ig_or_tr] for l in lp]
|
112
|
+
else:
|
113
|
+
return [self.ptn_cachefn(s) for s in ['new']]
|
114
|
+
# ----------------------------------------------------------------------------------------
|
115
|
+
def is_prod_test(self, ptest):
|
116
|
+
return 'cache-parameters' in ptest or ptest == 'simulate'
|
117
|
+
# ----------------------------------------------------------------------------------------
|
118
|
+
def sclust_sizes(self): # NOTE depends on self.n_simu_leaves
|
119
|
+
return (15, 20) if args.slow else (5, 10)
|
120
|
+
# ----------------------------------------------------------------------------------------
|
121
|
+
def min_smetric_cluster_size(self):
|
122
|
+
return 10 if args.slow else 3
|
123
|
+
# ----------------------------------------------------------------------------------------
|
124
|
+
def cluster_size_args(self):
|
125
|
+
return ['--min-selection-metric-cluster-size', str(self.min_smetric_cluster_size()),
|
126
|
+
'--min-paired-cluster-size-to-read', str(self.min_smetric_cluster_size())]
|
127
|
+
# ----------------------------------------------------------------------------------------
|
128
|
+
def __init__(self):
|
129
|
+
self.partis_path = 'partis' if shutil.which('partis') else '%s/bin/partis' % utils.get_partis_dir() # use version in PATH if it's there (pipx seems to leave two incompatible versions lying around)
|
130
|
+
if args.prepend_coverage:
|
131
|
+
self.partis_path = 'coverage3 run --append %s' % self.partis_path
|
132
|
+
self.datafname = 'test/mishmash.fa' # some data from adaptive, chaim, and vollmers
|
133
|
+
# generate paired data dir with: UPDATE i cat'd the ig?.fa files into all-seqs.fa (in the same dir) so extract-pair-info and split-loci get run
|
134
|
+
# - ./bin/split-loci.py /fh/fast/matsen_e/data/10x-examples/data/sc5p_v2_hs_B_prevax_10k_5gex_B_vdj_b_filtered_contig.fasta --outdir test/paired-data --input-metafname /fh/fast/matsen_e/data/10x-examples/processed-data/v0/sc5p_v2_hs_B_prevax_10k_5gex_B_vdj_b_filtered_contig/meta.yaml --n-max-queries 100 >test/paired-data/split-loci.log
|
135
|
+
# - ./bin/extract-pairing-info.py /fh/fast/matsen_e/data/10x-examples/data/sc5p_v2_hs_B_prevax_10k_5gex_B_vdj_b_filtered_contig.fasta test/paired-data/meta.yaml --n-max-queries 100 >test/paired-data/extract.log
|
136
|
+
self.paired_datafname = 'test/paired-data/all-seqs.fa'
|
137
|
+
self.input_metafname = 'test/input-meta.yaml'
|
138
|
+
|
139
|
+
self.stypes = ['ref', 'new'] # I don't know what the 's' stands for
|
140
|
+
self.dtypes = ['data', 'simu']
|
141
|
+
if not os.path.exists(self.dirs('new')):
|
142
|
+
os.makedirs(self.dirs('new'))
|
143
|
+
self.common_extras = ['--random-seed', '1', '--n-procs', '10'] # would be nice to set --n-procs based on the machine, but for some reason the order of things in the parameter files gets shuffled a bit if you change the number of procs
|
144
|
+
self.label = 'test' # i really don't think there's any reason to have this, but i don't feel like removing it atm since it's not really causing much complication
|
145
|
+
|
146
|
+
self.tiny_eps = 1e-4
|
147
|
+
self.run_times = {}
|
148
|
+
self.eps_vals = {} # fractional difference which we allow for each test type (these were generated with the code in get_typical_variances() above)
|
149
|
+
self.eps_vals['v_call'] = 0.02 # hm, actually, I think I just made the annotation ones up
|
150
|
+
self.eps_vals['d_call'] = 0.02
|
151
|
+
self.eps_vals['j_call'] = 0.02
|
152
|
+
self.eps_vals['mean_hamming'] = 0.1
|
153
|
+
self.eps_vals['v_hamming'] = 0.1
|
154
|
+
self.eps_vals['d_hamming'] = 0.1
|
155
|
+
self.eps_vals['j_hamming'] = 0.1
|
156
|
+
self.eps_vals['cdr3_hamming'] = 0.1
|
157
|
+
self.eps_vals['purity'] = 0.08
|
158
|
+
self.eps_vals['completeness'] = 0.08
|
159
|
+
|
160
|
+
self.n_simu_leaves = 5
|
161
|
+
|
162
|
+
self.selection_metrics = ['lbi', 'lbr', 'cons-dist-aa', 'aa-lbi', 'aa-lbr'] # NOTE kind of duplicates treeutils.selection_metrics, but I want to be able to change the latter
|
163
|
+
self.pair_clean_metrics = ['correct', 'unpaired', 'mispaired'] if args.paired else []
|
164
|
+
self.expected_trees = ['tree', 'aa-tree']
|
165
|
+
|
166
|
+
self.logfname = self.dirs('new') + '/test.log'
|
167
|
+
|
168
|
+
self.tests = OrderedDict()
|
169
|
+
|
170
|
+
# ----------------------------------------------------------------------------------------
|
171
|
+
def add_inference_tests(input_stype): # if input_stype is 'ref', infer on old simulation and parameters, if it's 'new' use the new ones
|
172
|
+
if not args.paired:
|
173
|
+
self.tests['annotate-%s-simu'%input_stype] = {'extras' : ['--plot-annotation-performance', ]}
|
174
|
+
self.tests['multi-annotate-%s-simu'%input_stype] = {'extras' : ['--plot-annotation-performance', '--simultaneous-true-clonal-seqs']} # NOTE this is mostly different to the multi-seq annotations from the partition step because it uses the whole sample
|
175
|
+
self.tests['partition-%s-simu'%input_stype] = {'extras' : ['--plot-annotation-performance', '--max-ccf-fail-frac', '0.10']} # '--biggest-logprob-cluster-to-calculate', '5', '--biggest-naive-seq-cluster-to-calculate', '5',
|
176
|
+
if args.paired:
|
177
|
+
self.tests['subset-partition-%s-simu'%input_stype] = {'extras' : ['--max-ccf-fail-frac', '0.15']} # '--biggest-logprob-cluster-to-calculate', '5', '--biggest-naive-seq-cluster-to-calculate', '5',
|
178
|
+
# this runs ok, but i's need to modify some things so its output is actually checked self.tests['subset-annotate-%s-simu'%input_stype] = {'extras' : ['--max-ccf-fail-frac', '0.15']} # '--biggest-logprob-cluster-to-calculate', '5', '--biggest-naive-seq-cluster-to-calculate', '5',
|
179
|
+
self.tests['seed-partition-%s-simu'%input_stype] = {'extras' : ['--max-ccf-fail-frac', '0.10']}
|
180
|
+
if not args.paired:
|
181
|
+
self.tests['vsearch-partition-%s-simu'%input_stype] = {'extras' : ['--naive-vsearch']}
|
182
|
+
self.tests['get-selection-metrics-%s-simu'%input_stype] = {'extras' : ['--existing-output-run-cfg', 'paired'] + self.cluster_size_args()} # NOTE this runs on simulation, but it's checking the inferred selection metrics
|
183
|
+
|
184
|
+
# ----------------------------------------------------------------------------------------
|
185
|
+
if args.quick:
|
186
|
+
self.tests['cache-parameters-quick-new-simu'] = {'extras' : ['--n-max-queries', str(self.nqr('quick'))]}
|
187
|
+
else:
|
188
|
+
pcache_data_args = {'extras' : ['--n-max-queries', str(self.nqr('data'))]}
|
189
|
+
pcache_simu_args = {'extras' : []}
|
190
|
+
n_events = int(self.nqr('simu') / float(self.n_simu_leaves))
|
191
|
+
simulate_args = {'extras' : ['--n-sim-events', str(n_events), '--n-trees', str(n_events), '--n-leaf-distribution', 'geometric', '--n-leaves', str(self.n_simu_leaves)]}
|
192
|
+
if args.paired:
|
193
|
+
simulate_args['extras'] += ['--min-observations-per-gene', '5', '--mean-cells-per-droplet', '1.25', '--constant-cells-per-droplet', '--fraction-of-reads-to-remove', '0.15'] # it was crashing and this fixes it, i dunno if we should turn it on also for non-paired but whatever
|
194
|
+
if args.bust_cache: # if we're cache busting, we need to run these *first*, so that the inference tests run on a simulation file in the new dir that was just made (i.e. *not* whatever simulation file in the new dir happens to be there)
|
195
|
+
self.tests['cache-parameters-data'] = pcache_data_args
|
196
|
+
self.tests['simulate'] = simulate_args
|
197
|
+
self.tests['cache-parameters-simu'] = pcache_simu_args
|
198
|
+
add_inference_tests('new')
|
199
|
+
if not args.bust_cache: # normally (if we're not cache busting) we want to run these last, to make it super clear that the inference tests are running on the *reference* simulation file
|
200
|
+
self.tests['cache-parameters-data'] = pcache_data_args
|
201
|
+
if not args.no_simu:
|
202
|
+
self.tests['simulate'] = simulate_args
|
203
|
+
|
204
|
+
self.quick_tests = ['cache-parameters-quick-new-simu'] # this is kind of dumb to keep track of what the quick tests are in two different ways, but I only just started not adding the non-quick tests if --quick is set, and I don't want to mess with all the other places that use <self.quick_tests>
|
205
|
+
|
206
|
+
self.perfdirs = {} # set in fiddle_with_arguments() NOTE these correspond only to annotation performance, whereas <self.perf_info> has also partition performance
|
207
|
+
for ptest, argfo in self.tests.items():
|
208
|
+
self.fiddle_with_arguments(ptest, argfo)
|
209
|
+
|
210
|
+
self.perf_info = {version_stype : {} for version_stype in self.stypes}
|
211
|
+
|
212
|
+
# ----------------------------------------------------------------------------------------
|
213
|
+
def test(self, args):
|
214
|
+
if not args.dont_run:
|
215
|
+
self.run(args)
|
216
|
+
if args.dry_run or args.bust_cache or args.quick:
|
217
|
+
return
|
218
|
+
self.compare_production_results(['cache-parameters-simu'])
|
219
|
+
self.compare_stuff(input_stype='new')
|
220
|
+
self.compare_production_results(['cache-parameters-data'])
|
221
|
+
if not args.no_simu:
|
222
|
+
self.compare_production_results(['simulate'])
|
223
|
+
self.compare_run_times()
|
224
|
+
|
225
|
+
# ----------------------------------------------------------------------------------------
|
226
|
+
def run_coverage(self, args):
|
227
|
+
# ----------------------------------------------------------------------------------------
|
228
|
+
def run_cmd(cmd, shell=False, logfname=None, dont_prepend=False):
|
229
|
+
if logfname is not None:
|
230
|
+
print(' writing log to %s' % logfname)
|
231
|
+
cov_str = 'coverage3 run --append' # --data-file=%s/coverage/%d.cov (this doesn't seem to be supported in my version
|
232
|
+
utils.simplerun('%s%s' % ('' if dont_prepend else cov_str+' ', cmd), shell=shell, logfname=logfname)
|
233
|
+
# ----------------------------------------------------------------------------------------
|
234
|
+
ivsn = 0
|
235
|
+
while True:
|
236
|
+
odir = '%s/vsn-%d' % (args.coverage_outdir, ivsn)
|
237
|
+
if os.path.exists(odir):
|
238
|
+
print(' coverage outdir %s exists, you may want to rm -r it by hand' % odir)
|
239
|
+
else:
|
240
|
+
break
|
241
|
+
ivsn += 1
|
242
|
+
cfn = '%s/.coverage' % os.getcwd()
|
243
|
+
if os.path.exists(cfn):
|
244
|
+
print(' removing existing coverage file %s' % cfn)
|
245
|
+
os.remove(cfn)
|
246
|
+
|
247
|
+
run_cmd('./test/test.py --prepend-coverage', dont_prepend=True) # NOTE tests may fail because of the coverage stuff, which is fine (at the least they'll be way too slow)
|
248
|
+
run_cmd('./test/test.py --prepend-coverage --paired', dont_prepend=True) # also note that we have to put dont_prepend since recursive subprocs having coverage commands breaks things (at least before coverage 6.3)
|
249
|
+
|
250
|
+
# cp output files so that working files (e.g. tree inference output files) don't get scattered around the normal test output dir
|
251
|
+
if not os.path.exists(odir):
|
252
|
+
os.makedirs(odir)
|
253
|
+
ptnfn = '%s/%s' % (odir, utils.insert_before_suffix('-single', os.path.basename(self.opath('partition-new-simu', st='ref'))))
|
254
|
+
utils.simplerun('cp %s %s' % (self.opath('partition-new-simu', st='ref'), ptnfn))
|
255
|
+
pair_ptndir = '%s/%s-paired' % (odir, os.path.basename(self.opath('partition-new-simu', st='ref', force_paired=True)))
|
256
|
+
utils.simplerun('cp -r %s %s' % (self.opath('partition-new-simu', st='ref', force_paired=True), pair_ptndir))
|
257
|
+
|
258
|
+
for ft in ['csv', 'fa', 'yaml']:
|
259
|
+
run_cmd('./bin/parse-output.py %s %s/parse-output.%s' % (ptnfn, odir, ft))
|
260
|
+
run_cmd('./bin/parse-output.py %s %s/parse-output-paired --paired' % (pair_ptndir, odir))
|
261
|
+
run_cmd('./bin/cf-alleles.py --bases all', logfname='%s/cf-alleles.log'%odir)
|
262
|
+
run_cmd('./bin/cf-alleles.py --bases 8-51-1', logfname='%s/cf-alleles-8-51.log'%odir)
|
263
|
+
|
264
|
+
run_cmd('./bin/partis view-output --outfname %s' % ptnfn, logfname='%s/view-output.log'%odir)
|
265
|
+
run_cmd('./bin/partis view-output --paired-loci --paired-outdir %s' % pair_ptndir, logfname='%s/view-output-paired.log'%odir)
|
266
|
+
|
267
|
+
run_cmd('./bin/cf-germlines.py %s/hmm/germline-sets %s/hmm/germline-sets' % (self.paramdir('ref', 'simu'), self.paramdir('ref', 'data')), logfname='%s/cf-germlines.log'%odir)
|
268
|
+
run_cmd('./bin/compare-plotdirs.py --outdir %s/compare-plotdirs --plotdirs %s/hmm/mutation:%s/sw/mutation --names hmm:sw' % (odir, self.opath('annotate-new-simu-annotation-performance', st='ref').replace('.yaml', ''), self.opath('annotate-new-simu-annotation-performance', st='ref').replace('.yaml', '')))
|
269
|
+
|
270
|
+
ptn_plot_cmd = './bin/partis plot-partitions --partition-plot-cfg mds:trees --tree-inference-method iqtree --cluster-indices 0:2 %s' % ' '.join(self.cluster_size_args())
|
271
|
+
run_cmd('%s --outfname %s --plotdir %s/plot-partitions' % (ptn_plot_cmd, ptnfn, odir))
|
272
|
+
run_cmd('%s --paired-loci --paired-outdir %s --plotdir %s/plot-partitions-paired' % (ptn_plot_cmd, pair_ptndir, odir))
|
273
|
+
|
274
|
+
run_cmd('./bin/plot-hmms.py --outdir %s/plot-hmms --infiles %s' % (odir, ':'.join(glob.glob('%s/hmm/hmms/IGHD1*.yaml'%self.paramdir('ref', 'data')))))
|
275
|
+
|
276
|
+
gct_sm_cmd = './bin/partis get-selection-metrics --tree-inference-method gctree %s --cluster-indices 0:2' % ' '.join(self.cluster_size_args())
|
277
|
+
run_cmd('%s --outfname %s --plotdir %s/gctree-smetric-plots' % (gct_sm_cmd, ptnfn, odir))
|
278
|
+
run_cmd('./bin/read-gctree-output.py --locus igh --species human --gctreedir %s/gctree/iclust-0 --outdir %s/read-gctree-output' % (utils.getprefix(ptnfn), odir)) # NOTE this uses output from the previous line
|
279
|
+
run_cmd('%s --paired-loci --paired-outdir %s --plotdir %s/paired-gctree-smetric-plots' % (gct_sm_cmd, pair_ptndir, odir))
|
280
|
+
|
281
|
+
run_cmd('./bin/bcr-phylo-run.py --base-outdir %s/bcr-phylo-run' % odir) # don't use multiple gc rounds here, since we need the tree in the next line (and the tree isn't written for multiple gc rounds)
|
282
|
+
run_cmd('./bin/smetric-run.py --infname %s/bcr-phylo-run/selection/simu/mutated-simu.yaml --base-plotdir %s/smetric-run --metric-method lbi' % (odir, odir)) # NOTE uses results from previous line
|
283
|
+
run_cmd('./bin/bcr-phylo-run.py --base-outdir %s/bcr-phylo-run-paired --paired --n-gc-rounds 3 --obs-times 30:5,10:5' % odir)
|
284
|
+
run_cmd('./test/cf-paired-loci.py --label coverage --version v0 --n-replicates 2 --n-sub-procs 10 --scratch-mute-freq-list 0.01:0.1 --simu-extra-args=\"--flat-mute-freq --same-mute-freq-for-all-seqs --mutate-stop-codons\" --final-plot-xvar scratch-mute-freq --n-leaves-list 3 --n-sim-events-list 100 --single-light-locus igk --base-outdir %s/cf-paired-loci --perf-metrics precision:sensitivity --actions simu:cache-parameters:partition:plot:combine-plots' % odir, shell=True)
|
285
|
+
|
286
|
+
# it'd be nice to add germline inference to the normal (non-coverage) tests, but doing more than a trivial test like this requires lots of sequences, which is slower than I really want to add to testing
|
287
|
+
run_cmd('./bin/test-germline-inference.py --prepend-coverage-command --n-sim-events 1000 --outdir %s/test-germline-inference --sim-v-genes=IGHV1-18*01 --inf-v-genes=IGHV1-18*01 --snp-positions 27,55,88 --mutation-multiplier 0.00001 --seed 1' % odir, dont_prepend=True)
|
288
|
+
|
289
|
+
print('now run: coverage3 report --omit=python/__init__.py') # could automate this, but whatever
|
290
|
+
|
291
|
+
# ----------------------------------------------------------------------------------------
|
292
|
+
def fiddle_with_arguments(self, ptest, argfo):
|
293
|
+
input_stype, input_dtype = self.get_stypes(ptest)
|
294
|
+
argfo['input_stype'] = input_stype
|
295
|
+
argfo['bin'] = self.partis_path
|
296
|
+
|
297
|
+
if ptest == 'simulate':
|
298
|
+
argfo['parameter-dir'] = self.paramdir(input_stype, 'data')
|
299
|
+
else:
|
300
|
+
argfo['inpath'] = self.inpath('new' if args.bust_cache else 'ref', input_dtype)
|
301
|
+
if ptest.find('subset-') != 0:
|
302
|
+
argfo['parameter-dir'] = self.paramdir(input_stype, input_dtype)
|
303
|
+
if not args.paired:
|
304
|
+
argfo['sw-cachefname'] = self.paramdir(input_stype, input_dtype) + '/sw-cache.yaml'
|
305
|
+
if ptest != 'simulate' and input_dtype == 'simu' and not args.quick:
|
306
|
+
argfo['extras'] += ['--is-simu']
|
307
|
+
|
308
|
+
if 'annotate' in ptest:
|
309
|
+
argfo['action'] = 'annotate'
|
310
|
+
elif 'partition' in ptest:
|
311
|
+
argfo['action'] = 'partition'
|
312
|
+
if not args.paired: # eh, i don't think there's really a reason to do this for paired (although I partially implemented -- i got the files in single-chain/, but then getting the igh+igk/ etc. was going to be more work)
|
313
|
+
argfo['extras'] += ['--persistent-cachefname', self.ptn_cachefn(input_stype, for_cmd=True)]
|
314
|
+
elif 'get-selection-metrics' in ptest:
|
315
|
+
argfo['action'] = 'get-selection-metrics' # could really remove almost all of the arguments, mostly just need --outfname
|
316
|
+
elif 'cache-parameters-' in ptest:
|
317
|
+
argfo['action'] = 'cache-parameters'
|
318
|
+
else:
|
319
|
+
argfo['action'] = ptest
|
320
|
+
if ptest.find('subset-') == 0:
|
321
|
+
argfo['action'] = 'subset-%s' % argfo['action']
|
322
|
+
argfo['extras'] += ['--n-subsets', '2']
|
323
|
+
|
324
|
+
if '--plot-annotation-performance' in argfo['extras']:
|
325
|
+
self.perfdirs[ptest] = ptest + '-annotation-performance'
|
326
|
+
argfo['extras'] += ['--plotdir', self.dirs('new') + '/' + self.perfdirs[ptest]]
|
327
|
+
|
328
|
+
if '--plotdir' in argfo['extras']:
|
329
|
+
argfo['extras'] += ['--only-csv-plots']
|
330
|
+
if 'partition' in ptest:
|
331
|
+
argfo['extras'] += ['--no-partition-plots']
|
332
|
+
|
333
|
+
if '-data' in ptest and not args.paired: # would be cleaner to check that inpath is self.datafname
|
334
|
+
argfo['extras'] += ['--input-metafnames', self.input_metafname]
|
335
|
+
|
336
|
+
# ----------------------------------------------------------------------------------------
|
337
|
+
def compare_stuff(self, input_stype):
|
338
|
+
print('%s input' % input_stype)
|
339
|
+
for version_stype in self.stypes: # <version_stype> is the code version, i.e. 'ref' is the reference results, 'new' is the results we just made with the new code
|
340
|
+
self.read_annotation_performance(version_stype, input_stype)
|
341
|
+
self.read_partition_performance(version_stype, input_stype) # NOTE also calls read_annotation_performance()
|
342
|
+
self.read_selection_metric_performance(version_stype, input_stype)
|
343
|
+
self.compare_performance(input_stype)
|
344
|
+
if not args.paired:
|
345
|
+
self.compare_partition_cachefiles(input_stype)
|
346
|
+
|
347
|
+
# ----------------------------------------------------------------------------------------
|
348
|
+
def prepare_to_run(self, args, name, info):
|
349
|
+
""" Pre-run stuff that you don't want to do until *right* before you actually run. """
|
350
|
+
# ----------------------------------------------------------------------------------------
|
351
|
+
def clean_dir(sdir): # rm whole seed dir to make sure the dir for the previous seed id doesn't hang around
|
352
|
+
if args.dry_run:
|
353
|
+
print(' would rm %s' % sdir)
|
354
|
+
else: # maybe i should just rm the output dir for every test before running? although it might get complicated since some of them i think share dirs
|
355
|
+
print(' removing %s' % sdir)
|
356
|
+
shutil.rmtree(sdir)
|
357
|
+
# ----------------------------------------------------------------------------------------
|
358
|
+
def rm_file(fn): # for search: remove
|
359
|
+
if args.dry_run:
|
360
|
+
files_to_rm.append(fn)
|
361
|
+
else:
|
362
|
+
check_call(['rm', '-v', fn])
|
363
|
+
# ----------------------------------------------------------------------------------------
|
364
|
+
files_to_rm = [] # just for dbg
|
365
|
+
# delete any old partition cache files
|
366
|
+
if name == 'partition-' + info['input_stype'] + '-simu':
|
367
|
+
cachefnames = ['%s/%s' % (self.dirs('new'), f) for f in self.all_ptn_cachefns()]
|
368
|
+
for cfn in [f for f in cachefnames if os.path.exists(f)]:
|
369
|
+
rm_file(cfn)
|
370
|
+
# and any old tree inference files
|
371
|
+
if name == 'get-selection-metrics-' + info['input_stype'] + '-simu':
|
372
|
+
tfns = []
|
373
|
+
for subd in ['', '/*+*/partition-*']:
|
374
|
+
for tmeth in ['fasttree', 'iqtree']:
|
375
|
+
for ftp in ['fasttree.out', 'log*', 'input-seqs.fa']:
|
376
|
+
tfns += glob.glob('%s%s/%s/iclust-*/%s' % (self.opath('partition-new-simu', st='new'), subd, tmeth, ftp))
|
377
|
+
for ffn in [f for f in tfns if os.path.exists(f)]:
|
378
|
+
rm_file(ffn)
|
379
|
+
if len(files_to_rm) > 0:
|
380
|
+
print(' would rm %d tree inference working files' % len(files_to_rm))
|
381
|
+
|
382
|
+
# choose a seed uid
|
383
|
+
if name == 'seed-partition-' + info['input_stype'] + '-simu':
|
384
|
+
ifn = info['inpath']
|
385
|
+
seed_uid, _ = utils.choose_seed_unique_id(ifn, self.sclust_sizes()[0], self.sclust_sizes()[1], paired=args.paired) # , n_max_queries=self.nqr('partition')
|
386
|
+
if args.paired:
|
387
|
+
seed_uid, seed_loci = seed_uid
|
388
|
+
info['extras'] += ['--seed-unique-id', ':'.join(seed_uid), '--seed-loci', ':'.join(seed_loci)]
|
389
|
+
sdir = '%s/seeds' % self.opath(name, st=info['input_stype'])
|
390
|
+
if os.path.exists(sdir):
|
391
|
+
clean_dir(sdir)
|
392
|
+
else:
|
393
|
+
info['extras'] += ['--seed-unique-id', seed_uid]
|
394
|
+
|
395
|
+
if name.find('subset-') == 0:
|
396
|
+
if os.path.exists(self.opath(name, st='new')):
|
397
|
+
clean_dir(self.opath(name, st='new'))
|
398
|
+
|
399
|
+
# ----------------------------------------------------------------------------------------
|
400
|
+
def run(self, args):
|
401
|
+
if not args.dry_run:
|
402
|
+
open(self.logfname, 'w').close()
|
403
|
+
|
404
|
+
os.environ['PYTHONHASHSEED'] = '0' # turn off hash seed randomization for repeatable results
|
405
|
+
for name, info in self.tests.items():
|
406
|
+
if args.quick and name not in self.quick_tests:
|
407
|
+
continue
|
408
|
+
|
409
|
+
self.prepare_to_run(args, name, info)
|
410
|
+
|
411
|
+
ist, idt = self.get_stypes(name)
|
412
|
+
action = info['action']
|
413
|
+
cmd_str = info['bin'] + ' ' + action + ' --dont-write-git-info'
|
414
|
+
if args.prepend_coverage:
|
415
|
+
cmd_str += ' --prepend-coverage-command'
|
416
|
+
if args.paired:
|
417
|
+
cmd_str += ' --paired-loci'
|
418
|
+
for tkey in ['inpath', 'parameter-dir', 'sw-cachefname']:
|
419
|
+
if tkey in info:
|
420
|
+
cmd_str += ' --%s %s' % (tkey if tkey != 'inpath' else self.astr('in', dt=idt), info[tkey])
|
421
|
+
cmd_str += ' %s' % ' '.join(info['extras'] + self.common_extras)
|
422
|
+
|
423
|
+
if name == 'simulate':
|
424
|
+
cmd_str += ' --%s %s' % (self.astr('out'), self.inpath('new', 'simu'))
|
425
|
+
cmd_str += ' --indel-frequency 0.2' # super high indel freq, partly because we want to make sure we have some even with the new smaller default number of seqs
|
426
|
+
elif 'get-selection-metrics-' in name:
|
427
|
+
cmd_str += ' --%s %s' % (self.astr('out'), self.opath(name.replace('get-selection-metrics-', 'partition-'), st='new'))
|
428
|
+
cmd_str += ' --%s %s' % ('chosen-ab-fname' if args.paired else 'selection-metric-fname', self.opath(name, st='new'))
|
429
|
+
if args.slow:
|
430
|
+
cmd_str += ' --ab-choice-cfg %s/test/ab-choice-slow.yaml' % utils.get_partis_dir()
|
431
|
+
clist = cmd_str.split()
|
432
|
+
utils.remove_from_arglist(clist, '--%s'%self.astr('in', dt=idt), has_arg=True)
|
433
|
+
utils.remove_from_arglist(clist, '--parameter-dir', has_arg=True)
|
434
|
+
utils.remove_from_arglist(clist, '--sw-cachefname', has_arg=True)
|
435
|
+
utils.remove_from_arglist(clist, '--is-simu')
|
436
|
+
cmd_str = ' '.join(clist)
|
437
|
+
elif 'cache-parameters-' not in name:
|
438
|
+
cmd_str += ' --%s %s' % (self.astr('out'), self.opath(name, st='new'))
|
439
|
+
|
440
|
+
logstr = '%s %s' % (utils.color('green', name, width=30, padside='right'), cmd_str)
|
441
|
+
print(logstr if utils.len_excluding_colors(logstr) < args.print_width else logstr[:args.print_width] + '[...]')
|
442
|
+
if args.dry_run:
|
443
|
+
continue
|
444
|
+
logfile = open(self.logfname, 'a')
|
445
|
+
logfile.write(logstr + '\n')
|
446
|
+
logfile.close()
|
447
|
+
start = time.time()
|
448
|
+
try:
|
449
|
+
check_call(cmd_str + ' 1>>' + self.logfname + ' 2>>' + self.logfname, shell=True)
|
450
|
+
if args.quick:
|
451
|
+
print('\n %s' % 'ok')
|
452
|
+
except CalledProcessError as err:
|
453
|
+
# print err # this just says it exited with code != 0
|
454
|
+
print(' log tail: %s' % self.logfname)
|
455
|
+
print(utils.pad_lines(check_output(['tail', self.logfname], universal_newlines=True)))
|
456
|
+
sys.exit(1) # raise Exception('exited with error')
|
457
|
+
self.run_times[name] = time.time() - start # seconds
|
458
|
+
|
459
|
+
if not args.quick and not args.dry_run:
|
460
|
+
self.write_run_times()
|
461
|
+
|
462
|
+
# ----------------------------------------------------------------------------------------
|
463
|
+
def remove_reference_results(self, expected_content):
|
464
|
+
print(' removing ref files')
|
465
|
+
dir_content = set([os.path.basename(f) for f in glob.glob(self.dirs('ref') + '/*')])
|
466
|
+
if len(dir_content - expected_content) > 0 or len(expected_content - dir_content) > 0:
|
467
|
+
if len(dir_content - expected_content) > 0:
|
468
|
+
print('in ref dir but not expected\n %s' % (utils.color('red', ' '.join(dir_content - expected_content))))
|
469
|
+
if len(expected_content - dir_content) > 0:
|
470
|
+
print('expected but not in ref dir\n %s' % (utils.color('red', ' '.join(expected_content - dir_content))))
|
471
|
+
raise Exception('unexpected or missing content in reference dir (see above)')
|
472
|
+
for fname in [self.dirs('ref') + '/' + ec for ec in expected_content]:
|
473
|
+
print(' %srm %s' % ('(would) ' if args.dry_run else '', fname))
|
474
|
+
if args.dry_run:
|
475
|
+
continue
|
476
|
+
if os.path.isdir(fname):
|
477
|
+
shutil.rmtree(fname)
|
478
|
+
else:
|
479
|
+
os.remove(fname)
|
480
|
+
|
481
|
+
# ----------------------------------------------------------------------------------------
|
482
|
+
def bust_cache(self):
|
483
|
+
test_outputs = [self.opath(k) for k in self.tests if not self.is_prod_test(k)]
|
484
|
+
expected_content = set(test_outputs + list(self.perfdirs.values()) + [os.path.basename(self.logfname), self.label])
|
485
|
+
if not args.paired:
|
486
|
+
expected_content |= set(self.all_ptn_cachefns()) # they're in the partition outdir if --paired is set, so don't need to be moved
|
487
|
+
expected_content.add('run-times.csv')
|
488
|
+
|
489
|
+
# remove (very, very gingerly) whole reference dir
|
490
|
+
self.remove_reference_results(expected_content)
|
491
|
+
|
492
|
+
# copy over parameters, simulation, and plots
|
493
|
+
print(' mv new files to ref')
|
494
|
+
for fname in expected_content:
|
495
|
+
print(' mv %s --> %s/' % (fname, self.dirs('ref')))
|
496
|
+
if args.dry_run:
|
497
|
+
continue
|
498
|
+
shutil.move(self.dirs('new') + '/' + fname, self.dirs('ref') + '/')
|
499
|
+
|
500
|
+
# ----------------------------------------------------------------------------------------
|
501
|
+
def read_annotation_performance(self, version_stype, input_stype, these_are_cluster_annotations=False):
|
502
|
+
for sequence_multiplicity in ['single', 'multi']:
|
503
|
+
self.read_each_annotation_performance(sequence_multiplicity, version_stype, input_stype, these_are_cluster_annotations=these_are_cluster_annotations)
|
504
|
+
|
505
|
+
# ----------------------------------------------------------------------------------------
|
506
|
+
def read_each_annotation_performance(self, sequence_multiplicity, version_stype, input_stype, these_are_cluster_annotations=False): # <these_are_cluster_annotations> means this fcn is being called from within read_partition_performance()
|
507
|
+
""" version_stype is the code version, while input_stype is the input data version, i.e. 'ref', 'new' is the reference code version (last commit) run on the then-new simulation and parameters"""
|
508
|
+
if these_are_cluster_annotations:
|
509
|
+
ptest = '-'.join(['partition', input_stype, 'simu'])
|
510
|
+
methods = ['hmm']
|
511
|
+
elif sequence_multiplicity == 'single':
|
512
|
+
ptest = '-'.join(['annotate', input_stype, 'simu'])
|
513
|
+
methods = ['sw', 'hmm']
|
514
|
+
elif sequence_multiplicity == 'multi':
|
515
|
+
ptest = '-'.join(['multi', 'annotate', input_stype, 'simu'])
|
516
|
+
methods = ['hmm']
|
517
|
+
else:
|
518
|
+
assert False
|
519
|
+
if (args.quick and ptest not in self.quick_tests) or ptest not in self.tests: # ok i think now i'm adding the second clause i don't need the first, but not quite sure
|
520
|
+
return
|
521
|
+
if input_stype not in self.perf_info[version_stype]:
|
522
|
+
self.perf_info[version_stype][input_stype] = OrderedDict()
|
523
|
+
if ptest not in self.perf_info[version_stype][input_stype]:
|
524
|
+
self.perf_info[version_stype][input_stype][ptest] = OrderedDict()
|
525
|
+
perfdir = self.dirs(version_stype) + '/' + self.perfdirs[ptest]
|
526
|
+
perffo = self.perf_info[version_stype][input_stype][ptest]
|
527
|
+
for method in methods:
|
528
|
+
perffo[method] = OrderedDict() # arg, this is deeper than I'd like
|
529
|
+
perffo[method]['mean_hamming'] = Hist(fname=perfdir + '/' + method + '/mutation/hamming_to_true_naive.csv').get_mean()
|
530
|
+
for region in utils.regions + ['cdr3']:
|
531
|
+
perffo[method][region + '_hamming'] = Hist(fname=perfdir + '/' + method + '/mutation/' + region + '_hamming_to_true_naive.csv').get_mean()
|
532
|
+
for bound in utils.boundaries:
|
533
|
+
perffo[method][bound + '_insertion'] = Hist(fname=perfdir + '/' + method + '/boundaries/' + bound + '_insertion.csv').get_mean(absval=True)
|
534
|
+
for erosion in utils.real_erosions:
|
535
|
+
perffo[method][erosion + '_del'] = Hist(fname=perfdir + '/' + method + '/boundaries/' + erosion + '_del.csv').get_mean(absval=True)
|
536
|
+
|
537
|
+
# ----------------------------------------------------------------------------------------
|
538
|
+
def do_this_test(self, tstr, input_stype, pt):
|
539
|
+
if tstr not in pt:
|
540
|
+
return False
|
541
|
+
if input_stype not in pt:
|
542
|
+
return False
|
543
|
+
if args.quick and pt not in self.quick_tests:
|
544
|
+
return False
|
545
|
+
return True
|
546
|
+
|
547
|
+
# ----------------------------------------------------------------------------------------
|
548
|
+
def read_partition_performance(self, version_stype, input_stype, debug=False):
|
549
|
+
""" Read new partitions from self.dirs('new'), and put the comparison numbers in self.perf_info (compare either to true, for simulation, or to the partition in reference dir, for data). """
|
550
|
+
# ----------------------------------------------------------------------------------------
|
551
|
+
def read_cpath(fname):
|
552
|
+
_, _, cpath = utils.read_yaml_output(fname=fname, skip_annotations=True)
|
553
|
+
ccfs = cpath.ccfs[cpath.i_best]
|
554
|
+
if None in ccfs:
|
555
|
+
raise Exception('none type ccf read from %s' % fname)
|
556
|
+
if debug:
|
557
|
+
print(' %5.2f %5.2f %-28s to true partition' % (ccfs[0], ccfs[1], fname)) #os.path.basename(fname))
|
558
|
+
return ccfs
|
559
|
+
# ----------------------------------------------------------------------------------------
|
560
|
+
ptest_list = [k for k in self.tests.keys() if self.do_this_test('partition', input_stype, k)]
|
561
|
+
if len(ptest_list) == 0:
|
562
|
+
return
|
563
|
+
if input_stype not in self.perf_info[version_stype]:
|
564
|
+
self.perf_info[version_stype][input_stype] = OrderedDict()
|
565
|
+
pinfo = self.perf_info[version_stype][input_stype]
|
566
|
+
if debug:
|
567
|
+
print(' version %s input %s partitioning' % (version_stype, input_stype))
|
568
|
+
print(' purity completeness test description')
|
569
|
+
for ptest in ptest_list:
|
570
|
+
if ptest not in pinfo:
|
571
|
+
pinfo[ptest] = OrderedDict()
|
572
|
+
if args.paired:
|
573
|
+
l_ccfs = []
|
574
|
+
for locus in utils.sub_loci(args.ig_or_tr):
|
575
|
+
ofn = '%s/partition-%s.yaml' % (self.opath(ptest, st=version_stype), locus)
|
576
|
+
if 'seed-' in ptest:
|
577
|
+
sfns = glob.glob(ofn.replace('/partition-', '/seeds/*/partition-'))
|
578
|
+
if len(sfns) == 0: # non-seed light chain
|
579
|
+
continue
|
580
|
+
if len(sfns) > 1:
|
581
|
+
raise Exception('multiple seed subdirs in %s/seeds/, you probably changed the seed seq and need to delete the old dir (yes --bust-cache should do this, but doesn\'t atm)' % os.path.dirname(ofn) + '/seeds/')
|
582
|
+
ofn = sfns[0]
|
583
|
+
l_ccfs.append(read_cpath(ofn))
|
584
|
+
pinfo[ptest]['purity'], pinfo[ptest]['completeness'] = [numpy.mean(lcfs) for lcfs in zip(*l_ccfs)]
|
585
|
+
if 'seed-' not in ptest:
|
586
|
+
htmp = Hist(fname='%s/true-pair-clean-performance.csv' % self.opath(ptest, st=version_stype))
|
587
|
+
ttot = htmp.integral(False)
|
588
|
+
for pcat in self.pair_clean_metrics:
|
589
|
+
pinfo[ptest][pcat] = htmp.bin_contents[htmp.find_bin(None, label=pcat)] / float(ttot)
|
590
|
+
else:
|
591
|
+
ccfs = read_cpath(self.opath(ptest, st=version_stype)) # self.dirs(version_stype) + '/' + ptest + '.yaml')
|
592
|
+
pinfo[ptest]['purity'], pinfo[ptest]['completeness'] = ccfs
|
593
|
+
if ptest in self.perfdirs:
|
594
|
+
self.read_each_annotation_performance('single', version_stype, input_stype, these_are_cluster_annotations=True)
|
595
|
+
|
596
|
+
# ----------------------------------------------------------------------------------------
|
597
|
+
def read_selection_metric_performance(self, version_stype, input_stype, debug=False):
|
598
|
+
# ----------------------------------------------------------------------------------------
|
599
|
+
def read_smfile(fname, smfo):
|
600
|
+
if not os.path.exists(fname): # probably e.g. igh+igl for a sample with only igh+igk
|
601
|
+
print(' %s selection metric output file doesn\'t exist: %s' % (utils.wrnstr(), fname))
|
602
|
+
return
|
603
|
+
with open(fname) as yfile:
|
604
|
+
lbfos = yaml.load(yfile, Loader=yaml.CLoader)
|
605
|
+
for metric in self.selection_metrics:
|
606
|
+
for lbfo in lbfos: # one lbfo for each cluster
|
607
|
+
smfo[metric] += list(lbfo['lb'][metric].values())
|
608
|
+
if debug:
|
609
|
+
print(' read lbfos for %d cluster%s from %s' % (len(lbfos), utils.plural(len(lbfos)), fname))
|
610
|
+
# ----------------------------------------------------------------------------------------
|
611
|
+
def read_chosen_abs(fname):
|
612
|
+
with open(fname) as chfile:
|
613
|
+
chlines = list(csv.DictReader(chfile))
|
614
|
+
return set((l['h_id'], l['l_id']) for l in chlines) # not really proper to key only by h_id, but the pairing info shouldn't be able to change, right?
|
615
|
+
# ----------------------------------------------------------------------------------------
|
616
|
+
pinfo = self.perf_info[version_stype][input_stype]
|
617
|
+
if debug:
|
618
|
+
print(' version %s input %s selection metrics' % (version_stype, input_stype))
|
619
|
+
ptest_list = [k for k in self.tests.keys() if self.do_this_test('get-selection', input_stype, k)]
|
620
|
+
for ptest in ptest_list:
|
621
|
+
if ptest not in pinfo: # perf_info should already have all the parent keys cause we run read_partition_performance() first
|
622
|
+
pinfo[ptest] = OrderedDict([(m, []) for m in self.selection_metrics])
|
623
|
+
if args.paired:
|
624
|
+
for lpair in utils.locus_pairs[args.ig_or_tr]:
|
625
|
+
for locus in lpair:
|
626
|
+
smfname = '%s/%s/partition-%s-selection-metrics.yaml' % (self.opath(ptest.replace('get-selection-metrics', 'partition'), st=version_stype), '+'.join(lpair), locus)
|
627
|
+
read_smfile(smfname, pinfo[ptest])
|
628
|
+
if debug:
|
629
|
+
print(' total values read: %s' % ' '.join('%s %d'%(m, len(pinfo[ptest][m])) for m in self.selection_metrics))
|
630
|
+
pinfo[ptest]['chosen-abs'] = read_chosen_abs(self.opath(ptest, st=version_stype))
|
631
|
+
else:
|
632
|
+
read_smfile(self.opath(ptest, st=version_stype), pinfo[ptest])
|
633
|
+
|
634
|
+
# ----------------------------------------------------------------------------------------
|
635
|
+
def compare_performance(self, input_stype):
|
636
|
+
# ----------------------------------------------------------------------------------------
|
637
|
+
def print_comparison_str(ref_val, new_val, epsval, fw=7, dp=3, pm=False):
|
638
|
+
fractional_change = 0. if ref_val == 0. else (new_val - ref_val) / float(ref_val) # NOTE not the abs value yet
|
639
|
+
if abs(fractional_change) > epsval:
|
640
|
+
color = 'red'
|
641
|
+
elif abs(fractional_change) > self.tiny_eps:
|
642
|
+
color = 'yellow'
|
643
|
+
else:
|
644
|
+
color = None
|
645
|
+
def floatstr(v):
|
646
|
+
fmstr = '%%-%d.%df' % (fw, dp)
|
647
|
+
if pm:
|
648
|
+
fmstr = fmstr.replace('%', '%+')
|
649
|
+
return fmstr % v
|
650
|
+
print(' %s%s ' % (floatstr(ref_val), (fw+4)*' ' if color is None else utils.color(color, '--> %s'%floatstr(new_val))), end=' ')
|
651
|
+
|
652
|
+
# ----------------------------------------------------------------------------------------
|
653
|
+
print(' performance with %s simulation and parameters (smaller is better for all annotation metrics)' % input_stype)
|
654
|
+
all_annotation_ptests = ['annotate-' + input_stype + '-simu', 'multi-annotate-' + input_stype + '-simu', 'partition-' + input_stype + '-simu'] # hard code for order
|
655
|
+
all_partition_ptests = [flavor + 'partition-' + input_stype + '-simu' for flavor in ['', 'vsearch-', 'seed-', 'subset-']]
|
656
|
+
annotation_ptests = [pt for pt in all_annotation_ptests if pt in self.perf_info['ref'][input_stype]]
|
657
|
+
partition_ptests = [pt for pt in all_partition_ptests if pt in self.perf_info['ref'][input_stype]]
|
658
|
+
selection_metric_tests = ['get-selection-metrics-'+input_stype+'-simu']
|
659
|
+
metricstrs = {
|
660
|
+
'mean_hamming' : 'hamming',
|
661
|
+
'v_hamming' : 'v ',
|
662
|
+
'd_hamming' : 'd ',
|
663
|
+
'j_hamming' : 'j ',
|
664
|
+
'cdr3_hamming' : 'cdr3 ',
|
665
|
+
'vd_insertion' : 'vd insert',
|
666
|
+
'dj_insertion' : 'dj insert',
|
667
|
+
'd_call' : 'd ',
|
668
|
+
'j_call' : 'j ',
|
669
|
+
'completeness' : 'compl.',
|
670
|
+
'cons-dist-aa' : 'aa-cdist',
|
671
|
+
'correct' : 'pair clean correct',
|
672
|
+
'mispaired' : ' mispaired',
|
673
|
+
'unpaired' : ' unpaired',
|
674
|
+
}
|
675
|
+
refpfo, newpfo = [self.perf_info[st][input_stype] for st in ['ref', 'new']]
|
676
|
+
|
677
|
+
|
678
|
+
# print annotation header
|
679
|
+
print('%8s %9s' % ('', ''), end=' ')
|
680
|
+
for ptest in annotation_ptests:
|
681
|
+
for method in [m for m in refpfo[ptest] if m in ['sw', 'hmm']]: # 'if' is just to skip purity and completeness
|
682
|
+
printstr = method
|
683
|
+
if 'multi-annotate' in ptest:
|
684
|
+
printstr = 'multi %s' % method
|
685
|
+
if 'partition' in ptest:
|
686
|
+
printstr = 'partition %s' % method
|
687
|
+
print(' %-15s' % printstr, end=' ')
|
688
|
+
print('')
|
689
|
+
|
690
|
+
# print values
|
691
|
+
if 'hmm' in refpfo[annotation_ptests[0]]: # it's not in there for paired partition test, since (at least atm) we don't do annotation tests for it
|
692
|
+
allmetrics = [m for m in refpfo[annotation_ptests[0]]['hmm']]
|
693
|
+
for metric in allmetrics:
|
694
|
+
alignstr = '' if len(metricstrs.get(metric, metric).strip()) < 5 else '-'
|
695
|
+
print(('%8s %' + alignstr + '9s') % ('', metricstrs.get(metric, metric)), end=' ')
|
696
|
+
for ptest in annotation_ptests:
|
697
|
+
for method in [m for m in refpfo[ptest] if m in ['sw', 'hmm']]: # 'if' is just to skip purity and completeness
|
698
|
+
if set(refpfo[ptest]) != set(newpfo[ptest]):
|
699
|
+
raise Exception('different metrics in ref vs new:\n %s\n %s' % (sorted(refpfo[ptest]), sorted(newpfo[ptest])))
|
700
|
+
print_comparison_str(refpfo[ptest][method][metric], newpfo[ptest][method][metric], self.eps_vals.get(metric, 0.1))
|
701
|
+
print('')
|
702
|
+
|
703
|
+
# print partition header
|
704
|
+
print('%8s %5s' % ('', ''), end=' ')
|
705
|
+
for ptest in partition_ptests:
|
706
|
+
print(' %-18s' % ptest.split('-')[0], end=' ')
|
707
|
+
print('')
|
708
|
+
for metric in ['purity', 'completeness'] + self.pair_clean_metrics:
|
709
|
+
alignstr = '' if len(metricstrs.get(metric, metric).strip()) < 5 else '-'
|
710
|
+
print(('%8s %' + alignstr + '9s') % ('', metricstrs.get(metric, metric)), end=' ')
|
711
|
+
for ptest in partition_ptests:
|
712
|
+
if 'seed-' in ptest and metric in self.pair_clean_metrics: # ick
|
713
|
+
continue
|
714
|
+
if set(refpfo[ptest]) != set(newpfo[ptest]):
|
715
|
+
raise Exception('different metrics in ref vs new:\n %s\n %s' % (sorted(refpfo[ptest]), sorted(newpfo[ptest])))
|
716
|
+
method = ptest.split('-')[0]
|
717
|
+
if metric != 'purity':
|
718
|
+
method = ''
|
719
|
+
print_comparison_str(refpfo[ptest][metric], newpfo[ptest][metric], self.eps_vals.get(metric, 0.1))
|
720
|
+
print('')
|
721
|
+
|
722
|
+
# selection metrics
|
723
|
+
print(' %s' % ''.join(['%-23s'%metricstrs.get(m, m) for m in self.selection_metrics]))
|
724
|
+
for mfname, mfcn in [('mean', numpy.mean), ('min', min), ('max', max), ('len', len)]:
|
725
|
+
print(' %5s' % mfname, end=' ')
|
726
|
+
for metric in self.selection_metrics:
|
727
|
+
for ptest in selection_metric_tests: # this'll break if there's more than one selection metric ptest
|
728
|
+
if set(refpfo[ptest]) != set(newpfo[ptest]):
|
729
|
+
raise Exception('different metrics in ref vs new:\n %s\n %s' % (sorted(refpfo[ptest]), sorted(newpfo[ptest])))
|
730
|
+
ref_list, new_list = [self.perf_info[rn][input_stype][ptest][metric] for rn in ['ref', 'new']]
|
731
|
+
dp = 1 if metric=='cons-dist-aa' else 3
|
732
|
+
if mfname=='len': dp = 0
|
733
|
+
print_comparison_str(mfcn(ref_list), mfcn(new_list), self.eps_vals.get(metric, 0.1), dp=dp, pm=metric=='cons-dist-aa')
|
734
|
+
print('')
|
735
|
+
if args.paired:
|
736
|
+
ptest = utils.get_single_entry(selection_metric_tests)
|
737
|
+
ref_abs, new_abs = refpfo[ptest]['chosen-abs'], newpfo[ptest]['chosen-abs']
|
738
|
+
n_ref, n_new = len(ref_abs), len(new_abs)
|
739
|
+
n_common = len(ref_abs & new_abs)
|
740
|
+
n_only_ref, n_only_new = len(ref_abs - new_abs), len(new_abs - ref_abs)
|
741
|
+
diffstr = ' ok'
|
742
|
+
if n_ref != n_new or n_only_ref > 0 or n_only_new > 0:
|
743
|
+
diffstr = ' %s in common, %s only in ref, %s only in new' % (utils.color(None if n_common==n_ref else 'red', str(n_common)), utils.color(None if n_only_ref==0 else 'red', str(n_only_ref)), utils.color(None if n_only_new==0 else 'red', str(n_only_new)))
|
744
|
+
print(' chose %d abs %s%s' % (n_ref, '' if n_new==n_ref else utils.color('red', '--> %d'%n_new), diffstr))
|
745
|
+
|
746
|
+
# ----------------------------------------------------------------------------------------
|
747
|
+
def compare_production_results(self, ptests):
|
748
|
+
print('diffing production results')
|
749
|
+
for ptest in ptests:
|
750
|
+
if args.quick and ptest not in self.quick_tests:
|
751
|
+
continue
|
752
|
+
fname = self.opath(ptest) # sometimes a dir rather than a file
|
753
|
+
print(' %-30s' % fname, end=' ')
|
754
|
+
cmd = 'diff -qbr ' + ' '.join(self.dirs(st) + '/' + fname for st in self.stypes)
|
755
|
+
proc = Popen(cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True)
|
756
|
+
out, err = proc.communicate()
|
757
|
+
if proc.returncode == 0:
|
758
|
+
print(' ok')
|
759
|
+
else:
|
760
|
+
differlines = [ l for l in out.split('\n') if 'differ' in l]
|
761
|
+
onlylines = [ l for l in out.split('\n') if 'Only' in l]
|
762
|
+
print('')
|
763
|
+
if len(differlines) > 0:
|
764
|
+
n_total_files = int(check_output('find ' + self.dirs('ref') + '/' + fname + ' -type f | wc -l', shell=True, universal_newlines=True))
|
765
|
+
if n_total_files == 1:
|
766
|
+
assert len(differlines) == 1
|
767
|
+
print(utils.color('red', ' file differs'), end=' ')
|
768
|
+
else:
|
769
|
+
print(utils.color('red', ' %d / %d files differ' % (len(differlines), n_total_files)), end=' ')
|
770
|
+
if len(onlylines) > 0:
|
771
|
+
for st in self.stypes:
|
772
|
+
theseonlylines = [l for l in onlylines if self.dirs(st) + '/' + fname in l]
|
773
|
+
if len(theseonlylines) > 0:
|
774
|
+
print(utils.color('red', ' %d files only in %s' % (len(theseonlylines), st)), end=' ')
|
775
|
+
if differlines == 0 and onlylines == 0:
|
776
|
+
print(utils.color('red', ' not sure why, but diff returned %d' % proc.returncode), end=' ')
|
777
|
+
print(' (%s)' % cmd)
|
778
|
+
if err != '':
|
779
|
+
print(err)
|
780
|
+
|
781
|
+
# ----------------------------------------------------------------------------------------
|
782
|
+
def write_run_times(self):
|
783
|
+
with open(self.dirs('new') + '/run-times.csv', utils.csv_wmode()) as newfile:
|
784
|
+
writer = csv.DictWriter(newfile, ('name', 'seconds'))
|
785
|
+
writer.writeheader()
|
786
|
+
for name, seconds in self.run_times.items():
|
787
|
+
writer.writerow({'name' : name, 'seconds' : '%.1f'%seconds})
|
788
|
+
|
789
|
+
# ----------------------------------------------------------------------------------------
|
790
|
+
def compare_run_times(self):
|
791
|
+
print('checking run times')
|
792
|
+
|
793
|
+
def read_run_times(stype):
|
794
|
+
times[stype] = {}
|
795
|
+
with open(self.dirs(stype) + '/run-times.csv') as timefile:
|
796
|
+
reader = csv.DictReader(timefile)
|
797
|
+
for line in reader:
|
798
|
+
times[stype][line['name']] = float(line['seconds'])
|
799
|
+
times = {}
|
800
|
+
for stype in self.stypes:
|
801
|
+
read_run_times(stype)
|
802
|
+
|
803
|
+
for name in self.tests:
|
804
|
+
if args.quick and name not in self.quick_tests:
|
805
|
+
continue
|
806
|
+
print(' %30s %7.1f' % (name, times['ref'][name]), end=' ')
|
807
|
+
if name not in times['new']:
|
808
|
+
print(' no new time for %s' % utils.color('red', name))
|
809
|
+
continue
|
810
|
+
fractional_change = (times['new'][name] - times['ref'][name]) / float(times['ref'][name])
|
811
|
+
if abs(fractional_change) > 0.2:
|
812
|
+
print('--> %-5.1f %s' % (times['new'][name], utils.color('red', '(%+.3f)' % fractional_change)), end=' ')
|
813
|
+
elif abs(fractional_change) > 0.1:
|
814
|
+
print('--> %-5.1f %s' % (times['new'][name], utils.color('yellow', '(%+.3f)' % fractional_change)), end=' ')
|
815
|
+
else:
|
816
|
+
print(' ok ', end=' ')
|
817
|
+
print('')
|
818
|
+
|
819
|
+
# ----------------------------------------------------------------------------------------
|
820
|
+
def compare_partition_cachefiles(self, input_stype, debug=False):
|
821
|
+
# ----------------------------------------------------------------------------------------
|
822
|
+
def print_key_differences(vtype, refkeys, newkeys):
|
823
|
+
print(' %s keys' % vtype)
|
824
|
+
if len(refkeys - newkeys) > 0 or len(newkeys - refkeys) > 0:
|
825
|
+
if len(refkeys - newkeys) > 0:
|
826
|
+
print(utils.color('red', ' %d only in ref version' % len(refkeys - newkeys)))
|
827
|
+
if len(newkeys - refkeys) > 0:
|
828
|
+
print(utils.color('red', ' %d only in new version' % len(newkeys - refkeys)))
|
829
|
+
print(' %d in common' % len(refkeys & newkeys))
|
830
|
+
else:
|
831
|
+
print(' %d identical keys in new and ref cache' % len(refkeys))
|
832
|
+
# ----------------------------------------------------------------------------------------
|
833
|
+
def readcache(fname):
|
834
|
+
if debug: print(' reading partition cache from %s' % fname)
|
835
|
+
cache = {'naive_seqs' : {}, 'logprobs' : {}}
|
836
|
+
with open(fname) as cachefile:
|
837
|
+
reader = csv.DictReader(cachefile)
|
838
|
+
for line in reader:
|
839
|
+
if line['naive_seq'] != '':
|
840
|
+
cache['naive_seqs'][line['unique_ids']] = line['naive_seq']
|
841
|
+
if line['logprob'] != '':
|
842
|
+
cache['logprobs'][line['unique_ids']] = float(line['logprob'])
|
843
|
+
return cache
|
844
|
+
# ----------------------------------------------------------------------------------------
|
845
|
+
def compare_files(fname):
|
846
|
+
print(' %s input partition cache file' % input_stype)
|
847
|
+
refcache = readcache(self.dirs('ref') + '/' + fname)
|
848
|
+
newcache = readcache(self.dirs('new') + '/' + fname)
|
849
|
+
|
850
|
+
# work out intersection and complement
|
851
|
+
refkeys, newkeys = {}, {}
|
852
|
+
for vtype in ['naive_seqs', 'logprobs']:
|
853
|
+
refkeys[vtype] = set(refcache[vtype].keys())
|
854
|
+
newkeys[vtype] = set(newcache[vtype].keys())
|
855
|
+
print_key_differences(vtype, refkeys[vtype], newkeys[vtype])
|
856
|
+
|
857
|
+
hammings = []
|
858
|
+
n_hammings = 0
|
859
|
+
n_different_length, n_big_hammings = 0, 0
|
860
|
+
hamming_eps = 0.
|
861
|
+
vtype = 'naive_seqs'
|
862
|
+
for uids in refkeys[vtype] & newkeys[vtype]:
|
863
|
+
refseq = refcache[vtype][uids]
|
864
|
+
newseq = newcache[vtype][uids]
|
865
|
+
n_hammings += 1
|
866
|
+
if len(refseq) == len(newseq):
|
867
|
+
hamming_fraction = utils.hamming_fraction(refseq, newseq)
|
868
|
+
if hamming_fraction > hamming_eps:
|
869
|
+
n_big_hammings += 1
|
870
|
+
hammings.append(hamming_fraction)
|
871
|
+
else:
|
872
|
+
n_different_length += 1
|
873
|
+
|
874
|
+
diff_hfracs_str = '%3d / %4d' % (n_big_hammings, n_hammings)
|
875
|
+
mean_hfrac_str = '%.3f' % (numpy.average(hammings) if len(hammings) > 0 else 0.)
|
876
|
+
if n_big_hammings > 0:
|
877
|
+
diff_hfracs_str = utils.color('red', diff_hfracs_str)
|
878
|
+
mean_hfrac_str = utils.color('red', mean_hfrac_str)
|
879
|
+
|
880
|
+
abs_delta_logprobs = []
|
881
|
+
n_delta_logprobs = 0
|
882
|
+
n_big_delta_logprobs = 0
|
883
|
+
logprob_eps = 1e-5
|
884
|
+
vtype = 'logprobs'
|
885
|
+
for uids in refkeys[vtype] & newkeys[vtype]:
|
886
|
+
refval = refcache[vtype][uids]
|
887
|
+
newval = newcache[vtype][uids]
|
888
|
+
n_delta_logprobs += 1
|
889
|
+
abs_delta_logprob = abs(refval - newval)
|
890
|
+
if abs_delta_logprob > logprob_eps:
|
891
|
+
# print '%s %s ref %f new %f' % (vtype, uids, refval, newval)
|
892
|
+
n_big_delta_logprobs += 1
|
893
|
+
abs_delta_logprobs.append(abs_delta_logprob)
|
894
|
+
|
895
|
+
diff_logprob_str = '%3d / %4d' % (n_big_delta_logprobs, n_delta_logprobs)
|
896
|
+
mean_logprob_str = '%.3f' % (numpy.average(abs_delta_logprobs) if len(abs_delta_logprobs) > 0 else 0.)
|
897
|
+
if n_big_delta_logprobs > 0:
|
898
|
+
diff_logprob_str = utils.color('red', diff_logprob_str)
|
899
|
+
mean_logprob_str = utils.color('red', mean_logprob_str)
|
900
|
+
print(' fraction different mean abs difference among differents')
|
901
|
+
print(' naive seqs %s %s (hamming fraction)' % (diff_hfracs_str, mean_hfrac_str))
|
902
|
+
print(' log probs %s %s' % (diff_logprob_str, mean_logprob_str))
|
903
|
+
if n_different_length > 0:
|
904
|
+
print(utils.color('red', ' %d different length' % n_different_length))
|
905
|
+
|
906
|
+
# ----------------------------------------------------------------------------------------
|
907
|
+
ptest = 'partition-' + input_stype + '-simu'
|
908
|
+
if args.quick and ptest not in self.quick_tests:
|
909
|
+
return
|
910
|
+
|
911
|
+
if args.paired:
|
912
|
+
assert False # eh, probably not really any point
|
913
|
+
# for locus in XXX:
|
914
|
+
# compare_files(self.ptn_cachefn(input_stype))
|
915
|
+
else:
|
916
|
+
compare_files(self.ptn_cachefn(input_stype))
|
917
|
+
|
918
|
+
# ----------------------------------------------------------------------------------------
|
919
|
+
parser = argparse.ArgumentParser()
|
920
|
+
parser.add_argument('--dont-run', action='store_true', help='don\'t actually run anything, just check the results')
|
921
|
+
parser.add_argument('--dry-run', action='store_true', help='do all preparations to run, but don\'t actually run the commands, and don\'t check results')
|
922
|
+
parser.add_argument('--quick', action='store_true', help='only run one command: cache-parameters on a small numbrer of simulation events')
|
923
|
+
parser.add_argument('--slow', action='store_true', help='by default, we run tests on a fairly small number of sequences, which is sufficient for checking that *nothing* has changed. But --slow is for cases where you\'ve made changes that you know will affect results, and you want to look at the details of how they\'re affected, for which you need to run on more sequences. Note that whether --slow is set or not (runs all tests with more or less sequences) is separate from --quick (which only runs one test).')
|
924
|
+
parser.add_argument('--run-coverage', action='store_true', help='instead of running the normal suite of tests (which compare results to make sure they haven\'t changed), instead run a series of commands that\'s designed to execute as many of the lines as possible (without comparing results).')
|
925
|
+
parser.add_argument('--prepend-coverage', action='store_true', help='run normal tests, but prepending coverage append commands')
|
926
|
+
parser.add_argument('--coverage-outdir', default='%s/partis/tmp/coverage' % os.getenv('fs', default=os.getenv('HOME')))
|
927
|
+
parser.add_argument('--bust-cache', action='store_true', help='overwrite current ref info, i.e. run this when things have changed, but you\'ve decided they\'re fine')
|
928
|
+
parser.add_argument('--only-bust-current', action='store_true', help='only bust cache for current command line args (as opposed to the default of busting caches for both slow and non-slow, paired and non-paired)')
|
929
|
+
parser.add_argument('--paired', action='store_true', help='run paired tests, i.e. with --paired-loci. Note that this doesn\'t test all the things (e.g. seed partitioning) that non-paired does.')
|
930
|
+
parser.add_argument('--run-all', action='store_true', help='run all four combinations of tests: paired/non-paired and slow/non-slow (by default only runs one). *Not* for use with --bust-cache, which runs all of them by default.')
|
931
|
+
parser.add_argument('--no-simu', action='store_true', help='don\'t run simulation, e.g. if using a minimal install')
|
932
|
+
parser.add_argument('--ig-or-tr', default='ig')
|
933
|
+
parser.add_argument('--print-width', type=int, default=300, help='set to 0 for infinite')
|
934
|
+
|
935
|
+
parser.add_argument('--glfo-dir', default='data/germlines/human')
|
936
|
+
parser.add_argument('--locus', default='igh')
|
937
|
+
args = parser.parse_args()
|
938
|
+
assert not (args.quick and args.slow) # it just doesn't make sense
|
939
|
+
assert not (args.quick and args.paired) # --quick ignores --paired, which is confusing
|
940
|
+
|
941
|
+
random.seed(0)
|
942
|
+
numpy.random.seed(0)
|
943
|
+
|
944
|
+
if args.print_width == 0:
|
945
|
+
args.print_width = 99999
|
946
|
+
|
947
|
+
if args.run_all or (args.bust_cache and not args.only_bust_current): # run all four combos
|
948
|
+
for slowval in [False, True]:
|
949
|
+
for pairedval in [False, True]:
|
950
|
+
clist = copy.deepcopy(sys.argv)
|
951
|
+
utils.remove_from_arglist(clist, '--slow')
|
952
|
+
utils.remove_from_arglist(clist, '--paired')
|
953
|
+
if args.bust_cache:
|
954
|
+
assert not args.run_all
|
955
|
+
clist += ['--only-bust-current']
|
956
|
+
else:
|
957
|
+
utils.remove_from_arglist(clist, '--run-all')
|
958
|
+
cmd_str = ' '.join(clist)
|
959
|
+
if slowval:
|
960
|
+
cmd_str += ' --slow'
|
961
|
+
if pairedval:
|
962
|
+
cmd_str += ' --paired'
|
963
|
+
utils.simplerun(cmd_str, dryrun=args.dry_run)
|
964
|
+
sys.exit(0)
|
965
|
+
|
966
|
+
tester = Tester()
|
967
|
+
if args.run_coverage:
|
968
|
+
tester.run_coverage(args)
|
969
|
+
sys.exit(0)
|
970
|
+
tester.test(args)
|
971
|
+
if args.bust_cache:
|
972
|
+
tester.bust_cache()
|
973
|
+
|
974
|
+
# ----------------------------------------------------------------------------------------
|
975
|
+
def get_typical_variances():
|
976
|
+
# NOTE don't delete this, since it was used (and might be needed again) to get the expected variances hardcoded above
|
977
|
+
raise Exception('needs updating to work as a function')
|
978
|
+
# cp = ClusterPath(fname='tmp.csv')
|
979
|
+
# cp.print_partitions()
|
980
|
+
# sys.exit()
|
981
|
+
# cps = []
|
982
|
+
# adj_mis, ccf_unders, ccf_overs = [], [], []
|
983
|
+
# for iseed in range(6):
|
984
|
+
# # print 'seed %d' % iseed
|
985
|
+
# cp = ClusterPath(fname='%d.csv' % iseed)
|
986
|
+
# cp.print_partitions() #(cp.i_best) #, abbreviate=False)
|
987
|
+
# adj_mis.append(cp.adj_mis[cp.i_best])
|
988
|
+
# ccf_unders.append(cp.ccfs[cp.i_best][0])
|
989
|
+
# ccf_overs.append(cp.ccfs[cp.i_best][1])
|
990
|
+
# cps.append(cp)
|
991
|
+
# def print_mean_variance(vals):
|
992
|
+
# mean = numpy.average(vals)
|
993
|
+
# variance = numpy.average((vals - mean)**2) #, weights=wgts)
|
994
|
+
# print 'mean %.2f std dev %.3f (%.1f%%)' % (mean, math.sqrt(variance), 100. * math.sqrt(variance) / mean)
|
995
|
+
|
996
|
+
# # mean/var for six random seeds
|
997
|
+
# print_mean_variance(adj_mis) # mean 0.61 std dev 0.053 (8.7%)
|
998
|
+
# print_mean_variance(ccf_unders) # mean 0.74 std dev 0.026 (3.5%)
|
999
|
+
# print_mean_variance(ccf_overs) # mean 0.90 std dev 0.015 (1.7%)
|
1000
|
+
# # for iseed in range(len(cps)):
|
1001
|
+
# # icp = cps[iseed]
|
1002
|
+
# # for jseed in range(iseed, len(cps)):
|
1003
|
+
# # jcp = cps[jseed]
|
1004
|
+
# # print ' %d %d %.3f' % (iseed, jseed, utils.adjusted_mutual_information(icp.partitions[icp.i_best], jcp.partitions[jcp.i_best]))
|
1005
|
+
|