partis-bcr 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. bin/FastTree +0 -0
  2. bin/add-chimeras.py +59 -0
  3. bin/add-seqs-to-outputs.py +81 -0
  4. bin/bcr-phylo-run.py +799 -0
  5. bin/build.sh +24 -0
  6. bin/cf-alleles.py +97 -0
  7. bin/cf-germlines.py +57 -0
  8. bin/cf-linearham.py +199 -0
  9. bin/chimera-plot.py +76 -0
  10. bin/choose-partially-paired.py +143 -0
  11. bin/circle-plots.py +30 -0
  12. bin/compare-plotdirs.py +298 -0
  13. bin/diff-parameters.py +133 -0
  14. bin/docker-hub-push.sh +6 -0
  15. bin/extract-pairing-info.py +55 -0
  16. bin/gcdyn-simu-run.py +223 -0
  17. bin/gctree-run.py +244 -0
  18. bin/get-naive-probabilities.py +126 -0
  19. bin/iqtree-1.6.12 +0 -0
  20. bin/lonr.r +1020 -0
  21. bin/makeHtml +52 -0
  22. bin/mds-run.py +46 -0
  23. bin/parse-output.py +277 -0
  24. bin/partis +1869 -0
  25. bin/partis-pip +116 -0
  26. bin/partis.py +1869 -0
  27. bin/plot-gl-set-trees.py +519 -0
  28. bin/plot-hmms.py +151 -0
  29. bin/plot-lb-tree.py +427 -0
  30. bin/raxml-ng +0 -0
  31. bin/read-bcr-phylo-trees.py +38 -0
  32. bin/read-gctree-output.py +166 -0
  33. bin/run-chimeras.sh +64 -0
  34. bin/run-dtr-scan.sh +25 -0
  35. bin/run-paired-loci.sh +100 -0
  36. bin/run-tree-metrics.sh +88 -0
  37. bin/smetric-run.py +62 -0
  38. bin/split-loci.py +317 -0
  39. bin/swarm-2.1.13-linux-x86_64 +0 -0
  40. bin/test-germline-inference.py +425 -0
  41. bin/tree-perf-run.py +194 -0
  42. bin/vsearch-2.4.3-linux-x86_64 +0 -0
  43. bin/vsearch-2.4.3-macos-x86_64 +0 -0
  44. bin/xvfb-run +194 -0
  45. partis_bcr-1.0.2.data/scripts/cf-alleles.py +97 -0
  46. partis_bcr-1.0.2.data/scripts/cf-germlines.py +57 -0
  47. partis_bcr-1.0.2.data/scripts/extract-pairing-info.py +55 -0
  48. partis_bcr-1.0.2.data/scripts/gctree-run.py +244 -0
  49. partis_bcr-1.0.2.data/scripts/parse-output.py +277 -0
  50. partis_bcr-1.0.2.data/scripts/split-loci.py +317 -0
  51. partis_bcr-1.0.2.data/scripts/test.py +1005 -0
  52. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/METADATA +1 -1
  53. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/RECORD +101 -51
  54. partis_bcr-1.0.2.dist-info/top_level.txt +1 -0
  55. {partis → python}/glutils.py +1 -1
  56. python/main.py +30 -0
  57. {partis → python}/plotting.py +10 -1
  58. {partis → python}/treeutils.py +18 -16
  59. {partis → python}/utils.py +14 -7
  60. packages/ham/bcrham +0 -0
  61. partis/main.py +0 -59
  62. partis_bcr-1.0.0.dist-info/top_level.txt +0 -1
  63. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/WHEEL +0 -0
  64. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/entry_points.txt +0 -0
  65. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/licenses/COPYING +0 -0
  66. {partis → python}/__init__.py +0 -0
  67. {partis → python}/alleleclusterer.py +0 -0
  68. {partis → python}/allelefinder.py +0 -0
  69. {partis → python}/alleleremover.py +0 -0
  70. {partis → python}/annotationclustering.py +0 -0
  71. {partis → python}/baseutils.py +0 -0
  72. {partis → python}/cache/__init__.py +0 -0
  73. {partis → python}/cache/cached_uncertainties.py +0 -0
  74. {partis → python}/clusterpath.py +0 -0
  75. {partis → python}/coar.py +0 -0
  76. {partis → python}/corrcounter.py +0 -0
  77. {partis → python}/datautils.py +0 -0
  78. {partis → python}/event.py +0 -0
  79. {partis → python}/fraction_uncertainty.py +0 -0
  80. {partis → python}/gex.py +0 -0
  81. {partis → python}/glomerator.py +0 -0
  82. {partis → python}/hist.py +0 -0
  83. {partis → python}/hmmwriter.py +0 -0
  84. {partis → python}/hutils.py +0 -0
  85. {partis → python}/indelutils.py +0 -0
  86. {partis → python}/lbplotting.py +0 -0
  87. {partis → python}/mds.py +0 -0
  88. {partis → python}/mutefreqer.py +0 -0
  89. {partis → python}/paircluster.py +0 -0
  90. {partis → python}/parametercounter.py +0 -0
  91. {partis → python}/paramutils.py +0 -0
  92. {partis → python}/partitiondriver.py +0 -0
  93. {partis → python}/partitionplotter.py +0 -0
  94. {partis → python}/performanceplotter.py +0 -0
  95. {partis → python}/plotconfig.py +0 -0
  96. {partis → python}/processargs.py +0 -0
  97. {partis → python}/prutils.py +0 -0
  98. {partis → python}/recombinator.py +0 -0
  99. {partis → python}/scanplot.py +0 -0
  100. {partis → python}/seqfileopener.py +0 -0
  101. {partis → python}/treegenerator.py +0 -0
  102. {partis → python}/viterbicluster.py +0 -0
  103. {partis → python}/vrc01.py +0 -0
  104. {partis → python}/waterer.py +0 -0
bin/bcr-phylo-run.py ADDED
@@ -0,0 +1,799 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import absolute_import, division, unicode_literals
3
+ from __future__ import print_function
4
+ import argparse
5
+ import csv
6
+ import colored_traceback.always
7
+ import collections
8
+ import copy
9
+ import os
10
+ import sys
11
+ import numpy
12
+ import math
13
+ import time
14
+ import traceback
15
+ import itertools
16
+ import yaml
17
+ from io import open
18
+
19
+ from pathlib import Path
20
+ partis_dir = str(Path(__file__).parent.parent)
21
+ sys.path.insert(1, partis_dir) # + '/python')
22
+ import python.utils as utils
23
+ import python.indelutils as indelutils
24
+ import python.treeutils as treeutils
25
+ from python.event import RecombinationEvent
26
+ import python.paircluster as paircluster
27
+
28
+ bcr_phylo_path = os.getenv('PWD') + '/packages/bcr-phylo-benchmark'
29
+ ig_or_tr = 'ig'
30
+
31
+ # ----------------------------------------------------------------------------------------
32
+ def simdir():
33
+ return '%s/selection/simu' % args.base_outdir
34
+ def infdir():
35
+ return '%s/selection/partis' % args.base_outdir
36
+ def evtdir(i, igcr=None):
37
+ return '%s/event-%d%s' % (simdir(), i, '' if igcr is None else '/round-%d'%igcr)
38
+ def spath(tstr, unsampled=False): # use spath() for building command line args, whereas use get_simfn() to get [an] inidivudal file e.g. for utils.output_exists(), as well as for passing to fcns in paircluster.py
39
+ if args.mutated_outpath and tstr == 'mutated':
40
+ opth = args.base_outdir
41
+ else:
42
+ opth = '%s/%s-simu' % (simdir(), tstr)
43
+ return '%s%s%s' % (opth, '-unsampled' if (args.tpsample and unsampled) else '', '' if args.paired_loci else '.yaml')
44
+ def sfname(tstr, ltmp, lpair=None, unsampled=False):
45
+ if ltmp is None: assert not args.paired_loci
46
+ return paircluster.paired_fn(spath(tstr, unsampled=unsampled), ltmp, lpair=lpair, suffix='.yaml') if args.paired_loci else spath(tstr, unsampled=unsampled)
47
+ def naive_fname(ltmp, lpair=None): # args are only used for paired loci (but we pass the whole fcn to another fcn, so we need the signature like this)
48
+ return sfname('naive', ltmp, lpair=lpair) #paircluster.paired_fn(spath('naive'), ltmp, lpair=lpair, suffix='.yaml') if args.paired_loci else spath('naive')
49
+ def bcr_phylo_fasta_fname(outdir):
50
+ return '%s/%s.fasta' % (outdir, args.extrastr)
51
+ def get_simfn(ltmp, lpair=None, joint=False): # NOTE joint has no effect, but is needed for passing to paircluster.write_concatd_output_files()
52
+ return sfname('mutated', ltmp, lpair=lpair)
53
+ def get_unsampled_simfn(ltmp, lpair=None, joint=False): # ugly to have this, but signature has to be this so we can pass it to fcns in paircluster
54
+ return sfname('mutated', ltmp, lpair=lpair, unsampled=True)
55
+ # ----------------------------------------------------------------------------------------
56
+ def ipath(stype): # path/file name for building command line args
57
+ rpath = infdir()
58
+ if args.paired_loci:
59
+ return rpath
60
+ assert stype in ['params', 'partition', 'plots']
61
+ rpath = '%s/%s' % (rpath, stype)
62
+ if stype == 'partition':
63
+ rpath += '.yaml'
64
+ return rpath
65
+ # ----------------------------------------------------------------------------------------
66
+ def ifname(stype, ltmp='igh'): # path/files for utils.output_exists()
67
+ rpath = ipath(stype)
68
+ if args.paired_loci:
69
+ if stype == 'partition':
70
+ rpath = paircluster.paired_fn(rpath, ltmp, suffix='.yaml', actstr=stype)
71
+ else:
72
+ rpath += '/parameters/%s' % ltmp
73
+ if stype == 'params':
74
+ rpath += '/hmm/hmms'
75
+ return rpath
76
+ # ----------------------------------------------------------------------------------------
77
+ def lpairs():
78
+ return utils.locus_pairs[ig_or_tr]
79
+ # ----------------------------------------------------------------------------------------
80
+ def rearrange():
81
+ if utils.output_exists(args, naive_fname('igh'), outlabel='naive simu', offset=4): # just look for the merged igh file, since it's about the last to be written (and both paired subdirs may not be there)
82
+ return
83
+ cmd = './bin/partis simulate --simulate-from-scratch --mutation-multiplier 0.0001 --n-leaves 1 --constant-number-of-leaves' # tends to get in infinite loop if you actually pass 0. (yes, I should fix this)
84
+ cmd += ' --debug %d --random-seed %d --n-sim-events %d' % (int(args.debug), args.seed, args.n_sim_events if not args.restrict_to_single_naive_seq else 1)
85
+ if args.paired_loci:
86
+ cmd += ' --paired-loci --paired-outdir %s' % spath('naive')
87
+ else:
88
+ cmd += ' --outfname %s' % spath('naive')
89
+ if args.restrict_available_genes:
90
+ assert not args.paired_loci
91
+ cmd += ' --only-genes IGHV1-18*01:IGHJ1*01'
92
+ if args.rearr_extra_args is not None:
93
+ cmd += ' %s' % args.rearr_extra_args
94
+ if args.single_light_locus is not None:
95
+ cmd += ' --single-light-locus %s' % args.single_light_locus
96
+ if args.n_procs > 1:
97
+ cmd += ' --n-procs %d' % args.n_procs
98
+ if args.slurm:
99
+ cmd += ' --batch-system slurm'
100
+ utils.simplerun(cmd, dryrun=args.dry_run, debug=True)
101
+
102
+ # ----------------------------------------------------------------------------------------
103
+ def get_vpar_val(parg, pval, debug=False): # get value of parameter/command line arg that is allowed to (but may not at the moment) be drawn from a variable distribution (note we have to pass in <pval> for args that are lists)
104
+ if args.parameter_variances is None or parg not in args.parameter_variances: # default: just use the single, fixed value from the command line
105
+ return pval
106
+ if args.n_gc_rounds is not None and parg in ['obs-times', 'n-sim-seqs-per-generation']:
107
+ raise Exception('shouldn\'t get here (see exception elsewhere)')
108
+ def sfcn(x): # just for dbg/exceptions
109
+ return str(int(x)) if parg != 'selection-strength' else ('%.2f' % x)
110
+ pvar = args.parameter_variances[parg]
111
+ if '..' in pvar: # list of allowed values NOTE pval is *not* used if we're choosing from several choices (ick, but not sure what else to do)
112
+ dbgstr = '[%s]' % pvar.replace('..', ', ')
113
+ return_val = numpy.random.choice([float(pv) for pv in pvar.split('..')])
114
+ else: # actual parameter variance (i know, this is ugly)
115
+ parg_bounds = {'min' : {'n-sim-seqs-per-generation' : 1}, 'max' : {}}
116
+ pmean = pval
117
+ pvar = float(pvar)
118
+ pmin, pmax = pmean - 0.5 * pvar, pmean + 0.5 * pvar
119
+ if pmin < 0:
120
+ raise Exception('min parameter value for %s less than 0 (from mean %s and half width %s)' % (parg, sfcn(pmean), sfcn(pvar)))
121
+ if parg == 'selection-strength' and pmax > 1:
122
+ raise Exception('max parameter value for %s greater than 1 (from mean %s and half width %s)' % (parg, sfcn(pmean), sfcn(pvar)))
123
+ if parg in parg_bounds['min'] and pmin < parg_bounds['min'][parg]:
124
+ raise Exception('min value too small for %s: %f < %f' % (parg, pmin, parg_bounds['min'][parg]))
125
+ if parg in parg_bounds['max'] and pmax > parg_bounds['max'][parg]:
126
+ raise Exception('max value too large for %s: %f > %f' % (parg, pmax, parg_bounds['max'][parg]))
127
+ dbgstr = '[%6s, %6s]' % (sfcn(pmin), sfcn(pmax))
128
+ return_val = numpy.random.uniform(pmin, pmax)
129
+ if parg != 'selection-strength':
130
+ return_val = int(return_val)
131
+ if debug:
132
+ print(' %30s --> %-7s %s' % (dbgstr, sfcn(return_val), parg))
133
+ return return_val
134
+
135
+ # ----------------------------------------------------------------------------------------
136
+ def run_bcr_phylo(naive_seq, outdir, ievent, uid_str_len=None, igcr=None):
137
+ if utils.output_exists(args, bcr_phylo_fasta_fname(outdir), outlabel='bcr-phylo', offset=4):
138
+ return None
139
+
140
+ cmd = '%s/bin/simulator.py' % bcr_phylo_path
141
+ if args.run_help:
142
+ cmd += ' --help'
143
+ else:
144
+ cmd += ' --lambda0 %f' % args.base_mutation_rate
145
+ if args.no_selection:
146
+ cmd += ' --no_selection'
147
+ else:
148
+ cmd += ' --selection_strength %f' % get_vpar_val('selection-strength', args.selection_strength)
149
+ for astr in ['obs-times', 'n-sim-seqs-per-generation']: # for search: obs_times n_sim_seqs_per_generation
150
+ aval = getattr(args, astr.replace('-', '_'))
151
+ tstr = ' '.join('%d' % get_vpar_val(astr, t) for t in (aval if args.n_gc_rounds is None else aval[igcr]))
152
+ cmd += ' --%s %s' % (astr.replace('n-sim-seqs-per-generation', 'n-to-sample').replace('-', '_'), tstr) # ick
153
+ cmd += ' --metric_for_target_dist %s' % args.metric_for_target_distance
154
+ if args.multifurcating_tree:
155
+ cmd += ' --multifurcating_tree'
156
+ if args.aa_paratope_positions is not None:
157
+ cmd += ' --aa_paratope_positions %s' % args.aa_paratope_positions
158
+ if args.aa_struct_positions is not None:
159
+ cmd += ' --aa_struct_positions %s' % args.aa_struct_positions
160
+ if args.dont_mutate_struct_positions:
161
+ cmd += ' --dont_mutate_struct_positions'
162
+ if args.skip_stops:
163
+ cmd += ' --skip_stops_when_mutating'
164
+ if args.allow_stops:
165
+ cmd += ' --allow_stops_in_functional_seqs'
166
+ cmd += ' --target_dist %d' % args.target_distance
167
+ cmd += ' --target_count %d' % args.target_count
168
+ cmd += ' --carry_cap %d' % get_vpar_val('carry-cap', args.carry_cap)
169
+ if not args.dont_observe_common_ancestors:
170
+ cmd += ' --observe_common_ancestors'
171
+ if args.leaf_sampling_scheme is not None:
172
+ cmd += ' --leaf_sampling_scheme %s' % args.leaf_sampling_scheme
173
+ if args.n_target_clusters is not None:
174
+ cmd += ' --n_target_clusters %d' % args.n_target_clusters
175
+ # cmd += ' --target_cluster_distance 1'
176
+ if args.tdist_scale is not None:
177
+ cmd += ' --tdist_scale %d' % args.tdist_scale
178
+ if args.tdist_weights is not None:
179
+ cmd += ' --tdist_weights %s' % args.tdist_weights
180
+ if args.min_target_distance is not None:
181
+ cmd += ' --min_target_distance %d' % args.min_target_distance
182
+ if args.min_effective_kd is not None:
183
+ cmd += ' --min_effective_kd %d' % args.min_effective_kd
184
+ if args.n_naive_seq_copies is not None:
185
+ cmd += ' --n_naive_seq_copies %d' % args.n_naive_seq_copies
186
+ if args.n_gc_rounds is not None and igcr > 0:
187
+ init_fn = '%s/init-seqs.fa' % outdir
188
+ if not args.dry_run:
189
+ isfos = utils.read_fastx(bcr_phylo_fasta_fname(evtdir(ievent, igcr=igcr - 1)))
190
+ if args.n_reentry_seqs is not None:
191
+ if args.n_reentry_seqs > len(isfos):
192
+ print(' %s --n-reentry-seqs %d greater than number of observed seqs %d in %s' % (utils.wrnstr(), args.n_reentry_seqs, len(isfos), bcr_phylo_fasta_fname(evtdir(ievent, igcr=igcr - 1))))
193
+ isfos = numpy.random.choice(isfos, size=args.n_reentry_seqs, replace=False)
194
+ utils.write_fasta(init_fn, isfos)
195
+ cmd += ' --initial_seq_file %s' % init_fn
196
+
197
+ cmd += ' --debug %d' % args.debug
198
+ cmd += ' --n_tries 1000'
199
+ if args.context_depend == 0:
200
+ cmd += ' --no_context'
201
+ cmd += ' --no_plot'
202
+ if args.only_csv_plots:
203
+ cmd += ' --dont_write_hists'
204
+ cmd += ' --outbase %s/%s' % (outdir, args.extrastr)
205
+ cmd += ' --random_seed %d' % (args.seed + ievent) # NOTE if args.n_gc_rounds is set, it's *really* important that this is the same for each round since it ensures we have the same target sequence
206
+ if uid_str_len is not None:
207
+ cmd += ' --uid_str_len %d' % uid_str_len
208
+ cmd += ' --naive_seq %s' % naive_seq
209
+
210
+ if not os.path.exists(outdir):
211
+ os.makedirs(outdir)
212
+
213
+ cfo = None
214
+ if args.n_procs == 1:
215
+ utils.run_ete_script(cmd, dryrun=args.dry_run)
216
+ else:
217
+ cmd = utils.run_ete_script(cmd, return_for_cmdfos=True, dryrun=args.dry_run)
218
+ cfo = {'cmd_str' : cmd, 'workdir' : outdir, 'outfname' : bcr_phylo_fasta_fname(outdir)}
219
+ sys.stdout.flush()
220
+ return cfo
221
+
222
+ # ----------------------------------------------------------------------------------------
223
+ def parse_bcr_phylo_output(glfos, naive_events, outdir, ievent, uid_info):
224
+ # ----------------------------------------------------------------------------------------
225
+ def split_seqfos(seqfos):
226
+ hline, lline = naive_events[ievent]
227
+ hseqfos, lseqfos = [], []
228
+ for sfo in seqfos:
229
+ padseq = utils.pad_nuc_seq(hline['naive_seq'])
230
+ assert len(sfo['seq']) == len(padseq) + len(lline['naive_seq'])
231
+ hseqfos.append({'name' : sfo['name'], 'seq' : sfo['seq'][ : len(hline['naive_seq'])]})
232
+ lseqfos.append({'name' : sfo['name'], 'seq' : sfo['seq'][len(padseq) : ]})
233
+ return hseqfos, lseqfos
234
+ # ----------------------------------------------------------------------------------------
235
+ def read_kdvals(kdfname):
236
+ nodefo = {}
237
+ with open(kdfname) as kdfile:
238
+ reader = csv.DictReader(kdfile)
239
+ for line in reader:
240
+ nodefo[line['uid']] = {
241
+ 'kd' : float(line['kd']),
242
+ 'relative_kd' : float(line['relative_kd']),
243
+ 'lambda' : float(line['lambda']) if line['lambda'] != '' else None, # bcr-phylo used to not run the lambda update fcn after last iteratio, which resulted in empty lambda values, but it shouldn't happen any more (but leaving here for backwards compatibility)
244
+ 'target_index' : int(line['target_index']),
245
+ 'target_distance' : float(line['target_distance']),
246
+ 'time' : int(line['time']),
247
+ }
248
+ return nodefo
249
+ # ----------------------------------------------------------------------------------------
250
+ def get_mature_line(sfos, naive_line, glfo, nodefo, dtree, target_sfos, dup_translations=None, locus=None):
251
+ assert len(naive_line['unique_ids']) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above
252
+ assert not indelutils.has_indels(naive_line['indelfos'][0]) # would have to handle this below
253
+ if args.debug:
254
+ utils.print_reco_event(naive_line)
255
+ reco_info = collections.OrderedDict()
256
+ for sfo in sfos:
257
+ mline = utils.get_non_implicit_copy(naive_line)
258
+ del mline['tree']
259
+ mline['unique_ids'] = [sfo['name']]
260
+ mline['seqs'] = [sfo['seq']]
261
+ mline['input_seqs'] = [sfo['seq']] # it's really important to set both the seqs (since they're both already in there from the naive line)
262
+ mline['duplicates'] = [[]]
263
+ reco_info[sfo['name']] = mline
264
+ try:
265
+ utils.add_implicit_info(glfo, mline)
266
+ except: # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file
267
+ print('implicit info adding failed for ievent %d in %s' % (ievent, outdir))
268
+ lines = traceback.format_exception(*sys.exc_info())
269
+ print(utils.pad_lines(''.join(lines))) # NOTE this will still crash on the next line if implicit info adding failed
270
+ final_line = utils.synthesize_multi_seq_line_from_reco_info([sfo['name'] for sfo in sfos], reco_info)
271
+
272
+ ftree = copy.deepcopy(dtree)
273
+ if locus is not None:
274
+ def ltr(u): return u + '-' + locus
275
+ new_nodefo = {}
276
+ for u_old in nodefo:
277
+ new_nodefo[ltr(u_old)] = nodefo[u_old]
278
+ nodefo = new_nodefo
279
+ treeutils.translate_labels(ftree, [(u, ltr(u)) for u in final_line['unique_ids']])
280
+ final_line['unique_ids'] = [ltr(u) for u in final_line['unique_ids']]
281
+ assert len(sfos) == len(final_line['unique_ids'])
282
+ for iseq, sfo in enumerate(sfos):
283
+ naive_id = naive_line['unique_ids'][0]
284
+ assert naive_id.count('-') == 1
285
+ bstr = naive_id.replace('-'+locus, '')
286
+ pids = final_line['paired-uids'][iseq]
287
+ assert len(pids) == 1 and pids[0].find(bstr) == 0 and pids[0].count('-') == 1 and pids[0].split('-')[1] in utils.loci # if uid is xxx-igh, paired id shoud be e.g. xxx-igk
288
+ final_line['paired-uids'][iseq] = [p.replace(bstr, sfo['name']) for p in pids]
289
+
290
+ tmp_trns = {}
291
+ for iu, old_id in enumerate(final_line['unique_ids']): # NOTE this only translates the uids, for paired h/l we still need to go back through and translate paired uids
292
+ if old_id in uid_info['all_uids']:
293
+ new_id, uid_info['n_duplicate_uids'] = utils.choose_non_dup_id(old_id, uid_info['n_duplicate_uids'], uid_info['all_uids'])
294
+ tmp_trns[old_id] = new_id
295
+ final_line['unique_ids'][iu] = new_id
296
+ uid_info['all_uids'].add(final_line['unique_ids'][iu])
297
+ if len(tmp_trns) > 0:
298
+ for old_id, new_id in tmp_trns.items():
299
+ nodefo[new_id] = nodefo[old_id]
300
+ del nodefo[old_id]
301
+ treeutils.translate_labels(ftree, [(o, n) for o, n in tmp_trns.items()], expect_missing=True)
302
+ if dup_translations is not None:
303
+ dup_translations.update(tmp_trns)
304
+
305
+ if len(set(nodefo) - set(final_line['unique_ids'])) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
306
+ print(' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (set(nodefo) - set(final_line['unique_ids'])))
307
+ if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
308
+ print(' in final_line, but missing from kdvals: %s' % ' '.join(set(final_line['unique_ids']) - set(nodefo)))
309
+ final_line['affinities'] = [1. / nodefo[u]['kd'] for u in final_line['unique_ids']]
310
+ if args.affinity_measurement_error is not None:
311
+ # before = final_line['affinities']
312
+ final_line['affinities'] = [numpy.random.normal(a, args.affinity_measurement_error * a) for a in final_line['affinities']]
313
+ # print ' '.join('%.4f'%v for v in before)
314
+ # print ' '.join('%.4f'%v for v in final_line['affinities'])
315
+ final_line['relative_affinities'] = [1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']]
316
+ final_line['lambdas'] = [nodefo[u]['lambda'] for u in final_line['unique_ids']]
317
+ final_line['nearest_target_indices'] = [nodefo[u]['target_index'] for u in final_line['unique_ids']]
318
+ final_line['min_target_distances'] = [nodefo[u]['target_distance'] for u in final_line['unique_ids']]
319
+ final_line['generation-times'] = [nodefo[u]['time'] for u in final_line['unique_ids']]
320
+ ftree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']])) # note that if --paired-loci is set then most edges will still be the wrong length (compared to the mutations in the single-locus sequences), i.e. best not to use this much until treeutils.combine_selection_metrics(), where we rescale to the full h+l length
321
+ # treeutils.compare_tree_distance_to_shm(ftree, final_line, debug=True)
322
+ if args.debug:
323
+ print(utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=ftree), padwidth=12))
324
+ final_line['tree'] = ftree.as_string(schema='newick')
325
+ if args.debug:
326
+ utils.print_reco_event(final_line) #, extra_print_keys=['lambdas'])
327
+
328
+ tmp_event = RecombinationEvent(glfo) # I don't want to move the function out of event.py right now
329
+ tmp_event.set_reco_id(final_line, irandom=ievent) # not sure that setting <irandom> here actually does anything
330
+ final_line['target_seqs'] = [[tfo['seq'] for tfo in target_sfos] for _ in final_line['unique_ids']] # NOTE it would be nice to put this in a per-family key, but it ends up we want it to behave like an input meta info key, and input meta keys need to be per-seq since they're inherently a property of each sequence. So instead we duplicate this info across all seqs to which it applies
331
+ return final_line
332
+ # ----------------------------------------------------------------------------------------
333
+ def translate_duplicate_pids(mpair, dup_translations):
334
+ if len(dup_translations) == 0:
335
+ return
336
+ assert len(set(len(l['unique_ids']) for l in mpair)) == 1 # make sure h and l annotations have the same length
337
+ for atn1, atn2 in itertools.permutations(mpair, 2):
338
+ # print ':'.join([utils.color('red' if atn1['paired-uids'][i]!=[u] else None, u) for i, u in enumerate(atn2['unique_ids'])])
339
+ for pids, uid in zip(atn1['paired-uids'], atn2['unique_ids']): # this is just to double check things, so could be removed
340
+ assert len(pids) == 1
341
+ if pids[0] != uid:
342
+ assert pids[0] in dup_translations and dup_translations[pids[0]] == uid
343
+ del dup_translations[pids[0]]
344
+ atn1['paired-uids'] = [[u] for u in atn2['unique_ids']] # seems a bit hackey to reset all of them, not just the translated one, but whatever
345
+
346
+ # ----------------------------------------------------------------------------------------
347
+ # extract kd values from pickle file (used to need a separate script since ete3 needed python 3, but now probably doesn't)
348
+ kdfname, nwkfname = '%s/kd-vals.csv' % outdir, '%s/simu.nwk' % outdir
349
+ if not utils.output_exists(args, kdfname, outlabel='kd/nwk conversion', offset=4): # eh, don't really need to check for both kd and nwk file, chances of only one being missing are really small, and it'll just crash when it looks for it a couple lines later
350
+ cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s --newick-tree-file %s' % (outdir, args.extrastr, kdfname, nwkfname)
351
+ utils.run_ete_script(cmd, debug=args.n_procs==1)
352
+ nodefo = read_kdvals(kdfname)
353
+ dtree = treeutils.get_dendro_tree(treefname=nwkfname)
354
+ seqfos = utils.read_fastx(bcr_phylo_fasta_fname(outdir)) # output mutated sequences from bcr-phylo
355
+ target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr))
356
+ if args.paired_loci:
357
+ mpair = []
358
+ dup_translations = {}
359
+ for tline, sfos, tsfos in zip(naive_events[ievent], split_seqfos(seqfos), split_seqfos(target_seqfos)):
360
+ mpair.append(get_mature_line(sfos, tline, glfos[tline['loci'][0]], nodefo, dtree, target_seqfos, dup_translations=dup_translations, locus=tline['loci'][0]))
361
+ translate_duplicate_pids(mpair, dup_translations)
362
+ return mpair
363
+ else:
364
+ return get_mature_line(seqfos, naive_events[ievent], glfos[0], nodefo, dtree, target_seqfos)
365
+
366
+ # ----------------------------------------------------------------------------------------
367
+ def read_rearrangements():
368
+ if args.paired_loci:
369
+ lp_infos = paircluster.read_lpair_output_files(lpairs(), naive_fname, dbgstr='naive simulation')
370
+ naive_events = paircluster.get_all_antn_pairs(lp_infos)
371
+ glfos, _, _ = paircluster.concat_heavy_chain(lpairs(), lp_infos) # per-locus glfos with concat'd heavy chain
372
+ else:
373
+ glfo, naive_events, _ = utils.read_output(naive_fname(None))
374
+ glfos = [glfo]
375
+ return glfos, naive_events
376
+
377
+ # ----------------------------------------------------------------------------------------
378
+ def write_simulation(glfos, mutated_events, unsampled=False):
379
+ opath = spath('mutated', unsampled=unsampled)
380
+ print(' writing%s annotations to %s' % ('' if not args.tpsample else (' unsampled' if unsampled else ' timepoint sampled'), opath))
381
+ mheads = []
382
+ if args.n_gc_rounds is not None:
383
+ mheads += ['gc-rounds', 'generation-times']
384
+ if args.tpsample:
385
+ mheads += ['timepoints']
386
+ headers = utils.simulation_headers + mheads
387
+ if args.paired_loci:
388
+ lp_infos = {}
389
+ for lpair in lpairs():
390
+ lpfos = {k : {} for k in ['glfos', 'antn_lists', 'cpaths']} # cpaths i think don't get used
391
+ mevents = [(hl, ll) for hl, ll in mutated_events if [hl['loci'][0], ll['loci'][0]] == lpair] # grab the events for this h/l pair
392
+ for ltmp, levents in zip(lpair, zip(*mevents)):
393
+ lpfos['antn_lists'][ltmp] = levents
394
+ lpfos['glfos'][ltmp] = glfos[ltmp]
395
+ lp_infos[tuple(lpair)] = lpfos
396
+ sfcn = get_unsampled_simfn if (args.tpsample and unsampled) else get_simfn
397
+ paircluster.write_lpair_output_files(lpairs(), lp_infos, sfcn, headers)
398
+ glfos, antn_lists, _ = paircluster.concat_heavy_chain(lpairs(), lp_infos) # per-locus glfos with concat'd heavy chain
399
+ paircluster.write_concatd_output_files(glfos, antn_lists, sfcn, headers)
400
+ outfos, metafos = paircluster.get_combined_outmetafos(antn_lists, extra_meta_headers=mheads)
401
+ paircluster.write_combined_fasta_and_meta(opath+'/all-seqs.fa', opath+'/meta.yaml', outfos, metafos)
402
+ else:
403
+ utils.write_annotations(opath, glfos[0], mutated_events, headers)
404
+
405
+ # ----------------------------------------------------------------------------------------
406
+ def combine_gc_rounds(glfos, mevt_lists):
407
+ # ----------------------------------------------------------------------------------------
408
+ def fix_evt(igcr, sum_time, evt, all_gtimes, ltmp=None):
409
+ evt['generation-times'] = [t + sum_time for t in evt['generation-times']]
410
+ evt['gc-rounds'] = [igcr for _ in evt['unique_ids']]
411
+ if ltmp is None:
412
+ def tfcn(u, t): return '%s-%s'%(u, t)
413
+ else:
414
+ def tfcn(u, t): lstr = u.split('-')[-1]; assert lstr in utils.loci; return u.replace('-'+lstr, '-%s-%s'%(t, lstr))
415
+ trns = {u : tfcn(u, t) for u, t in zip(evt['unique_ids'], evt['generation-times'])}
416
+ if ltmp is not None:
417
+ trns.update({p : tfcn(p, t) for pids, t in zip(evt['paired-uids'], evt['generation-times']) for p in pids})
418
+ utils.translate_uids([evt], trns=trns, translate_pids=args.paired_loci) # kind of annoying to add the timepoint to the uid, but otherwise we get duplicate uids in different rounds (and we can't change the random seed atm, or else the target seqs will be out of whack)
419
+ all_gtimes |= set(evt['generation-times'])
420
+ # ----------------------------------------------------------------------------------------
421
+ if utils.output_exists(args, get_simfn('igh'), outlabel='mutated simu', offset=4):
422
+ return None
423
+ assert len(mevt_lists) == args.n_gc_rounds
424
+ assert len(set(len(l) for l in mevt_lists)) == 1 # all rounds should have the same number of events
425
+ sum_time = 0
426
+ assert len(args.obs_times) == args.n_gc_rounds # also checked elsewhere
427
+ all_gtimes, stlist = set(), []
428
+ for igcr in range(args.n_gc_rounds):
429
+ if args.paired_loci:
430
+ for epair in mevt_lists[igcr]:
431
+ for evt in epair:
432
+ fix_evt(igcr, sum_time, evt, all_gtimes, ltmp=evt['loci'][0])
433
+ else:
434
+ for evt in mevt_lists[igcr]:
435
+ fix_evt(igcr, sum_time, evt, all_gtimes)
436
+ sum_time += args.obs_times[igcr][-1]
437
+ stlist.append(sum_time)
438
+ print(' merging %d events over %d gc rounds with final generation times%s: %s' % (len(mevt_lists[0]), args.n_gc_rounds, '' if args.dont_observe_common_ancestors else ' (including observed common ancestors)', ' '.join(utils.color('blue' if t in stlist else None, str(t)) for t in sorted(all_gtimes))))
439
+ merged_events = []
440
+ for ievt in range(len(mevt_lists[0])):
441
+ if args.paired_loci:
442
+ mpair = []
443
+ lpair = [l['loci'][0] for l in mevt_lists[0][0]]
444
+ for ilocus, ltmp in enumerate(lpair):
445
+ mgevt = utils.combine_events(glfos[ltmp], [evts[ievt][ilocus] for evts in mevt_lists], meta_keys=['gc-rounds', 'generation-times'])
446
+ mpair.append(mgevt)
447
+ merged_events.append(mpair)
448
+ else:
449
+ mgevt = utils.combine_events(glfos[0], [evts[ievt] for evts in mevt_lists], meta_keys=['gc-rounds', 'generation-times'])
450
+ merged_events.append(mgevt)
451
+
452
+ write_simulation(glfos, merged_events, unsampled=args.tpsample)
453
+
454
+ return merged_events
455
+
456
+ # ----------------------------------------------------------------------------------------
457
+ def sample_tp_seqs(glfos, evt_list, l_evts=None, ltmp=None):
458
+ id_list = [u for l in evt_list for u in l['unique_ids']]
459
+ if len(set(id_list)) != len(id_list):
460
+ raise Exception('duplicate ids in final events') # shouldn't be able to get to here, but if it does it'll break stuff below
461
+ for fevt in evt_list:
462
+ if 'timepoints' in fevt: # shouldn't be in there
463
+ print(' %s \'timepoints\' already in event (overwriting)' % utils.wrnstr())
464
+ fevt['timepoints'] = [None for _ in fevt['unique_ids']]
465
+ all_gtimes = set(t for l in evt_list for t in l['generation-times'])
466
+ gt_ids = {t : [] for t in all_gtimes} # map from each generation time to list of all remaining uids with that time
467
+ for tline in evt_list:
468
+ for tid, gtime in zip(tline['unique_ids'], tline['generation-times']):
469
+ gt_ids[gtime].append(tid)
470
+ print(' N generation N N')
471
+ print(' timepoint total time chosen remaining')
472
+ for tpfo in args.sequence_sample_times:
473
+ if any(t not in all_gtimes for t in tpfo['times']):
474
+ raise Exception('generation time %s not among actual final times: %s' % (' '.join(str(t) for t in tpfo['times'] if t not in all_gtimes), ' '.join(str(t) for t in sorted(all_gtimes))))
475
+ allowed_uids = [u for gt in tpfo['times'] for u in gt_ids[gt]]
476
+ if tpfo['n'] > len(allowed_uids):
477
+ print(' %s not enough allowed seqs remain (%d > %d, probably didn\'t sample enough sequences at allowed times %s)' % (utils.wrnstr(), tpfo['n'], len(allowed_uids), ' '.join(str(t) for t in tpfo['times'])))
478
+ chosen_ids = numpy.random.choice(allowed_uids, size=tpfo['n'], replace=False)
479
+ if len(chosen_ids) != tpfo['n']:
480
+ print(' %s couldn\'t choose enough seqs (only got %d)' % (utils.wrnstr(), len(chosen_ids)))
481
+ n_chosen = {}
482
+ for gtime in tpfo['times']:
483
+ n_before = len(gt_ids[gtime])
484
+ gt_ids[gtime] = [u for u in gt_ids[gtime] if u not in chosen_ids]
485
+ n_chosen[gtime] = n_before - len(gt_ids[gtime])
486
+ for igt, gtime in enumerate(tpfo['times']):
487
+ print(' %12s %3s %3d %4d %4d' % (tpfo['name'] if igt==0 else '', '%d'%tpfo['n'] if igt==0 else '', gtime, n_chosen[gtime], len(gt_ids[gtime])))
488
+ for fevt in evt_list:
489
+ fevt['timepoints'] = [tpfo['name'] if u in chosen_ids else t for u, t in zip(fevt['unique_ids'], fevt['timepoints'])]
490
+ for ievt, fevt in enumerate(evt_list):
491
+ iseqs_to_keep = [i for i, t in enumerate(fevt['timepoints']) if t is not None]
492
+ if l_evts is not None:
493
+ hevt, levt = fevt, l_evts[ievt]
494
+ hloc, lloc = [e['loci'][0] for e in [hevt, levt]]
495
+ def htrans(u, hloc, lloc): return u.replace('-'+hloc, '-'+lloc)
496
+ assert [htrans(u, hloc, lloc) for u in hevt['unique_ids']] == levt['unique_ids']
497
+ levt['timepoints'] = [t for t in hevt['timepoints']] # NOTE *has* to happen before restrict_to_iseqs() (duh)
498
+ utils.restrict_to_iseqs(levt, iseqs_to_keep, glfos[levt['loci'][0]], remove_tree=True)
499
+ utils.restrict_to_iseqs(fevt, iseqs_to_keep, glfos[0 if ltmp is None else ltmp], remove_tree=True)
500
+ # utils.print_reco_event(levt, extra_print_keys=['timepoints', 'gc-rounds', 'generation-times'])
501
+
502
+ # ----------------------------------------------------------------------------------------
503
+ def write_timepoint_sampled_sequences(glfos, final_events):
504
+ if utils.output_exists(args, get_simfn('igh'), outlabel='mutated simu', offset=4):
505
+ return None
506
+ if args.paired_loci:
507
+ h_evts, l_evts = list(zip(*final_events))
508
+ sample_tp_seqs(glfos, h_evts, l_evts=l_evts, ltmp=h_evts[0]['loci'][0])
509
+ else:
510
+ sample_tp_seqs(glfos, final_events)
511
+
512
+ write_simulation(glfos, final_events)
513
+
514
+ # ----------------------------------------------------------------------------------------
515
+ def simulate(igcr=None):
516
+
517
+ if igcr in [None, 0]:
518
+ rearrange()
519
+
520
+ glfos, naive_events = read_rearrangements()
521
+ if args.dry_run:
522
+ for ievent in range(args.n_sim_events):
523
+ _ = run_bcr_phylo('<NAIVE_SEQ>', evtdir(ievent, igcr=igcr), ievent, igcr=igcr)
524
+ return None, None
525
+ if args.restrict_to_single_naive_seq:
526
+ print(' --restrict-to-single-naive-seq: using same naive event for all mutation simulations')
527
+ assert len(naive_events) == 1
528
+ naive_events = [naive_events[0] for _ in range(args.n_sim_events)]
529
+ else:
530
+ assert len(naive_events) == args.n_sim_events
531
+
532
+ outdirs = [evtdir(i, igcr=igcr) for i in range(len(naive_events))]
533
+
534
+ start = time.time()
535
+ cmdfos = []
536
+ if args.n_procs > 1:
537
+ print(' starting %d events%s' % (len(naive_events), '' if args.n_procs==1 else ' with N max simultaneous procs %d'%args.n_procs))
538
+ uid_str_len = args.min_ustr_len # UPDATE don't need to increase this any more since I'm handling duplicates when above + int(math.log(len(naive_events), 7)) # if the final sample's going to contain many trees, it's worth making the uids longer so there's fewer collisions/duplicates (note that this starts getting pretty slow if it's bigger than 7 or so)
539
+ for ievent, outdir in enumerate(outdirs):
540
+ if args.n_sim_events > 1 and args.n_procs == 1:
541
+ print(' %s %d' % (utils.color('blue', 'ievent'), ievent))
542
+ if args.paired_loci:
543
+ hnseq, lnseq = [l['naive_seq'] for l in naive_events[ievent]]
544
+ naive_seq = utils.pad_nuc_seq(hnseq) + lnseq
545
+ else:
546
+ naive_seq = naive_events[ievent]['naive_seq']
547
+ cfo = run_bcr_phylo(naive_seq, outdir, ievent, uid_str_len=uid_str_len, igcr=igcr) # if n_procs > 1, doesn't run, just returns cfo
548
+ if cfo is not None:
549
+ print(' %s %s' % (utils.color('red', 'run'), cfo['cmd_str']))
550
+ cmdfos.append(cfo)
551
+ if args.n_procs > 1 and len(cmdfos) > 0:
552
+ utils.run_cmds(cmdfos, shell=True, n_max_procs=args.n_procs, batch_system='slurm' if args.slurm else None, allow_failure=True, debug='print')
553
+ print(' bcr-phylo run time: %.1fs' % (time.time() - start))
554
+
555
+ if utils.output_exists(args, get_simfn('igh'), outlabel='mutated simu', offset=4): # i guess if it crashes during the plotting just below, this'll get confused
556
+ return None, None
557
+
558
+ start = time.time()
559
+ uid_info = {'all_uids' : set(), 'n_duplicate_uids' : 0} # stuff just for dealing with duplicate uids
560
+ mutated_events = []
561
+ for ievent, outdir in enumerate(outdirs):
562
+ mutated_events.append(parse_bcr_phylo_output(glfos, naive_events, outdir, ievent, uid_info))
563
+ if uid_info['n_duplicate_uids'] > 0:
564
+ print(' %s renamed %d duplicate uids from %d bcr-phylo events' % (utils.color('yellow', 'warning'), uid_info['n_duplicate_uids'], len(mutated_events)))
565
+ print(' parsing time: %.1fs' % (time.time() - start))
566
+
567
+ if igcr is None:
568
+ write_simulation(glfos, mutated_events, unsampled=args.tpsample)
569
+
570
+ if not args.only_csv_plots:
571
+ import python.lbplotting as lbplotting
572
+ for ievent, outdir in enumerate(outdirs):
573
+ if args.paired_loci:
574
+ lpair = [l['loci'][0] for l in mutated_events[ievent]]
575
+ evtlist = mutated_events[ievent]
576
+ else:
577
+ lpair = None
578
+ evtlist = [mutated_events[ievent]]
579
+ lbplotting.plot_bcr_phylo_simulation(outdir + '/plots', outdir, evtlist, args.extrastr, lbplotting.metric_for_target_distance_labels[args.metric_for_target_distance], lpair=lpair)
580
+ # utils.simplerun('cp -v %s/simu_collapsed_runstat_color_tree.svg %s/plots/' % (outdir, outdir))
581
+
582
+ return glfos, mutated_events
583
+
584
+ # ----------------------------------------------------------------------------------------
585
+ def simulseq_args():
586
+ cstr = ''
587
+ if args.restrict_to_single_naive_seq:
588
+ print(' note: using --all-seqs-simultaneous because --restrict-to-single-naive-seq was set')
589
+ cstr += ' --all-seqs-simultaneous'
590
+ if args.n_gc_rounds is None and not args.tpsample:
591
+ cstr += ' --is-simu'
592
+ if '--all-seqs-simultaneous' not in cstr:
593
+ cstr += ' --simultaneous-true-clonal-seqs'
594
+ elif args.n_sim_events == 1:
595
+ print(' %s not using --is-simu since --n-gc-rounds or --sequence-sample-time-fname are set, so e.g. plots won\'t use true info, and true tree won\'t be set' % utils.wrnstr())
596
+ if '--all-seqs-simultaneous' not in cstr:
597
+ cstr += ' --all-seqs-simultaneous'
598
+ else:
599
+ print(' %s not using any of --is-simu or --simultaneous-true-clonal-seqs since either --n-gc-rounds or --sequence-sample-time-fname are set with more than one event, so e.g. plots won\'t use true info, and true tree won\'t be set' % utils.wrnstr())
600
+ return cstr
601
+
602
+ # ----------------------------------------------------------------------------------------
603
+ def cache_parameters():
604
+ if utils.output_exists(args, ifname('params'), outlabel='parameters', offset=4):
605
+ return
606
+ cmd = './bin/partis cache-parameters --random-seed %d --no-indels' % args.seed # forbid indels because in the very rare cases when we call them, they're always wrong, and then they screw up the simultaneous true clonal seqs option
607
+ cmd += simulseq_args()
608
+ fstr = ' --paired-loci --paired-indir %s --paired-outdir %s' if args.paired_loci else ' --infname %s --parameter-dir %s'
609
+ cmd += fstr % (spath('mutated'), ipath('params'))
610
+ if args.all_inference_plots:
611
+ cmd += ' --plotdir %s' % ('paired-outdir' if args.paired_loci else ipath('plots'))
612
+ if args.meta_info_key_to_color is not None:
613
+ cmd += ' --meta-info-key-to-color %s' % args.meta_info_key_to_color
614
+ if args.inf_extra_args is not None:
615
+ cmd += ' %s' % args.inf_extra_args
616
+ if args.n_procs > 1:
617
+ cmd += ' --n-procs %d' % args.n_procs
618
+ if args.slurm:
619
+ cmd += ' --batch-system slurm'
620
+ if args.n_max_queries is not None:
621
+ cmd += ' --n-max-queries %d' % args.n_max_queries
622
+ utils.simplerun(cmd, debug=True, dryrun=args.dry_run)
623
+ sys.stdout.flush()
624
+
625
+ # ----------------------------------------------------------------------------------------
626
+ def partition():
627
+ if utils.output_exists(args, ifname('partition'), outlabel='partition', offset=4):
628
+ return
629
+ cmd = './bin/partis partition --random-seed %d' % args.seed
630
+ cmd += simulseq_args()
631
+ fstr = ' --paired-loci --paired-indir %s --paired-outdir %s' if args.paired_loci else (' --infname %%s --parameter-dir %s --outfname %%s' % ipath('params'))
632
+ cmd += fstr % (spath('mutated'), ipath('partition'))
633
+ # --write-additional-cluster-annotations 0:5 # I don't think there was really a good reason for having this
634
+ if not args.dont_get_tree_metrics:
635
+ cmd += ' --get-selection-metrics'
636
+ if args.tree_inference_method is not None:
637
+ cmd += ' --tree-inference-method %s' % args.tree_inference_method
638
+ if not args.dont_get_tree_metrics or args.all_inference_plots:
639
+ cmd += ' --plotdir %s' % ('paired-outdir' if args.paired_loci else ipath('plots'))
640
+ if not args.all_inference_plots:
641
+ cmd += ' --no-partition-plots'
642
+ if args.meta_info_key_to_color is not None:
643
+ cmd += ' --meta-info-key-to-color %s' % args.meta_info_key_to_color
644
+ if args.inf_extra_args is not None:
645
+ cmd += ' %s' % args.inf_extra_args
646
+ if args.lb_tau is not None:
647
+ cmd += ' --lb-tau %f' % args.lb_tau
648
+ if args.n_procs > 1:
649
+ cmd += ' --n-procs %d' % args.n_procs
650
+ if args.slurm:
651
+ cmd += ' --batch-system slurm'
652
+ if args.n_max_queries is not None:
653
+ cmd += ' --n-max-queries %d' % args.n_max_queries
654
+ if args.extra_smetric_plots is not None:
655
+ cmd += ' --selection-metric-plot-cfg %s' % ':'.join(treeutils.default_plot_cfg + args.extra_smetric_plots + ['distr'])
656
+ utils.simplerun(cmd, debug=True, dryrun=args.dry_run)
657
+ # cmd = './bin/partis get-selection-metrics --outfname %s/partition.yaml' % infdir()
658
+ # utils.simplerun(cmd, debug=True) #, dryrun=True)
659
+ sys.stdout.flush()
660
+
661
+ # ----------------------------------------------------------------------------------------
662
+ all_actions = ('simu', 'cache-parameters', 'partition')
663
+ class MultiplyInheritedFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
664
+ pass
665
+ formatter_class = MultiplyInheritedFormatter
666
+ parser = argparse.ArgumentParser(formatter_class=MultiplyInheritedFormatter)
667
+ parser.add_argument('--actions', default=':'.join(all_actions), help='which actions to run')
668
+ parser.add_argument('--base-outdir', default='%s/partis/bcr-phylo/test' % os.getenv('fs', default=os.getenv('HOME')), help='base output dir')
669
+ parser.add_argument('--debug', type=int, default=0, choices=[0, 1, 2])
670
+ parser.add_argument('--run-help', action='store_true')
671
+ parser.add_argument('--overwrite', action='store_true')
672
+ parser.add_argument('--only-csv-plots', action='store_true')
673
+ parser.add_argument('--dont-get-tree-metrics', action='store_true', help='Partition without getting tree metrics, presumably because you want to run them yourself later')
674
+ parser.add_argument('--tree-inference-method')
675
+ parser.add_argument('--seed', type=int, default=1, help='random seed (note that bcr-phylo doesn\'t seem to support setting its random seed)')
676
+ parser.add_argument('--n-procs', type=int, default=1)
677
+ parser.add_argument('--extrastr', default='simu', help='doesn\'t really do anything, but it\'s required by bcr-phylo EDIT ok it actually doesn\'t need it, it\'s just that the output files look weird without it because they start with \'_\' if it\'s empty')
678
+ parser.add_argument('--n-sim-seqs-per-generation', default='100', help='Number of sequences to sample at each time in --obs-times.')
679
+ parser.add_argument('--n-sim-events', type=int, default=1, help='number of simulated rearrangement events')
680
+ parser.add_argument('--n-max-queries', type=int, help='during parameter caching and partitioning, stop after reading this many queries from simulation file (useful for dtr training samples where we need massive samples to actually train the dtr, but for testing various metrics need far smaller samples).')
681
+ parser.add_argument('--obs-times', default='100:120', help='Times (generations) at which to select sequences for observation. Note that this is the time that the sequences existed in/exited the gc, not necessarily the time at which we "sequenced" them. If --n-gc-rounds is set, this must be a colon-separated list of comma-separated lists (see help under that arg).')
682
+ parser.add_argument('--sequence-sample-time-fname', help='Times at which we "sample" for sequencing, i.e. at which we draw blood (as opposed to --obs-times, which is the generation time at which a cell leaves the gc). Specified in a yaml file as a list of key : val pairs, with key the timepoint label and values a dict with keys \'n\' (total number of sequences) and \'times\' (dict keyed by gc round time (if --n-gc-rounds is set, otherwise just a list) of generation times from which to sample those \'n\' sequences uniformly at random). If set, a new output file/dir is created by inserting \'timepoint-sampled\' at end of regular output file. See example in data/sample-seqs.yaml.')
683
+ parser.add_argument('--n-naive-seq-copies', type=int, help='see bcr-phylo docs')
684
+ parser.add_argument('--n-gc-rounds', type=int, help='number of rounds of gc entry, i.e. if set, upon gc completion we choose --n-reentry-seqs sampled seqs with which to seed a new (otherwise identical) gc reaction. Results for each gc round N are written to subdirs round-N/ for each event, then all sampled sequences from all reactions are collected into the normal output file locations, with input meta info key \'gc-rounds\' specifying their gc round. If this arg is set, then --obs-times must be a colon-separated list (of length --n-gc-rounds) of comma-separated lists, where each sample time is relative to the *start* of that round.')
685
+ parser.add_argument('--n-reentry-seqs', type=int, help='number of sampled seqs from previous round (chosen randomly) to inject into the next gc round (if not set, we take all of them).')
686
+ parser.add_argument('--carry-cap', type=int, default=1000, help='carrying capacity of germinal center')
687
+ parser.add_argument('--target-distance', type=int, default=15, help='Desired distance (number of non-synonymous mutations) between the naive sequence and the target sequences.')
688
+ parser.add_argument('--tdist-scale', type=int, help='see bcr-phylo docs')
689
+ parser.add_argument('--tdist-weights', help='see bcr-phylo docs')
690
+ parser.add_argument('--metric-for-target-distance', default='aa', choices=['aa', 'nuc', 'aa-sim-ascii', 'aa-sim-blosum'], help='see bcr-phylo docs')
691
+ parser.add_argument('--target-count', type=int, default=1, help='Number of target sequences to generate.')
692
+ parser.add_argument('--n-target-clusters', type=int, help='number of cluster into which to divide the --target-count target seqs (see bcr-phylo docs)')
693
+ parser.add_argument('--min-target-distance', type=int, help='see bcr-phylo docs')
694
+ parser.add_argument('--min-effective-kd', type=float, help='see bcr-phylo docs')
695
+ parser.add_argument('--affinity-measurement-error', type=float, help='Fractional measurement error on affinity: if set, replace \'affinities\' in final partis line with new values smeared with a normal distribution with this fractional width, i.e. <a> is replaced with a value drawn from a normal distribution with mean <a> and width <f>*<a> for this fraction <f>.')
696
+ parser.add_argument('--base-mutation-rate', type=float, default=0.365, help='see bcr-phylo docs')
697
+ parser.add_argument('--selection-strength', type=float, default=1., help='see bcr-phylo docs')
698
+ parser.add_argument('--context-depend', type=int, default=0, choices=[0, 1]) # i wish this could be a boolean, but having it int makes it much much easier to interface with the scan infrastructure in cf-tree-metrics.py
699
+ parser.add_argument('--aa-paratope-positions', help='see bcr-phylo docs')
700
+ parser.add_argument('--aa-struct-positions', help='see bcr-phylo docs')
701
+ parser.add_argument('--dont-mutate-struct-positions', action='store_true', help='see bcr-phylo docs')
702
+ parser.add_argument('--skip-stops', action='store_true', help='see bcr-phylo docs')
703
+ parser.add_argument('--allow-stops', action='store_true', help='see bcr-phylo docs')
704
+ parser.add_argument('--no-selection', action='store_true', help='see bcr-phylo docs')
705
+ parser.add_argument('--multifurcating-tree', action='store_true', help='see bcr-phylo docs')
706
+ parser.add_argument('--restrict-available-genes', action='store_true', help='restrict v and j gene choice to one each (so context dependence is easier to plot)')
707
+ parser.add_argument('--restrict-to-single-naive-seq', action='store_true', help='restrict all events to use the same naive sequence')
708
+ parser.add_argument('--lb-tau', type=float, help='')
709
+ parser.add_argument('--dont-observe-common-ancestors', action='store_true')
710
+ parser.add_argument('--leaf-sampling-scheme', help='see bcr-phylo help')
711
+ parser.add_argument('--parameter-variances', help='if set, parameters vary from family to family in one of two ways 1) the specified parameters are drawn from a uniform distribution of the specified width (with mean from the regular argument) for each family. Format example: n-sim-seqs-per-generation,10:carry-cap,150 would give --n-sim-seqs-per-generation +/-5 and --carry-cap +/-75, or 2) parameters for each family are chosen from a \'..\'-separated list, e.g. obs-times,75..100..150')
712
+ parser.add_argument('--slurm', action='store_true')
713
+ parser.add_argument('--paired-loci', action='store_true')
714
+ parser.add_argument('--parameter-plots', action='store_true', help='DEPRECATED')
715
+ parser.add_argument('--all-inference-plots', action='store_true')
716
+ parser.add_argument('--meta-info-key-to-color')
717
+ parser.add_argument('--single-light-locus', help='set to igk or igl if you want only that one; otherwise each event is chosen at random (see partis help)')
718
+ parser.add_argument('--rearr-extra-args', help='')
719
+ parser.add_argument('--inf-extra-args', help='')
720
+ parser.add_argument('--dry-run', action='store_true')
721
+ parser.add_argument('--mutated-outpath', action='store_true', help='write final (mutated) output file[s] to --base-outdir, rather than the default of burying them in subdirs with intermediate files')
722
+ parser.add_argument('--extra-smetric-plots', default=':'.join(treeutils.default_plot_cfg))
723
+ parser.add_argument('--min-ustr-len', type=int, default=5, help='min length of hashed uid strs (longer makes collisions less likely, but it\'s hard to avoid them entirely since independent bcr-phylo procs choose the uids for each family)')
724
+
725
+ args = parser.parse_args()
726
+
727
+ if args.parameter_plots:
728
+ print(' %s transferring deprecated arg --parameter-plots to --all-inference-plots' % utils.wrnstr())
729
+ args.all_inference_plots = True
730
+ delattr(args, 'parameter_plots')
731
+ if args.seed is not None:
732
+ numpy.random.seed(args.seed)
733
+ args.obs_times = utils.get_arg_list(args.obs_times, intify=True, list_of_lists=args.n_gc_rounds is not None)
734
+ args.n_sim_seqs_per_generation = utils.get_arg_list(args.n_sim_seqs_per_generation, intify=True, list_of_lists=args.n_gc_rounds is not None)
735
+ args.actions = utils.get_arg_list(args.actions, choices=all_actions)
736
+ args.parameter_variances = utils.get_arg_list(args.parameter_variances, key_val_pairs=True, choices=['selection-strength', 'obs-times', 'n-sim-seqs-per-generation', 'carry-cap', 'metric-for-target-distance']) # if you add more, make sure the bounds enforcement and conversion stuff in get_vpar_val() are still ok
737
+ args.extra_smetric_plots = utils.get_arg_list(args.extra_smetric_plots, choices=treeutils.all_plot_cfg)
738
+ if args.rearr_extra_args is not None:
739
+ args.rearr_extra_args = args.rearr_extra_args.replace('@', ' ') # ick this sucks
740
+ if args.inf_extra_args is not None:
741
+ args.inf_extra_args = args.inf_extra_args.replace('@', ' ') # ick this sucks
742
+ if args.affinity_measurement_error is not None:
743
+ assert args.affinity_measurement_error >= 0
744
+ if args.affinity_measurement_error > 1:
745
+ print(' note: --affinity-measurement-error %.2f is greater than 1 -- this is fine as long as it\'s on purpose, but will result in smearing by a normal with width larger than each affinity value (and probably result in some negative values).' % args.affinity_measurement_error)
746
+ if args.n_gc_rounds is not None:
747
+ assert len(args.obs_times) == args.n_gc_rounds
748
+ for otlist in args.obs_times:
749
+ if otlist != sorted(otlist): # various things assume it's sorted
750
+ raise Exception('obs times within each gc round must be sorted')
751
+ otstrs = ['%s' % ' '.join(str(t) for t in otlist) for otlist in args.obs_times]
752
+ def fgt(i, t): return t + sum(args.obs_times[j][-1] for j in range(i))
753
+ fgstrs = ['%s' % ' '.join(str(fgt(i, t)) for t in otlist) for i, otlist in enumerate(args.obs_times)]
754
+ print(' --obs-times at each of %d gc rounds: %s (final generation times: %s)' % (args.n_gc_rounds, ', '.join(otstrs), ', '.join(fgstrs)))
755
+ if len(args.n_sim_seqs_per_generation) != args.n_gc_rounds and len(args.n_sim_seqs_per_generation) == 1:
756
+ args.n_sim_seqs_per_generation = [args.n_sim_seqs_per_generation[0] for _ in range(args.n_gc_rounds)]
757
+ assert len(args.n_sim_seqs_per_generation) == args.n_gc_rounds
758
+ if args.parameter_variances is not None: # don't feel like implementing this atm
759
+ if any(a in args.parameter_variances for a in ['obs-times', 'n-sim-seqs-per-generation']):
760
+ raise Exception('haven\'t implemented parameter variances for --obs-times/--n-sim-seqs-per-generation with multiple gc rounds')
761
+ setattr(args, 'sequence_sample_times', None)
762
+ setattr(args, 'tpsample', False)
763
+ if args.sequence_sample_time_fname is not None:
764
+ print(' reading timepoint sample times from %s' % args.sequence_sample_time_fname)
765
+ with open(args.sequence_sample_time_fname) as sfile:
766
+ yamlfo = yaml.load(sfile, Loader=yaml.CLoader)
767
+ if args.n_gc_rounds is not None: # have to translate the "local" gc round times to final times, then also concatenate them into ones list for all gc rounds
768
+ sum_time = 0
769
+ for igcr in range(args.n_gc_rounds):
770
+ for tpfo in yamlfo:
771
+ if igcr not in tpfo['times']:
772
+ continue
773
+ tpfo['times'][igcr] = [t + sum_time for t in tpfo['times'][igcr]]
774
+ sum_time += args.obs_times[igcr][-1]
775
+ for tpfo in yamlfo:
776
+ tpfo['times'] = sorted(t for tlist in tpfo['times'].values() for t in tlist) # should already be sorted, but whatever
777
+ args.sequence_sample_times = yamlfo
778
+ args.tpsample = True # just a shorthand
779
+ delattr(args, 'sequence_sample_time_fname')
780
+
781
+ assert args.extrastr == 'simu' # I think at this point this actually can't be changed without changing some other things
782
+
783
+ # ----------------------------------------------------------------------------------------
784
+ if 'simu' in args.actions:
785
+ if args.n_gc_rounds is None:
786
+ glfos, final_events = simulate()
787
+ else:
788
+ mevt_lists = [] # list (for each gc round) of [sub]lists, where each sublist is the sampled seqs from that round for each event
789
+ for igcr in range(args.n_gc_rounds):
790
+ glfos, mevts = simulate(igcr=igcr)
791
+ mevt_lists.append(mevts)
792
+ if not args.dry_run:
793
+ final_events = combine_gc_rounds(glfos, mevt_lists)
794
+ if not args.dry_run and args.tpsample:
795
+ write_timepoint_sampled_sequences(glfos, final_events)
796
+ if 'cache-parameters' in args.actions:
797
+ cache_parameters()
798
+ if 'partition' in args.actions:
799
+ partition()