partis-bcr 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bin/FastTree +0 -0
- bin/add-chimeras.py +59 -0
- bin/add-seqs-to-outputs.py +81 -0
- bin/bcr-phylo-run.py +799 -0
- bin/build.sh +24 -0
- bin/cf-alleles.py +97 -0
- bin/cf-germlines.py +57 -0
- bin/cf-linearham.py +199 -0
- bin/chimera-plot.py +76 -0
- bin/choose-partially-paired.py +143 -0
- bin/circle-plots.py +30 -0
- bin/compare-plotdirs.py +298 -0
- bin/diff-parameters.py +133 -0
- bin/docker-hub-push.sh +6 -0
- bin/extract-pairing-info.py +55 -0
- bin/gcdyn-simu-run.py +223 -0
- bin/gctree-run.py +244 -0
- bin/get-naive-probabilities.py +126 -0
- bin/iqtree-1.6.12 +0 -0
- bin/lonr.r +1020 -0
- bin/makeHtml +52 -0
- bin/mds-run.py +46 -0
- bin/parse-output.py +277 -0
- bin/partis +1869 -0
- bin/partis-pip +116 -0
- bin/partis.py +1869 -0
- bin/plot-gl-set-trees.py +519 -0
- bin/plot-hmms.py +151 -0
- bin/plot-lb-tree.py +427 -0
- bin/raxml-ng +0 -0
- bin/read-bcr-phylo-trees.py +38 -0
- bin/read-gctree-output.py +166 -0
- bin/run-chimeras.sh +64 -0
- bin/run-dtr-scan.sh +25 -0
- bin/run-paired-loci.sh +100 -0
- bin/run-tree-metrics.sh +88 -0
- bin/smetric-run.py +62 -0
- bin/split-loci.py +317 -0
- bin/swarm-2.1.13-linux-x86_64 +0 -0
- bin/test-germline-inference.py +425 -0
- bin/tree-perf-run.py +194 -0
- bin/vsearch-2.4.3-linux-x86_64 +0 -0
- bin/vsearch-2.4.3-macos-x86_64 +0 -0
- bin/xvfb-run +194 -0
- partis_bcr-1.0.2.data/scripts/cf-alleles.py +97 -0
- partis_bcr-1.0.2.data/scripts/cf-germlines.py +57 -0
- partis_bcr-1.0.2.data/scripts/extract-pairing-info.py +55 -0
- partis_bcr-1.0.2.data/scripts/gctree-run.py +244 -0
- partis_bcr-1.0.2.data/scripts/parse-output.py +277 -0
- partis_bcr-1.0.2.data/scripts/split-loci.py +317 -0
- partis_bcr-1.0.2.data/scripts/test.py +1005 -0
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/METADATA +1 -1
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/RECORD +101 -51
- partis_bcr-1.0.2.dist-info/top_level.txt +1 -0
- {partis → python}/glutils.py +1 -1
- python/main.py +30 -0
- {partis → python}/plotting.py +10 -1
- {partis → python}/treeutils.py +18 -16
- {partis → python}/utils.py +14 -7
- packages/ham/bcrham +0 -0
- partis/main.py +0 -59
- partis_bcr-1.0.0.dist-info/top_level.txt +0 -1
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/WHEEL +0 -0
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/entry_points.txt +0 -0
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/licenses/COPYING +0 -0
- {partis → python}/__init__.py +0 -0
- {partis → python}/alleleclusterer.py +0 -0
- {partis → python}/allelefinder.py +0 -0
- {partis → python}/alleleremover.py +0 -0
- {partis → python}/annotationclustering.py +0 -0
- {partis → python}/baseutils.py +0 -0
- {partis → python}/cache/__init__.py +0 -0
- {partis → python}/cache/cached_uncertainties.py +0 -0
- {partis → python}/clusterpath.py +0 -0
- {partis → python}/coar.py +0 -0
- {partis → python}/corrcounter.py +0 -0
- {partis → python}/datautils.py +0 -0
- {partis → python}/event.py +0 -0
- {partis → python}/fraction_uncertainty.py +0 -0
- {partis → python}/gex.py +0 -0
- {partis → python}/glomerator.py +0 -0
- {partis → python}/hist.py +0 -0
- {partis → python}/hmmwriter.py +0 -0
- {partis → python}/hutils.py +0 -0
- {partis → python}/indelutils.py +0 -0
- {partis → python}/lbplotting.py +0 -0
- {partis → python}/mds.py +0 -0
- {partis → python}/mutefreqer.py +0 -0
- {partis → python}/paircluster.py +0 -0
- {partis → python}/parametercounter.py +0 -0
- {partis → python}/paramutils.py +0 -0
- {partis → python}/partitiondriver.py +0 -0
- {partis → python}/partitionplotter.py +0 -0
- {partis → python}/performanceplotter.py +0 -0
- {partis → python}/plotconfig.py +0 -0
- {partis → python}/processargs.py +0 -0
- {partis → python}/prutils.py +0 -0
- {partis → python}/recombinator.py +0 -0
- {partis → python}/scanplot.py +0 -0
- {partis → python}/seqfileopener.py +0 -0
- {partis → python}/treegenerator.py +0 -0
- {partis → python}/viterbicluster.py +0 -0
- {partis → python}/vrc01.py +0 -0
- {partis → python}/waterer.py +0 -0
bin/split-loci.py
ADDED
@@ -0,0 +1,317 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
from __future__ import absolute_import, division, unicode_literals
|
3
|
+
from __future__ import print_function
|
4
|
+
import json
|
5
|
+
import csv
|
6
|
+
import os
|
7
|
+
import sys
|
8
|
+
import argparse
|
9
|
+
import operator
|
10
|
+
import colored_traceback.always
|
11
|
+
import collections
|
12
|
+
import copy
|
13
|
+
from collections import defaultdict
|
14
|
+
import random
|
15
|
+
import numpy
|
16
|
+
from io import open
|
17
|
+
import time
|
18
|
+
from pathlib import Path
|
19
|
+
|
20
|
+
# if you move this script, you'll need to change this method of getting the imports
|
21
|
+
partis_dir = str(Path(__file__).parent.parent)
|
22
|
+
sys.path.insert(1, partis_dir) # + '/python')
|
23
|
+
|
24
|
+
import python.utils as utils
|
25
|
+
import python.paircluster as paircluster
|
26
|
+
import python.glutils as glutils
|
27
|
+
from python.clusterpath import ClusterPath
|
28
|
+
import python.seqfileopener as seqfileopener
|
29
|
+
|
30
|
+
# ----------------------------------------------------------------------------------------
|
31
|
+
dstr = """
|
32
|
+
Uses vsearch (or the \'locus\' key in --input-metfname) to split the sequences in <fname> according to their loci, writing each locus to its own fasta file <locus>.fa.
|
33
|
+
If \'paired-uids\' are available in --input-metafname, also splits the heavy sequences according to the light chain locus with which they\'re paired, resulting in subdirectories e.g. igh+igk/ and igh+igl/.
|
34
|
+
Use --reverse-negative-strands to check both senses for each input sequence.
|
35
|
+
"""
|
36
|
+
parser = argparse.ArgumentParser(description=dstr,
|
37
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter) # why tf isn't this printing the defaults?
|
38
|
+
parser.add_argument('fname', help='fasta input file')
|
39
|
+
parser.add_argument('--outdir', help='directory to which to write output files (if not set, output is written to directory of <fname>)')
|
40
|
+
parser.add_argument('--reverse-negative-strands', action='store_true', help='align every sequence both forwards and revcomp\'d, then for each sequence keep the sense with better alignment.')
|
41
|
+
parser.add_argument('--species', default='human', choices=('human', 'macaque', 'mouse'), help='Which species?')
|
42
|
+
parser.add_argument('--germline-dir', default=utils.get_partis_dir() + '/data/germlines', help='doesn\'t need to be the germlines corresponding to this sample since it\'s just so it can figure out which is igh vs igk vs igl, so the default is probably fine')
|
43
|
+
parser.add_argument('--workdir', default=utils.choose_random_subdir('/tmp/%s/partis' % os.getenv('USER', default='partis-work')), help='working directory for vsearch')
|
44
|
+
parser.add_argument('--vsearch-binary', help='Path to vsearch binary (vsearch binaries for linux and darwin are included in partis/bin/, so leaving this unset should work, but for other systems you need to get your own)')
|
45
|
+
parser.add_argument('--vsearch-threshold', type=float, default=0.4, help='default identity threshold for vsearch')
|
46
|
+
parser.add_argument('--debug', type=int, default=1)
|
47
|
+
parser.add_argument('--overwrite', action='store_true')
|
48
|
+
parser.add_argument('--random-seed', type=int, default=1)
|
49
|
+
parser.add_argument('--guess-pairing-info', action='store_true', help=utils.did_help['guess'])
|
50
|
+
parser.add_argument('--droplet-id-separators', help=utils.did_help['seps'])
|
51
|
+
parser.add_argument('--droplet-id-indices', help=utils.did_help['indices'])
|
52
|
+
parser.add_argument('--fasta-info-index', type=int, help='zero-based index in fasta info/meta string of sequence name/uid (e.g. if name line is \'>stuff more-stuff NAME extra-stuff\' the index should be 2)')
|
53
|
+
parser.add_argument('--allowed-contigs-per-droplet', help='if set, discard sequences from droplets that contain any number of contigs not in this colon-separated list')
|
54
|
+
parser.add_argument('--allowed-meta-keys-values', help='if set, require that the kept contigs from --allowed-contigs-per-droplet have these key:value pairs (colon-separated list of comma-separated key:value pairs)')
|
55
|
+
parser.add_argument('--input-metafname', help='yaml file with meta information keyed by sequence id. See same argument in main partis help, and https://github.com/psathyrella/partis/blob/master/docs/subcommands.md#input-meta-info for an example.')
|
56
|
+
parser.add_argument('--for-testing-n-max-queries', type=int, default=-1, help='only for testing, applied when reading initial fasta file, just in case it\'s huge and you want to run quickly without having to read the whole file')
|
57
|
+
parser.add_argument('--n-max-queries', type=int, default=-1, help='see partis help (although here it applies to droplets, not individual seqs)')
|
58
|
+
parser.add_argument('--n-random-queries', type=int, help='see partis help (although here it applies to droplets, not individual seqs)')
|
59
|
+
parser.add_argument('--ig-or-tr', default='ig', choices=list(utils.locus_pairs.keys()), help='antibodies or TCRs?')
|
60
|
+
|
61
|
+
# ----------------------------------------------------------------------------------------
|
62
|
+
def use_rev_comp(pline, rline): # decide whether positive sense <pline> or negative sense <rline> has better alignment
|
63
|
+
assert pline['unique_ids'][0] == rline['unique_ids'][0]
|
64
|
+
if rline.get('invalid', False):
|
65
|
+
return False
|
66
|
+
elif pline.get('invalid', False):
|
67
|
+
return True
|
68
|
+
elif rline['score'] > pline['score']:
|
69
|
+
return True
|
70
|
+
else:
|
71
|
+
return False
|
72
|
+
|
73
|
+
# ----------------------------------------------------------------------------------------
|
74
|
+
def run_vsearch(seqfos): # run vsearch to see if you can get a match for each locus for every sequence
|
75
|
+
print(' running vsearch on %d sequences:' % len(seqfos))
|
76
|
+
n_rev_compd, n_total = 0, 0
|
77
|
+
for locus in utils.sub_loci(args.ig_or_tr):
|
78
|
+
lglfo = glutils.read_glfo(args.germline_dir, locus)
|
79
|
+
annotations = utils.run_vsearch_with_duplicate_uids('search', seqfos, args.workdir + '/vsearch', args.vsearch_threshold, glfo=lglfo, print_time=True, vsearch_binary=args.vsearch_binary, get_annotations=True, expect_failure=True, extra_str=' %s fwd:'%utils.color('blue', locus) if args.reverse_negative_strands else ' %s: '%locus) #, debug=args.debug>1)
|
80
|
+
assert len(annotations) == len(seqfos)
|
81
|
+
if args.reverse_negative_strands: # it might be nicer to user vsearch options to run on both senses at once, but otoh this might be nicer
|
82
|
+
revnotations = utils.run_vsearch_with_duplicate_uids('search', revfos, args.workdir + '/vsearch', args.vsearch_threshold, glfo=lglfo, print_time=True, vsearch_binary=args.vsearch_binary, get_annotations=True, expect_failure=True, extra_str=' rev:') #, debug=args.debug>1)
|
83
|
+
assert len(revnotations) == len(seqfos)
|
84
|
+
for il, (sfo, line) in enumerate(zip(seqfos, annotations)):
|
85
|
+
assert sfo['name'] == line['unique_ids'][0] # note that they're not full annotations, they just have a couple keys
|
86
|
+
if args.reverse_negative_strands and use_rev_comp(line, revnotations[il]):
|
87
|
+
sfo['seq'] = revfos[il]['seq']
|
88
|
+
line = revnotations[il]
|
89
|
+
n_rev_compd += 1
|
90
|
+
sfo[locus] = line # add info for each locus to the input seqfos
|
91
|
+
n_total += 1
|
92
|
+
if args.reverse_negative_strands:
|
93
|
+
print(' used rev comp for %d/%d locus results (for %d seqs)' % (n_rev_compd, n_total, len(seqfos)))
|
94
|
+
|
95
|
+
# ----------------------------------------------------------------------------------------
|
96
|
+
def write_locus_file(locus, ofos, lpair=None, extra_str=' ', totstr=''):
|
97
|
+
ofn = paircluster.paired_fn(args.outdir, locus=locus, lpair=lpair)
|
98
|
+
if utils.output_exists(args, ofn, leave_zero_len=len(ofos)==0, offset=4): # NOTE not really sure this does anything (or if i want it) now that I'm cleaning/looking for the whole dir at the start of this script
|
99
|
+
return
|
100
|
+
if not os.path.exists(os.path.dirname(ofn)):
|
101
|
+
os.makedirs(os.path.dirname(ofn))
|
102
|
+
if len(ofos) == 0:
|
103
|
+
# print '%s%s: nothing to write' % (extra_str, locus)
|
104
|
+
open(ofn, 'w').close()
|
105
|
+
return
|
106
|
+
print('%s%s: %d%s to %s/%s' % (extra_str, locus, len(ofos), totstr, os.path.basename(os.path.dirname(ofn)), os.path.basename(ofn)))
|
107
|
+
with open(ofn, 'w') as lfile:
|
108
|
+
for sfo in ofos:
|
109
|
+
lfile.write('>%s\n%s\n' % (sfo['name'], sfo['seq']))
|
110
|
+
|
111
|
+
# ----------------------------------------------------------------------------------------
|
112
|
+
def read_meta_info(seqfos): # read all input meta info, and add pairing info (if present) to <paired_uids>
|
113
|
+
dummy_annotation_list = [{'unique_ids' : [sfo['name']]} for sfo in seqfos]
|
114
|
+
seqfileopener.read_input_metafo([args.input_metafname], dummy_annotation_list) # , required_keys=['paired-uids'])
|
115
|
+
for line in dummy_annotation_list:
|
116
|
+
uid = utils.get_single_entry(line['unique_ids'])
|
117
|
+
if 'loci' in line:
|
118
|
+
meta_loci[uid] = line['loci'][0]
|
119
|
+
if 'paired-uids' in line:
|
120
|
+
paired_uids[uid] = line['paired-uids'][0]
|
121
|
+
if len(paired_uids) > 0:
|
122
|
+
print(' read pairing info for %d seqs from input meta file' % len(paired_uids))
|
123
|
+
if len(paired_uids) < len(seqfos):
|
124
|
+
print(' %s only read pairing info for %d/%d seqfos' % (utils.color('yellow', 'warning'), len(paired_uids), len(seqfos)))
|
125
|
+
if len(meta_loci) > 0:
|
126
|
+
print(' read loci for %d sequences from input meta file (so not running vsearch)' % len(meta_loci))
|
127
|
+
if len(meta_loci) < len(seqfos):
|
128
|
+
print(' %s only read locus info for %d/%d seqfos' % (utils.color('yellow', 'warning'), len(meta_loci), len(seqfos)))
|
129
|
+
input_metafos = utils.read_json_yaml(args.input_metafname)
|
130
|
+
for uid in input_metafos: # we want to copy over any additional meta info (not paired uids or loci) to the output meta info file (since if we're guessing pair info, the uid names will change, so the original one is no good)
|
131
|
+
additional_mfo = {k : v for k, v in input_metafos[uid].items() if k not in ['loci', 'paired-uids']}
|
132
|
+
if len(additional_mfo) > 0:
|
133
|
+
input_metafos[uid] = additional_mfo
|
134
|
+
return input_metafos
|
135
|
+
|
136
|
+
# ----------------------------------------------------------------------------------------
|
137
|
+
def print_pairing_info(outfos, paired_uids):
|
138
|
+
loci_by_uid = {sfo['name'] : l for l in outfos for sfo in outfos[l]} # locus of each sequence, just for counting below
|
139
|
+
print_cutoff = 0.01
|
140
|
+
n_missing = 0
|
141
|
+
print(' count frac paired with')
|
142
|
+
for locus in utils.sub_loci(args.ig_or_tr):
|
143
|
+
plocicounts = {}
|
144
|
+
for sfo in outfos[locus]:
|
145
|
+
if sfo['name'] not in paired_uids:
|
146
|
+
n_missing += 1
|
147
|
+
continue
|
148
|
+
plstr = ' '.join(utils.locstr(l) for l in sorted([loci_by_uid.get(pid, '?') for pid in paired_uids[sfo['name']]]))
|
149
|
+
if plstr not in plocicounts:
|
150
|
+
plocicounts[plstr] = 0
|
151
|
+
plocicounts[plstr] += 1
|
152
|
+
total = sum(plocicounts.values())
|
153
|
+
n_skipped = 0
|
154
|
+
for ipl, (plstr, counts) in enumerate(sorted(list(plocicounts.items()), key=operator.itemgetter(1), reverse=True)):
|
155
|
+
if counts / float(total) < print_cutoff:
|
156
|
+
n_skipped += counts
|
157
|
+
continue
|
158
|
+
print(' %s %6d %5.2f %s' % (utils.locstr(locus) if ipl==0 else ' ', counts, counts / float(total), plstr))
|
159
|
+
if n_skipped > 0:
|
160
|
+
print(' +%d counts skipped with <%.3f each' % (n_skipped , print_cutoff)) # utils.color('yellow', 'note
|
161
|
+
if n_missing > 0:
|
162
|
+
print(' %s %d uids missing from paired uids' % (utils.wrnstr(), n_missing))
|
163
|
+
|
164
|
+
# ----------------------------------------------------------------------------------------
|
165
|
+
args = parser.parse_args()
|
166
|
+
random.seed(args.random_seed)
|
167
|
+
numpy.random.seed(args.random_seed)
|
168
|
+
if os.path.dirname(args.fname) == '':
|
169
|
+
args.fname = '%s/%s' % (os.getcwd(), args.fname)
|
170
|
+
if args.outdir is None:
|
171
|
+
args.outdir = utils.getprefix(args.fname)
|
172
|
+
args.droplet_id_indices = utils.get_arg_list(args.droplet_id_indices, intify=True)
|
173
|
+
args.allowed_contigs_per_droplet = utils.get_arg_list(args.allowed_contigs_per_droplet, intify=True)
|
174
|
+
args.allowed_meta_keys_values = utils.get_arg_list(args.allowed_meta_keys_values, key_val_pairs=True)
|
175
|
+
args.input_metafname = utils.fpath(args.input_metafname)
|
176
|
+
|
177
|
+
if any(os.path.exists(ofn) for ofn in paircluster.paired_dir_fnames(args.outdir)):
|
178
|
+
if args.overwrite:
|
179
|
+
paircluster.clean_paired_dir(args.outdir)
|
180
|
+
else:
|
181
|
+
print(' split-loci.py output exists and --overwrite was not set, so not doing anything: %s' % args.outdir)
|
182
|
+
sys.exit(0)
|
183
|
+
|
184
|
+
seqfos = utils.read_fastx(args.fname, n_max_queries=args.for_testing_n_max_queries)
|
185
|
+
if args.n_max_queries != -1 or args.n_random_queries is not None:
|
186
|
+
seqfos = utils.subset_paired_queries(seqfos, args.droplet_id_separators, args.droplet_id_indices, n_max_queries=args.n_max_queries, n_random_queries=args.n_random_queries)
|
187
|
+
if args.fasta_info_index is not None:
|
188
|
+
for sfo in seqfos:
|
189
|
+
sfo['name'] = sfo['infostrs'][args.fasta_info_index]
|
190
|
+
if args.reverse_negative_strands:
|
191
|
+
revfos = [{'name' : s['name'], 'seq' : utils.revcomp(s['seq'])} for s in seqfos] # NOTE this is not on an equal footing with <seqfos>, since we add all the vsearch info to <seqfos>, then use it do decide on locus, and then to write output
|
192
|
+
|
193
|
+
if os.path.exists(args.germline_dir + '/' + args.species): # ick that is hackey
|
194
|
+
args.germline_dir += '/' + args.species
|
195
|
+
|
196
|
+
# read input meta file and/or run vsearch
|
197
|
+
paired_uids, meta_loci, input_metafos = {}, {}, {}
|
198
|
+
if args.input_metafname is not None:
|
199
|
+
input_metafos = read_meta_info(seqfos)
|
200
|
+
if len(meta_loci) == 0: # default: no input locus info
|
201
|
+
run_vsearch(seqfos)
|
202
|
+
|
203
|
+
# then, for each sequence, choose the locus with the best-scoring match (in practice i doubt you ever really get multiple loci with matches)
|
204
|
+
outfos = collections.OrderedDict(((l, []) for l in utils.sub_loci(args.ig_or_tr)))
|
205
|
+
failed_seqs = []
|
206
|
+
if args.debug > 1:
|
207
|
+
print(' printing scores for locus determination:')
|
208
|
+
n_skipped = 0
|
209
|
+
for sfo in seqfos:
|
210
|
+
if len(meta_loci) == 0: # default: use vsearch match scores
|
211
|
+
lscores = {l : sfo[l]['score'] if 'invalid' not in sfo[l] else 0 for l in utils.sub_loci(args.ig_or_tr)}
|
212
|
+
locus, max_score = sorted(list(lscores.items()), key=operator.itemgetter(1), reverse=True)[0]
|
213
|
+
if max_score == 0:
|
214
|
+
failed_seqs.append(sfo)
|
215
|
+
continue
|
216
|
+
else: # if we were passed input locus info
|
217
|
+
locus = meta_loci[sfo['name']]
|
218
|
+
outfos[locus].append(sfo)
|
219
|
+
if args.debug > 1:
|
220
|
+
def lpstr(spair):
|
221
|
+
l, s = spair
|
222
|
+
return '%s %s' % (utils.locstr(l) if l==locus else l.replace('ig', ''), utils.color('red' if s!=0 else None, '%3d'%s))
|
223
|
+
if list(lscores.values()).count(0) == 2:
|
224
|
+
n_skipped += 1
|
225
|
+
else:
|
226
|
+
print(' %s %s' % (' '.join(lpstr(s) for s in sorted(list(lscores.items()), key=operator.itemgetter(1), reverse=True)), sfo['name']))
|
227
|
+
if args.debug > 1 and n_skipped > 0:
|
228
|
+
print(' skipped %d seqs with non-zero scores from only one locus' % n_skipped)
|
229
|
+
|
230
|
+
print('totals: %s%s' % (' '.join(('%s %d'%(l, len(sfos))) for l, sfos in outfos.items()), '' if len(failed_seqs) == 0 else ' (%s: %d)'%(utils.color('yellow', 'failed'), len(failed_seqs))))
|
231
|
+
assert sum(len(ofo) for ofo in outfos.values()) + len(failed_seqs) == len(seqfos)
|
232
|
+
|
233
|
+
if args.guess_pairing_info:
|
234
|
+
if len(paired_uids) > 0:
|
235
|
+
raise Exception('can\'t/shouldn\'t guess pairing info if we already have it from elsewhere')
|
236
|
+
for locus in outfos:
|
237
|
+
for ofo in outfos[locus]:
|
238
|
+
new_name = ofo['name']
|
239
|
+
if '-' not in new_name or ofo['name'].split('-')[-1] != locus: # add locus (e.g. '-igh') to name, unless it's already there
|
240
|
+
new_name = ofo['name'] + '-' + locus
|
241
|
+
if ofo['name'] in input_metafos:
|
242
|
+
input_metafos[new_name] = input_metafos[ofo['name']]
|
243
|
+
del input_metafos[ofo['name']]
|
244
|
+
ofo['name'] = new_name
|
245
|
+
guessed_metafos = utils.extract_pairing_info(seqfos, droplet_id_separators=args.droplet_id_separators, droplet_id_indices=args.droplet_id_indices, debug=max(1, args.debug))
|
246
|
+
for uid in set(guessed_metafos) & set(input_metafos):
|
247
|
+
guessed_metafos[uid].update(input_metafos[uid])
|
248
|
+
for uid, mfo in guessed_metafos.items():
|
249
|
+
paired_uids[uid] = mfo['paired-uids']
|
250
|
+
|
251
|
+
removed_uids = set()
|
252
|
+
if args.allowed_contigs_per_droplet is not None:
|
253
|
+
new_outfos = collections.OrderedDict(((l, []) for l in utils.sub_loci(args.ig_or_tr)))
|
254
|
+
for locus in outfos:
|
255
|
+
n_ctg_removed, n_meta_removed, n_meta_added = defaultdict(int), 0, 0
|
256
|
+
for ofo in outfos[locus]:
|
257
|
+
skip = False
|
258
|
+
n_contigs = len(paired_uids[ofo['name']]) + 1 # total n contigs in the droplet
|
259
|
+
if args.allowed_meta_keys_values is not None:
|
260
|
+
mv_uids = [ofo['name']] + copy.copy(paired_uids[ofo['name']]) # uids in this droplet that have the required meta info values
|
261
|
+
for mkey, mval in args.allowed_meta_keys_values.items():
|
262
|
+
mv_uids = [u for u in mv_uids if input_metafos[u][mkey] == mval] # reduce mv_uids to the uids that have the required meta value
|
263
|
+
if len(mv_uids) != n_contigs:
|
264
|
+
# print(' reducing n_contigs with %s=%s: %d %d (%s --> %s)' % (mkey, mval, n_contigs, len(mv_uids), [guessed_metafos[u][mkey] for u in [ofo['name']] + paired_uids[ofo['name']]], [guessed_metafos[u][mkey] for u in mv_uids]))
|
265
|
+
if n_contigs in args.allowed_contigs_per_droplet and len(mv_uids) not in args.allowed_contigs_per_droplet:
|
266
|
+
n_meta_removed += 1 # keep track of how many were removed only because of the meta info requirements
|
267
|
+
if n_contigs not in args.allowed_contigs_per_droplet and len(mv_uids) in args.allowed_contigs_per_droplet:
|
268
|
+
n_meta_added += 1 # and how many were added only because of the meta info requirements
|
269
|
+
n_contigs = len(mv_uids) # n contigs that have the required meta values
|
270
|
+
if n_contigs in args.allowed_contigs_per_droplet:
|
271
|
+
new_outfos[locus].append(ofo)
|
272
|
+
else:
|
273
|
+
n_ctg_removed[n_contigs] += 1
|
274
|
+
removed_uids.add(ofo['name'])
|
275
|
+
if sum(n_ctg_removed.values()) > 0:
|
276
|
+
print(' %s --allowed-contigs-per-droplet: removed %d / %d contigs that were in droplets that didn\'t have an allowed number of contigs (%s): %s' % (utils.locstr(locus), sum(n_ctg_removed.values()), len(outfos[locus]), ' '.join(str(n) for n in args.allowed_contigs_per_droplet), ' '.join('%s: %d'%(k, v) for k, v in n_ctg_removed.items())))
|
277
|
+
if args.allowed_meta_keys_values is not None and n_meta_removed > 0:
|
278
|
+
print(' --allowed-meta-keys-values: %d were removed (and %d were kept) because of the meta info requirements: %s' % (n_meta_removed, n_meta_added, args.allowed_meta_keys_values))
|
279
|
+
outfos = new_outfos
|
280
|
+
|
281
|
+
removed_uids |= set(s['name'] for s in failed_seqs)
|
282
|
+
if len(removed_uids) > 0:
|
283
|
+
start = time.time()
|
284
|
+
n_removed = 0
|
285
|
+
for fid in removed_uids:
|
286
|
+
if fid in paired_uids:
|
287
|
+
del paired_uids[fid]
|
288
|
+
n_removed += 1
|
289
|
+
paired_uids = {uid : sorted(set(paired_uids[uid]) - removed_uids) for uid in paired_uids}
|
290
|
+
print(' removed %d uids from paired_uids (%d failed, %d removed b/c of disallowed N contigs) in %.1fs' % (n_removed, len(failed_seqs), len(removed_uids) - len(failed_seqs), time.time() - start))
|
291
|
+
|
292
|
+
if args.debug and len(paired_uids) > 0:
|
293
|
+
print_pairing_info(outfos, paired_uids)
|
294
|
+
|
295
|
+
print('writing to %s/' % args.outdir)
|
296
|
+
if len(failed_seqs) > 0:
|
297
|
+
write_locus_file('failed', failed_seqs)
|
298
|
+
|
299
|
+
for locus in outfos: # first write the single files with all seqs for each locus
|
300
|
+
write_locus_file(locus, outfos[locus])
|
301
|
+
|
302
|
+
omfname = '%s/meta.yaml' % args.outdir
|
303
|
+
if args.guess_pairing_info:
|
304
|
+
utils.jsdump(omfname, guessed_metafos) # NOTE file name duplicates code in bin/partis
|
305
|
+
elif args.input_metafname is not None and not os.path.exists(omfname):
|
306
|
+
utils.makelink(os.path.dirname(omfname), args.input_metafname, omfname)
|
307
|
+
|
308
|
+
if len(paired_uids) == 0:
|
309
|
+
print('no pairing info')
|
310
|
+
else:
|
311
|
+
print('writing to paired subdirs')
|
312
|
+
for lpair in utils.locus_pairs[args.ig_or_tr]:
|
313
|
+
print(' %s:' % '+'.join(lpair))
|
314
|
+
for l_other, ltmp in [lpair, reversed(lpair)]:
|
315
|
+
all_paired_uids = set(pid for s in outfos[ltmp] if s['name'] in paired_uids for pid in paired_uids[s['name']]) # all uids that are paired with any <ltmp> uid (not necessarily *correctly* paired, at this stage it likely just means they're in the same droplet)
|
316
|
+
other_outfo = [sfo for sfo in outfos[l_other] if sfo['name'] in all_paired_uids] # any <l_other> locus seq that was in a droplet with an <ltmp> uid
|
317
|
+
write_locus_file(l_other, other_outfo, lpair=lpair, extra_str=' ', totstr=' / %s'%len(outfos[l_other]))
|
Binary file
|