partis-bcr 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. bin/FastTree +0 -0
  2. bin/add-chimeras.py +59 -0
  3. bin/add-seqs-to-outputs.py +81 -0
  4. bin/bcr-phylo-run.py +799 -0
  5. bin/build.sh +24 -0
  6. bin/cf-alleles.py +97 -0
  7. bin/cf-germlines.py +57 -0
  8. bin/cf-linearham.py +199 -0
  9. bin/chimera-plot.py +76 -0
  10. bin/choose-partially-paired.py +143 -0
  11. bin/circle-plots.py +30 -0
  12. bin/compare-plotdirs.py +298 -0
  13. bin/diff-parameters.py +133 -0
  14. bin/docker-hub-push.sh +6 -0
  15. bin/extract-pairing-info.py +55 -0
  16. bin/gcdyn-simu-run.py +223 -0
  17. bin/gctree-run.py +244 -0
  18. bin/get-naive-probabilities.py +126 -0
  19. bin/iqtree-1.6.12 +0 -0
  20. bin/lonr.r +1020 -0
  21. bin/makeHtml +52 -0
  22. bin/mds-run.py +46 -0
  23. bin/parse-output.py +277 -0
  24. bin/partis +1869 -0
  25. bin/partis-pip +116 -0
  26. bin/partis.py +1869 -0
  27. bin/plot-gl-set-trees.py +519 -0
  28. bin/plot-hmms.py +151 -0
  29. bin/plot-lb-tree.py +427 -0
  30. bin/raxml-ng +0 -0
  31. bin/read-bcr-phylo-trees.py +38 -0
  32. bin/read-gctree-output.py +166 -0
  33. bin/run-chimeras.sh +64 -0
  34. bin/run-dtr-scan.sh +25 -0
  35. bin/run-paired-loci.sh +100 -0
  36. bin/run-tree-metrics.sh +88 -0
  37. bin/smetric-run.py +62 -0
  38. bin/split-loci.py +317 -0
  39. bin/swarm-2.1.13-linux-x86_64 +0 -0
  40. bin/test-germline-inference.py +425 -0
  41. bin/tree-perf-run.py +194 -0
  42. bin/vsearch-2.4.3-linux-x86_64 +0 -0
  43. bin/vsearch-2.4.3-macos-x86_64 +0 -0
  44. bin/xvfb-run +194 -0
  45. partis_bcr-1.0.1.data/scripts/cf-alleles.py +97 -0
  46. partis_bcr-1.0.1.data/scripts/cf-germlines.py +57 -0
  47. partis_bcr-1.0.1.data/scripts/extract-pairing-info.py +55 -0
  48. partis_bcr-1.0.1.data/scripts/gctree-run.py +244 -0
  49. partis_bcr-1.0.1.data/scripts/parse-output.py +277 -0
  50. partis_bcr-1.0.1.data/scripts/split-loci.py +317 -0
  51. partis_bcr-1.0.1.data/scripts/test.py +1005 -0
  52. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/METADATA +1 -1
  53. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/RECORD +101 -50
  54. partis_bcr-1.0.1.dist-info/top_level.txt +1 -0
  55. {partis → python}/glutils.py +1 -1
  56. python/main.py +30 -0
  57. {partis → python}/plotting.py +10 -1
  58. {partis → python}/treeutils.py +18 -16
  59. {partis → python}/utils.py +14 -7
  60. partis/main.py +0 -59
  61. partis_bcr-1.0.0.dist-info/top_level.txt +0 -1
  62. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/WHEEL +0 -0
  63. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/entry_points.txt +0 -0
  64. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/licenses/COPYING +0 -0
  65. {partis → python}/__init__.py +0 -0
  66. {partis → python}/alleleclusterer.py +0 -0
  67. {partis → python}/allelefinder.py +0 -0
  68. {partis → python}/alleleremover.py +0 -0
  69. {partis → python}/annotationclustering.py +0 -0
  70. {partis → python}/baseutils.py +0 -0
  71. {partis → python}/cache/__init__.py +0 -0
  72. {partis → python}/cache/cached_uncertainties.py +0 -0
  73. {partis → python}/clusterpath.py +0 -0
  74. {partis → python}/coar.py +0 -0
  75. {partis → python}/corrcounter.py +0 -0
  76. {partis → python}/datautils.py +0 -0
  77. {partis → python}/event.py +0 -0
  78. {partis → python}/fraction_uncertainty.py +0 -0
  79. {partis → python}/gex.py +0 -0
  80. {partis → python}/glomerator.py +0 -0
  81. {partis → python}/hist.py +0 -0
  82. {partis → python}/hmmwriter.py +0 -0
  83. {partis → python}/hutils.py +0 -0
  84. {partis → python}/indelutils.py +0 -0
  85. {partis → python}/lbplotting.py +0 -0
  86. {partis → python}/mds.py +0 -0
  87. {partis → python}/mutefreqer.py +0 -0
  88. {partis → python}/paircluster.py +0 -0
  89. {partis → python}/parametercounter.py +0 -0
  90. {partis → python}/paramutils.py +0 -0
  91. {partis → python}/partitiondriver.py +0 -0
  92. {partis → python}/partitionplotter.py +0 -0
  93. {partis → python}/performanceplotter.py +0 -0
  94. {partis → python}/plotconfig.py +0 -0
  95. {partis → python}/processargs.py +0 -0
  96. {partis → python}/prutils.py +0 -0
  97. {partis → python}/recombinator.py +0 -0
  98. {partis → python}/scanplot.py +0 -0
  99. {partis → python}/seqfileopener.py +0 -0
  100. {partis → python}/treegenerator.py +0 -0
  101. {partis → python}/viterbicluster.py +0 -0
  102. {partis → python}/vrc01.py +0 -0
  103. {partis → python}/waterer.py +0 -0
@@ -0,0 +1,277 @@
1
+ #!python
2
+ from __future__ import absolute_import, division, unicode_literals
3
+ from __future__ import print_function
4
+ import sys
5
+ import csv
6
+ from io import open
7
+ csv.field_size_limit(sys.maxsize) # make sure we can write very large csv fields
8
+ import os
9
+ import argparse
10
+ import colored_traceback.always
11
+ from pathlib import Path
12
+
13
+ # if you move this script, you'll need to change this method of getting the imports
14
+ partis_dir = str(Path(__file__).parent.parent)
15
+ sys.path.insert(1, partis_dir) # + '/python')
16
+
17
+ import python.utils as utils
18
+ import python.glutils as glutils
19
+ from python.clusterpath import ClusterPath
20
+ import python.paircluster as paircluster
21
+
22
+ # ----------------------------------------------------------------------------------------
23
+ def count_plot(tglfo, tlist, plotdir, paired_loci=None):
24
+ if len(tlist) == 0:
25
+ return
26
+ if args.plot_tree_mut_stats:
27
+ import python.plotting as plotting
28
+ plotting.plot_tree_mut_stats(plotdir, tlist, args.is_simu, only_leaves=args.only_plot_leaves, treefname=args.treefname)
29
+ plotting.make_html(plotdir)
30
+ return
31
+ if args.only_count_correlations:
32
+ from python.corrcounter import CorrCounter
33
+ ccounter = CorrCounter(paired_loci=paired_loci)
34
+ for line in tlist:
35
+ l_info = None
36
+ if paired_loci is not None:
37
+ line, l_info = line
38
+ ccounter.increment(line, l_info=l_info)
39
+ ccounter.plot(plotdir + '/correlations', only_csv=args.only_csv_plots, debug=args.debug)
40
+ return
41
+ if args.simfname is not None:
42
+ simglfo, true_antn_list, _ = utils.read_output(args.simfname)
43
+ true_antn_dict = {}
44
+ for true_line in true_antn_list:
45
+ for iseq, uid in enumerate(true_line['unique_ids']):
46
+ true_antn_dict[uid] = utils.synthesize_single_seq_line(true_line, iseq)
47
+ # true_antn_dict = utils.get_annotation_dict(true_antn_list)
48
+ from python.performanceplotter import PerformancePlotter
49
+ perfplotter = PerformancePlotter('hmm')
50
+ n_failed = 0
51
+ for line in tlist:
52
+ if line['invalid']:
53
+ n_failed += 1
54
+ continue
55
+ for iseq, uid in enumerate(line['unique_ids']): # NOTE this counts rearrangement-level parameters once for every mature sequence, which is inconsistent with the pcounters... but I think might make more sense here?
56
+ _ = perfplotter.evaluate(true_antn_dict[uid], utils.synthesize_single_seq_line(line, iseq), simglfo=simglfo)
57
+ perfplotter.plot(args.plotdir, only_csv=args.only_csv_plots)
58
+ if n_failed > 0:
59
+ print(' %s %d / %d failed queries' % (utils.color('yellow', 'warning'), n_failed, len([u for l in tlist for u in l['unique_ids']])))
60
+ if args.only_plot_performance:
61
+ return
62
+ assert not args.paired # only handled for correlation counting atm
63
+ from python.parametercounter import ParameterCounter
64
+ setattr(args, 'region_end_exclusions', {r : [0 for e in ['5p', '3p']] for r in utils.regions}) # hackity hackity hackity
65
+ pcounter = ParameterCounter(tglfo, args) # NOTE doesn't count correlations by default
66
+ for line in tlist:
67
+ pcounter.increment(line)
68
+ pcounter.plot(plotdir, only_csv=args.only_csv_plots, only_overall=args.only_overall_plots) #, make_per_base_plots=True) , make_per_base_plots=True
69
+
70
+ # ----------------------------------------------------------------------------------------
71
+ helpstr = """
72
+ Extract sequences from a partis output file and write them to a fasta, csv, or tsv file, optionally with a limited amount of extra information for each sequence.
73
+ For details of partis output files, see the manual.
74
+ To view the partitions and annotations in a partis output file, use the partis \'view-output\' action.
75
+ Example usage:
76
+ bin/parse-output.py test/reference-results/partition-new-simu.yaml out.fa
77
+ bin/parse-output.py test/paired/ref-results/partition-new-simu outdir --paired
78
+ """
79
+ class MultiplyInheritedFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
80
+ pass
81
+ formatter_class = MultiplyInheritedFormatter
82
+ parser = argparse.ArgumentParser(formatter_class=MultiplyInheritedFormatter, description=helpstr)
83
+ parser.add_argument('infile', help='partis output file from which to read input')
84
+ parser.add_argument('outfile', help='File to which to write output extracted from <infile> (fasta or csv/tsv). If --paired is set, this must be a directory, to which will be written a fasta with all sequences, a yaml with pairing info, and a csv with h/l sequence pairs.')
85
+ parser.add_argument('--paired', action='store_true', help='if set, <infile> should be a paired output dir, rather than a single file')
86
+ parser.add_argument('--extra-columns', help='colon-separated list of additional partis output columns (beyond sequences), to write to the output file. If writing to a fasta file, the column values are appended after the sequence name, separated by --fasta-info-separator. If writing to csv/tsv, they\'re written as proper, labeled columns.')
87
+ parser.add_argument('--partition-index', type=int, help='if set, use the partition at this index in the cluster path, rather than the default of using the best partition')
88
+ parser.add_argument('--seed-unique-id', help='if set, take sequences only from the cluster containing this seed sequence, rather than the default of taking all sequences from all clusters')
89
+ parser.add_argument('--cluster-index', type=int, help='if set, take sequences only from the cluster at this index in the partition, rather than the default of taking all sequences from all clusters. This index is with respect to the cluster order found in the file (which, in contrast to plots made by --plotdir, is *not* sorted by size)')
90
+ parser.add_argument('--sort-by-size', action='store_true', help='if set, sort clusters in partition by decreasing size before applying --cluster-index')
91
+ parser.add_argument('--indel-reversed-seqs', action='store_true', help='if set, take sequences that have had any shm indels "reversed" (i.e. insertions are reversed, and deletions are replaced with the germline bases) rather than the default of using sequences from the original input file. Indel-reversed sequences can be convenient because they are by definition the same length as and aligned to the naive sequence.')
92
+ parser.add_argument('--glfo-dir', help='Directory with germline info. Only necessary for old-style csv output files. Equivalent to a parameter dir with \'/hmm/germline-sets\' appended.')
93
+ parser.add_argument('--template-glfo-dir', help='use this glfo dir as a template when reading --glfo-dir (only used for airr input atm)')
94
+ parser.add_argument('--locus', default='igh', help='only used for old-style csv output files')
95
+ parser.add_argument('--plotdir', help='if set, plot annotation parameters from infile to --plotdir and exit (you still have to set outfile, sorry, since it\'s nice having it be a positional arg, but it doesn\'t get used for this). To add e.g. per-gene-per-position plots comment/uncomment args in the call below.')
96
+ parser.add_argument('--only-count-correlations', action='store_true', help='')
97
+ parser.add_argument('--only-plot-performance', action='store_true', help='')
98
+ parser.add_argument('--fasta-info-separator', default=' ', help='character to use ')
99
+ parser.add_argument('--debug', type=int, default=0)
100
+ parser.add_argument('--airr-input', action='store_true', help='read input in AIRR tsv format, and if output file suffix is .yaml write partis output.')
101
+ parser.add_argument('--airr-output', action='store_true', help='write output in AIRR tsv format')
102
+ parser.add_argument('--skip-other-locus', action='store_true', help='if --airr-output is set, this tells us to skip lines from the other locus')
103
+ parser.add_argument('--skip-columns', help='don\'t write these columns to output (atm only implemented for airr output, since we need to remove the clone_id column so scoper doesn\'t crash)')
104
+ parser.add_argument('--simfname', help='simulation file corresponding to input file (i.e. presumably <infile> is inference that was performed on --simfname')
105
+ parser.add_argument('--only-csv-plots', action='store_true', help='only write csv versions of plots (not svg), which is a lot faster')
106
+ parser.add_argument('--only-make-plots', action='store_true', help='if --plotdir is set, set this to only do plotting, i.e. don\'t do the usual/default file reading/conversion')
107
+ parser.add_argument('--plot-tree-mut-stats', action='store_true', help='plot tree mutation stats and exit')
108
+ parser.add_argument('--only-plot-leaves', action='store_true', help='only affects --plot-tree-mut-stats')
109
+ parser.add_argument('--is-simu', action='store_true', help='only affects --plot-tree-mut-stats')
110
+ parser.add_argument('--only-overall-plots', action='store_true', help='TODO')
111
+ parser.add_argument('--treefname', help='only affects --plot-tree-mut-stats')
112
+ parser.add_argument('--meta-info-key-to-color', help='see partis help')
113
+ parser.add_argument('--meta-emph-formats', help='see partis help')
114
+ parser.add_argument('--meta-info-to-emphasize', help='see partis help')
115
+
116
+ if 'extract-fasta.py' in sys.argv[0]: # if they're trying to run this old script, which is now just a link to this one, print a warning and rejigger the arguments so it still works
117
+ print(' note: running deprecated script %s, which currently is just a link pointing to %s' % (os.path.basename(sys.argv[0]), os.path.basename(os.path.realpath( __file__))))
118
+ print(' note: transferring deprecated arguments --input-file and --fasta-output-file to the first two positional arguments (this will continue to work, you only need to change things if you want this warning to go away)')
119
+ utils.insert_in_arglist(sys.argv, [utils.get_val_from_arglist(sys.argv, '--input-file'), utils.get_val_from_arglist(sys.argv, '--fasta-output-file')], sys.argv[0])
120
+ utils.remove_from_arglist(sys.argv, '--input-file', has_arg=True)
121
+ utils.remove_from_arglist(sys.argv, '--fasta-output-file', has_arg=True)
122
+
123
+ args = parser.parse_args()
124
+ args.extra_columns = utils.get_arg_list(args.extra_columns)
125
+ args.meta_emph_formats = utils.get_arg_list(args.meta_emph_formats, key_val_pairs=True)
126
+ utils.meta_emph_arg_process(args)
127
+ if args.paired:
128
+ if utils.getsuffix(args.outfile) != '':
129
+ raise Exception('--outfile \'%s\' must be a directory, but it has a non-empty suffix \'%s\'' % (args.outfile, utils.getsuffix(args.outfile)))
130
+ else:
131
+ assert utils.getsuffix(args.outfile) in ['.csv', '.tsv', '.fa', '.fasta', '.yaml'] # or args.airr_input and utils.getsuffix(args.outfile) == '.yaml'
132
+
133
+ default_glfo_dir = partis_dir + '/data/germlines/human'
134
+ if utils.getsuffix(args.infile) in ['.csv', '.tsv'] and args.glfo_dir is None:
135
+ print(' note: reading csv/tsv format without germline info, so need to get germline info from a separate directory; --glfo-dir was not set, so using default %s. If it doesn\'t crash, it\'s probably ok.' % default_glfo_dir)
136
+ args.glfo_dir = default_glfo_dir
137
+
138
+ # ----------------------------------------------------------------------------------------
139
+ # read input
140
+ if args.paired:
141
+ if not os.path.isdir(args.infile):
142
+ raise Exception('--infile \'%s\' either doesn\'t exist or it isn\'t a directory' % args.infile)
143
+ lp_infos = paircluster.read_paired_dir(args.infile)
144
+ else:
145
+ if args.airr_input:
146
+ glfo, glfd = None, args.glfo_dir
147
+ if args.template_glfo_dir is not None: # NOTE only handled for airr input at the moment, cause that's what i need it for right now
148
+ glfo = glutils.read_glfo(args.glfo_dir, args.locus, template_glfo=glutils.read_glfo(args.template_glfo_dir, args.locus))
149
+ # glutils.write_glfo(args.glfo_dir + '-parsed', glfo, debug=True)
150
+ glfd = None
151
+ glfo, annotation_list, cpath = utils.read_airr_output(args.infile, locus=args.locus, glfo=glfo, glfo_dir=glfd, skip_other_locus=args.skip_other_locus)
152
+ else:
153
+ glfo, annotation_list, cpath = utils.read_output(args.infile, glfo_dir=args.glfo_dir, locus=args.locus)
154
+
155
+ # plot
156
+ if args.plotdir is not None:
157
+ if args.paired:
158
+ for lpair in utils.locus_pairs['ig']:
159
+ if lp_infos[tuple(lpair)]['glfos'] is None:
160
+ continue
161
+ for ltmp in lpair:
162
+ count_plot(lp_infos[tuple(lpair)]['glfos'][ltmp], lp_infos[tuple(lpair)]['antn_lists'][ltmp], '%s/%s/%s'%(args.plotdir, '+'.join(lpair), ltmp))
163
+ antn_pairs = paircluster.find_cluster_pairs(lp_infos, lpair) #, debug=True)
164
+ count_plot(None, antn_pairs, '%s/%s'%(args.plotdir, '+'.join(lpair)), paired_loci=[l['loci'][0] for l in antn_pairs[0]])
165
+ else:
166
+ count_plot(glfo, annotation_list, args.plotdir)
167
+ if args.only_make_plots:
168
+ sys.exit(0)
169
+
170
+ if args.paired:
171
+ glfos, antn_lists, cpaths = paircluster.concat_heavy_chain(utils.locus_pairs['ig'], lp_infos, dont_deep_copy=True) # NOTE this is a pretty arbitrary way to combine the partitions for the seqs with uncertain pairing info, but whatever
172
+ outfos, metafos = paircluster.get_combined_outmetafos(antn_lists)
173
+ paircluster.write_combined_fasta_and_meta('%s/all-seqs.fa'%args.outfile, '%s/meta.yaml'%args.outfile, outfos, metafos)
174
+ outfos = paircluster.find_seq_pairs(antn_lists)
175
+ print(' writing sequence id pairs to %s' % '%s/seq-pairs.csv'%args.outfile)
176
+ with open('%s/seq-pairs.csv'%args.outfile, utils.csv_wmode()) as cfile:
177
+ okeys = ['%s_%s'%(c, s) for s in ('id', 'locus', 'seq') for c in 'hl']
178
+ writer = csv.DictWriter(cfile, okeys) # sorted(outfos[0].keys()))
179
+ writer.writeheader()
180
+ for ofo in outfos:
181
+ writer.writerow({k : ofo[k] for k in okeys})
182
+ if args.airr_output:
183
+ for ltmp in sorted(glfos):
184
+ utils.write_airr_output('%s/%s.tsv'%(args.outfile, ltmp), antn_lists[ltmp], cpath=cpaths[ltmp], glfo=glfos[ltmp])
185
+ sys.exit(0)
186
+
187
+ # restrict to certain partitions/clusters
188
+ if cpath is None or cpath.i_best is None:
189
+ clusters_to_use = [l['unique_ids'] for l in annotation_list]
190
+ print(' no cluster path in input file, so just using all %d sequences (in %d clusters) in annotations' % (sum(len(c) for c in clusters_to_use), len(clusters_to_use)))
191
+ else:
192
+ ipartition = cpath.i_best if args.partition_index is None else args.partition_index
193
+ print(' found %d clusters with %d seqs in %s' % (len(cpath.partitions[ipartition]), sum(len(c) for c in cpath.partitions[ipartition]), 'best partition' if args.partition_index is None else 'partition at index %d (of %d)' % (ipartition, len(cpath.partitions))))
194
+ modified = False
195
+ if args.cluster_index is None:
196
+ clusters_to_use = cpath.partitions[ipartition]
197
+ print(' taking all %d clusters' % len(clusters_to_use))
198
+ else:
199
+ ptn = cpath.partitions[ipartition]
200
+ if args.sort_by_size:
201
+ ptn = sorted(cpath.partitions[ipartition], key=len, reverse=True)
202
+ clusters_to_use = [ptn[args.cluster_index]]
203
+ modified = True
204
+ print(' taking cluster at index %d with size %d%s' % (args.cluster_index, len(clusters_to_use[0]), ' after sorting by size' if args.sort_by_size else ''))
205
+ if args.seed_unique_id is not None:
206
+ clusters_to_use = [c for c in clusters_to_use if args.seed_unique_id in c] # NOTE can result in more than one cluster with the seed sequence (e.g. if this file contains intermediate annotations from seed partitioning))
207
+ modified = True
208
+ print(' removing clusters not containing sequence \'%s\' (leaving %d)' % (args.seed_unique_id, len(clusters_to_use)))
209
+ if modified:
210
+ cpath = ClusterPath(partition=clusters_to_use, seed_unique_id=args.seed_unique_id)
211
+ antn_dict = utils.get_annotation_dict(annotation_list)
212
+ annotation_list = [antn_dict[':'.join(c)] for c in clusters_to_use if ':'.join(c) in antn_dict]
213
+
214
+ if not os.path.exists(os.path.dirname(os.path.abspath(args.outfile))):
215
+ os.makedirs(os.path.dirname(os.path.abspath(args.outfile)))
216
+
217
+ if args.airr_output:
218
+ print(' writing %d annotations%s to %s' % (len(annotation_list), '' if cpath is None else ' (with partition: %d seqs in %d clusters)'%(sum(len(c) for c in cpath.best()), len(cpath.best())), args.outfile))
219
+ utils.write_airr_output(args.outfile, annotation_list, cpath=cpath, extra_columns=args.extra_columns, skip_columns=args.skip_columns)
220
+ sys.exit(0)
221
+
222
+ # condense partis info into <seqfos> for fasta/csv output
223
+ n_skipped, n_failed_to_add = 0, 0
224
+ seqfos = []
225
+ antn_dict = utils.get_annotation_dict(annotation_list)
226
+ for cluster in clusters_to_use:
227
+ if ':'.join(cluster) not in antn_dict:
228
+ n_skipped += 1
229
+ # print ' %s cluster with size %d not in annotations, so skipping it' % (utils.color('yellow', 'warning'), len(cluster))
230
+ continue
231
+ cluster_annotation = antn_dict[':'.join(cluster)]
232
+ newfos = [{'name' : u, 'seq' : s} for u, s in zip(cluster_annotation['unique_ids'], cluster_annotation['seqs' if args.indel_reversed_seqs else 'input_seqs'])]
233
+ if args.extra_columns is not None:
234
+ for ecol in args.extra_columns:
235
+ if ecol not in cluster_annotation:
236
+ utils.add_extra_column(ecol, cluster_annotation, cluster_annotation, glfo=glfo)
237
+ if ecol not in cluster_annotation:
238
+ n_failed_to_add += 1
239
+ cluster_annotation[ecol] = None
240
+ for iseq in range(len(newfos)):
241
+ ival = cluster_annotation[ecol]
242
+ if ival is not None and ecol in utils.linekeys['per_seq']:
243
+ ival = ival[iseq]
244
+ newfos[iseq][ecol] = ival
245
+ seqfos += newfos
246
+ if n_skipped > 0:
247
+ print(' missing annotations for %d sequences' % n_skipped)
248
+ if n_failed_to_add > 0:
249
+ print(' %s couldn\'t add \'%s\' to %d / %d annotations' % (utils.wrnstr(), ecol, n_failed_to_add, len(clusters_to_use) - n_skipped))
250
+
251
+ # write output
252
+ with open(args.outfile, utils.csv_wmode()) as ofile:
253
+ if utils.getsuffix(args.outfile) in ['.csv', '.tsv']:
254
+ print(' writing %d sequences to %s' % (len(seqfos), args.outfile))
255
+ writer = csv.DictWriter(ofile, list(seqfos[0].keys()), delimiter=str(',') if utils.getsuffix(args.outfile)=='.csv' else '\t')
256
+ writer.writeheader()
257
+ for sfo in seqfos:
258
+ writer.writerow(sfo)
259
+ elif utils.getsuffix(args.outfile) in ['.fa', '.fasta']:
260
+ print(' writing %d sequences to %s' % (len(seqfos), args.outfile))
261
+ for sfo in seqfos:
262
+ estr = ''
263
+ if args.extra_columns is not None:
264
+ estr = args.fasta_info_separator
265
+ estr += args.fasta_info_separator.join(str(sfo[c]) for c in args.extra_columns)
266
+ ofile.write('>%s%s\n%s\n' % (sfo['name'], estr, sfo['seq']))
267
+ elif utils.getsuffix(args.outfile) == '.yaml':
268
+ true_partition = None
269
+ if args.simfname is not None:
270
+ print(' reading true partition from %s' % args.simfname)
271
+ _, _, true_cpath = utils.read_output(args.simfname, skip_annotations=True)
272
+ true_partition = true_cpath.best()
273
+ plines = cpath.get_partition_lines(true_partition=true_partition, calc_missing_values='none' if true_partition is None else 'best')
274
+ print(' writing %d annotations with %d partition%s to %s' % (len(annotation_list), len(plines), utils.plural(len(plines)), args.outfile))
275
+ utils.write_annotations(args.outfile, glfo, annotation_list, utils.add_lists(utils.annotation_headers, args.extra_columns), partition_lines=plines)
276
+ else:
277
+ assert False
@@ -0,0 +1,317 @@
1
+ #!python
2
+ from __future__ import absolute_import, division, unicode_literals
3
+ from __future__ import print_function
4
+ import json
5
+ import csv
6
+ import os
7
+ import sys
8
+ import argparse
9
+ import operator
10
+ import colored_traceback.always
11
+ import collections
12
+ import copy
13
+ from collections import defaultdict
14
+ import random
15
+ import numpy
16
+ from io import open
17
+ import time
18
+ from pathlib import Path
19
+
20
+ # if you move this script, you'll need to change this method of getting the imports
21
+ partis_dir = str(Path(__file__).parent.parent)
22
+ sys.path.insert(1, partis_dir) # + '/python')
23
+
24
+ import python.utils as utils
25
+ import python.paircluster as paircluster
26
+ import python.glutils as glutils
27
+ from python.clusterpath import ClusterPath
28
+ import python.seqfileopener as seqfileopener
29
+
30
+ # ----------------------------------------------------------------------------------------
31
+ dstr = """
32
+ Uses vsearch (or the \'locus\' key in --input-metfname) to split the sequences in <fname> according to their loci, writing each locus to its own fasta file <locus>.fa.
33
+ If \'paired-uids\' are available in --input-metafname, also splits the heavy sequences according to the light chain locus with which they\'re paired, resulting in subdirectories e.g. igh+igk/ and igh+igl/.
34
+ Use --reverse-negative-strands to check both senses for each input sequence.
35
+ """
36
+ parser = argparse.ArgumentParser(description=dstr,
37
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter) # why tf isn't this printing the defaults?
38
+ parser.add_argument('fname', help='fasta input file')
39
+ parser.add_argument('--outdir', help='directory to which to write output files (if not set, output is written to directory of <fname>)')
40
+ parser.add_argument('--reverse-negative-strands', action='store_true', help='align every sequence both forwards and revcomp\'d, then for each sequence keep the sense with better alignment.')
41
+ parser.add_argument('--species', default='human', choices=('human', 'macaque', 'mouse'), help='Which species?')
42
+ parser.add_argument('--germline-dir', default=utils.get_partis_dir() + '/data/germlines', help='doesn\'t need to be the germlines corresponding to this sample since it\'s just so it can figure out which is igh vs igk vs igl, so the default is probably fine')
43
+ parser.add_argument('--workdir', default=utils.choose_random_subdir('/tmp/%s/partis' % os.getenv('USER', default='partis-work')), help='working directory for vsearch')
44
+ parser.add_argument('--vsearch-binary', help='Path to vsearch binary (vsearch binaries for linux and darwin are included in partis/bin/, so leaving this unset should work, but for other systems you need to get your own)')
45
+ parser.add_argument('--vsearch-threshold', type=float, default=0.4, help='default identity threshold for vsearch')
46
+ parser.add_argument('--debug', type=int, default=1)
47
+ parser.add_argument('--overwrite', action='store_true')
48
+ parser.add_argument('--random-seed', type=int, default=1)
49
+ parser.add_argument('--guess-pairing-info', action='store_true', help=utils.did_help['guess'])
50
+ parser.add_argument('--droplet-id-separators', help=utils.did_help['seps'])
51
+ parser.add_argument('--droplet-id-indices', help=utils.did_help['indices'])
52
+ parser.add_argument('--fasta-info-index', type=int, help='zero-based index in fasta info/meta string of sequence name/uid (e.g. if name line is \'>stuff more-stuff NAME extra-stuff\' the index should be 2)')
53
+ parser.add_argument('--allowed-contigs-per-droplet', help='if set, discard sequences from droplets that contain any number of contigs not in this colon-separated list')
54
+ parser.add_argument('--allowed-meta-keys-values', help='if set, require that the kept contigs from --allowed-contigs-per-droplet have these key:value pairs (colon-separated list of comma-separated key:value pairs)')
55
+ parser.add_argument('--input-metafname', help='yaml file with meta information keyed by sequence id. See same argument in main partis help, and https://github.com/psathyrella/partis/blob/master/docs/subcommands.md#input-meta-info for an example.')
56
+ parser.add_argument('--for-testing-n-max-queries', type=int, default=-1, help='only for testing, applied when reading initial fasta file, just in case it\'s huge and you want to run quickly without having to read the whole file')
57
+ parser.add_argument('--n-max-queries', type=int, default=-1, help='see partis help (although here it applies to droplets, not individual seqs)')
58
+ parser.add_argument('--n-random-queries', type=int, help='see partis help (although here it applies to droplets, not individual seqs)')
59
+ parser.add_argument('--ig-or-tr', default='ig', choices=list(utils.locus_pairs.keys()), help='antibodies or TCRs?')
60
+
61
+ # ----------------------------------------------------------------------------------------
62
+ def use_rev_comp(pline, rline): # decide whether positive sense <pline> or negative sense <rline> has better alignment
63
+ assert pline['unique_ids'][0] == rline['unique_ids'][0]
64
+ if rline.get('invalid', False):
65
+ return False
66
+ elif pline.get('invalid', False):
67
+ return True
68
+ elif rline['score'] > pline['score']:
69
+ return True
70
+ else:
71
+ return False
72
+
73
+ # ----------------------------------------------------------------------------------------
74
+ def run_vsearch(seqfos): # run vsearch to see if you can get a match for each locus for every sequence
75
+ print(' running vsearch on %d sequences:' % len(seqfos))
76
+ n_rev_compd, n_total = 0, 0
77
+ for locus in utils.sub_loci(args.ig_or_tr):
78
+ lglfo = glutils.read_glfo(args.germline_dir, locus)
79
+ annotations = utils.run_vsearch_with_duplicate_uids('search', seqfos, args.workdir + '/vsearch', args.vsearch_threshold, glfo=lglfo, print_time=True, vsearch_binary=args.vsearch_binary, get_annotations=True, expect_failure=True, extra_str=' %s fwd:'%utils.color('blue', locus) if args.reverse_negative_strands else ' %s: '%locus) #, debug=args.debug>1)
80
+ assert len(annotations) == len(seqfos)
81
+ if args.reverse_negative_strands: # it might be nicer to user vsearch options to run on both senses at once, but otoh this might be nicer
82
+ revnotations = utils.run_vsearch_with_duplicate_uids('search', revfos, args.workdir + '/vsearch', args.vsearch_threshold, glfo=lglfo, print_time=True, vsearch_binary=args.vsearch_binary, get_annotations=True, expect_failure=True, extra_str=' rev:') #, debug=args.debug>1)
83
+ assert len(revnotations) == len(seqfos)
84
+ for il, (sfo, line) in enumerate(zip(seqfos, annotations)):
85
+ assert sfo['name'] == line['unique_ids'][0] # note that they're not full annotations, they just have a couple keys
86
+ if args.reverse_negative_strands and use_rev_comp(line, revnotations[il]):
87
+ sfo['seq'] = revfos[il]['seq']
88
+ line = revnotations[il]
89
+ n_rev_compd += 1
90
+ sfo[locus] = line # add info for each locus to the input seqfos
91
+ n_total += 1
92
+ if args.reverse_negative_strands:
93
+ print(' used rev comp for %d/%d locus results (for %d seqs)' % (n_rev_compd, n_total, len(seqfos)))
94
+
95
+ # ----------------------------------------------------------------------------------------
96
+ def write_locus_file(locus, ofos, lpair=None, extra_str=' ', totstr=''):
97
+ ofn = paircluster.paired_fn(args.outdir, locus=locus, lpair=lpair)
98
+ if utils.output_exists(args, ofn, leave_zero_len=len(ofos)==0, offset=4): # NOTE not really sure this does anything (or if i want it) now that I'm cleaning/looking for the whole dir at the start of this script
99
+ return
100
+ if not os.path.exists(os.path.dirname(ofn)):
101
+ os.makedirs(os.path.dirname(ofn))
102
+ if len(ofos) == 0:
103
+ # print '%s%s: nothing to write' % (extra_str, locus)
104
+ open(ofn, 'w').close()
105
+ return
106
+ print('%s%s: %d%s to %s/%s' % (extra_str, locus, len(ofos), totstr, os.path.basename(os.path.dirname(ofn)), os.path.basename(ofn)))
107
+ with open(ofn, 'w') as lfile:
108
+ for sfo in ofos:
109
+ lfile.write('>%s\n%s\n' % (sfo['name'], sfo['seq']))
110
+
111
+ # ----------------------------------------------------------------------------------------
112
+ def read_meta_info(seqfos): # read all input meta info, and add pairing info (if present) to <paired_uids>
113
+ dummy_annotation_list = [{'unique_ids' : [sfo['name']]} for sfo in seqfos]
114
+ seqfileopener.read_input_metafo([args.input_metafname], dummy_annotation_list) # , required_keys=['paired-uids'])
115
+ for line in dummy_annotation_list:
116
+ uid = utils.get_single_entry(line['unique_ids'])
117
+ if 'loci' in line:
118
+ meta_loci[uid] = line['loci'][0]
119
+ if 'paired-uids' in line:
120
+ paired_uids[uid] = line['paired-uids'][0]
121
+ if len(paired_uids) > 0:
122
+ print(' read pairing info for %d seqs from input meta file' % len(paired_uids))
123
+ if len(paired_uids) < len(seqfos):
124
+ print(' %s only read pairing info for %d/%d seqfos' % (utils.color('yellow', 'warning'), len(paired_uids), len(seqfos)))
125
+ if len(meta_loci) > 0:
126
+ print(' read loci for %d sequences from input meta file (so not running vsearch)' % len(meta_loci))
127
+ if len(meta_loci) < len(seqfos):
128
+ print(' %s only read locus info for %d/%d seqfos' % (utils.color('yellow', 'warning'), len(meta_loci), len(seqfos)))
129
+ input_metafos = utils.read_json_yaml(args.input_metafname)
130
+ for uid in input_metafos: # we want to copy over any additional meta info (not paired uids or loci) to the output meta info file (since if we're guessing pair info, the uid names will change, so the original one is no good)
131
+ additional_mfo = {k : v for k, v in input_metafos[uid].items() if k not in ['loci', 'paired-uids']}
132
+ if len(additional_mfo) > 0:
133
+ input_metafos[uid] = additional_mfo
134
+ return input_metafos
135
+
136
+ # ----------------------------------------------------------------------------------------
137
+ def print_pairing_info(outfos, paired_uids):
138
+ loci_by_uid = {sfo['name'] : l for l in outfos for sfo in outfos[l]} # locus of each sequence, just for counting below
139
+ print_cutoff = 0.01
140
+ n_missing = 0
141
+ print(' count frac paired with')
142
+ for locus in utils.sub_loci(args.ig_or_tr):
143
+ plocicounts = {}
144
+ for sfo in outfos[locus]:
145
+ if sfo['name'] not in paired_uids:
146
+ n_missing += 1
147
+ continue
148
+ plstr = ' '.join(utils.locstr(l) for l in sorted([loci_by_uid.get(pid, '?') for pid in paired_uids[sfo['name']]]))
149
+ if plstr not in plocicounts:
150
+ plocicounts[plstr] = 0
151
+ plocicounts[plstr] += 1
152
+ total = sum(plocicounts.values())
153
+ n_skipped = 0
154
+ for ipl, (plstr, counts) in enumerate(sorted(list(plocicounts.items()), key=operator.itemgetter(1), reverse=True)):
155
+ if counts / float(total) < print_cutoff:
156
+ n_skipped += counts
157
+ continue
158
+ print(' %s %6d %5.2f %s' % (utils.locstr(locus) if ipl==0 else ' ', counts, counts / float(total), plstr))
159
+ if n_skipped > 0:
160
+ print(' +%d counts skipped with <%.3f each' % (n_skipped , print_cutoff)) # utils.color('yellow', 'note
161
+ if n_missing > 0:
162
+ print(' %s %d uids missing from paired uids' % (utils.wrnstr(), n_missing))
163
+
164
+ # ----------------------------------------------------------------------------------------
165
+ args = parser.parse_args()
166
+ random.seed(args.random_seed)
167
+ numpy.random.seed(args.random_seed)
168
+ if os.path.dirname(args.fname) == '':
169
+ args.fname = '%s/%s' % (os.getcwd(), args.fname)
170
+ if args.outdir is None:
171
+ args.outdir = utils.getprefix(args.fname)
172
+ args.droplet_id_indices = utils.get_arg_list(args.droplet_id_indices, intify=True)
173
+ args.allowed_contigs_per_droplet = utils.get_arg_list(args.allowed_contigs_per_droplet, intify=True)
174
+ args.allowed_meta_keys_values = utils.get_arg_list(args.allowed_meta_keys_values, key_val_pairs=True)
175
+ args.input_metafname = utils.fpath(args.input_metafname)
176
+
177
+ if any(os.path.exists(ofn) for ofn in paircluster.paired_dir_fnames(args.outdir)):
178
+ if args.overwrite:
179
+ paircluster.clean_paired_dir(args.outdir)
180
+ else:
181
+ print(' split-loci.py output exists and --overwrite was not set, so not doing anything: %s' % args.outdir)
182
+ sys.exit(0)
183
+
184
+ seqfos = utils.read_fastx(args.fname, n_max_queries=args.for_testing_n_max_queries)
185
+ if args.n_max_queries != -1 or args.n_random_queries is not None:
186
+ seqfos = utils.subset_paired_queries(seqfos, args.droplet_id_separators, args.droplet_id_indices, n_max_queries=args.n_max_queries, n_random_queries=args.n_random_queries)
187
+ if args.fasta_info_index is not None:
188
+ for sfo in seqfos:
189
+ sfo['name'] = sfo['infostrs'][args.fasta_info_index]
190
+ if args.reverse_negative_strands:
191
+ revfos = [{'name' : s['name'], 'seq' : utils.revcomp(s['seq'])} for s in seqfos] # NOTE this is not on an equal footing with <seqfos>, since we add all the vsearch info to <seqfos>, then use it do decide on locus, and then to write output
192
+
193
+ if os.path.exists(args.germline_dir + '/' + args.species): # ick that is hackey
194
+ args.germline_dir += '/' + args.species
195
+
196
+ # read input meta file and/or run vsearch
197
+ paired_uids, meta_loci, input_metafos = {}, {}, {}
198
+ if args.input_metafname is not None:
199
+ input_metafos = read_meta_info(seqfos)
200
+ if len(meta_loci) == 0: # default: no input locus info
201
+ run_vsearch(seqfos)
202
+
203
+ # then, for each sequence, choose the locus with the best-scoring match (in practice i doubt you ever really get multiple loci with matches)
204
+ outfos = collections.OrderedDict(((l, []) for l in utils.sub_loci(args.ig_or_tr)))
205
+ failed_seqs = []
206
+ if args.debug > 1:
207
+ print(' printing scores for locus determination:')
208
+ n_skipped = 0
209
+ for sfo in seqfos:
210
+ if len(meta_loci) == 0: # default: use vsearch match scores
211
+ lscores = {l : sfo[l]['score'] if 'invalid' not in sfo[l] else 0 for l in utils.sub_loci(args.ig_or_tr)}
212
+ locus, max_score = sorted(list(lscores.items()), key=operator.itemgetter(1), reverse=True)[0]
213
+ if max_score == 0:
214
+ failed_seqs.append(sfo)
215
+ continue
216
+ else: # if we were passed input locus info
217
+ locus = meta_loci[sfo['name']]
218
+ outfos[locus].append(sfo)
219
+ if args.debug > 1:
220
+ def lpstr(spair):
221
+ l, s = spair
222
+ return '%s %s' % (utils.locstr(l) if l==locus else l.replace('ig', ''), utils.color('red' if s!=0 else None, '%3d'%s))
223
+ if list(lscores.values()).count(0) == 2:
224
+ n_skipped += 1
225
+ else:
226
+ print(' %s %s' % (' '.join(lpstr(s) for s in sorted(list(lscores.items()), key=operator.itemgetter(1), reverse=True)), sfo['name']))
227
+ if args.debug > 1 and n_skipped > 0:
228
+ print(' skipped %d seqs with non-zero scores from only one locus' % n_skipped)
229
+
230
+ print('totals: %s%s' % (' '.join(('%s %d'%(l, len(sfos))) for l, sfos in outfos.items()), '' if len(failed_seqs) == 0 else ' (%s: %d)'%(utils.color('yellow', 'failed'), len(failed_seqs))))
231
+ assert sum(len(ofo) for ofo in outfos.values()) + len(failed_seqs) == len(seqfos)
232
+
233
+ if args.guess_pairing_info:
234
+ if len(paired_uids) > 0:
235
+ raise Exception('can\'t/shouldn\'t guess pairing info if we already have it from elsewhere')
236
+ for locus in outfos:
237
+ for ofo in outfos[locus]:
238
+ new_name = ofo['name']
239
+ if '-' not in new_name or ofo['name'].split('-')[-1] != locus: # add locus (e.g. '-igh') to name, unless it's already there
240
+ new_name = ofo['name'] + '-' + locus
241
+ if ofo['name'] in input_metafos:
242
+ input_metafos[new_name] = input_metafos[ofo['name']]
243
+ del input_metafos[ofo['name']]
244
+ ofo['name'] = new_name
245
+ guessed_metafos = utils.extract_pairing_info(seqfos, droplet_id_separators=args.droplet_id_separators, droplet_id_indices=args.droplet_id_indices, debug=max(1, args.debug))
246
+ for uid in set(guessed_metafos) & set(input_metafos):
247
+ guessed_metafos[uid].update(input_metafos[uid])
248
+ for uid, mfo in guessed_metafos.items():
249
+ paired_uids[uid] = mfo['paired-uids']
250
+
251
+ removed_uids = set()
252
+ if args.allowed_contigs_per_droplet is not None:
253
+ new_outfos = collections.OrderedDict(((l, []) for l in utils.sub_loci(args.ig_or_tr)))
254
+ for locus in outfos:
255
+ n_ctg_removed, n_meta_removed, n_meta_added = defaultdict(int), 0, 0
256
+ for ofo in outfos[locus]:
257
+ skip = False
258
+ n_contigs = len(paired_uids[ofo['name']]) + 1 # total n contigs in the droplet
259
+ if args.allowed_meta_keys_values is not None:
260
+ mv_uids = [ofo['name']] + copy.copy(paired_uids[ofo['name']]) # uids in this droplet that have the required meta info values
261
+ for mkey, mval in args.allowed_meta_keys_values.items():
262
+ mv_uids = [u for u in mv_uids if input_metafos[u][mkey] == mval] # reduce mv_uids to the uids that have the required meta value
263
+ if len(mv_uids) != n_contigs:
264
+ # print(' reducing n_contigs with %s=%s: %d %d (%s --> %s)' % (mkey, mval, n_contigs, len(mv_uids), [guessed_metafos[u][mkey] for u in [ofo['name']] + paired_uids[ofo['name']]], [guessed_metafos[u][mkey] for u in mv_uids]))
265
+ if n_contigs in args.allowed_contigs_per_droplet and len(mv_uids) not in args.allowed_contigs_per_droplet:
266
+ n_meta_removed += 1 # keep track of how many were removed only because of the meta info requirements
267
+ if n_contigs not in args.allowed_contigs_per_droplet and len(mv_uids) in args.allowed_contigs_per_droplet:
268
+ n_meta_added += 1 # and how many were added only because of the meta info requirements
269
+ n_contigs = len(mv_uids) # n contigs that have the required meta values
270
+ if n_contigs in args.allowed_contigs_per_droplet:
271
+ new_outfos[locus].append(ofo)
272
+ else:
273
+ n_ctg_removed[n_contigs] += 1
274
+ removed_uids.add(ofo['name'])
275
+ if sum(n_ctg_removed.values()) > 0:
276
+ print(' %s --allowed-contigs-per-droplet: removed %d / %d contigs that were in droplets that didn\'t have an allowed number of contigs (%s): %s' % (utils.locstr(locus), sum(n_ctg_removed.values()), len(outfos[locus]), ' '.join(str(n) for n in args.allowed_contigs_per_droplet), ' '.join('%s: %d'%(k, v) for k, v in n_ctg_removed.items())))
277
+ if args.allowed_meta_keys_values is not None and n_meta_removed > 0:
278
+ print(' --allowed-meta-keys-values: %d were removed (and %d were kept) because of the meta info requirements: %s' % (n_meta_removed, n_meta_added, args.allowed_meta_keys_values))
279
+ outfos = new_outfos
280
+
281
+ removed_uids |= set(s['name'] for s in failed_seqs)
282
+ if len(removed_uids) > 0:
283
+ start = time.time()
284
+ n_removed = 0
285
+ for fid in removed_uids:
286
+ if fid in paired_uids:
287
+ del paired_uids[fid]
288
+ n_removed += 1
289
+ paired_uids = {uid : sorted(set(paired_uids[uid]) - removed_uids) for uid in paired_uids}
290
+ print(' removed %d uids from paired_uids (%d failed, %d removed b/c of disallowed N contigs) in %.1fs' % (n_removed, len(failed_seqs), len(removed_uids) - len(failed_seqs), time.time() - start))
291
+
292
+ if args.debug and len(paired_uids) > 0:
293
+ print_pairing_info(outfos, paired_uids)
294
+
295
+ print('writing to %s/' % args.outdir)
296
+ if len(failed_seqs) > 0:
297
+ write_locus_file('failed', failed_seqs)
298
+
299
+ for locus in outfos: # first write the single files with all seqs for each locus
300
+ write_locus_file(locus, outfos[locus])
301
+
302
+ omfname = '%s/meta.yaml' % args.outdir
303
+ if args.guess_pairing_info:
304
+ utils.jsdump(omfname, guessed_metafos) # NOTE file name duplicates code in bin/partis
305
+ elif args.input_metafname is not None and not os.path.exists(omfname):
306
+ utils.makelink(os.path.dirname(omfname), args.input_metafname, omfname)
307
+
308
+ if len(paired_uids) == 0:
309
+ print('no pairing info')
310
+ else:
311
+ print('writing to paired subdirs')
312
+ for lpair in utils.locus_pairs[args.ig_or_tr]:
313
+ print(' %s:' % '+'.join(lpair))
314
+ for l_other, ltmp in [lpair, reversed(lpair)]:
315
+ all_paired_uids = set(pid for s in outfos[ltmp] if s['name'] in paired_uids for pid in paired_uids[s['name']]) # all uids that are paired with any <ltmp> uid (not necessarily *correctly* paired, at this stage it likely just means they're in the same droplet)
316
+ other_outfo = [sfo for sfo in outfos[l_other] if sfo['name'] in all_paired_uids] # any <l_other> locus seq that was in a droplet with an <ltmp> uid
317
+ write_locus_file(l_other, other_outfo, lpair=lpair, extra_str=' ', totstr=' / %s'%len(outfos[l_other]))