partis-bcr 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bin/FastTree +0 -0
- bin/add-chimeras.py +59 -0
- bin/add-seqs-to-outputs.py +81 -0
- bin/bcr-phylo-run.py +799 -0
- bin/build.sh +24 -0
- bin/cf-alleles.py +97 -0
- bin/cf-germlines.py +57 -0
- bin/cf-linearham.py +199 -0
- bin/chimera-plot.py +76 -0
- bin/choose-partially-paired.py +143 -0
- bin/circle-plots.py +30 -0
- bin/compare-plotdirs.py +298 -0
- bin/diff-parameters.py +133 -0
- bin/docker-hub-push.sh +6 -0
- bin/extract-pairing-info.py +55 -0
- bin/gcdyn-simu-run.py +223 -0
- bin/gctree-run.py +244 -0
- bin/get-naive-probabilities.py +126 -0
- bin/iqtree-1.6.12 +0 -0
- bin/lonr.r +1020 -0
- bin/makeHtml +52 -0
- bin/mds-run.py +46 -0
- bin/parse-output.py +277 -0
- bin/partis +1869 -0
- bin/partis-pip +116 -0
- bin/partis.py +1869 -0
- bin/plot-gl-set-trees.py +519 -0
- bin/plot-hmms.py +151 -0
- bin/plot-lb-tree.py +427 -0
- bin/raxml-ng +0 -0
- bin/read-bcr-phylo-trees.py +38 -0
- bin/read-gctree-output.py +166 -0
- bin/run-chimeras.sh +64 -0
- bin/run-dtr-scan.sh +25 -0
- bin/run-paired-loci.sh +100 -0
- bin/run-tree-metrics.sh +88 -0
- bin/smetric-run.py +62 -0
- bin/split-loci.py +317 -0
- bin/swarm-2.1.13-linux-x86_64 +0 -0
- bin/test-germline-inference.py +425 -0
- bin/tree-perf-run.py +194 -0
- bin/vsearch-2.4.3-linux-x86_64 +0 -0
- bin/vsearch-2.4.3-macos-x86_64 +0 -0
- bin/xvfb-run +194 -0
- partis_bcr-1.0.1.data/scripts/cf-alleles.py +97 -0
- partis_bcr-1.0.1.data/scripts/cf-germlines.py +57 -0
- partis_bcr-1.0.1.data/scripts/extract-pairing-info.py +55 -0
- partis_bcr-1.0.1.data/scripts/gctree-run.py +244 -0
- partis_bcr-1.0.1.data/scripts/parse-output.py +277 -0
- partis_bcr-1.0.1.data/scripts/split-loci.py +317 -0
- partis_bcr-1.0.1.data/scripts/test.py +1005 -0
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/METADATA +1 -1
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/RECORD +101 -50
- partis_bcr-1.0.1.dist-info/top_level.txt +1 -0
- {partis → python}/glutils.py +1 -1
- python/main.py +30 -0
- {partis → python}/plotting.py +10 -1
- {partis → python}/treeutils.py +18 -16
- {partis → python}/utils.py +14 -7
- partis/main.py +0 -59
- partis_bcr-1.0.0.dist-info/top_level.txt +0 -1
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/WHEEL +0 -0
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/entry_points.txt +0 -0
- {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.1.dist-info}/licenses/COPYING +0 -0
- {partis → python}/__init__.py +0 -0
- {partis → python}/alleleclusterer.py +0 -0
- {partis → python}/allelefinder.py +0 -0
- {partis → python}/alleleremover.py +0 -0
- {partis → python}/annotationclustering.py +0 -0
- {partis → python}/baseutils.py +0 -0
- {partis → python}/cache/__init__.py +0 -0
- {partis → python}/cache/cached_uncertainties.py +0 -0
- {partis → python}/clusterpath.py +0 -0
- {partis → python}/coar.py +0 -0
- {partis → python}/corrcounter.py +0 -0
- {partis → python}/datautils.py +0 -0
- {partis → python}/event.py +0 -0
- {partis → python}/fraction_uncertainty.py +0 -0
- {partis → python}/gex.py +0 -0
- {partis → python}/glomerator.py +0 -0
- {partis → python}/hist.py +0 -0
- {partis → python}/hmmwriter.py +0 -0
- {partis → python}/hutils.py +0 -0
- {partis → python}/indelutils.py +0 -0
- {partis → python}/lbplotting.py +0 -0
- {partis → python}/mds.py +0 -0
- {partis → python}/mutefreqer.py +0 -0
- {partis → python}/paircluster.py +0 -0
- {partis → python}/parametercounter.py +0 -0
- {partis → python}/paramutils.py +0 -0
- {partis → python}/partitiondriver.py +0 -0
- {partis → python}/partitionplotter.py +0 -0
- {partis → python}/performanceplotter.py +0 -0
- {partis → python}/plotconfig.py +0 -0
- {partis → python}/processargs.py +0 -0
- {partis → python}/prutils.py +0 -0
- {partis → python}/recombinator.py +0 -0
- {partis → python}/scanplot.py +0 -0
- {partis → python}/seqfileopener.py +0 -0
- {partis → python}/treegenerator.py +0 -0
- {partis → python}/viterbicluster.py +0 -0
- {partis → python}/vrc01.py +0 -0
- {partis → python}/waterer.py +0 -0
bin/makeHtml
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
basedir=$1
|
4
|
+
if ! [ $basedir ]; then echo "makeHtml: no dir given..."; exit 1; fi
|
5
|
+
if ! [ -d $basedir ]; then echo "makeHtml: base dir \"$basedir\" does not exist..."; exit 1; fi
|
6
|
+
|
7
|
+
row_length=$2 # number of plots per row
|
8
|
+
if ! [ $row_length ]; then row_length=4; fi
|
9
|
+
|
10
|
+
title=$3
|
11
|
+
if ! [ $title ]; then title=null; fi
|
12
|
+
|
13
|
+
ext=$4
|
14
|
+
if ! [ $ext ]; then ext=png; fi
|
15
|
+
|
16
|
+
ls $basedir/plots/*.$ext &>/dev/null
|
17
|
+
if ! [ $? -eq 0 ]; then
|
18
|
+
echo "no .$ext files in $basedir"
|
19
|
+
exit 0
|
20
|
+
fi
|
21
|
+
|
22
|
+
htmlfile=$basedir/plots.html
|
23
|
+
|
24
|
+
cat > $htmlfile <<EOF
|
25
|
+
<!DOCTYPE html
|
26
|
+
PUBLIC "-//W3C//DTD HTML 3.2//EN">
|
27
|
+
<html>
|
28
|
+
<head><title>$title</title></head>
|
29
|
+
<body bgcolor="000000">
|
30
|
+
<h3 style="text-align:left; color:DD6600;">$title</h3>
|
31
|
+
|
32
|
+
<table border="0" cellspacing="5" width="100%">
|
33
|
+
<tr>
|
34
|
+
EOF
|
35
|
+
|
36
|
+
iplot=0
|
37
|
+
for plotfile in `ls $basedir/plots/*.$ext | sort`; do
|
38
|
+
(( iplot ++ ))
|
39
|
+
file=`basename $plotfile`
|
40
|
+
echo '<td width="25%"><a target="_blank" href="plots/'$file'"><img src="plots/'$file'" alt="plots/'$file'" width="100%"></a></td>"' >> $htmlfile
|
41
|
+
if (( (iplot % row_length)==0 )); then
|
42
|
+
echo '</tr>' >> $htmlfile
|
43
|
+
echo '<tr>' >> $htmlfile
|
44
|
+
fi
|
45
|
+
done
|
46
|
+
|
47
|
+
cat >> $htmlfile <<EOF
|
48
|
+
</tr>
|
49
|
+
</table>
|
50
|
+
</body>
|
51
|
+
</html>
|
52
|
+
EOF
|
bin/mds-run.py
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
from __future__ import absolute_import, division, unicode_literals
|
3
|
+
import sys
|
4
|
+
import argparse
|
5
|
+
import random
|
6
|
+
import colored_traceback.always
|
7
|
+
import os
|
8
|
+
|
9
|
+
# if you move this script, you'll need to change this method of getting the imports
|
10
|
+
from pathlib import Path
|
11
|
+
partis_dir = str(Path(__file__).parent.parent)
|
12
|
+
sys.path.insert(1, partis_dir) # + '/python')
|
13
|
+
|
14
|
+
import python.utils as utils
|
15
|
+
import python.mds as mds
|
16
|
+
|
17
|
+
# ----------------------------------------------------------------------------------------
|
18
|
+
parser = argparse.ArgumentParser()
|
19
|
+
parser.add_argument('infname')
|
20
|
+
parser.add_argument('--n-clusters', type=int) # if not set, it just runs mds (i.e. without k-means clustering)
|
21
|
+
parser.add_argument('--n-components', type=int, default=2)
|
22
|
+
parser.add_argument('--plotdir')
|
23
|
+
parser.add_argument('--plotname')
|
24
|
+
parser.add_argument('--title')
|
25
|
+
parser.add_argument('--leg-title')
|
26
|
+
parser.add_argument('--queries-to-include')
|
27
|
+
parser.add_argument('--workdir', default='/tmp/dralph/mds/' + str(random.randint(0, 999999)))
|
28
|
+
parser.add_argument('--seed', type=int, default=1)
|
29
|
+
parser.add_argument('--aligned', action='store_true')
|
30
|
+
args = parser.parse_args()
|
31
|
+
args.queries_to_include = utils.get_arg_list(args.queries_to_include, key_val_pairs=True)
|
32
|
+
if args.title is not None:
|
33
|
+
args.title = args.title.replace('@', ' ') # this is kind of hackey
|
34
|
+
|
35
|
+
seqfos = utils.read_fastx(args.infname)
|
36
|
+
color_scale_vals = {}
|
37
|
+
for sfo in seqfos:
|
38
|
+
if len(sfo['infostrs']) == 2:
|
39
|
+
color_scale_vals[sfo['name']] = int(sfo['infostrs'][1])
|
40
|
+
if len(color_scale_vals) == 0:
|
41
|
+
color_scale_vals = None
|
42
|
+
|
43
|
+
# mds.run_sklearn_mds(args.n_components, args.n_clusters, seqfos, args.seed, plotdir=args.plotdir)
|
44
|
+
mds.run_bios2mds(args.n_components, args.n_clusters, seqfos, args.workdir, args.seed,
|
45
|
+
aligned=args.aligned, plotdir=args.plotdir, plotname=args.plotname,
|
46
|
+
queries_to_include=args.queries_to_include, color_scale_vals=color_scale_vals, title=args.title, leg_title=args.leg_title)
|
bin/parse-output.py
ADDED
@@ -0,0 +1,277 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
from __future__ import absolute_import, division, unicode_literals
|
3
|
+
from __future__ import print_function
|
4
|
+
import sys
|
5
|
+
import csv
|
6
|
+
from io import open
|
7
|
+
csv.field_size_limit(sys.maxsize) # make sure we can write very large csv fields
|
8
|
+
import os
|
9
|
+
import argparse
|
10
|
+
import colored_traceback.always
|
11
|
+
from pathlib import Path
|
12
|
+
|
13
|
+
# if you move this script, you'll need to change this method of getting the imports
|
14
|
+
partis_dir = str(Path(__file__).parent.parent)
|
15
|
+
sys.path.insert(1, partis_dir) # + '/python')
|
16
|
+
|
17
|
+
import python.utils as utils
|
18
|
+
import python.glutils as glutils
|
19
|
+
from python.clusterpath import ClusterPath
|
20
|
+
import python.paircluster as paircluster
|
21
|
+
|
22
|
+
# ----------------------------------------------------------------------------------------
|
23
|
+
def count_plot(tglfo, tlist, plotdir, paired_loci=None):
|
24
|
+
if len(tlist) == 0:
|
25
|
+
return
|
26
|
+
if args.plot_tree_mut_stats:
|
27
|
+
import python.plotting as plotting
|
28
|
+
plotting.plot_tree_mut_stats(plotdir, tlist, args.is_simu, only_leaves=args.only_plot_leaves, treefname=args.treefname)
|
29
|
+
plotting.make_html(plotdir)
|
30
|
+
return
|
31
|
+
if args.only_count_correlations:
|
32
|
+
from python.corrcounter import CorrCounter
|
33
|
+
ccounter = CorrCounter(paired_loci=paired_loci)
|
34
|
+
for line in tlist:
|
35
|
+
l_info = None
|
36
|
+
if paired_loci is not None:
|
37
|
+
line, l_info = line
|
38
|
+
ccounter.increment(line, l_info=l_info)
|
39
|
+
ccounter.plot(plotdir + '/correlations', only_csv=args.only_csv_plots, debug=args.debug)
|
40
|
+
return
|
41
|
+
if args.simfname is not None:
|
42
|
+
simglfo, true_antn_list, _ = utils.read_output(args.simfname)
|
43
|
+
true_antn_dict = {}
|
44
|
+
for true_line in true_antn_list:
|
45
|
+
for iseq, uid in enumerate(true_line['unique_ids']):
|
46
|
+
true_antn_dict[uid] = utils.synthesize_single_seq_line(true_line, iseq)
|
47
|
+
# true_antn_dict = utils.get_annotation_dict(true_antn_list)
|
48
|
+
from python.performanceplotter import PerformancePlotter
|
49
|
+
perfplotter = PerformancePlotter('hmm')
|
50
|
+
n_failed = 0
|
51
|
+
for line in tlist:
|
52
|
+
if line['invalid']:
|
53
|
+
n_failed += 1
|
54
|
+
continue
|
55
|
+
for iseq, uid in enumerate(line['unique_ids']): # NOTE this counts rearrangement-level parameters once for every mature sequence, which is inconsistent with the pcounters... but I think might make more sense here?
|
56
|
+
_ = perfplotter.evaluate(true_antn_dict[uid], utils.synthesize_single_seq_line(line, iseq), simglfo=simglfo)
|
57
|
+
perfplotter.plot(args.plotdir, only_csv=args.only_csv_plots)
|
58
|
+
if n_failed > 0:
|
59
|
+
print(' %s %d / %d failed queries' % (utils.color('yellow', 'warning'), n_failed, len([u for l in tlist for u in l['unique_ids']])))
|
60
|
+
if args.only_plot_performance:
|
61
|
+
return
|
62
|
+
assert not args.paired # only handled for correlation counting atm
|
63
|
+
from python.parametercounter import ParameterCounter
|
64
|
+
setattr(args, 'region_end_exclusions', {r : [0 for e in ['5p', '3p']] for r in utils.regions}) # hackity hackity hackity
|
65
|
+
pcounter = ParameterCounter(tglfo, args) # NOTE doesn't count correlations by default
|
66
|
+
for line in tlist:
|
67
|
+
pcounter.increment(line)
|
68
|
+
pcounter.plot(plotdir, only_csv=args.only_csv_plots, only_overall=args.only_overall_plots) #, make_per_base_plots=True) , make_per_base_plots=True
|
69
|
+
|
70
|
+
# ----------------------------------------------------------------------------------------
|
71
|
+
helpstr = """
|
72
|
+
Extract sequences from a partis output file and write them to a fasta, csv, or tsv file, optionally with a limited amount of extra information for each sequence.
|
73
|
+
For details of partis output files, see the manual.
|
74
|
+
To view the partitions and annotations in a partis output file, use the partis \'view-output\' action.
|
75
|
+
Example usage:
|
76
|
+
bin/parse-output.py test/reference-results/partition-new-simu.yaml out.fa
|
77
|
+
bin/parse-output.py test/paired/ref-results/partition-new-simu outdir --paired
|
78
|
+
"""
|
79
|
+
class MultiplyInheritedFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
|
80
|
+
pass
|
81
|
+
formatter_class = MultiplyInheritedFormatter
|
82
|
+
parser = argparse.ArgumentParser(formatter_class=MultiplyInheritedFormatter, description=helpstr)
|
83
|
+
parser.add_argument('infile', help='partis output file from which to read input')
|
84
|
+
parser.add_argument('outfile', help='File to which to write output extracted from <infile> (fasta or csv/tsv). If --paired is set, this must be a directory, to which will be written a fasta with all sequences, a yaml with pairing info, and a csv with h/l sequence pairs.')
|
85
|
+
parser.add_argument('--paired', action='store_true', help='if set, <infile> should be a paired output dir, rather than a single file')
|
86
|
+
parser.add_argument('--extra-columns', help='colon-separated list of additional partis output columns (beyond sequences), to write to the output file. If writing to a fasta file, the column values are appended after the sequence name, separated by --fasta-info-separator. If writing to csv/tsv, they\'re written as proper, labeled columns.')
|
87
|
+
parser.add_argument('--partition-index', type=int, help='if set, use the partition at this index in the cluster path, rather than the default of using the best partition')
|
88
|
+
parser.add_argument('--seed-unique-id', help='if set, take sequences only from the cluster containing this seed sequence, rather than the default of taking all sequences from all clusters')
|
89
|
+
parser.add_argument('--cluster-index', type=int, help='if set, take sequences only from the cluster at this index in the partition, rather than the default of taking all sequences from all clusters. This index is with respect to the cluster order found in the file (which, in contrast to plots made by --plotdir, is *not* sorted by size)')
|
90
|
+
parser.add_argument('--sort-by-size', action='store_true', help='if set, sort clusters in partition by decreasing size before applying --cluster-index')
|
91
|
+
parser.add_argument('--indel-reversed-seqs', action='store_true', help='if set, take sequences that have had any shm indels "reversed" (i.e. insertions are reversed, and deletions are replaced with the germline bases) rather than the default of using sequences from the original input file. Indel-reversed sequences can be convenient because they are by definition the same length as and aligned to the naive sequence.')
|
92
|
+
parser.add_argument('--glfo-dir', help='Directory with germline info. Only necessary for old-style csv output files. Equivalent to a parameter dir with \'/hmm/germline-sets\' appended.')
|
93
|
+
parser.add_argument('--template-glfo-dir', help='use this glfo dir as a template when reading --glfo-dir (only used for airr input atm)')
|
94
|
+
parser.add_argument('--locus', default='igh', help='only used for old-style csv output files')
|
95
|
+
parser.add_argument('--plotdir', help='if set, plot annotation parameters from infile to --plotdir and exit (you still have to set outfile, sorry, since it\'s nice having it be a positional arg, but it doesn\'t get used for this). To add e.g. per-gene-per-position plots comment/uncomment args in the call below.')
|
96
|
+
parser.add_argument('--only-count-correlations', action='store_true', help='')
|
97
|
+
parser.add_argument('--only-plot-performance', action='store_true', help='')
|
98
|
+
parser.add_argument('--fasta-info-separator', default=' ', help='character to use ')
|
99
|
+
parser.add_argument('--debug', type=int, default=0)
|
100
|
+
parser.add_argument('--airr-input', action='store_true', help='read input in AIRR tsv format, and if output file suffix is .yaml write partis output.')
|
101
|
+
parser.add_argument('--airr-output', action='store_true', help='write output in AIRR tsv format')
|
102
|
+
parser.add_argument('--skip-other-locus', action='store_true', help='if --airr-output is set, this tells us to skip lines from the other locus')
|
103
|
+
parser.add_argument('--skip-columns', help='don\'t write these columns to output (atm only implemented for airr output, since we need to remove the clone_id column so scoper doesn\'t crash)')
|
104
|
+
parser.add_argument('--simfname', help='simulation file corresponding to input file (i.e. presumably <infile> is inference that was performed on --simfname')
|
105
|
+
parser.add_argument('--only-csv-plots', action='store_true', help='only write csv versions of plots (not svg), which is a lot faster')
|
106
|
+
parser.add_argument('--only-make-plots', action='store_true', help='if --plotdir is set, set this to only do plotting, i.e. don\'t do the usual/default file reading/conversion')
|
107
|
+
parser.add_argument('--plot-tree-mut-stats', action='store_true', help='plot tree mutation stats and exit')
|
108
|
+
parser.add_argument('--only-plot-leaves', action='store_true', help='only affects --plot-tree-mut-stats')
|
109
|
+
parser.add_argument('--is-simu', action='store_true', help='only affects --plot-tree-mut-stats')
|
110
|
+
parser.add_argument('--only-overall-plots', action='store_true', help='TODO')
|
111
|
+
parser.add_argument('--treefname', help='only affects --plot-tree-mut-stats')
|
112
|
+
parser.add_argument('--meta-info-key-to-color', help='see partis help')
|
113
|
+
parser.add_argument('--meta-emph-formats', help='see partis help')
|
114
|
+
parser.add_argument('--meta-info-to-emphasize', help='see partis help')
|
115
|
+
|
116
|
+
if 'extract-fasta.py' in sys.argv[0]: # if they're trying to run this old script, which is now just a link to this one, print a warning and rejigger the arguments so it still works
|
117
|
+
print(' note: running deprecated script %s, which currently is just a link pointing to %s' % (os.path.basename(sys.argv[0]), os.path.basename(os.path.realpath( __file__))))
|
118
|
+
print(' note: transferring deprecated arguments --input-file and --fasta-output-file to the first two positional arguments (this will continue to work, you only need to change things if you want this warning to go away)')
|
119
|
+
utils.insert_in_arglist(sys.argv, [utils.get_val_from_arglist(sys.argv, '--input-file'), utils.get_val_from_arglist(sys.argv, '--fasta-output-file')], sys.argv[0])
|
120
|
+
utils.remove_from_arglist(sys.argv, '--input-file', has_arg=True)
|
121
|
+
utils.remove_from_arglist(sys.argv, '--fasta-output-file', has_arg=True)
|
122
|
+
|
123
|
+
args = parser.parse_args()
|
124
|
+
args.extra_columns = utils.get_arg_list(args.extra_columns)
|
125
|
+
args.meta_emph_formats = utils.get_arg_list(args.meta_emph_formats, key_val_pairs=True)
|
126
|
+
utils.meta_emph_arg_process(args)
|
127
|
+
if args.paired:
|
128
|
+
if utils.getsuffix(args.outfile) != '':
|
129
|
+
raise Exception('--outfile \'%s\' must be a directory, but it has a non-empty suffix \'%s\'' % (args.outfile, utils.getsuffix(args.outfile)))
|
130
|
+
else:
|
131
|
+
assert utils.getsuffix(args.outfile) in ['.csv', '.tsv', '.fa', '.fasta', '.yaml'] # or args.airr_input and utils.getsuffix(args.outfile) == '.yaml'
|
132
|
+
|
133
|
+
default_glfo_dir = partis_dir + '/data/germlines/human'
|
134
|
+
if utils.getsuffix(args.infile) in ['.csv', '.tsv'] and args.glfo_dir is None:
|
135
|
+
print(' note: reading csv/tsv format without germline info, so need to get germline info from a separate directory; --glfo-dir was not set, so using default %s. If it doesn\'t crash, it\'s probably ok.' % default_glfo_dir)
|
136
|
+
args.glfo_dir = default_glfo_dir
|
137
|
+
|
138
|
+
# ----------------------------------------------------------------------------------------
|
139
|
+
# read input
|
140
|
+
if args.paired:
|
141
|
+
if not os.path.isdir(args.infile):
|
142
|
+
raise Exception('--infile \'%s\' either doesn\'t exist or it isn\'t a directory' % args.infile)
|
143
|
+
lp_infos = paircluster.read_paired_dir(args.infile)
|
144
|
+
else:
|
145
|
+
if args.airr_input:
|
146
|
+
glfo, glfd = None, args.glfo_dir
|
147
|
+
if args.template_glfo_dir is not None: # NOTE only handled for airr input at the moment, cause that's what i need it for right now
|
148
|
+
glfo = glutils.read_glfo(args.glfo_dir, args.locus, template_glfo=glutils.read_glfo(args.template_glfo_dir, args.locus))
|
149
|
+
# glutils.write_glfo(args.glfo_dir + '-parsed', glfo, debug=True)
|
150
|
+
glfd = None
|
151
|
+
glfo, annotation_list, cpath = utils.read_airr_output(args.infile, locus=args.locus, glfo=glfo, glfo_dir=glfd, skip_other_locus=args.skip_other_locus)
|
152
|
+
else:
|
153
|
+
glfo, annotation_list, cpath = utils.read_output(args.infile, glfo_dir=args.glfo_dir, locus=args.locus)
|
154
|
+
|
155
|
+
# plot
|
156
|
+
if args.plotdir is not None:
|
157
|
+
if args.paired:
|
158
|
+
for lpair in utils.locus_pairs['ig']:
|
159
|
+
if lp_infos[tuple(lpair)]['glfos'] is None:
|
160
|
+
continue
|
161
|
+
for ltmp in lpair:
|
162
|
+
count_plot(lp_infos[tuple(lpair)]['glfos'][ltmp], lp_infos[tuple(lpair)]['antn_lists'][ltmp], '%s/%s/%s'%(args.plotdir, '+'.join(lpair), ltmp))
|
163
|
+
antn_pairs = paircluster.find_cluster_pairs(lp_infos, lpair) #, debug=True)
|
164
|
+
count_plot(None, antn_pairs, '%s/%s'%(args.plotdir, '+'.join(lpair)), paired_loci=[l['loci'][0] for l in antn_pairs[0]])
|
165
|
+
else:
|
166
|
+
count_plot(glfo, annotation_list, args.plotdir)
|
167
|
+
if args.only_make_plots:
|
168
|
+
sys.exit(0)
|
169
|
+
|
170
|
+
if args.paired:
|
171
|
+
glfos, antn_lists, cpaths = paircluster.concat_heavy_chain(utils.locus_pairs['ig'], lp_infos, dont_deep_copy=True) # NOTE this is a pretty arbitrary way to combine the partitions for the seqs with uncertain pairing info, but whatever
|
172
|
+
outfos, metafos = paircluster.get_combined_outmetafos(antn_lists)
|
173
|
+
paircluster.write_combined_fasta_and_meta('%s/all-seqs.fa'%args.outfile, '%s/meta.yaml'%args.outfile, outfos, metafos)
|
174
|
+
outfos = paircluster.find_seq_pairs(antn_lists)
|
175
|
+
print(' writing sequence id pairs to %s' % '%s/seq-pairs.csv'%args.outfile)
|
176
|
+
with open('%s/seq-pairs.csv'%args.outfile, utils.csv_wmode()) as cfile:
|
177
|
+
okeys = ['%s_%s'%(c, s) for s in ('id', 'locus', 'seq') for c in 'hl']
|
178
|
+
writer = csv.DictWriter(cfile, okeys) # sorted(outfos[0].keys()))
|
179
|
+
writer.writeheader()
|
180
|
+
for ofo in outfos:
|
181
|
+
writer.writerow({k : ofo[k] for k in okeys})
|
182
|
+
if args.airr_output:
|
183
|
+
for ltmp in sorted(glfos):
|
184
|
+
utils.write_airr_output('%s/%s.tsv'%(args.outfile, ltmp), antn_lists[ltmp], cpath=cpaths[ltmp], glfo=glfos[ltmp])
|
185
|
+
sys.exit(0)
|
186
|
+
|
187
|
+
# restrict to certain partitions/clusters
|
188
|
+
if cpath is None or cpath.i_best is None:
|
189
|
+
clusters_to_use = [l['unique_ids'] for l in annotation_list]
|
190
|
+
print(' no cluster path in input file, so just using all %d sequences (in %d clusters) in annotations' % (sum(len(c) for c in clusters_to_use), len(clusters_to_use)))
|
191
|
+
else:
|
192
|
+
ipartition = cpath.i_best if args.partition_index is None else args.partition_index
|
193
|
+
print(' found %d clusters with %d seqs in %s' % (len(cpath.partitions[ipartition]), sum(len(c) for c in cpath.partitions[ipartition]), 'best partition' if args.partition_index is None else 'partition at index %d (of %d)' % (ipartition, len(cpath.partitions))))
|
194
|
+
modified = False
|
195
|
+
if args.cluster_index is None:
|
196
|
+
clusters_to_use = cpath.partitions[ipartition]
|
197
|
+
print(' taking all %d clusters' % len(clusters_to_use))
|
198
|
+
else:
|
199
|
+
ptn = cpath.partitions[ipartition]
|
200
|
+
if args.sort_by_size:
|
201
|
+
ptn = sorted(cpath.partitions[ipartition], key=len, reverse=True)
|
202
|
+
clusters_to_use = [ptn[args.cluster_index]]
|
203
|
+
modified = True
|
204
|
+
print(' taking cluster at index %d with size %d%s' % (args.cluster_index, len(clusters_to_use[0]), ' after sorting by size' if args.sort_by_size else ''))
|
205
|
+
if args.seed_unique_id is not None:
|
206
|
+
clusters_to_use = [c for c in clusters_to_use if args.seed_unique_id in c] # NOTE can result in more than one cluster with the seed sequence (e.g. if this file contains intermediate annotations from seed partitioning))
|
207
|
+
modified = True
|
208
|
+
print(' removing clusters not containing sequence \'%s\' (leaving %d)' % (args.seed_unique_id, len(clusters_to_use)))
|
209
|
+
if modified:
|
210
|
+
cpath = ClusterPath(partition=clusters_to_use, seed_unique_id=args.seed_unique_id)
|
211
|
+
antn_dict = utils.get_annotation_dict(annotation_list)
|
212
|
+
annotation_list = [antn_dict[':'.join(c)] for c in clusters_to_use if ':'.join(c) in antn_dict]
|
213
|
+
|
214
|
+
if not os.path.exists(os.path.dirname(os.path.abspath(args.outfile))):
|
215
|
+
os.makedirs(os.path.dirname(os.path.abspath(args.outfile)))
|
216
|
+
|
217
|
+
if args.airr_output:
|
218
|
+
print(' writing %d annotations%s to %s' % (len(annotation_list), '' if cpath is None else ' (with partition: %d seqs in %d clusters)'%(sum(len(c) for c in cpath.best()), len(cpath.best())), args.outfile))
|
219
|
+
utils.write_airr_output(args.outfile, annotation_list, cpath=cpath, extra_columns=args.extra_columns, skip_columns=args.skip_columns)
|
220
|
+
sys.exit(0)
|
221
|
+
|
222
|
+
# condense partis info into <seqfos> for fasta/csv output
|
223
|
+
n_skipped, n_failed_to_add = 0, 0
|
224
|
+
seqfos = []
|
225
|
+
antn_dict = utils.get_annotation_dict(annotation_list)
|
226
|
+
for cluster in clusters_to_use:
|
227
|
+
if ':'.join(cluster) not in antn_dict:
|
228
|
+
n_skipped += 1
|
229
|
+
# print ' %s cluster with size %d not in annotations, so skipping it' % (utils.color('yellow', 'warning'), len(cluster))
|
230
|
+
continue
|
231
|
+
cluster_annotation = antn_dict[':'.join(cluster)]
|
232
|
+
newfos = [{'name' : u, 'seq' : s} for u, s in zip(cluster_annotation['unique_ids'], cluster_annotation['seqs' if args.indel_reversed_seqs else 'input_seqs'])]
|
233
|
+
if args.extra_columns is not None:
|
234
|
+
for ecol in args.extra_columns:
|
235
|
+
if ecol not in cluster_annotation:
|
236
|
+
utils.add_extra_column(ecol, cluster_annotation, cluster_annotation, glfo=glfo)
|
237
|
+
if ecol not in cluster_annotation:
|
238
|
+
n_failed_to_add += 1
|
239
|
+
cluster_annotation[ecol] = None
|
240
|
+
for iseq in range(len(newfos)):
|
241
|
+
ival = cluster_annotation[ecol]
|
242
|
+
if ival is not None and ecol in utils.linekeys['per_seq']:
|
243
|
+
ival = ival[iseq]
|
244
|
+
newfos[iseq][ecol] = ival
|
245
|
+
seqfos += newfos
|
246
|
+
if n_skipped > 0:
|
247
|
+
print(' missing annotations for %d sequences' % n_skipped)
|
248
|
+
if n_failed_to_add > 0:
|
249
|
+
print(' %s couldn\'t add \'%s\' to %d / %d annotations' % (utils.wrnstr(), ecol, n_failed_to_add, len(clusters_to_use) - n_skipped))
|
250
|
+
|
251
|
+
# write output
|
252
|
+
with open(args.outfile, utils.csv_wmode()) as ofile:
|
253
|
+
if utils.getsuffix(args.outfile) in ['.csv', '.tsv']:
|
254
|
+
print(' writing %d sequences to %s' % (len(seqfos), args.outfile))
|
255
|
+
writer = csv.DictWriter(ofile, list(seqfos[0].keys()), delimiter=str(',') if utils.getsuffix(args.outfile)=='.csv' else '\t')
|
256
|
+
writer.writeheader()
|
257
|
+
for sfo in seqfos:
|
258
|
+
writer.writerow(sfo)
|
259
|
+
elif utils.getsuffix(args.outfile) in ['.fa', '.fasta']:
|
260
|
+
print(' writing %d sequences to %s' % (len(seqfos), args.outfile))
|
261
|
+
for sfo in seqfos:
|
262
|
+
estr = ''
|
263
|
+
if args.extra_columns is not None:
|
264
|
+
estr = args.fasta_info_separator
|
265
|
+
estr += args.fasta_info_separator.join(str(sfo[c]) for c in args.extra_columns)
|
266
|
+
ofile.write('>%s%s\n%s\n' % (sfo['name'], estr, sfo['seq']))
|
267
|
+
elif utils.getsuffix(args.outfile) == '.yaml':
|
268
|
+
true_partition = None
|
269
|
+
if args.simfname is not None:
|
270
|
+
print(' reading true partition from %s' % args.simfname)
|
271
|
+
_, _, true_cpath = utils.read_output(args.simfname, skip_annotations=True)
|
272
|
+
true_partition = true_cpath.best()
|
273
|
+
plines = cpath.get_partition_lines(true_partition=true_partition, calc_missing_values='none' if true_partition is None else 'best')
|
274
|
+
print(' writing %d annotations with %d partition%s to %s' % (len(annotation_list), len(plines), utils.plural(len(plines)), args.outfile))
|
275
|
+
utils.write_annotations(args.outfile, glfo, annotation_list, utils.add_lists(utils.annotation_headers, args.extra_columns), partition_lines=plines)
|
276
|
+
else:
|
277
|
+
assert False
|