partis-bcr 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. bin/FastTree +0 -0
  2. bin/add-chimeras.py +59 -0
  3. bin/add-seqs-to-outputs.py +81 -0
  4. bin/bcr-phylo-run.py +799 -0
  5. bin/build.sh +24 -0
  6. bin/cf-alleles.py +97 -0
  7. bin/cf-germlines.py +57 -0
  8. bin/cf-linearham.py +199 -0
  9. bin/chimera-plot.py +76 -0
  10. bin/choose-partially-paired.py +143 -0
  11. bin/circle-plots.py +30 -0
  12. bin/compare-plotdirs.py +298 -0
  13. bin/diff-parameters.py +133 -0
  14. bin/docker-hub-push.sh +6 -0
  15. bin/extract-pairing-info.py +55 -0
  16. bin/gcdyn-simu-run.py +223 -0
  17. bin/gctree-run.py +244 -0
  18. bin/get-naive-probabilities.py +126 -0
  19. bin/iqtree-1.6.12 +0 -0
  20. bin/lonr.r +1020 -0
  21. bin/makeHtml +52 -0
  22. bin/mds-run.py +46 -0
  23. bin/parse-output.py +277 -0
  24. bin/partis +1869 -0
  25. bin/partis-pip +116 -0
  26. bin/partis.py +1869 -0
  27. bin/plot-gl-set-trees.py +519 -0
  28. bin/plot-hmms.py +151 -0
  29. bin/plot-lb-tree.py +427 -0
  30. bin/raxml-ng +0 -0
  31. bin/read-bcr-phylo-trees.py +38 -0
  32. bin/read-gctree-output.py +166 -0
  33. bin/run-chimeras.sh +64 -0
  34. bin/run-dtr-scan.sh +25 -0
  35. bin/run-paired-loci.sh +100 -0
  36. bin/run-tree-metrics.sh +88 -0
  37. bin/smetric-run.py +62 -0
  38. bin/split-loci.py +317 -0
  39. bin/swarm-2.1.13-linux-x86_64 +0 -0
  40. bin/test-germline-inference.py +425 -0
  41. bin/tree-perf-run.py +194 -0
  42. bin/vsearch-2.4.3-linux-x86_64 +0 -0
  43. bin/vsearch-2.4.3-macos-x86_64 +0 -0
  44. bin/xvfb-run +194 -0
  45. partis_bcr-1.0.2.data/scripts/cf-alleles.py +97 -0
  46. partis_bcr-1.0.2.data/scripts/cf-germlines.py +57 -0
  47. partis_bcr-1.0.2.data/scripts/extract-pairing-info.py +55 -0
  48. partis_bcr-1.0.2.data/scripts/gctree-run.py +244 -0
  49. partis_bcr-1.0.2.data/scripts/parse-output.py +277 -0
  50. partis_bcr-1.0.2.data/scripts/split-loci.py +317 -0
  51. partis_bcr-1.0.2.data/scripts/test.py +1005 -0
  52. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/METADATA +1 -1
  53. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/RECORD +101 -51
  54. partis_bcr-1.0.2.dist-info/top_level.txt +1 -0
  55. {partis → python}/glutils.py +1 -1
  56. python/main.py +30 -0
  57. {partis → python}/plotting.py +10 -1
  58. {partis → python}/treeutils.py +18 -16
  59. {partis → python}/utils.py +14 -7
  60. packages/ham/bcrham +0 -0
  61. partis/main.py +0 -59
  62. partis_bcr-1.0.0.dist-info/top_level.txt +0 -1
  63. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/WHEEL +0 -0
  64. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/entry_points.txt +0 -0
  65. {partis_bcr-1.0.0.dist-info → partis_bcr-1.0.2.dist-info}/licenses/COPYING +0 -0
  66. {partis → python}/__init__.py +0 -0
  67. {partis → python}/alleleclusterer.py +0 -0
  68. {partis → python}/allelefinder.py +0 -0
  69. {partis → python}/alleleremover.py +0 -0
  70. {partis → python}/annotationclustering.py +0 -0
  71. {partis → python}/baseutils.py +0 -0
  72. {partis → python}/cache/__init__.py +0 -0
  73. {partis → python}/cache/cached_uncertainties.py +0 -0
  74. {partis → python}/clusterpath.py +0 -0
  75. {partis → python}/coar.py +0 -0
  76. {partis → python}/corrcounter.py +0 -0
  77. {partis → python}/datautils.py +0 -0
  78. {partis → python}/event.py +0 -0
  79. {partis → python}/fraction_uncertainty.py +0 -0
  80. {partis → python}/gex.py +0 -0
  81. {partis → python}/glomerator.py +0 -0
  82. {partis → python}/hist.py +0 -0
  83. {partis → python}/hmmwriter.py +0 -0
  84. {partis → python}/hutils.py +0 -0
  85. {partis → python}/indelutils.py +0 -0
  86. {partis → python}/lbplotting.py +0 -0
  87. {partis → python}/mds.py +0 -0
  88. {partis → python}/mutefreqer.py +0 -0
  89. {partis → python}/paircluster.py +0 -0
  90. {partis → python}/parametercounter.py +0 -0
  91. {partis → python}/paramutils.py +0 -0
  92. {partis → python}/partitiondriver.py +0 -0
  93. {partis → python}/partitionplotter.py +0 -0
  94. {partis → python}/performanceplotter.py +0 -0
  95. {partis → python}/plotconfig.py +0 -0
  96. {partis → python}/processargs.py +0 -0
  97. {partis → python}/prutils.py +0 -0
  98. {partis → python}/recombinator.py +0 -0
  99. {partis → python}/scanplot.py +0 -0
  100. {partis → python}/seqfileopener.py +0 -0
  101. {partis → python}/treegenerator.py +0 -0
  102. {partis → python}/viterbicluster.py +0 -0
  103. {partis → python}/vrc01.py +0 -0
  104. {partis → python}/waterer.py +0 -0
bin/FastTree ADDED
Binary file
bin/add-chimeras.py ADDED
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import absolute_import, division, unicode_literals
3
+ from __future__ import print_function
4
+ import argparse
5
+ import collections
6
+ import numpy
7
+ import random
8
+ import sys
9
+ import os
10
+ import csv
11
+ from io import open
12
+
13
+ from pathlib import Path
14
+ partis_dir = str(Path(__file__).parent.parent)
15
+ if not os.path.exists(partis_dir):
16
+ print('WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % partis_dir)
17
+ sys.path.insert(1, partis_dir) # + '/python')
18
+ import python.utils as utils
19
+ import python.seqfileopener as seqfileopener
20
+
21
+ parser = argparse.ArgumentParser()
22
+ parser.add_argument('infile')
23
+ parser.add_argument('outfile')
24
+ parser.add_argument('--debug', action='store_true')
25
+ parser.add_argument('--chimera-freq', default=1., type=float, help='fraction of sequences to make chimeric')
26
+ parser.add_argument('--min-chunk-len', default=15, type=int, help='require that each bit of the chimera is at least this long')
27
+ args = parser.parse_args()
28
+
29
+ input_info, _, _ = seqfileopener.read_sequence_file(args.infile, is_data=False)
30
+ if len(input_info) < 50:
31
+ print('%s making chimeras with only %d sequences, and since we choose from among the existing sequence for templates this won\'t be very effective' % (utils.color('yellow', 'warning'), len(input_info)))
32
+
33
+ n_chimeric = 0
34
+ outfo = collections.OrderedDict()
35
+ for uid, seqfo in input_info.items():
36
+ if args.debug:
37
+ print(uid)
38
+
39
+ if numpy.random.uniform(0, 1) > args.chimera_freq: # no chimeras for this sequence
40
+ if args.debug:
41
+ print(' non-chimeric')
42
+ continue
43
+
44
+ break_point = random.randint(args.min_chunk_len, len(seqfo['seqs'][0]) - args.min_chunk_len)
45
+ switch_uid = numpy.random.choice(input_info)
46
+ switch_seq = input_info[switch_uid]['seqs'][0][ : break_point]
47
+
48
+ if args.debug:
49
+ print(' switching to %s at %d:' % (switch_uid, break_point))
50
+ print(' %s' % switch_seq)
51
+ print(' %s%s' % (' ' * len(switch_seq), seqfo['seqs'][0][break_point : ]))
52
+
53
+ outfo[uid] = switch_seq + seqfo['seqs'][0][break_point : ]
54
+ n_chimeric += 1
55
+
56
+ print('writing %d / %d chimeric sequences to %s' % (n_chimeric, len(input_info), args.outfile))
57
+ with open(args.outfile, 'w') as outfile:
58
+ for uid, seq in outfo.items():
59
+ outfile.write('>%s\n%s\n' % (uid, seq))
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import absolute_import, division, unicode_literals
3
+ from __future__ import print_function
4
+ import csv
5
+ import os
6
+ import sys
7
+ csv.field_size_limit(sys.maxsize) # make sure we can write very large csv fields
8
+ import argparse
9
+ import operator
10
+ import colored_traceback.always
11
+ import collections
12
+
13
+ # if you move this script, you'll need to change this method of getting the imports
14
+ from pathlib import Path
15
+ partis_dir = str(Path(__file__).parent.parent)
16
+ sys.path.insert(1, partis_dir) # + '/python')
17
+
18
+ import python.utils as utils
19
+ import python.glutils as glutils
20
+ from python.clusterpath import ClusterPath
21
+
22
+ dstr = """
23
+ Add seqs from the fasta file --new-seq-file to an annotation from --partis-output-file.
24
+ Looks for a cluster in the best partition that has sequences in common with the fasta file (and crashes if there's more than one such cluster).
25
+ Writes a single modified annotation to --outfile.
26
+ """
27
+ parser = argparse.ArgumentParser(description=dstr,
28
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter) # why tf isn't this printing the defaults?
29
+ parser.add_argument('--new-seq-file', required=True, help='fasta input file with seqs to be added to annotations + partitions in partis output yaml')
30
+ parser.add_argument('--partis-output-file', required=True, help='partis output file to which to add the seqs from --new-seq-file')
31
+ parser.add_argument('--partition-index', type=int, help='index of partition from which to take the clusters/annotations (if not set, uses the best partition)')
32
+ parser.add_argument('--glfo-dir', default=partis_dir + '/data/germlines/human', help='germline info directory. Only used if --partis-output-file is an old-style .csv, and this default dir may work if your output file doesn\'t have novel inferred genes. Otherwise, is the germline info dir from the partis inferred parameter directory corresponding to your output file --partis-output-file.')
33
+ parser.add_argument('--locus', default='igh')
34
+ parser.add_argument('--outfile', required=True, help='output partis yaml file')
35
+ parser.add_argument('--debug', action='store_true')
36
+ parser.add_argument('--n-test-subset-seqs', type=int, help='take only the first N seqs from both the fasta file and the annotation in the partis output file (e.g. for testing when the family is huge)')
37
+ args = parser.parse_args()
38
+
39
+ new_seqfos = utils.read_fastx(args.new_seq_file, sanitize_seqs=True)
40
+ print(' read %d seqs from %s' % (len(new_seqfos), args.new_seq_file))
41
+
42
+ glfo = None
43
+ if utils.getsuffix(args.partis_output_file) == '.csv':
44
+ print(' reading deprecated csv format, so need to read germline info from somewhere else, using --glfo-dir %s, hopefully it works' % args.glfo_dir)
45
+ glfo = glutils.read_glfo(args.glfo_dir, locus=args.locus)
46
+
47
+ glfo, annotation_list, cpath = utils.read_output(args.partis_output_file, glfo=glfo, locus=args.locus)
48
+ if args.partition_index is not None:
49
+ print(' using non-best partition index %d (best is %d)' % (args.partition_index, cpath.i_best))
50
+ partition = cpath.partitions[cpath.i_best if args.partition_index is None else args.partition_index]
51
+ print(' read partition with %d clusters from %s' % (len(partition), args.partis_output_file))
52
+
53
+ new_uids = set(sfo['name'] for sfo in new_seqfos)
54
+ clusters_with_overlap = []
55
+ for cluster in partition:
56
+ overlap_uids = set(cluster) & new_uids
57
+ if len(overlap_uids) > 0:
58
+ clusters_with_overlap.append((cluster, overlap_uids))
59
+
60
+ if len(clusters_with_overlap) == 0:
61
+ raise Exception('no clusters in partition have any overlap with sequences from fasta file')
62
+ elif len(clusters_with_overlap) > 1:
63
+ # raise Exception('too many clusters %d in the partition overlaps with sequences from the fasta file' % len(clusters_with_overlap))
64
+ clusters_with_overlap = sorted(clusters_with_overlap, key=lambda p: len(p[1]), reverse=True)
65
+ ostrs = ['%d %d'%(len(c), len(o)) for c, o in clusters_with_overlap]
66
+ print(' %s more than one cluster overlaps with sequences from fasta file, just taking first one (size overlap): %s, %s' % (utils.color('yellow', 'warning'), utils.color('red', ostrs[0]), ', '.join(ostrs[1:])))
67
+ old_cluster = clusters_with_overlap[0][0]
68
+
69
+ print(' adding %d fasta sequences to cluster of size %d (%d fasta sequences were already in cluster)' % (len(new_uids - set(old_cluster)), len(old_cluster), len(new_uids & set(old_cluster))))
70
+ sfos_to_add = [sfo for sfo in new_seqfos if sfo['name'] not in old_cluster]
71
+ annotation_dict = utils.get_annotation_dict(annotation_list)
72
+ annotation = annotation_dict[':'.join(old_cluster)]
73
+
74
+ if args.n_test_subset_seqs is not None:
75
+ print(' taking only first %d seqs from fasta and annotation' % args.n_test_subset_seqs)
76
+ utils.restrict_to_iseqs(annotation, list(range(args.n_test_subset_seqs)), glfo)
77
+ sfos_to_add = sfos_to_add[:args.n_test_subset_seqs]
78
+ utils.add_seqs_to_line(annotation, sfos_to_add, glfo, debug=args.debug)
79
+
80
+ output_headers = list(set(annotation_list[0].keys()) | set(utils.annotation_headers)) # try to pick up any extra headers that were written to the file
81
+ utils.write_annotations(args.outfile, glfo, [annotation], output_headers)