crossfilt 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,139 @@
1
+ #!/usr/bin/python3
2
+
3
+ import sys
4
+ import argparse
5
+ import pysam
6
+ import logging
7
+ import array
8
+ from timeit import default_timer as timer
9
+ import math
10
+ import os
11
+ from collections import defaultdict
12
+ import importlib.metadata
13
+
14
+ __version__ = importlib.metadata.version('crossfilt')
15
+
16
+ def main():
17
+ parser = argparse.ArgumentParser(
18
+ prog='crossfilt-filter',
19
+ description='Outputs reads from bam1 that that have identical contig, position, CIGAR string, and XF tag (optional) in bam2')
20
+
21
+ parser.add_argument("-x", "--xf", required=False, help="Require identical XF tag", action="store_true")
22
+ parser.add_argument("bam1", help="Input bam files.")
23
+ parser.add_argument("bam2", help="Input bam files.")
24
+ parser.add_argument('--version', action='version',
25
+ version='CrossFilt v{version}'.format(version=__version__))
26
+
27
+ args = parser.parse_args()
28
+ use_xf = args.xf
29
+
30
+ name_sorted = False
31
+
32
+ if not os.path.exists(args.bam1 + ".bai"):
33
+ print("Warning: " + args.bam1 + ".bai not found. Assuming files are filtered and sorted by read name.", file=sys.stderr)
34
+ name_sorted = True
35
+
36
+ if not os.path.exists(args.bam2 + ".bai"):
37
+ print("Warning: " + args.bam2 + ".bai not found. Assuming files are filtered and sorted by read name.", file=sys.stderr)
38
+ name_sorted = True
39
+
40
+
41
+ # This function will pull read pairs from two position sorted files. It will
42
+ # cache reads from the second argument
43
+
44
+ def read_pair_generator(bam1, bam2, chrom):
45
+
46
+ read2_iter = bam2.fetch(chrom)
47
+ read2_dict = defaultdict()
48
+ last_read2_pos = 0
49
+
50
+ for read1 in bam1.fetch(chrom):
51
+
52
+ qname = read1.query_name
53
+ pos = read1.reference_start
54
+ r1 = read1.is_read1
55
+
56
+ read1_id = qname + str(r1)
57
+
58
+ # add reads from file2 till the position in that file is greater than this read
59
+ while last_read2_pos <= pos:
60
+ try:
61
+ read2 = next(read2_iter)
62
+ last_read2_pos = read2.reference_start
63
+ read2_dict[read2.query_name + str(read2.is_read1)] = read2
64
+ except StopIteration:
65
+ break
66
+
67
+ if read1_id in read2_dict:
68
+ yield read1, read2_dict[read1_id]
69
+ del read2_dict[read1_id]
70
+
71
+ # clear out the cache of reads that we have passed
72
+ for read2_id, read2 in read2_dict.items():
73
+ if read2.reference_start < pos:
74
+ del read2_dict[read2_id]
75
+
76
+
77
+ def get_read_count(file):
78
+ contig_list = []
79
+ total_reads = 0
80
+ index_stats = file.get_index_statistics()
81
+
82
+ for i in index_stats:
83
+ if i[3] != 0:
84
+ contig_list.append(i[0])
85
+ total_reads += i[3]
86
+
87
+ return total_reads, contig_list
88
+
89
+
90
+ SAMFILE1 = pysam.AlignmentFile(args.bam1, "rb")
91
+ SAMFILE2 = pysam.AlignmentFile(args.bam2, "rb")
92
+ OUTFILE = pysam.AlignmentFile('-', "wb", template=SAMFILE1)
93
+
94
+ if not name_sorted:
95
+ file1_total_reads, file1_contigs = get_read_count(SAMFILE1)
96
+ file2_total_reads, file2_contigs = get_read_count(SAMFILE2)
97
+
98
+ # this will be more efficient if bam2 is the smaller file
99
+ i = matched = 0
100
+ for contig in file1_contigs:
101
+ for read1, read2 in read_pair_generator(SAMFILE1, SAMFILE2, contig):
102
+ i += 1
103
+ if not read1.reference_start == read2.reference_start: continue
104
+ if not read1.reference_name == read2.reference_name: continue
105
+ if not read1.cigarstring == read2.cigarstring: continue
106
+
107
+ if use_xf:
108
+ if not read1.has_tag("XF"): continue
109
+ if not read2.has_tag("XF"): continue
110
+ if not read1.get_tag("XF") == read2.get_tag("XF"): continue
111
+
112
+ matched += 1
113
+ OUTFILE.write(read1)
114
+
115
+ print(str(matched) + ' (' + str(round(100*matched/i,2)) + '%) successfully matched', file=sys.stderr)
116
+
117
+ else:
118
+ iter1 = SAMFILE1.fetch(until_eof = True)
119
+ iter2 = SAMFILE2.fetch(until_eof = True)
120
+
121
+ i = matched = 0
122
+ for read1, read2 in zip(iter1, iter2):
123
+ i += 1
124
+ # check read names to make sure they match
125
+ if not read1.query_name == read2.query_name:
126
+ sys.exit("Error: Read number " + str(i) + " query names are not identical (" + read1.query_name + " and " + read2.query_name + ")\nUse position sorted files or filter and sort your bam files by name.")
127
+
128
+ if use_xf:
129
+ if not read1.has_tag("XF"): continue
130
+ if not read2.has_tag("XF"): continue
131
+ if not read1.get_tag("XF") == read2.get_tag("XF"): continue
132
+
133
+ matched += 1
134
+ OUTFILE.write(read1)
135
+
136
+ print(matched,"(",round(100*matched/i,2),"%) successfully matched",file=sys.stderr)
137
+
138
+ if __name__ == '__main__':
139
+ main()