crossfilt 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crossfilt/identical_reads.py +139 -0
- crossfilt/lib/liftover_functions.py +968 -0
- crossfilt/liftover_bam.py +160 -0
- crossfilt/split_bam.py +133 -0
- crossfilt-0.1.1.dist-info/METADATA +130 -0
- crossfilt-0.1.1.dist-info/RECORD +10 -0
- crossfilt-0.1.1.dist-info/WHEEL +5 -0
- crossfilt-0.1.1.dist-info/entry_points.txt +4 -0
- crossfilt-0.1.1.dist-info/licenses/LICENSE +674 -0
- crossfilt-0.1.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
#!/usr/bin/python3
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
import argparse
|
|
5
|
+
import pysam
|
|
6
|
+
import logging
|
|
7
|
+
import array
|
|
8
|
+
from timeit import default_timer as timer
|
|
9
|
+
import math
|
|
10
|
+
import os
|
|
11
|
+
from collections import defaultdict
|
|
12
|
+
import importlib.metadata
|
|
13
|
+
|
|
14
|
+
__version__ = importlib.metadata.version('crossfilt')
|
|
15
|
+
|
|
16
|
+
def main():
|
|
17
|
+
parser = argparse.ArgumentParser(
|
|
18
|
+
prog='crossfilt-filter',
|
|
19
|
+
description='Outputs reads from bam1 that that have identical contig, position, CIGAR string, and XF tag (optional) in bam2')
|
|
20
|
+
|
|
21
|
+
parser.add_argument("-x", "--xf", required=False, help="Require identical XF tag", action="store_true")
|
|
22
|
+
parser.add_argument("bam1", help="Input bam files.")
|
|
23
|
+
parser.add_argument("bam2", help="Input bam files.")
|
|
24
|
+
parser.add_argument('--version', action='version',
|
|
25
|
+
version='CrossFilt v{version}'.format(version=__version__))
|
|
26
|
+
|
|
27
|
+
args = parser.parse_args()
|
|
28
|
+
use_xf = args.xf
|
|
29
|
+
|
|
30
|
+
name_sorted = False
|
|
31
|
+
|
|
32
|
+
if not os.path.exists(args.bam1 + ".bai"):
|
|
33
|
+
print("Warning: " + args.bam1 + ".bai not found. Assuming files are filtered and sorted by read name.", file=sys.stderr)
|
|
34
|
+
name_sorted = True
|
|
35
|
+
|
|
36
|
+
if not os.path.exists(args.bam2 + ".bai"):
|
|
37
|
+
print("Warning: " + args.bam2 + ".bai not found. Assuming files are filtered and sorted by read name.", file=sys.stderr)
|
|
38
|
+
name_sorted = True
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# This function will pull read pairs from two position sorted files. It will
|
|
42
|
+
# cache reads from the second argument
|
|
43
|
+
|
|
44
|
+
def read_pair_generator(bam1, bam2, chrom):
|
|
45
|
+
|
|
46
|
+
read2_iter = bam2.fetch(chrom)
|
|
47
|
+
read2_dict = defaultdict()
|
|
48
|
+
last_read2_pos = 0
|
|
49
|
+
|
|
50
|
+
for read1 in bam1.fetch(chrom):
|
|
51
|
+
|
|
52
|
+
qname = read1.query_name
|
|
53
|
+
pos = read1.reference_start
|
|
54
|
+
r1 = read1.is_read1
|
|
55
|
+
|
|
56
|
+
read1_id = qname + str(r1)
|
|
57
|
+
|
|
58
|
+
# add reads from file2 till the position in that file is greater than this read
|
|
59
|
+
while last_read2_pos <= pos:
|
|
60
|
+
try:
|
|
61
|
+
read2 = next(read2_iter)
|
|
62
|
+
last_read2_pos = read2.reference_start
|
|
63
|
+
read2_dict[read2.query_name + str(read2.is_read1)] = read2
|
|
64
|
+
except StopIteration:
|
|
65
|
+
break
|
|
66
|
+
|
|
67
|
+
if read1_id in read2_dict:
|
|
68
|
+
yield read1, read2_dict[read1_id]
|
|
69
|
+
del read2_dict[read1_id]
|
|
70
|
+
|
|
71
|
+
# clear out the cache of reads that we have passed
|
|
72
|
+
for read2_id, read2 in read2_dict.items():
|
|
73
|
+
if read2.reference_start < pos:
|
|
74
|
+
del read2_dict[read2_id]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_read_count(file):
|
|
78
|
+
contig_list = []
|
|
79
|
+
total_reads = 0
|
|
80
|
+
index_stats = file.get_index_statistics()
|
|
81
|
+
|
|
82
|
+
for i in index_stats:
|
|
83
|
+
if i[3] != 0:
|
|
84
|
+
contig_list.append(i[0])
|
|
85
|
+
total_reads += i[3]
|
|
86
|
+
|
|
87
|
+
return total_reads, contig_list
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
SAMFILE1 = pysam.AlignmentFile(args.bam1, "rb")
|
|
91
|
+
SAMFILE2 = pysam.AlignmentFile(args.bam2, "rb")
|
|
92
|
+
OUTFILE = pysam.AlignmentFile('-', "wb", template=SAMFILE1)
|
|
93
|
+
|
|
94
|
+
if not name_sorted:
|
|
95
|
+
file1_total_reads, file1_contigs = get_read_count(SAMFILE1)
|
|
96
|
+
file2_total_reads, file2_contigs = get_read_count(SAMFILE2)
|
|
97
|
+
|
|
98
|
+
# this will be more efficient if bam2 is the smaller file
|
|
99
|
+
i = matched = 0
|
|
100
|
+
for contig in file1_contigs:
|
|
101
|
+
for read1, read2 in read_pair_generator(SAMFILE1, SAMFILE2, contig):
|
|
102
|
+
i += 1
|
|
103
|
+
if not read1.reference_start == read2.reference_start: continue
|
|
104
|
+
if not read1.reference_name == read2.reference_name: continue
|
|
105
|
+
if not read1.cigarstring == read2.cigarstring: continue
|
|
106
|
+
|
|
107
|
+
if use_xf:
|
|
108
|
+
if not read1.has_tag("XF"): continue
|
|
109
|
+
if not read2.has_tag("XF"): continue
|
|
110
|
+
if not read1.get_tag("XF") == read2.get_tag("XF"): continue
|
|
111
|
+
|
|
112
|
+
matched += 1
|
|
113
|
+
OUTFILE.write(read1)
|
|
114
|
+
|
|
115
|
+
print(str(matched) + ' (' + str(round(100*matched/i,2)) + '%) successfully matched', file=sys.stderr)
|
|
116
|
+
|
|
117
|
+
else:
|
|
118
|
+
iter1 = SAMFILE1.fetch(until_eof = True)
|
|
119
|
+
iter2 = SAMFILE2.fetch(until_eof = True)
|
|
120
|
+
|
|
121
|
+
i = matched = 0
|
|
122
|
+
for read1, read2 in zip(iter1, iter2):
|
|
123
|
+
i += 1
|
|
124
|
+
# check read names to make sure they match
|
|
125
|
+
if not read1.query_name == read2.query_name:
|
|
126
|
+
sys.exit("Error: Read number " + str(i) + " query names are not identical (" + read1.query_name + " and " + read2.query_name + ")\nUse position sorted files or filter and sort your bam files by name.")
|
|
127
|
+
|
|
128
|
+
if use_xf:
|
|
129
|
+
if not read1.has_tag("XF"): continue
|
|
130
|
+
if not read2.has_tag("XF"): continue
|
|
131
|
+
if not read1.get_tag("XF") == read2.get_tag("XF"): continue
|
|
132
|
+
|
|
133
|
+
matched += 1
|
|
134
|
+
OUTFILE.write(read1)
|
|
135
|
+
|
|
136
|
+
print(matched,"(",round(100*matched/i,2),"%) successfully matched",file=sys.stderr)
|
|
137
|
+
|
|
138
|
+
if __name__ == '__main__':
|
|
139
|
+
main()
|