gseda 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gseda-0.0.1/.gitignore +3 -0
- gseda-0.0.1/PKG-INFO +17 -0
- gseda-0.0.1/README.md +4 -0
- gseda-0.0.1/pyproject.toml +23 -0
- gseda-0.0.1/src/gseda/__init__.py +0 -0
- gseda-0.0.1/src/gseda/bam_filter/__init__.py +0 -0
- gseda-0.0.1/src/gseda/bam_filter/drop_multiple_mapping_reads.py +47 -0
- gseda-0.0.1/src/gseda/bam_surgery/__init__.py +0 -0
- gseda-0.0.1/src/gseda/bam_surgery/insert_cs_to_sbr_bam.py +45 -0
- gseda-0.0.1/src/gseda/bam_surgery/insert_ref_to_sbr_bam.py +58 -0
- gseda-0.0.1/src/gseda/fact_table_ana/__init__.py +0 -0
- gseda-0.0.1/src/gseda/fact_table_ana/polars_init.py +8 -0
- gseda-0.0.1/src/gseda/fact_table_ana/pred_baseq_and_emperical_baseq.py +186 -0
- gseda-0.0.1/src/gseda/fact_table_ana/preq-baseq-and-emp-q.py +64 -0
- gseda-0.0.1/src/gseda/fact_table_ana/query_error_locus_ana.py +62 -0
- gseda-0.0.1/src/gseda/fact_table_ana/ref_locus_ana.py +74 -0
- gseda-0.0.1/src/gseda/fact_table_ana/utils.py +36 -0
- gseda-0.0.1/src/gseda/msa_view/__init__.py +0 -0
- gseda-0.0.1/src/gseda/msa_view/msa_view.py +417 -0
gseda-0.0.1/.gitignore
ADDED
gseda-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: gseda
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: gsetl
|
|
5
|
+
Project-URL: Homepage, https://github.com/keithyin/gsda
|
|
6
|
+
Project-URL: Issues, https://github.com/keithyin/gsda/issues
|
|
7
|
+
Author-email: keithyin <yinpenghhz@hotmail.com>
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
gseda
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
how to pack: https://cloud.tencent.com/developer/article/2401343
|
gseda-0.0.1/README.md
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "gseda"
|
|
3
|
+
version = "0.0.1"
|
|
4
|
+
authors = [
|
|
5
|
+
{ name="keithyin", email="yinpenghhz@hotmail.com" },
|
|
6
|
+
]
|
|
7
|
+
description = "gsetl"
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
requires-python = ">=3.8"
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Programming Language :: Python :: 3",
|
|
12
|
+
"License :: OSI Approved :: MIT License",
|
|
13
|
+
"Operating System :: OS Independent",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[project.urls]
|
|
17
|
+
Homepage = "https://github.com/keithyin/gsda"
|
|
18
|
+
Issues = "https://github.com/keithyin/gsda/issues"
|
|
19
|
+
|
|
20
|
+
# 构建后端,默认为 Hatchling
|
|
21
|
+
[build-system]
|
|
22
|
+
requires = ["hatchling"]
|
|
23
|
+
build-backend = "hatchling.build"
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import pysam
|
|
2
|
+
from typing import List, Set
|
|
3
|
+
import argparse
|
|
4
|
+
import pysam.samtools
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def extract_non_multiple_mapping_reads(bam_file: str) -> Set[str]:
|
|
9
|
+
qname_cnts = {}
|
|
10
|
+
with pysam.AlignmentFile(filename=bam_file, mode="rb", threads=40) as bam_h:
|
|
11
|
+
for record in tqdm(
|
|
12
|
+
bam_h.fetch(), desc=f"extract_non_multiple_mapping_reads from {bam_file}"
|
|
13
|
+
):
|
|
14
|
+
qname = record.query_name
|
|
15
|
+
if qname not in qname_cnts:
|
|
16
|
+
qname_cnts[qname] = 0
|
|
17
|
+
qname_cnts[qname] += 1
|
|
18
|
+
return set([qname for qname, cnt in qname_cnts.items() if cnt == 1])
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def dump_non_multiple_mapping_reads(bam_file: str, qname_set):
|
|
22
|
+
o_bam_file = "{}.non_multiple_mapping_reads.bam".format(
|
|
23
|
+
bam_file.rsplit(".", maxsplit=1)[0]
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
with pysam.AlignmentFile(filename=bam_file, mode="rb", threads=40) as bam_h:
|
|
27
|
+
with pysam.AlignmentFile(
|
|
28
|
+
filename=o_bam_file, mode="wb", threads=40, header=bam_h.header
|
|
29
|
+
) as o_bam_h:
|
|
30
|
+
for record in tqdm(
|
|
31
|
+
bam_h.fetch(), desc=f"dump_non_multiple_mapping_reads to {o_bam_file}"
|
|
32
|
+
):
|
|
33
|
+
if record.query_name in qname_set:
|
|
34
|
+
o_bam_h.write(record)
|
|
35
|
+
|
|
36
|
+
pysam.samtools.index("-@", "40", "-b", o_bam_file)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def main(args):
|
|
40
|
+
qname_set = extract_non_multiple_mapping_reads(args.bam)
|
|
41
|
+
dump_non_multiple_mapping_reads(args.bam, qname_set)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
if __name__ == "__main__":
|
|
45
|
+
parser = argparse.ArgumentParser(prog="")
|
|
46
|
+
parser.add_argument("bam")
|
|
47
|
+
main(args=parser.parse_args())
|
|
File without changes
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import pysam
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
import argparse
|
|
6
|
+
|
|
7
|
+
pre_dir = os.path.abspath(__file__).rsplit("/", maxsplit=2)[0]
|
|
8
|
+
sys.path.append(pre_dir)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def insert_cs_to_sbr_bam(sbr_bam: str, smc_bam: str):
|
|
12
|
+
out_bam = "{}.with-cs.bam".format(sbr_bam.rsplit(".", maxsplit=1)[0])
|
|
13
|
+
|
|
14
|
+
with pysam.AlignmentFile(sbr_bam, mode="rb", threads=40, check_sq=False) as in_h:
|
|
15
|
+
with pysam.AlignmentFile(
|
|
16
|
+
out_bam, mode="wb", threads=40, check_sq=False, header=in_h.header
|
|
17
|
+
) as out_h:
|
|
18
|
+
for record in tqdm(in_h.fetch(until_eof=True), desc=f"dumping1 {out_bam}"):
|
|
19
|
+
out_h.write(record)
|
|
20
|
+
|
|
21
|
+
with pysam.AlignmentFile(
|
|
22
|
+
smc_bam, mode="rb", threads=40, check_sq=False
|
|
23
|
+
) as in_h:
|
|
24
|
+
for in_record in tqdm(
|
|
25
|
+
in_h.fetch(until_eof=True), desc=f"dumping2 {out_bam}"
|
|
26
|
+
):
|
|
27
|
+
qname = in_record.query_name
|
|
28
|
+
record = pysam.AlignedSegment()
|
|
29
|
+
record.query_name = f"00_{qname}"
|
|
30
|
+
record.query_sequence = in_record.query_sequence
|
|
31
|
+
record.set_tag("ch", int(in_record.get_tag("ch")), value_type="I")
|
|
32
|
+
out_h.write(record)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def main(args):
|
|
36
|
+
insert_cs_to_sbr_bam(args.sbr_bam, args.smc_bam)
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
if __name__ == "__main__":
|
|
41
|
+
parser = argparse.ArgumentParser(prog="")
|
|
42
|
+
parser.add_argument("--sbr-bam", dest="sbr_bam")
|
|
43
|
+
parser.add_argument("--smc-bam", dest="smc_bam")
|
|
44
|
+
|
|
45
|
+
main(args=parser.parse_args())
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import pysam
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
import argparse
|
|
6
|
+
|
|
7
|
+
pre_dir = os.path.abspath(__file__).rsplit("/", maxsplit=2)[0]
|
|
8
|
+
sys.path.append(pre_dir)
|
|
9
|
+
|
|
10
|
+
import gseda.utils as gseda_utils
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def insert_ref_to_sbr_bam(sbr_bam: str, smc2ref_bam: str, ref_file: str):
|
|
14
|
+
out_bam = "{}.with-ref.bam".format(sbr_bam.rsplit(".", maxsplit=1)[0])
|
|
15
|
+
ref_data = gseda_utils.read_fastx_file(ref_file)
|
|
16
|
+
|
|
17
|
+
channel_refs = {}
|
|
18
|
+
with pysam.AlignmentFile(smc2ref_bam, mode="rb", threads=40) as smc2ref_bam_h:
|
|
19
|
+
for record in tqdm(smc2ref_bam_h.fetch(), desc=f"extracting channel ref"):
|
|
20
|
+
ch = int(record.get_tag("ch"))
|
|
21
|
+
ref_start = record.reference_start
|
|
22
|
+
ref_end = record.reference_end
|
|
23
|
+
ref_name = record.reference_name
|
|
24
|
+
ref_len = len(ref_data[ref_name][0])
|
|
25
|
+
|
|
26
|
+
ref_start = max(0, ref_start - 10)
|
|
27
|
+
ref_end = min(ref_len, ref_end + 11)
|
|
28
|
+
|
|
29
|
+
ref_sub_seq = ref_data[ref_name][0][ref_start:ref_end]
|
|
30
|
+
assert ch not in channel_refs
|
|
31
|
+
channel_refs[ch] = ref_sub_seq
|
|
32
|
+
with pysam.AlignmentFile(sbr_bam, mode="rb", threads=40, check_sq=False) as in_h:
|
|
33
|
+
with pysam.AlignmentFile(
|
|
34
|
+
out_bam, mode="wb", threads=40, check_sq=False, header=in_h.header
|
|
35
|
+
) as out_h:
|
|
36
|
+
for record in tqdm(in_h.fetch(until_eof=True), desc=f"dumping1 {out_bam}"):
|
|
37
|
+
out_h.write(record)
|
|
38
|
+
|
|
39
|
+
for ch, ref_seq in tqdm(channel_refs.items(), desc=f"dumping2 {out_bam}"):
|
|
40
|
+
record = pysam.AlignedSegment()
|
|
41
|
+
record.query_name = f"09_REF_{ch}"
|
|
42
|
+
record.query_sequence = ref_seq
|
|
43
|
+
record.set_tag("ch", ch, value_type="I")
|
|
44
|
+
out_h.write(record)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def main(args):
|
|
48
|
+
insert_ref_to_sbr_bam(args.sbr_bam, args.smc2ref_bam, args.ref_file)
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
if __name__ == "__main__":
|
|
53
|
+
parser = argparse.ArgumentParser(prog="")
|
|
54
|
+
parser.add_argument("--sbr-bam", dest="sbr_bam")
|
|
55
|
+
parser.add_argument("--smc2ref-bam", dest="smc2ref_bam")
|
|
56
|
+
parser.add_argument("--ref-file", dest="ref_file")
|
|
57
|
+
|
|
58
|
+
main(args=parser.parse_args())
|
|
File without changes
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
cur_dir = os.path.abspath(__file__).rsplit("/", maxsplit=1)[0]
|
|
5
|
+
sys.path.insert(0, cur_dir)
|
|
6
|
+
|
|
7
|
+
import pysam
|
|
8
|
+
import utils
|
|
9
|
+
import polars as pl
|
|
10
|
+
import argparse
|
|
11
|
+
from tqdm import tqdm
|
|
12
|
+
import seaborn as sns
|
|
13
|
+
import matplotlib.pyplot as plt
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BaseQStat:
|
|
17
|
+
def __init__(self, pred_q):
|
|
18
|
+
self.pred_q = pred_q
|
|
19
|
+
self.eq = 0
|
|
20
|
+
self.diff = 0
|
|
21
|
+
self.insertion = 0
|
|
22
|
+
self.deletion = 0
|
|
23
|
+
self.depth = 0
|
|
24
|
+
|
|
25
|
+
def add_eq(self, num=1):
|
|
26
|
+
self.eq += num
|
|
27
|
+
|
|
28
|
+
def add_diff(self, num=1):
|
|
29
|
+
self.diff += num
|
|
30
|
+
|
|
31
|
+
def add_insertion(self, num=1):
|
|
32
|
+
self.insertion += num
|
|
33
|
+
|
|
34
|
+
def add_deletion(self, num=1):
|
|
35
|
+
self.deletion += num
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def stat_one_record(aligned_pairs, qual, baseq2baseq_stat, query_seq, refseq, ref_end):
|
|
39
|
+
ref_pos_cursor = None
|
|
40
|
+
query_pos_cursor = None
|
|
41
|
+
|
|
42
|
+
for qpos, rpos in aligned_pairs:
|
|
43
|
+
|
|
44
|
+
if qpos is not None:
|
|
45
|
+
query_pos_cursor = qpos
|
|
46
|
+
if rpos is not None:
|
|
47
|
+
ref_pos_cursor = rpos
|
|
48
|
+
|
|
49
|
+
if ref_pos_cursor is None:
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
if query_pos_cursor is None:
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
baseq_stat = baseq2baseq_stat.setdefault(
|
|
56
|
+
qual[query_pos_cursor], BaseQStat(qual[query_pos_cursor])
|
|
57
|
+
)
|
|
58
|
+
assert isinstance(baseq_stat, BaseQStat)
|
|
59
|
+
if rpos is None:
|
|
60
|
+
baseq_stat.add_insertion()
|
|
61
|
+
if ref_pos_cursor == (ref_end - 1):
|
|
62
|
+
break
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
if qpos is None:
|
|
66
|
+
baseq_stat.add_deletion()
|
|
67
|
+
else:
|
|
68
|
+
if refseq[rpos] == query_seq[qpos]:
|
|
69
|
+
baseq_stat.add_eq()
|
|
70
|
+
else:
|
|
71
|
+
baseq_stat.add_diff()
|
|
72
|
+
|
|
73
|
+
if ref_pos_cursor == (ref_end - 1):
|
|
74
|
+
break
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def stat(aligned_bam_file: str, ref_file: str):
|
|
78
|
+
ref_data = utils.read_fastx_file(ref_file)
|
|
79
|
+
|
|
80
|
+
baseq2baseq_stat = {}
|
|
81
|
+
|
|
82
|
+
with pysam.AlignmentFile(aligned_bam_file, mode="rb") as bam_h:
|
|
83
|
+
for refname, refseq in ref_data.items():
|
|
84
|
+
|
|
85
|
+
for record in tqdm(
|
|
86
|
+
bam_h.fetch(contig=refname), desc=f"processing {refname}"
|
|
87
|
+
):
|
|
88
|
+
if record.is_secondary or record.is_supplementary or record.is_unmapped:
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
ref_pos_cursor = None
|
|
92
|
+
query_pos_cursor = None
|
|
93
|
+
|
|
94
|
+
ref_start = record.reference_start
|
|
95
|
+
ref_end = record.reference_end
|
|
96
|
+
query_start = record.query_alignment_start
|
|
97
|
+
query_end = record.query_alignment_end
|
|
98
|
+
|
|
99
|
+
qual = record.query_qualities
|
|
100
|
+
query_seq = record.query_sequence
|
|
101
|
+
|
|
102
|
+
stat_one_record(
|
|
103
|
+
record.get_aligned_pairs(),
|
|
104
|
+
qual,
|
|
105
|
+
baseq2baseq_stat,
|
|
106
|
+
query_seq,
|
|
107
|
+
refseq,
|
|
108
|
+
ref_end,
|
|
109
|
+
)
|
|
110
|
+
# for qpos, rpos in record.get_aligned_pairs():
|
|
111
|
+
|
|
112
|
+
# if qpos is not None:
|
|
113
|
+
# query_pos_cursor = qpos
|
|
114
|
+
# if rpos is not None:
|
|
115
|
+
# ref_pos_cursor = rpos
|
|
116
|
+
|
|
117
|
+
# if ref_pos_cursor is None:
|
|
118
|
+
# continue
|
|
119
|
+
|
|
120
|
+
# if query_pos_cursor is None:
|
|
121
|
+
# continue
|
|
122
|
+
|
|
123
|
+
# baseq_stat = baseq2baseq_stat.setdefault(
|
|
124
|
+
# qual[query_pos_cursor], BaseQStat(qual[query_pos_cursor])
|
|
125
|
+
# )
|
|
126
|
+
# assert isinstance(baseq_stat, BaseQStat)
|
|
127
|
+
# if rpos is None:
|
|
128
|
+
# baseq_stat.add_insertion()
|
|
129
|
+
# if ref_pos_cursor == (ref_end - 1):
|
|
130
|
+
# break
|
|
131
|
+
# continue
|
|
132
|
+
|
|
133
|
+
# if qpos is None:
|
|
134
|
+
# baseq_stat.add_deletion()
|
|
135
|
+
# else:
|
|
136
|
+
# if refseq[rpos] == query_seq[qpos]:
|
|
137
|
+
# baseq_stat.add_eq()
|
|
138
|
+
# else:
|
|
139
|
+
# baseq_stat.add_diff()
|
|
140
|
+
|
|
141
|
+
# if ref_pos_cursor == (ref_end - 1):
|
|
142
|
+
# break
|
|
143
|
+
baseqs = []
|
|
144
|
+
eqs = []
|
|
145
|
+
diffs = []
|
|
146
|
+
insertions = []
|
|
147
|
+
deletions = []
|
|
148
|
+
|
|
149
|
+
for bq, stat in baseq2baseq_stat.items():
|
|
150
|
+
assert isinstance(stat, BaseQStat)
|
|
151
|
+
baseqs.append(bq)
|
|
152
|
+
eqs.append(stat.eq)
|
|
153
|
+
diffs.append(stat.diff)
|
|
154
|
+
insertions.append(stat.insertion)
|
|
155
|
+
deletions.append(stat.deletion)
|
|
156
|
+
|
|
157
|
+
return pl.DataFrame(
|
|
158
|
+
{"baseq": baseqs, "eq": eqs, "diff": diffs, "ins": insertions, "del": deletions}
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def main(args):
|
|
163
|
+
df = stat(args.aln_bam, args.ref_file)
|
|
164
|
+
df = df.with_columns(
|
|
165
|
+
[
|
|
166
|
+
(pl.col("eq") / (pl.col("eq") + pl.col("diff") + pl.col("ins"))).alias(
|
|
167
|
+
"emp_rq"
|
|
168
|
+
)
|
|
169
|
+
]
|
|
170
|
+
).with_columns([utils.q2phreq_expr("emp_rq", "emp_phreq")])
|
|
171
|
+
figure = plt.figure(figsize=(20, 10))
|
|
172
|
+
axs = figure.add_subplot(1, 1, 1)
|
|
173
|
+
sns.scatterplot(df.to_pandas(), x="baseq", y="emp_phreq", ax=axs)
|
|
174
|
+
|
|
175
|
+
print(df.head(10))
|
|
176
|
+
figure.savefig(fname="baseq2empq")
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
if __name__ == "__main__":
|
|
180
|
+
|
|
181
|
+
params = {
|
|
182
|
+
"aln_bam": "/data/ccs_data/ccs_eval2024q4/output-all/smc2ref.bam",
|
|
183
|
+
"ref_file": "/data/ccs_data/MG1655.fa",
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
main(argparse.Namespace(**params))
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import pysam
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
cur_dir = os.path.abspath(__file__).rsplit("/", maxsplit=1)[0]
|
|
6
|
+
sys.path.insert(0, cur_dir)
|
|
7
|
+
|
|
8
|
+
import utils
|
|
9
|
+
import polars as pl
|
|
10
|
+
import argparse
|
|
11
|
+
from tqdm import tqdm
|
|
12
|
+
import seaborn as sns
|
|
13
|
+
import matplotlib.pyplot as plt
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
import polars_init
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def main(args):
|
|
20
|
+
# plt.grid(True, linestyle=":", linewidth=0.5, color="gray")
|
|
21
|
+
|
|
22
|
+
df = pl.read_csv(args.data, separator="\t")
|
|
23
|
+
df = df.with_columns(
|
|
24
|
+
[
|
|
25
|
+
(pl.col("eq") / (pl.col("eq") + pl.col("diff") + pl.col("ins"))).alias(
|
|
26
|
+
"emp_rq"
|
|
27
|
+
)
|
|
28
|
+
]
|
|
29
|
+
).with_columns([utils.q2phreq_expr("emp_rq", "emp_phreq")])
|
|
30
|
+
figure = plt.figure(figsize=(10, 10))
|
|
31
|
+
axs = figure.add_subplot(1, 1, 1)
|
|
32
|
+
plt.sca(axs)
|
|
33
|
+
plt.grid(True, linestyle=":", linewidth=0.5, color="gray")
|
|
34
|
+
|
|
35
|
+
sns.scatterplot(df.to_pandas(), x="baseq", y="emp_phreq", ax=axs)
|
|
36
|
+
axs.set_xticks(list(range(0, 60, 2)))
|
|
37
|
+
axs.set_yticks(list(range(0, 60, 2)))
|
|
38
|
+
axs.set_xlabel("PredictedBaseQ", fontdict={"size": 16})
|
|
39
|
+
axs.set_ylabel("EmpericalBaseQ", fontdict={"size": 16})
|
|
40
|
+
perfect_line = pl.DataFrame(
|
|
41
|
+
{
|
|
42
|
+
"x": list(range(0, 60)),
|
|
43
|
+
"y": list(range(0, 60)),
|
|
44
|
+
}
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
sns.lineplot(
|
|
48
|
+
perfect_line.to_pandas(), x="x", y="y", ax=axs, color="blue", linestyle="--"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
print(df.head(10))
|
|
52
|
+
figure.savefig(fname="baseq2empq.png")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
if __name__ == "__main__":
|
|
56
|
+
polars_init.polars_env_init()
|
|
57
|
+
params = {
|
|
58
|
+
"data": "/data/ccs_data/ccs_eval2024q3/Ludaopei/subread_bak/output-all/analysis/fact_baseq_stat.csv",
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
main(argparse.Namespace(**params))
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# Z:\BC\共享文件夹\four\20241115_Sync_Y0701_03_H01_Run0003_called.bam . 中南 STR
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import pysam
|
|
2
|
+
import polars as pl
|
|
3
|
+
import argparse
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
cur_dir = os.path.abspath(__file__).rsplit("/", maxsplit=1)[0]
|
|
9
|
+
sys.path.insert(0, cur_dir)
|
|
10
|
+
|
|
11
|
+
import polars_init
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def error_channel_analysis(
|
|
15
|
+
fact_aligned_bam_bam_basic: str, fact_error_query_locus_info: str
|
|
16
|
+
):
|
|
17
|
+
|
|
18
|
+
basic = pl.read_csv(fact_aligned_bam_bam_basic, separator="\t")
|
|
19
|
+
query_error_locus = pl.read_csv(fact_error_query_locus_info, separator="\t")
|
|
20
|
+
|
|
21
|
+
basic = basic.filter((pl.col("np") == 7).and_(pl.col("iy") < 0.999)).select(
|
|
22
|
+
[
|
|
23
|
+
pl.col("qname"),
|
|
24
|
+
pl.col("np"),
|
|
25
|
+
pl.col("rq"),
|
|
26
|
+
pl.col("iy"),
|
|
27
|
+
pl.col("qlen"),
|
|
28
|
+
pl.col("fwd"),
|
|
29
|
+
]
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
df = (
|
|
33
|
+
basic.join(query_error_locus, on="qname", how="inner")
|
|
34
|
+
.with_columns(
|
|
35
|
+
[
|
|
36
|
+
pl.when(pl.col("fwd"))
|
|
37
|
+
.then(pl.col("qstart"))
|
|
38
|
+
.otherwise(pl.col("qlen") - pl.col("qend"))
|
|
39
|
+
.alias("qstart"),
|
|
40
|
+
pl.when(pl.col("fwd"))
|
|
41
|
+
.then(pl.col("qend"))
|
|
42
|
+
.otherwise(pl.col("qlen") - pl.col("qstart"))
|
|
43
|
+
.alias("qend"),
|
|
44
|
+
]
|
|
45
|
+
)
|
|
46
|
+
.sort(by=["qname", "rstart"], descending=[True, False])
|
|
47
|
+
)
|
|
48
|
+
print(df.head(100))
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def main(args):
|
|
52
|
+
error_channel_analysis(args.bam_basic, args.error_query_locus)
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
if __name__ == "__main__":
|
|
57
|
+
polars_init.polars_env_init()
|
|
58
|
+
|
|
59
|
+
parser = argparse.ArgumentParser(prog="")
|
|
60
|
+
parser.add_argument("bam_basic")
|
|
61
|
+
parser.add_argument("error_query_locus")
|
|
62
|
+
main(parser.parse_args())
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
cur_dir = os.path.abspath(__file__).rsplit("/", maxsplit=1)[0]
|
|
6
|
+
sys.path.insert(0, cur_dir)
|
|
7
|
+
|
|
8
|
+
import polars_init
|
|
9
|
+
import argparse
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def variant_calling(df: pl.DataFrame):
|
|
13
|
+
v = (
|
|
14
|
+
df.with_columns([(pl.col("eq") + pl.col("diff")).alias("eq_and_diff")])
|
|
15
|
+
.with_columns(
|
|
16
|
+
[
|
|
17
|
+
(pl.col("eq") / pl.col("eq_and_diff")).alias("eq_in_eqdiff"),
|
|
18
|
+
(pl.col("diff") / pl.col("eq_and_diff")).alias("diff_in_eqdiff"),
|
|
19
|
+
]
|
|
20
|
+
)
|
|
21
|
+
.filter((pl.col("eq_in_eqdiff") > 0.2).and_(pl.col("diff_in_eqdiff") > 0.2))
|
|
22
|
+
.filter(pl.col("diffDetail").str.split(",").list.len() < 2)
|
|
23
|
+
.shape[0]
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
print("variant calling ratio: {} / {} = {}".format(v, df.shape[0], v / df.shape[0]))
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def del_calling(df: pl.DataFrame):
|
|
30
|
+
v = (
|
|
31
|
+
df.with_columns([(pl.col("del") / pl.col("depth")).alias("del_ratio")])
|
|
32
|
+
.filter((pl.col("del_ratio") > 0.4))
|
|
33
|
+
.shape[0]
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
print("del ratio: {} / {} = {}".format(v, df.shape[0], v / df.shape[0]))
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def ana(filepath: str):
|
|
40
|
+
df = pl.read_csv(filepath, separator="\t")
|
|
41
|
+
print(df.head(2))
|
|
42
|
+
|
|
43
|
+
df = (
|
|
44
|
+
df.with_columns(
|
|
45
|
+
[
|
|
46
|
+
(
|
|
47
|
+
pl.col("eq")
|
|
48
|
+
/ (pl.col("eq") + pl.col("diff") + pl.col("ins") + pl.col("del"))
|
|
49
|
+
).alias("eq_rate"),
|
|
50
|
+
(pl.col("eq") / pl.col("depth")).alias("eq_rate2"),
|
|
51
|
+
]
|
|
52
|
+
)
|
|
53
|
+
# .filter(pl.col("curIsHomo").eq(0).and_(pl.col("nextIsHomo").eq(0)))
|
|
54
|
+
.sort(by=["eq_rate2"], descending=[False])
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
print(df.head(100))
|
|
58
|
+
|
|
59
|
+
print(df.select((pl.col("eq_rate2") < 0.5).sum() / pl.len()))
|
|
60
|
+
|
|
61
|
+
variant_calling(df=df)
|
|
62
|
+
del_calling(df=df)
|
|
63
|
+
|
|
64
|
+
# print(df.head(200))
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
if __name__ == "__main__":
|
|
68
|
+
polars_init.polars_env_init()
|
|
69
|
+
|
|
70
|
+
parser = argparse.ArgumentParser(prog="")
|
|
71
|
+
parser.add_argument("fp", metavar="fact_aligned_bam_ref_locus_info.csv")
|
|
72
|
+
args = parser.parse_args()
|
|
73
|
+
ana(filepath=args.fp)
|
|
74
|
+
pass
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import pysam
|
|
2
|
+
from typing import Mapping, Tuple
|
|
3
|
+
from tqdm import tqdm
|
|
4
|
+
import polars as pl
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def read_bam_file(bam_file: str) -> Mapping[str, Tuple[str, int]]:
|
|
8
|
+
res = {}
|
|
9
|
+
with pysam.AlignmentFile(bam_file, mode="rb", threads=40, check_sq=False) as bam_h:
|
|
10
|
+
for record in tqdm(
|
|
11
|
+
bam_h.fetch(until_eof=True), desc=f"read_bam_file:>> reading {bam_file}"
|
|
12
|
+
):
|
|
13
|
+
res[record.query_name] = (record.query_sequence, f"00_{record.query_name}")
|
|
14
|
+
|
|
15
|
+
return res
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def read_fastx_file(fname: str) -> Mapping[str, Tuple[str, int]]:
|
|
19
|
+
fh = pysam.FastxFile(fname)
|
|
20
|
+
res = {}
|
|
21
|
+
for entry in fh:
|
|
22
|
+
res[entry.name] = (entry.sequence, f"00_{entry.name}")
|
|
23
|
+
return res
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def q2phreq_expr(inp_name, oup_name=None):
|
|
27
|
+
oup_name = oup_name if oup_name is not None else inp_name
|
|
28
|
+
return (
|
|
29
|
+
-10.0
|
|
30
|
+
* (
|
|
31
|
+
1
|
|
32
|
+
- pl.when(pl.col(inp_name) > (1 - 1e-6))
|
|
33
|
+
.then(1 - 1e-6)
|
|
34
|
+
.otherwise(pl.col(inp_name))
|
|
35
|
+
).log10()
|
|
36
|
+
).alias(oup_name)
|
|
File without changes
|
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
"""
|
|
2
|
+
this script is used to visualize the msa.
|
|
3
|
+
the result can be used in two ways:
|
|
4
|
+
* copy the fasta info and past to the jalview, jalview is used for the plot. OR
|
|
5
|
+
* generate the msa picture directly
|
|
6
|
+
|
|
7
|
+
the whole pipeline is:
|
|
8
|
+
1) do alignment (pairwise alignment) mini_align -P -m -r ref_filepath.fa -i inp.fa -t 1 -p calls2draft
|
|
9
|
+
2) use this script to generate msa pic
|
|
10
|
+
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
import pysam
|
|
15
|
+
import numpy as np
|
|
16
|
+
import tempfile
|
|
17
|
+
import os
|
|
18
|
+
from typing import Mapping
|
|
19
|
+
from tqdm import tqdm
|
|
20
|
+
from typing import Dict
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def read_fastx_file(fname: str):
|
|
24
|
+
fh = pysam.FastxFile(fname)
|
|
25
|
+
res = {}
|
|
26
|
+
for entry in fh:
|
|
27
|
+
res[entry.name] = entry.sequence
|
|
28
|
+
return res
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def read_bam_file(bam_file: str) -> Mapping[str, str]:
|
|
32
|
+
res = {}
|
|
33
|
+
with pysam.AlignmentFile(bam_file, mode="rb", threads=40, check_sq=False) as bam_h:
|
|
34
|
+
for record in tqdm(
|
|
35
|
+
bam_h.fetch(until_eof=True), desc=f"read_bam_file:>> reading {bam_file}"
|
|
36
|
+
):
|
|
37
|
+
res[record.query_name] = record.query_sequence
|
|
38
|
+
|
|
39
|
+
return res
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class Name2Seq:
|
|
43
|
+
def __init__(self, fname: str):
|
|
44
|
+
if (
|
|
45
|
+
fname.endswith("fa")
|
|
46
|
+
or fname.endswith("fasta")
|
|
47
|
+
or fname.endswith("fna")
|
|
48
|
+
or fname.endswith("fq")
|
|
49
|
+
or fname.endswith("fastq")
|
|
50
|
+
):
|
|
51
|
+
self.qname2seq = read_fastx_file(fname)
|
|
52
|
+
elif fname.endswith("bam"):
|
|
53
|
+
self.qname2seq = read_bam_file(fname)
|
|
54
|
+
else:
|
|
55
|
+
raise ValueError(f"invalid file format, {fname}")
|
|
56
|
+
|
|
57
|
+
def fetch(self, name):
|
|
58
|
+
return self.qname2seq[name]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def build_query_name(align_seg: pysam.AlignedSegment):
|
|
62
|
+
return f"{align_seg.query_name}_SE_{align_seg.query_alignment_start}_{align_seg.query_alignment_end}"
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def init_matrix(num_rows, num_cols):
|
|
66
|
+
"""init matrix using "." """
|
|
67
|
+
matrix = np.empty(shape=[num_rows, num_cols], dtype=np.str_)
|
|
68
|
+
matrix.fill(".")
|
|
69
|
+
return matrix
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class ResultMatrix:
|
|
73
|
+
"""msa alignment matrix"""
|
|
74
|
+
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
ref_start,
|
|
78
|
+
ref_end,
|
|
79
|
+
query_names,
|
|
80
|
+
refpos2length: Dict[int, int],
|
|
81
|
+
ref_name=None,
|
|
82
|
+
) -> None:
|
|
83
|
+
self.num_records = len(query_names) + 1 # +ref
|
|
84
|
+
self.query_names = sorted(query_names)
|
|
85
|
+
self.query2idx = {
|
|
86
|
+
query_name: idx for idx, query_name in enumerate(self.query_names, start=1)
|
|
87
|
+
}
|
|
88
|
+
self.ref_name = "REF" if ref_name is None else ref_name
|
|
89
|
+
self.ref_name = f"{self.ref_name}_{ref_start}_{ref_end}"
|
|
90
|
+
self.matrix = init_matrix(self.num_records, sum(refpos2length.values()))
|
|
91
|
+
|
|
92
|
+
refpos2length_list = sorted(list(refpos2length.items()), key=lambda x: x[0])
|
|
93
|
+
rpos2matrix_col = [[refpos2length_list[0][0], 0]]
|
|
94
|
+
for i in range(1, len(refpos2length_list)):
|
|
95
|
+
cur_item = refpos2length_list[i]
|
|
96
|
+
rpos2matrix_col.append(
|
|
97
|
+
[cur_item[0], refpos2length_list[i - 1][1] + rpos2matrix_col[-1][1]]
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
print(refpos2length_list)
|
|
101
|
+
print(rpos2matrix_col)
|
|
102
|
+
self.rpos2matrix_col = {
|
|
103
|
+
rpos: matrix_col for rpos, matrix_col in rpos2matrix_col
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
self.ref_end = ref_end # exclusive
|
|
107
|
+
self.ref_start = ref_start
|
|
108
|
+
|
|
109
|
+
print(f"self.end={self.ref_end}")
|
|
110
|
+
|
|
111
|
+
def update(self, record: pysam.AlignedSegment, ref: str = None):
|
|
112
|
+
|
|
113
|
+
idx = self.query2idx[build_query_name(record)]
|
|
114
|
+
|
|
115
|
+
rpos_cursor = None
|
|
116
|
+
# qpos_cursor = None
|
|
117
|
+
offset = 0
|
|
118
|
+
query_seq = record.query_sequence
|
|
119
|
+
|
|
120
|
+
ref_aligned = []
|
|
121
|
+
query_aligned = []
|
|
122
|
+
|
|
123
|
+
qpos_start = None
|
|
124
|
+
qpos_end = None
|
|
125
|
+
|
|
126
|
+
for qpos, rpos in record.get_aligned_pairs():
|
|
127
|
+
if rpos is not None:
|
|
128
|
+
rpos_cursor = rpos
|
|
129
|
+
if rpos_cursor is None:
|
|
130
|
+
continue
|
|
131
|
+
if rpos_cursor < self.ref_start or rpos_cursor >= self.ref_end:
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
if qpos_start is None and qpos is not None:
|
|
135
|
+
qpos_start = qpos
|
|
136
|
+
|
|
137
|
+
if qpos is not None:
|
|
138
|
+
qpos_end = qpos
|
|
139
|
+
|
|
140
|
+
if rpos_cursor not in self.rpos2matrix_col:
|
|
141
|
+
print(
|
|
142
|
+
rpos_cursor, " not in ", sorted(list(self.rpos2matrix_col.keys()))
|
|
143
|
+
)
|
|
144
|
+
raise ValueError()
|
|
145
|
+
|
|
146
|
+
ref_aligned.append("-" if rpos is None else ref[rpos])
|
|
147
|
+
query_aligned.append("-" if qpos is None else query_seq[qpos])
|
|
148
|
+
|
|
149
|
+
matrix_init_col = self.rpos2matrix_col[rpos_cursor]
|
|
150
|
+
if rpos is None:
|
|
151
|
+
offset += 1
|
|
152
|
+
else:
|
|
153
|
+
offset = 0
|
|
154
|
+
self.matrix[0, matrix_init_col] = ref[rpos]
|
|
155
|
+
|
|
156
|
+
matrix_col = matrix_init_col + offset
|
|
157
|
+
if qpos is not None:
|
|
158
|
+
self.matrix[idx, matrix_col] = query_seq[qpos]
|
|
159
|
+
|
|
160
|
+
# ref_aligned = "".join(ref_aligned)
|
|
161
|
+
# query_aligned = "".join(query_aligned)
|
|
162
|
+
# info = f"qname:{record.query_name}\n{ref_aligned}\n{query_aligned}"
|
|
163
|
+
# print(info)
|
|
164
|
+
|
|
165
|
+
seq_len = record.query_length
|
|
166
|
+
if record.is_reverse:
|
|
167
|
+
qpos_start, qpos_end = seq_len - qpos_end, seq_len - qpos_start
|
|
168
|
+
|
|
169
|
+
called_start = None
|
|
170
|
+
called_end = None
|
|
171
|
+
if record.has_tag("be"):
|
|
172
|
+
shift = record.get_tag("be")[0]
|
|
173
|
+
called_start = qpos_start + shift
|
|
174
|
+
called_end = qpos_end + shift
|
|
175
|
+
|
|
176
|
+
print(
|
|
177
|
+
f"{record.query_name}: sbr:{qpos_start}-{qpos_end}, called:{called_start}-{called_end}"
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
def get_raw_result(self):
|
|
181
|
+
return self.matrix
|
|
182
|
+
|
|
183
|
+
def get_query_names(self):
|
|
184
|
+
names = [self.ref_name]
|
|
185
|
+
names.extend(self.query_names)
|
|
186
|
+
return names
|
|
187
|
+
|
|
188
|
+
def get_result(self):
|
|
189
|
+
"""may the matrix has invalid rows, trim it and return"""
|
|
190
|
+
return self.get_raw_result()
|
|
191
|
+
|
|
192
|
+
def get_result_str(self):
|
|
193
|
+
"""valid matrix to string"""
|
|
194
|
+
res = self.get_result()
|
|
195
|
+
names = [self.ref_name]
|
|
196
|
+
names.extend(self.query_names)
|
|
197
|
+
result_strs = []
|
|
198
|
+
for row_idx in range(res.shape[0]):
|
|
199
|
+
|
|
200
|
+
q_name = names[row_idx]
|
|
201
|
+
result_strs.append(f">{q_name}")
|
|
202
|
+
result_strs.append("".join(res[row_idx].tolist()))
|
|
203
|
+
|
|
204
|
+
return "\n".join(result_strs)
|
|
205
|
+
|
|
206
|
+
@staticmethod
|
|
207
|
+
def init_matrix(num_rows, num_cols):
|
|
208
|
+
"""init matrix using "." """
|
|
209
|
+
matrix = np.empty(shape=[num_rows, num_cols], dtype=np.str_)
|
|
210
|
+
matrix.fill(".")
|
|
211
|
+
return matrix
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def extract_reference(ref_filename):
|
|
215
|
+
"""extract reference from fasta file"""
|
|
216
|
+
with open(ref_filename, mode="r", encoding="utf8") as file:
|
|
217
|
+
lines = file.readlines()
|
|
218
|
+
ref_name = lines[0].split(" ")[0][1:]
|
|
219
|
+
return (ref_name.strip(), "".join(lines[1:]))
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def build_ref_pos_maxins(
|
|
223
|
+
sam_file: pysam.AlignmentFile, contig: str, ref_start: int, ref_end: int
|
|
224
|
+
):
|
|
225
|
+
rpos2max_ins = {rpos: 0 for rpos in range(ref_start, ref_end)}
|
|
226
|
+
|
|
227
|
+
for query in sam_file.fetch(contig=contig):
|
|
228
|
+
rpos_cursor = None
|
|
229
|
+
cur_query_ins = 0
|
|
230
|
+
for _, rpos in query.get_aligned_pairs():
|
|
231
|
+
|
|
232
|
+
if rpos is not None:
|
|
233
|
+
rpos_cursor = rpos
|
|
234
|
+
|
|
235
|
+
if rpos_cursor is None:
|
|
236
|
+
continue
|
|
237
|
+
if rpos_cursor < ref_start:
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
if rpos_cursor >= ref_end:
|
|
241
|
+
break
|
|
242
|
+
|
|
243
|
+
if rpos is None:
|
|
244
|
+
cur_query_ins += 1
|
|
245
|
+
else:
|
|
246
|
+
if rpos_cursor > ref_start:
|
|
247
|
+
rpos2max_ins[rpos_cursor - 1] = max(
|
|
248
|
+
cur_query_ins, rpos2max_ins[rpos_cursor - 1]
|
|
249
|
+
)
|
|
250
|
+
cur_query_ins = 0
|
|
251
|
+
|
|
252
|
+
if rpos_cursor > ref_start and rpos_cursor <= ref_end:
|
|
253
|
+
rpos2max_ins[rpos_cursor - 1] = max(
|
|
254
|
+
cur_query_ins, rpos2max_ins[rpos_cursor - 1]
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
return rpos2max_ins
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def bam2fa4jalview(
|
|
261
|
+
aligned_bam_filename,
|
|
262
|
+
ref_filename,
|
|
263
|
+
ref_name,
|
|
264
|
+
interested_ref_start=None,
|
|
265
|
+
interested_ref_end=None,
|
|
266
|
+
):
|
|
267
|
+
"""generate the fasta info that can be used in jalview from bam file and [ref file]
|
|
268
|
+
Params:
|
|
269
|
+
aligned_bam_filename
|
|
270
|
+
contig
|
|
271
|
+
ref_filename
|
|
272
|
+
"""
|
|
273
|
+
samfile = pysam.AlignmentFile(aligned_bam_filename, mode="rb", threads=40)
|
|
274
|
+
|
|
275
|
+
fastx_data = Name2Seq(ref_filename)
|
|
276
|
+
|
|
277
|
+
ref_seq = fastx_data.fetch(ref_name)
|
|
278
|
+
|
|
279
|
+
ref_start = 2**32
|
|
280
|
+
ref_end = 0
|
|
281
|
+
query_names = []
|
|
282
|
+
|
|
283
|
+
for query in samfile.fetch(contig=ref_name):
|
|
284
|
+
if (
|
|
285
|
+
interested_ref_start is not None
|
|
286
|
+
and query.reference_end <= interested_ref_start
|
|
287
|
+
):
|
|
288
|
+
continue
|
|
289
|
+
if (
|
|
290
|
+
interested_ref_end is not None
|
|
291
|
+
and query.reference_start >= interested_ref_end
|
|
292
|
+
):
|
|
293
|
+
continue
|
|
294
|
+
ref_start = min([query.reference_start, ref_start])
|
|
295
|
+
ref_end = max([query.reference_end, ref_end])
|
|
296
|
+
query_names.append(build_query_name(query))
|
|
297
|
+
|
|
298
|
+
interested_ref_start = (
|
|
299
|
+
ref_start if interested_ref_start is None else interested_ref_start
|
|
300
|
+
)
|
|
301
|
+
interested_ref_end = ref_end if interested_ref_end is None else interested_ref_end
|
|
302
|
+
|
|
303
|
+
print(
|
|
304
|
+
f"interested_ref_start={interested_ref_start}, interested_ref_end={interested_ref_end}"
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
rpos2maxins = build_ref_pos_maxins(
|
|
308
|
+
samfile,
|
|
309
|
+
contig=ref_name,
|
|
310
|
+
ref_start=interested_ref_start,
|
|
311
|
+
ref_end=interested_ref_end,
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
rpos2length = {pos: ins + 1 for pos, ins in rpos2maxins.items()}
|
|
315
|
+
|
|
316
|
+
result_matrix = ResultMatrix(
|
|
317
|
+
ref_start=interested_ref_start,
|
|
318
|
+
ref_end=interested_ref_end,
|
|
319
|
+
query_names=query_names,
|
|
320
|
+
refpos2length=rpos2length,
|
|
321
|
+
ref_name=ref_name,
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
for query in samfile.fetch(contig=ref_name):
|
|
325
|
+
if query.reference_end <= interested_ref_start:
|
|
326
|
+
continue
|
|
327
|
+
if query.reference_start >= interested_ref_end:
|
|
328
|
+
continue
|
|
329
|
+
result_matrix.update(query, ref=ref_seq)
|
|
330
|
+
|
|
331
|
+
return result_matrix
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def plot_msa_align(inp_filename, oup_filename=None):
|
|
335
|
+
"""plot the msa align according to the fasta file"""
|
|
336
|
+
|
|
337
|
+
from pymsaviz import MsaViz
|
|
338
|
+
|
|
339
|
+
mv = MsaViz(
|
|
340
|
+
inp_filename,
|
|
341
|
+
wrap_length=150,
|
|
342
|
+
show_count=True,
|
|
343
|
+
show_grid=True,
|
|
344
|
+
color_scheme="Identity",
|
|
345
|
+
)
|
|
346
|
+
if oup_filename is None:
|
|
347
|
+
oup_filename = f"{inp_filename}.png"
|
|
348
|
+
mv.savefig(oup_filename)
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def main(args):
|
|
352
|
+
res = bam2fa4jalview(
|
|
353
|
+
args.bam,
|
|
354
|
+
ref_filename=args.ref_fasta,
|
|
355
|
+
ref_name=args.ref_name,
|
|
356
|
+
interested_ref_start=args.start,
|
|
357
|
+
interested_ref_end=args.end,
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
res_str = res.get_result_str()
|
|
361
|
+
if args.o_fasta is not None:
|
|
362
|
+
with open(args.o_fasta, "w", encoding="utf8") as file:
|
|
363
|
+
file.write(res_str)
|
|
364
|
+
else:
|
|
365
|
+
print(res_str)
|
|
366
|
+
|
|
367
|
+
if args.o_pic is not None:
|
|
368
|
+
if args.o_fasta is not None:
|
|
369
|
+
plot_msa_align(inp_filename=args.o_fasta, oup_filename=args.o_pic)
|
|
370
|
+
|
|
371
|
+
else:
|
|
372
|
+
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp:
|
|
373
|
+
tmp.write(res_str)
|
|
374
|
+
tmp.close()
|
|
375
|
+
print(f"temp file name: {tmp.name}")
|
|
376
|
+
plot_msa_align(inp_filename=tmp.name, oup_filename=args.o_pic)
|
|
377
|
+
os.remove(tmp.name)
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
if __name__ == "__main__":
|
|
381
|
+
p = argparse.ArgumentParser(
|
|
382
|
+
"mas_view",
|
|
383
|
+
description="""
|
|
384
|
+
the whole pipeline is:
|
|
385
|
+
1) do alignment (pairwise alignment), minimap2 is an option;
|
|
386
|
+
2) use this script to generate msa pic
|
|
387
|
+
""",
|
|
388
|
+
)
|
|
389
|
+
p.add_argument("--bam", type=str, help="subreads2smc alignment", required=True)
|
|
390
|
+
p.add_argument(
|
|
391
|
+
"--ref-fastx-or-bam",
|
|
392
|
+
type=str,
|
|
393
|
+
help="smc.fa/smc.fq/.bam,",
|
|
394
|
+
required=True,
|
|
395
|
+
dest="ref_fasta",
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
p.add_argument("--ref-name", required=True, type=str, dest="ref_name")
|
|
399
|
+
p.add_argument(
|
|
400
|
+
"--ref-start", type=int, default=None, help="contig end", dest="start"
|
|
401
|
+
)
|
|
402
|
+
p.add_argument("--ref-end", type=int, default=None, help="contig start", dest="end")
|
|
403
|
+
|
|
404
|
+
p.add_argument(
|
|
405
|
+
"--o-fasta",
|
|
406
|
+
default=None,
|
|
407
|
+
help="output fasta file, if not provided, this content will be output to the stdout",
|
|
408
|
+
dest="o_fasta",
|
|
409
|
+
)
|
|
410
|
+
p.add_argument(
|
|
411
|
+
"--o-pic",
|
|
412
|
+
default=None,
|
|
413
|
+
help="visualization picture file path, if not provided, the plot procedure will be skipped",
|
|
414
|
+
dest="o_pic",
|
|
415
|
+
)
|
|
416
|
+
args = p.parse_args()
|
|
417
|
+
main(args)
|