gseda 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gseda-0.0.1/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ **/target/*
2
+ ana/*
3
+ *.png
gseda-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.3
2
+ Name: gseda
3
+ Version: 0.0.1
4
+ Summary: gsetl
5
+ Project-URL: Homepage, https://github.com/keithyin/gsda
6
+ Project-URL: Issues, https://github.com/keithyin/gsda/issues
7
+ Author-email: keithyin <yinpenghhz@hotmail.com>
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Classifier: Programming Language :: Python :: 3
11
+ Requires-Python: >=3.8
12
+ Description-Content-Type: text/markdown
13
+
14
+ gseda
15
+
16
+
17
+ how to pack: https://cloud.tencent.com/developer/article/2401343
gseda-0.0.1/README.md ADDED
@@ -0,0 +1,4 @@
1
+ gseda
2
+
3
+
4
+ how to pack: https://cloud.tencent.com/developer/article/2401343
@@ -0,0 +1,23 @@
1
+ [project]
2
+ name = "gseda"
3
+ version = "0.0.1"
4
+ authors = [
5
+ { name="keithyin", email="yinpenghhz@hotmail.com" },
6
+ ]
7
+ description = "gsetl"
8
+ readme = "README.md"
9
+ requires-python = ">=3.8"
10
+ classifiers = [
11
+ "Programming Language :: Python :: 3",
12
+ "License :: OSI Approved :: MIT License",
13
+ "Operating System :: OS Independent",
14
+ ]
15
+
16
+ [project.urls]
17
+ Homepage = "https://github.com/keithyin/gsda"
18
+ Issues = "https://github.com/keithyin/gsda/issues"
19
+
20
+ # 构建后端,默认为 Hatchling
21
+ [build-system]
22
+ requires = ["hatchling"]
23
+ build-backend = "hatchling.build"
File without changes
File without changes
@@ -0,0 +1,47 @@
1
+ import pysam
2
+ from typing import List, Set
3
+ import argparse
4
+ import pysam.samtools
5
+ from tqdm import tqdm
6
+
7
+
8
+ def extract_non_multiple_mapping_reads(bam_file: str) -> Set[str]:
9
+ qname_cnts = {}
10
+ with pysam.AlignmentFile(filename=bam_file, mode="rb", threads=40) as bam_h:
11
+ for record in tqdm(
12
+ bam_h.fetch(), desc=f"extract_non_multiple_mapping_reads from {bam_file}"
13
+ ):
14
+ qname = record.query_name
15
+ if qname not in qname_cnts:
16
+ qname_cnts[qname] = 0
17
+ qname_cnts[qname] += 1
18
+ return set([qname for qname, cnt in qname_cnts.items() if cnt == 1])
19
+
20
+
21
+ def dump_non_multiple_mapping_reads(bam_file: str, qname_set):
22
+ o_bam_file = "{}.non_multiple_mapping_reads.bam".format(
23
+ bam_file.rsplit(".", maxsplit=1)[0]
24
+ )
25
+
26
+ with pysam.AlignmentFile(filename=bam_file, mode="rb", threads=40) as bam_h:
27
+ with pysam.AlignmentFile(
28
+ filename=o_bam_file, mode="wb", threads=40, header=bam_h.header
29
+ ) as o_bam_h:
30
+ for record in tqdm(
31
+ bam_h.fetch(), desc=f"dump_non_multiple_mapping_reads to {o_bam_file}"
32
+ ):
33
+ if record.query_name in qname_set:
34
+ o_bam_h.write(record)
35
+
36
+ pysam.samtools.index("-@", "40", "-b", o_bam_file)
37
+
38
+
39
+ def main(args):
40
+ qname_set = extract_non_multiple_mapping_reads(args.bam)
41
+ dump_non_multiple_mapping_reads(args.bam, qname_set)
42
+
43
+
44
+ if __name__ == "__main__":
45
+ parser = argparse.ArgumentParser(prog="")
46
+ parser.add_argument("bam")
47
+ main(args=parser.parse_args())
File without changes
@@ -0,0 +1,45 @@
1
+ import pysam
2
+ import os
3
+ import sys
4
+ from tqdm import tqdm
5
+ import argparse
6
+
7
+ pre_dir = os.path.abspath(__file__).rsplit("/", maxsplit=2)[0]
8
+ sys.path.append(pre_dir)
9
+
10
+
11
+ def insert_cs_to_sbr_bam(sbr_bam: str, smc_bam: str):
12
+ out_bam = "{}.with-cs.bam".format(sbr_bam.rsplit(".", maxsplit=1)[0])
13
+
14
+ with pysam.AlignmentFile(sbr_bam, mode="rb", threads=40, check_sq=False) as in_h:
15
+ with pysam.AlignmentFile(
16
+ out_bam, mode="wb", threads=40, check_sq=False, header=in_h.header
17
+ ) as out_h:
18
+ for record in tqdm(in_h.fetch(until_eof=True), desc=f"dumping1 {out_bam}"):
19
+ out_h.write(record)
20
+
21
+ with pysam.AlignmentFile(
22
+ smc_bam, mode="rb", threads=40, check_sq=False
23
+ ) as in_h:
24
+ for in_record in tqdm(
25
+ in_h.fetch(until_eof=True), desc=f"dumping2 {out_bam}"
26
+ ):
27
+ qname = in_record.query_name
28
+ record = pysam.AlignedSegment()
29
+ record.query_name = f"00_{qname}"
30
+ record.query_sequence = in_record.query_sequence
31
+ record.set_tag("ch", int(in_record.get_tag("ch")), value_type="I")
32
+ out_h.write(record)
33
+
34
+
35
+ def main(args):
36
+ insert_cs_to_sbr_bam(args.sbr_bam, args.smc_bam)
37
+ pass
38
+
39
+
40
+ if __name__ == "__main__":
41
+ parser = argparse.ArgumentParser(prog="")
42
+ parser.add_argument("--sbr-bam", dest="sbr_bam")
43
+ parser.add_argument("--smc-bam", dest="smc_bam")
44
+
45
+ main(args=parser.parse_args())
@@ -0,0 +1,58 @@
1
+ import pysam
2
+ import os
3
+ import sys
4
+ from tqdm import tqdm
5
+ import argparse
6
+
7
+ pre_dir = os.path.abspath(__file__).rsplit("/", maxsplit=2)[0]
8
+ sys.path.append(pre_dir)
9
+
10
+ import gseda.utils as gseda_utils
11
+
12
+
13
+ def insert_ref_to_sbr_bam(sbr_bam: str, smc2ref_bam: str, ref_file: str):
14
+ out_bam = "{}.with-ref.bam".format(sbr_bam.rsplit(".", maxsplit=1)[0])
15
+ ref_data = gseda_utils.read_fastx_file(ref_file)
16
+
17
+ channel_refs = {}
18
+ with pysam.AlignmentFile(smc2ref_bam, mode="rb", threads=40) as smc2ref_bam_h:
19
+ for record in tqdm(smc2ref_bam_h.fetch(), desc=f"extracting channel ref"):
20
+ ch = int(record.get_tag("ch"))
21
+ ref_start = record.reference_start
22
+ ref_end = record.reference_end
23
+ ref_name = record.reference_name
24
+ ref_len = len(ref_data[ref_name][0])
25
+
26
+ ref_start = max(0, ref_start - 10)
27
+ ref_end = min(ref_len, ref_end + 11)
28
+
29
+ ref_sub_seq = ref_data[ref_name][0][ref_start:ref_end]
30
+ assert ch not in channel_refs
31
+ channel_refs[ch] = ref_sub_seq
32
+ with pysam.AlignmentFile(sbr_bam, mode="rb", threads=40, check_sq=False) as in_h:
33
+ with pysam.AlignmentFile(
34
+ out_bam, mode="wb", threads=40, check_sq=False, header=in_h.header
35
+ ) as out_h:
36
+ for record in tqdm(in_h.fetch(until_eof=True), desc=f"dumping1 {out_bam}"):
37
+ out_h.write(record)
38
+
39
+ for ch, ref_seq in tqdm(channel_refs.items(), desc=f"dumping2 {out_bam}"):
40
+ record = pysam.AlignedSegment()
41
+ record.query_name = f"09_REF_{ch}"
42
+ record.query_sequence = ref_seq
43
+ record.set_tag("ch", ch, value_type="I")
44
+ out_h.write(record)
45
+
46
+
47
+ def main(args):
48
+ insert_ref_to_sbr_bam(args.sbr_bam, args.smc2ref_bam, args.ref_file)
49
+ pass
50
+
51
+
52
+ if __name__ == "__main__":
53
+ parser = argparse.ArgumentParser(prog="")
54
+ parser.add_argument("--sbr-bam", dest="sbr_bam")
55
+ parser.add_argument("--smc2ref-bam", dest="smc2ref_bam")
56
+ parser.add_argument("--ref-file", dest="ref_file")
57
+
58
+ main(args=parser.parse_args())
File without changes
@@ -0,0 +1,8 @@
1
+ import os
2
+
3
+
4
+ def polars_env_init():
5
+ os.environ["POLARS_FMT_TABLE_ROUNDED_CORNERS"] = "1"
6
+ os.environ["POLARS_FMT_MAX_COLS"] = "100"
7
+ os.environ["POLARS_FMT_MAX_ROWS"] = "300"
8
+ os.environ["POLARS_FMT_STR_LEN"] = "100"
@@ -0,0 +1,186 @@
1
+ import os
2
+ import sys
3
+
4
+ cur_dir = os.path.abspath(__file__).rsplit("/", maxsplit=1)[0]
5
+ sys.path.insert(0, cur_dir)
6
+
7
+ import pysam
8
+ import utils
9
+ import polars as pl
10
+ import argparse
11
+ from tqdm import tqdm
12
+ import seaborn as sns
13
+ import matplotlib.pyplot as plt
14
+
15
+
16
+ class BaseQStat:
17
+ def __init__(self, pred_q):
18
+ self.pred_q = pred_q
19
+ self.eq = 0
20
+ self.diff = 0
21
+ self.insertion = 0
22
+ self.deletion = 0
23
+ self.depth = 0
24
+
25
+ def add_eq(self, num=1):
26
+ self.eq += num
27
+
28
+ def add_diff(self, num=1):
29
+ self.diff += num
30
+
31
+ def add_insertion(self, num=1):
32
+ self.insertion += num
33
+
34
+ def add_deletion(self, num=1):
35
+ self.deletion += num
36
+
37
+
38
+ def stat_one_record(aligned_pairs, qual, baseq2baseq_stat, query_seq, refseq, ref_end):
39
+ ref_pos_cursor = None
40
+ query_pos_cursor = None
41
+
42
+ for qpos, rpos in aligned_pairs:
43
+
44
+ if qpos is not None:
45
+ query_pos_cursor = qpos
46
+ if rpos is not None:
47
+ ref_pos_cursor = rpos
48
+
49
+ if ref_pos_cursor is None:
50
+ continue
51
+
52
+ if query_pos_cursor is None:
53
+ continue
54
+
55
+ baseq_stat = baseq2baseq_stat.setdefault(
56
+ qual[query_pos_cursor], BaseQStat(qual[query_pos_cursor])
57
+ )
58
+ assert isinstance(baseq_stat, BaseQStat)
59
+ if rpos is None:
60
+ baseq_stat.add_insertion()
61
+ if ref_pos_cursor == (ref_end - 1):
62
+ break
63
+ continue
64
+
65
+ if qpos is None:
66
+ baseq_stat.add_deletion()
67
+ else:
68
+ if refseq[rpos] == query_seq[qpos]:
69
+ baseq_stat.add_eq()
70
+ else:
71
+ baseq_stat.add_diff()
72
+
73
+ if ref_pos_cursor == (ref_end - 1):
74
+ break
75
+
76
+
77
+ def stat(aligned_bam_file: str, ref_file: str):
78
+ ref_data = utils.read_fastx_file(ref_file)
79
+
80
+ baseq2baseq_stat = {}
81
+
82
+ with pysam.AlignmentFile(aligned_bam_file, mode="rb") as bam_h:
83
+ for refname, refseq in ref_data.items():
84
+
85
+ for record in tqdm(
86
+ bam_h.fetch(contig=refname), desc=f"processing {refname}"
87
+ ):
88
+ if record.is_secondary or record.is_supplementary or record.is_unmapped:
89
+ continue
90
+
91
+ ref_pos_cursor = None
92
+ query_pos_cursor = None
93
+
94
+ ref_start = record.reference_start
95
+ ref_end = record.reference_end
96
+ query_start = record.query_alignment_start
97
+ query_end = record.query_alignment_end
98
+
99
+ qual = record.query_qualities
100
+ query_seq = record.query_sequence
101
+
102
+ stat_one_record(
103
+ record.get_aligned_pairs(),
104
+ qual,
105
+ baseq2baseq_stat,
106
+ query_seq,
107
+ refseq,
108
+ ref_end,
109
+ )
110
+ # for qpos, rpos in record.get_aligned_pairs():
111
+
112
+ # if qpos is not None:
113
+ # query_pos_cursor = qpos
114
+ # if rpos is not None:
115
+ # ref_pos_cursor = rpos
116
+
117
+ # if ref_pos_cursor is None:
118
+ # continue
119
+
120
+ # if query_pos_cursor is None:
121
+ # continue
122
+
123
+ # baseq_stat = baseq2baseq_stat.setdefault(
124
+ # qual[query_pos_cursor], BaseQStat(qual[query_pos_cursor])
125
+ # )
126
+ # assert isinstance(baseq_stat, BaseQStat)
127
+ # if rpos is None:
128
+ # baseq_stat.add_insertion()
129
+ # if ref_pos_cursor == (ref_end - 1):
130
+ # break
131
+ # continue
132
+
133
+ # if qpos is None:
134
+ # baseq_stat.add_deletion()
135
+ # else:
136
+ # if refseq[rpos] == query_seq[qpos]:
137
+ # baseq_stat.add_eq()
138
+ # else:
139
+ # baseq_stat.add_diff()
140
+
141
+ # if ref_pos_cursor == (ref_end - 1):
142
+ # break
143
+ baseqs = []
144
+ eqs = []
145
+ diffs = []
146
+ insertions = []
147
+ deletions = []
148
+
149
+ for bq, stat in baseq2baseq_stat.items():
150
+ assert isinstance(stat, BaseQStat)
151
+ baseqs.append(bq)
152
+ eqs.append(stat.eq)
153
+ diffs.append(stat.diff)
154
+ insertions.append(stat.insertion)
155
+ deletions.append(stat.deletion)
156
+
157
+ return pl.DataFrame(
158
+ {"baseq": baseqs, "eq": eqs, "diff": diffs, "ins": insertions, "del": deletions}
159
+ )
160
+
161
+
162
+ def main(args):
163
+ df = stat(args.aln_bam, args.ref_file)
164
+ df = df.with_columns(
165
+ [
166
+ (pl.col("eq") / (pl.col("eq") + pl.col("diff") + pl.col("ins"))).alias(
167
+ "emp_rq"
168
+ )
169
+ ]
170
+ ).with_columns([utils.q2phreq_expr("emp_rq", "emp_phreq")])
171
+ figure = plt.figure(figsize=(20, 10))
172
+ axs = figure.add_subplot(1, 1, 1)
173
+ sns.scatterplot(df.to_pandas(), x="baseq", y="emp_phreq", ax=axs)
174
+
175
+ print(df.head(10))
176
+ figure.savefig(fname="baseq2empq")
177
+
178
+
179
+ if __name__ == "__main__":
180
+
181
+ params = {
182
+ "aln_bam": "/data/ccs_data/ccs_eval2024q4/output-all/smc2ref.bam",
183
+ "ref_file": "/data/ccs_data/MG1655.fa",
184
+ }
185
+
186
+ main(argparse.Namespace(**params))
@@ -0,0 +1,64 @@
1
+ import pysam
2
+ import os
3
+ import sys
4
+
5
+ cur_dir = os.path.abspath(__file__).rsplit("/", maxsplit=1)[0]
6
+ sys.path.insert(0, cur_dir)
7
+
8
+ import utils
9
+ import polars as pl
10
+ import argparse
11
+ from tqdm import tqdm
12
+ import seaborn as sns
13
+ import matplotlib.pyplot as plt
14
+
15
+
16
+ import polars_init
17
+
18
+
19
+ def main(args):
20
+ # plt.grid(True, linestyle=":", linewidth=0.5, color="gray")
21
+
22
+ df = pl.read_csv(args.data, separator="\t")
23
+ df = df.with_columns(
24
+ [
25
+ (pl.col("eq") / (pl.col("eq") + pl.col("diff") + pl.col("ins"))).alias(
26
+ "emp_rq"
27
+ )
28
+ ]
29
+ ).with_columns([utils.q2phreq_expr("emp_rq", "emp_phreq")])
30
+ figure = plt.figure(figsize=(10, 10))
31
+ axs = figure.add_subplot(1, 1, 1)
32
+ plt.sca(axs)
33
+ plt.grid(True, linestyle=":", linewidth=0.5, color="gray")
34
+
35
+ sns.scatterplot(df.to_pandas(), x="baseq", y="emp_phreq", ax=axs)
36
+ axs.set_xticks(list(range(0, 60, 2)))
37
+ axs.set_yticks(list(range(0, 60, 2)))
38
+ axs.set_xlabel("PredictedBaseQ", fontdict={"size": 16})
39
+ axs.set_ylabel("EmpericalBaseQ", fontdict={"size": 16})
40
+ perfect_line = pl.DataFrame(
41
+ {
42
+ "x": list(range(0, 60)),
43
+ "y": list(range(0, 60)),
44
+ }
45
+ )
46
+
47
+ sns.lineplot(
48
+ perfect_line.to_pandas(), x="x", y="y", ax=axs, color="blue", linestyle="--"
49
+ )
50
+
51
+ print(df.head(10))
52
+ figure.savefig(fname="baseq2empq.png")
53
+
54
+
55
+ if __name__ == "__main__":
56
+ polars_init.polars_env_init()
57
+ params = {
58
+ "data": "/data/ccs_data/ccs_eval2024q3/Ludaopei/subread_bak/output-all/analysis/fact_baseq_stat.csv",
59
+ }
60
+
61
+ main(argparse.Namespace(**params))
62
+
63
+
64
+ # Z:\BC\共享文件夹\four\20241115_Sync_Y0701_03_H01_Run0003_called.bam . 中南 STR
@@ -0,0 +1,62 @@
1
+ import pysam
2
+ import polars as pl
3
+ import argparse
4
+
5
+ import os
6
+ import sys
7
+
8
+ cur_dir = os.path.abspath(__file__).rsplit("/", maxsplit=1)[0]
9
+ sys.path.insert(0, cur_dir)
10
+
11
+ import polars_init
12
+
13
+
14
+ def error_channel_analysis(
15
+ fact_aligned_bam_bam_basic: str, fact_error_query_locus_info: str
16
+ ):
17
+
18
+ basic = pl.read_csv(fact_aligned_bam_bam_basic, separator="\t")
19
+ query_error_locus = pl.read_csv(fact_error_query_locus_info, separator="\t")
20
+
21
+ basic = basic.filter((pl.col("np") == 7).and_(pl.col("iy") < 0.999)).select(
22
+ [
23
+ pl.col("qname"),
24
+ pl.col("np"),
25
+ pl.col("rq"),
26
+ pl.col("iy"),
27
+ pl.col("qlen"),
28
+ pl.col("fwd"),
29
+ ]
30
+ )
31
+
32
+ df = (
33
+ basic.join(query_error_locus, on="qname", how="inner")
34
+ .with_columns(
35
+ [
36
+ pl.when(pl.col("fwd"))
37
+ .then(pl.col("qstart"))
38
+ .otherwise(pl.col("qlen") - pl.col("qend"))
39
+ .alias("qstart"),
40
+ pl.when(pl.col("fwd"))
41
+ .then(pl.col("qend"))
42
+ .otherwise(pl.col("qlen") - pl.col("qstart"))
43
+ .alias("qend"),
44
+ ]
45
+ )
46
+ .sort(by=["qname", "rstart"], descending=[True, False])
47
+ )
48
+ print(df.head(100))
49
+
50
+
51
+ def main(args):
52
+ error_channel_analysis(args.bam_basic, args.error_query_locus)
53
+ pass
54
+
55
+
56
+ if __name__ == "__main__":
57
+ polars_init.polars_env_init()
58
+
59
+ parser = argparse.ArgumentParser(prog="")
60
+ parser.add_argument("bam_basic")
61
+ parser.add_argument("error_query_locus")
62
+ main(parser.parse_args())
@@ -0,0 +1,74 @@
1
+ import polars as pl
2
+ import os
3
+ import sys
4
+
5
+ cur_dir = os.path.abspath(__file__).rsplit("/", maxsplit=1)[0]
6
+ sys.path.insert(0, cur_dir)
7
+
8
+ import polars_init
9
+ import argparse
10
+
11
+
12
+ def variant_calling(df: pl.DataFrame):
13
+ v = (
14
+ df.with_columns([(pl.col("eq") + pl.col("diff")).alias("eq_and_diff")])
15
+ .with_columns(
16
+ [
17
+ (pl.col("eq") / pl.col("eq_and_diff")).alias("eq_in_eqdiff"),
18
+ (pl.col("diff") / pl.col("eq_and_diff")).alias("diff_in_eqdiff"),
19
+ ]
20
+ )
21
+ .filter((pl.col("eq_in_eqdiff") > 0.2).and_(pl.col("diff_in_eqdiff") > 0.2))
22
+ .filter(pl.col("diffDetail").str.split(",").list.len() < 2)
23
+ .shape[0]
24
+ )
25
+
26
+ print("variant calling ratio: {} / {} = {}".format(v, df.shape[0], v / df.shape[0]))
27
+
28
+
29
+ def del_calling(df: pl.DataFrame):
30
+ v = (
31
+ df.with_columns([(pl.col("del") / pl.col("depth")).alias("del_ratio")])
32
+ .filter((pl.col("del_ratio") > 0.4))
33
+ .shape[0]
34
+ )
35
+
36
+ print("del ratio: {} / {} = {}".format(v, df.shape[0], v / df.shape[0]))
37
+
38
+
39
+ def ana(filepath: str):
40
+ df = pl.read_csv(filepath, separator="\t")
41
+ print(df.head(2))
42
+
43
+ df = (
44
+ df.with_columns(
45
+ [
46
+ (
47
+ pl.col("eq")
48
+ / (pl.col("eq") + pl.col("diff") + pl.col("ins") + pl.col("del"))
49
+ ).alias("eq_rate"),
50
+ (pl.col("eq") / pl.col("depth")).alias("eq_rate2"),
51
+ ]
52
+ )
53
+ # .filter(pl.col("curIsHomo").eq(0).and_(pl.col("nextIsHomo").eq(0)))
54
+ .sort(by=["eq_rate2"], descending=[False])
55
+ )
56
+
57
+ print(df.head(100))
58
+
59
+ print(df.select((pl.col("eq_rate2") < 0.5).sum() / pl.len()))
60
+
61
+ variant_calling(df=df)
62
+ del_calling(df=df)
63
+
64
+ # print(df.head(200))
65
+
66
+
67
+ if __name__ == "__main__":
68
+ polars_init.polars_env_init()
69
+
70
+ parser = argparse.ArgumentParser(prog="")
71
+ parser.add_argument("fp", metavar="fact_aligned_bam_ref_locus_info.csv")
72
+ args = parser.parse_args()
73
+ ana(filepath=args.fp)
74
+ pass
@@ -0,0 +1,36 @@
1
+ import pysam
2
+ from typing import Mapping, Tuple
3
+ from tqdm import tqdm
4
+ import polars as pl
5
+
6
+
7
+ def read_bam_file(bam_file: str) -> Mapping[str, Tuple[str, int]]:
8
+ res = {}
9
+ with pysam.AlignmentFile(bam_file, mode="rb", threads=40, check_sq=False) as bam_h:
10
+ for record in tqdm(
11
+ bam_h.fetch(until_eof=True), desc=f"read_bam_file:>> reading {bam_file}"
12
+ ):
13
+ res[record.query_name] = (record.query_sequence, f"00_{record.query_name}")
14
+
15
+ return res
16
+
17
+
18
+ def read_fastx_file(fname: str) -> Mapping[str, Tuple[str, int]]:
19
+ fh = pysam.FastxFile(fname)
20
+ res = {}
21
+ for entry in fh:
22
+ res[entry.name] = (entry.sequence, f"00_{entry.name}")
23
+ return res
24
+
25
+
26
+ def q2phreq_expr(inp_name, oup_name=None):
27
+ oup_name = oup_name if oup_name is not None else inp_name
28
+ return (
29
+ -10.0
30
+ * (
31
+ 1
32
+ - pl.when(pl.col(inp_name) > (1 - 1e-6))
33
+ .then(1 - 1e-6)
34
+ .otherwise(pl.col(inp_name))
35
+ ).log10()
36
+ ).alias(oup_name)
File without changes
@@ -0,0 +1,417 @@
1
+ """
2
+ this script is used to visualize the msa.
3
+ the result can be used in two ways:
4
+ * copy the fasta info and past to the jalview, jalview is used for the plot. OR
5
+ * generate the msa picture directly
6
+
7
+ the whole pipeline is:
8
+ 1) do alignment (pairwise alignment) mini_align -P -m -r ref_filepath.fa -i inp.fa -t 1 -p calls2draft
9
+ 2) use this script to generate msa pic
10
+
11
+ """
12
+
13
+ import argparse
14
+ import pysam
15
+ import numpy as np
16
+ import tempfile
17
+ import os
18
+ from typing import Mapping
19
+ from tqdm import tqdm
20
+ from typing import Dict
21
+
22
+
23
+ def read_fastx_file(fname: str):
24
+ fh = pysam.FastxFile(fname)
25
+ res = {}
26
+ for entry in fh:
27
+ res[entry.name] = entry.sequence
28
+ return res
29
+
30
+
31
+ def read_bam_file(bam_file: str) -> Mapping[str, str]:
32
+ res = {}
33
+ with pysam.AlignmentFile(bam_file, mode="rb", threads=40, check_sq=False) as bam_h:
34
+ for record in tqdm(
35
+ bam_h.fetch(until_eof=True), desc=f"read_bam_file:>> reading {bam_file}"
36
+ ):
37
+ res[record.query_name] = record.query_sequence
38
+
39
+ return res
40
+
41
+
42
+ class Name2Seq:
43
+ def __init__(self, fname: str):
44
+ if (
45
+ fname.endswith("fa")
46
+ or fname.endswith("fasta")
47
+ or fname.endswith("fna")
48
+ or fname.endswith("fq")
49
+ or fname.endswith("fastq")
50
+ ):
51
+ self.qname2seq = read_fastx_file(fname)
52
+ elif fname.endswith("bam"):
53
+ self.qname2seq = read_bam_file(fname)
54
+ else:
55
+ raise ValueError(f"invalid file format, {fname}")
56
+
57
+ def fetch(self, name):
58
+ return self.qname2seq[name]
59
+
60
+
61
+ def build_query_name(align_seg: pysam.AlignedSegment):
62
+ return f"{align_seg.query_name}_SE_{align_seg.query_alignment_start}_{align_seg.query_alignment_end}"
63
+
64
+
65
+ def init_matrix(num_rows, num_cols):
66
+ """init matrix using "." """
67
+ matrix = np.empty(shape=[num_rows, num_cols], dtype=np.str_)
68
+ matrix.fill(".")
69
+ return matrix
70
+
71
+
72
+ class ResultMatrix:
73
+ """msa alignment matrix"""
74
+
75
+ def __init__(
76
+ self,
77
+ ref_start,
78
+ ref_end,
79
+ query_names,
80
+ refpos2length: Dict[int, int],
81
+ ref_name=None,
82
+ ) -> None:
83
+ self.num_records = len(query_names) + 1 # +ref
84
+ self.query_names = sorted(query_names)
85
+ self.query2idx = {
86
+ query_name: idx for idx, query_name in enumerate(self.query_names, start=1)
87
+ }
88
+ self.ref_name = "REF" if ref_name is None else ref_name
89
+ self.ref_name = f"{self.ref_name}_{ref_start}_{ref_end}"
90
+ self.matrix = init_matrix(self.num_records, sum(refpos2length.values()))
91
+
92
+ refpos2length_list = sorted(list(refpos2length.items()), key=lambda x: x[0])
93
+ rpos2matrix_col = [[refpos2length_list[0][0], 0]]
94
+ for i in range(1, len(refpos2length_list)):
95
+ cur_item = refpos2length_list[i]
96
+ rpos2matrix_col.append(
97
+ [cur_item[0], refpos2length_list[i - 1][1] + rpos2matrix_col[-1][1]]
98
+ )
99
+
100
+ print(refpos2length_list)
101
+ print(rpos2matrix_col)
102
+ self.rpos2matrix_col = {
103
+ rpos: matrix_col for rpos, matrix_col in rpos2matrix_col
104
+ }
105
+
106
+ self.ref_end = ref_end # exclusive
107
+ self.ref_start = ref_start
108
+
109
+ print(f"self.end={self.ref_end}")
110
+
111
+ def update(self, record: pysam.AlignedSegment, ref: str = None):
112
+
113
+ idx = self.query2idx[build_query_name(record)]
114
+
115
+ rpos_cursor = None
116
+ # qpos_cursor = None
117
+ offset = 0
118
+ query_seq = record.query_sequence
119
+
120
+ ref_aligned = []
121
+ query_aligned = []
122
+
123
+ qpos_start = None
124
+ qpos_end = None
125
+
126
+ for qpos, rpos in record.get_aligned_pairs():
127
+ if rpos is not None:
128
+ rpos_cursor = rpos
129
+ if rpos_cursor is None:
130
+ continue
131
+ if rpos_cursor < self.ref_start or rpos_cursor >= self.ref_end:
132
+ continue
133
+
134
+ if qpos_start is None and qpos is not None:
135
+ qpos_start = qpos
136
+
137
+ if qpos is not None:
138
+ qpos_end = qpos
139
+
140
+ if rpos_cursor not in self.rpos2matrix_col:
141
+ print(
142
+ rpos_cursor, " not in ", sorted(list(self.rpos2matrix_col.keys()))
143
+ )
144
+ raise ValueError()
145
+
146
+ ref_aligned.append("-" if rpos is None else ref[rpos])
147
+ query_aligned.append("-" if qpos is None else query_seq[qpos])
148
+
149
+ matrix_init_col = self.rpos2matrix_col[rpos_cursor]
150
+ if rpos is None:
151
+ offset += 1
152
+ else:
153
+ offset = 0
154
+ self.matrix[0, matrix_init_col] = ref[rpos]
155
+
156
+ matrix_col = matrix_init_col + offset
157
+ if qpos is not None:
158
+ self.matrix[idx, matrix_col] = query_seq[qpos]
159
+
160
+ # ref_aligned = "".join(ref_aligned)
161
+ # query_aligned = "".join(query_aligned)
162
+ # info = f"qname:{record.query_name}\n{ref_aligned}\n{query_aligned}"
163
+ # print(info)
164
+
165
+ seq_len = record.query_length
166
+ if record.is_reverse:
167
+ qpos_start, qpos_end = seq_len - qpos_end, seq_len - qpos_start
168
+
169
+ called_start = None
170
+ called_end = None
171
+ if record.has_tag("be"):
172
+ shift = record.get_tag("be")[0]
173
+ called_start = qpos_start + shift
174
+ called_end = qpos_end + shift
175
+
176
+ print(
177
+ f"{record.query_name}: sbr:{qpos_start}-{qpos_end}, called:{called_start}-{called_end}"
178
+ )
179
+
180
+ def get_raw_result(self):
181
+ return self.matrix
182
+
183
+ def get_query_names(self):
184
+ names = [self.ref_name]
185
+ names.extend(self.query_names)
186
+ return names
187
+
188
+ def get_result(self):
189
+ """may the matrix has invalid rows, trim it and return"""
190
+ return self.get_raw_result()
191
+
192
+ def get_result_str(self):
193
+ """valid matrix to string"""
194
+ res = self.get_result()
195
+ names = [self.ref_name]
196
+ names.extend(self.query_names)
197
+ result_strs = []
198
+ for row_idx in range(res.shape[0]):
199
+
200
+ q_name = names[row_idx]
201
+ result_strs.append(f">{q_name}")
202
+ result_strs.append("".join(res[row_idx].tolist()))
203
+
204
+ return "\n".join(result_strs)
205
+
206
+ @staticmethod
207
+ def init_matrix(num_rows, num_cols):
208
+ """init matrix using "." """
209
+ matrix = np.empty(shape=[num_rows, num_cols], dtype=np.str_)
210
+ matrix.fill(".")
211
+ return matrix
212
+
213
+
214
+ def extract_reference(ref_filename):
215
+ """extract reference from fasta file"""
216
+ with open(ref_filename, mode="r", encoding="utf8") as file:
217
+ lines = file.readlines()
218
+ ref_name = lines[0].split(" ")[0][1:]
219
+ return (ref_name.strip(), "".join(lines[1:]))
220
+
221
+
222
+ def build_ref_pos_maxins(
223
+ sam_file: pysam.AlignmentFile, contig: str, ref_start: int, ref_end: int
224
+ ):
225
+ rpos2max_ins = {rpos: 0 for rpos in range(ref_start, ref_end)}
226
+
227
+ for query in sam_file.fetch(contig=contig):
228
+ rpos_cursor = None
229
+ cur_query_ins = 0
230
+ for _, rpos in query.get_aligned_pairs():
231
+
232
+ if rpos is not None:
233
+ rpos_cursor = rpos
234
+
235
+ if rpos_cursor is None:
236
+ continue
237
+ if rpos_cursor < ref_start:
238
+ continue
239
+
240
+ if rpos_cursor >= ref_end:
241
+ break
242
+
243
+ if rpos is None:
244
+ cur_query_ins += 1
245
+ else:
246
+ if rpos_cursor > ref_start:
247
+ rpos2max_ins[rpos_cursor - 1] = max(
248
+ cur_query_ins, rpos2max_ins[rpos_cursor - 1]
249
+ )
250
+ cur_query_ins = 0
251
+
252
+ if rpos_cursor > ref_start and rpos_cursor <= ref_end:
253
+ rpos2max_ins[rpos_cursor - 1] = max(
254
+ cur_query_ins, rpos2max_ins[rpos_cursor - 1]
255
+ )
256
+
257
+ return rpos2max_ins
258
+
259
+
260
+ def bam2fa4jalview(
261
+ aligned_bam_filename,
262
+ ref_filename,
263
+ ref_name,
264
+ interested_ref_start=None,
265
+ interested_ref_end=None,
266
+ ):
267
+ """generate the fasta info that can be used in jalview from bam file and [ref file]
268
+ Params:
269
+ aligned_bam_filename
270
+ contig
271
+ ref_filename
272
+ """
273
+ samfile = pysam.AlignmentFile(aligned_bam_filename, mode="rb", threads=40)
274
+
275
+ fastx_data = Name2Seq(ref_filename)
276
+
277
+ ref_seq = fastx_data.fetch(ref_name)
278
+
279
+ ref_start = 2**32
280
+ ref_end = 0
281
+ query_names = []
282
+
283
+ for query in samfile.fetch(contig=ref_name):
284
+ if (
285
+ interested_ref_start is not None
286
+ and query.reference_end <= interested_ref_start
287
+ ):
288
+ continue
289
+ if (
290
+ interested_ref_end is not None
291
+ and query.reference_start >= interested_ref_end
292
+ ):
293
+ continue
294
+ ref_start = min([query.reference_start, ref_start])
295
+ ref_end = max([query.reference_end, ref_end])
296
+ query_names.append(build_query_name(query))
297
+
298
+ interested_ref_start = (
299
+ ref_start if interested_ref_start is None else interested_ref_start
300
+ )
301
+ interested_ref_end = ref_end if interested_ref_end is None else interested_ref_end
302
+
303
+ print(
304
+ f"interested_ref_start={interested_ref_start}, interested_ref_end={interested_ref_end}"
305
+ )
306
+
307
+ rpos2maxins = build_ref_pos_maxins(
308
+ samfile,
309
+ contig=ref_name,
310
+ ref_start=interested_ref_start,
311
+ ref_end=interested_ref_end,
312
+ )
313
+
314
+ rpos2length = {pos: ins + 1 for pos, ins in rpos2maxins.items()}
315
+
316
+ result_matrix = ResultMatrix(
317
+ ref_start=interested_ref_start,
318
+ ref_end=interested_ref_end,
319
+ query_names=query_names,
320
+ refpos2length=rpos2length,
321
+ ref_name=ref_name,
322
+ )
323
+
324
+ for query in samfile.fetch(contig=ref_name):
325
+ if query.reference_end <= interested_ref_start:
326
+ continue
327
+ if query.reference_start >= interested_ref_end:
328
+ continue
329
+ result_matrix.update(query, ref=ref_seq)
330
+
331
+ return result_matrix
332
+
333
+
334
+ def plot_msa_align(inp_filename, oup_filename=None):
335
+ """plot the msa align according to the fasta file"""
336
+
337
+ from pymsaviz import MsaViz
338
+
339
+ mv = MsaViz(
340
+ inp_filename,
341
+ wrap_length=150,
342
+ show_count=True,
343
+ show_grid=True,
344
+ color_scheme="Identity",
345
+ )
346
+ if oup_filename is None:
347
+ oup_filename = f"{inp_filename}.png"
348
+ mv.savefig(oup_filename)
349
+
350
+
351
+ def main(args):
352
+ res = bam2fa4jalview(
353
+ args.bam,
354
+ ref_filename=args.ref_fasta,
355
+ ref_name=args.ref_name,
356
+ interested_ref_start=args.start,
357
+ interested_ref_end=args.end,
358
+ )
359
+
360
+ res_str = res.get_result_str()
361
+ if args.o_fasta is not None:
362
+ with open(args.o_fasta, "w", encoding="utf8") as file:
363
+ file.write(res_str)
364
+ else:
365
+ print(res_str)
366
+
367
+ if args.o_pic is not None:
368
+ if args.o_fasta is not None:
369
+ plot_msa_align(inp_filename=args.o_fasta, oup_filename=args.o_pic)
370
+
371
+ else:
372
+ with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp:
373
+ tmp.write(res_str)
374
+ tmp.close()
375
+ print(f"temp file name: {tmp.name}")
376
+ plot_msa_align(inp_filename=tmp.name, oup_filename=args.o_pic)
377
+ os.remove(tmp.name)
378
+
379
+
380
+ if __name__ == "__main__":
381
+ p = argparse.ArgumentParser(
382
+ "mas_view",
383
+ description="""
384
+ the whole pipeline is:
385
+ 1) do alignment (pairwise alignment), minimap2 is an option;
386
+ 2) use this script to generate msa pic
387
+ """,
388
+ )
389
+ p.add_argument("--bam", type=str, help="subreads2smc alignment", required=True)
390
+ p.add_argument(
391
+ "--ref-fastx-or-bam",
392
+ type=str,
393
+ help="smc.fa/smc.fq/.bam,",
394
+ required=True,
395
+ dest="ref_fasta",
396
+ )
397
+
398
+ p.add_argument("--ref-name", required=True, type=str, dest="ref_name")
399
+ p.add_argument(
400
+ "--ref-start", type=int, default=None, help="contig end", dest="start"
401
+ )
402
+ p.add_argument("--ref-end", type=int, default=None, help="contig start", dest="end")
403
+
404
+ p.add_argument(
405
+ "--o-fasta",
406
+ default=None,
407
+ help="output fasta file, if not provided, this content will be output to the stdout",
408
+ dest="o_fasta",
409
+ )
410
+ p.add_argument(
411
+ "--o-pic",
412
+ default=None,
413
+ help="visualization picture file path, if not provided, the plot procedure will be skipped",
414
+ dest="o_pic",
415
+ )
416
+ args = p.parse_args()
417
+ main(args)