xcmap_bio 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xcmap_bio-0.0.1/PKG-INFO +16 -0
- xcmap_bio-0.0.1/README.md +0 -0
- xcmap_bio-0.0.1/pyproject.toml +25 -0
- xcmap_bio-0.0.1/src/xcmap_bio/__init__.py +0 -0
- xcmap_bio-0.0.1/src/xcmap_bio/genome/__init__.py +0 -0
- xcmap_bio-0.0.1/src/xcmap_bio/genome/fasta.py +52 -0
- xcmap_bio-0.0.1/src/xcmap_bio/genome/gff3.py +238 -0
- xcmap_bio-0.0.1/src/xcmap_bio/homology/__init__.py +0 -0
- xcmap_bio-0.0.1/src/xcmap_bio/transcriptone/__init__.py +0 -0
xcmap_bio-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: xcmap_bio
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Biological tools
|
|
5
|
+
Author: zhangyang
|
|
6
|
+
Author-email: bryanyo1017@126.com
|
|
7
|
+
Requires-Python: >=3.13
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
10
|
+
Requires-Dist: pandas (>=2.3.2,<3.0.0)
|
|
11
|
+
Requires-Dist: pyfaidx (>=0.9.0.3,<0.10.0.0)
|
|
12
|
+
Requires-Dist: pysam (>=0.23.3,<0.24.0)
|
|
13
|
+
Requires-Dist: xcmap (>=0.1.5,<0.2.0)
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
|
|
File without changes
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "xcmap_bio"
|
|
3
|
+
version = "0.0.1"
|
|
4
|
+
description = "Biological tools"
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "zhangyang",email = "bryanyo1017@126.com"}
|
|
7
|
+
]
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
requires-python = ">=3.13"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"xcmap (>=0.1.5,<0.2.0)",
|
|
12
|
+
"pandas (>=2.3.2,<3.0.0)",
|
|
13
|
+
"pysam (>=0.23.3,<0.24.0)",
|
|
14
|
+
"pyfaidx (>=0.9.0.3,<0.10.0.0)"
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[tool.poetry]
|
|
18
|
+
packages = [{include = "xcmap_bio", from = "src"}]
|
|
19
|
+
|
|
20
|
+
[tool.poetry.group.dev.dependencies]
|
|
21
|
+
jupyter = "^1.1.1"
|
|
22
|
+
|
|
23
|
+
[build-system]
|
|
24
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
25
|
+
build-backend = "poetry.core.masonry.api"
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from pyfaidx import Fasta
|
|
2
|
+
from typing import Iterator, Tuple
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class FastaOptions:
|
|
6
|
+
"""
|
|
7
|
+
高效 FASTA 读取工具类,基于 pyfaidx。
|
|
8
|
+
支持随机访问区间、获取序列长度、遍历序列。
|
|
9
|
+
"""
|
|
10
|
+
def __init__(self, fasta_path: str, rebuild_index: bool = False):
|
|
11
|
+
self.handler = Fasta(fasta_path, rebuild=rebuild_index)
|
|
12
|
+
|
|
13
|
+
def fetch_seq(self, seq_id: str, start: int = None, end: int = None):
|
|
14
|
+
fasta_reader = _FastaReader(self.handler)
|
|
15
|
+
return fasta_reader.get_seq(seq_id, start, end)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class _FastaReader:
|
|
19
|
+
"""
|
|
20
|
+
高效 FASTA 工具类,基于 pyfaidx。
|
|
21
|
+
支持随机访问区间、获取序列长度、遍历序列。
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, handler):
|
|
25
|
+
self.handler = handler
|
|
26
|
+
|
|
27
|
+
def get_seq(self, seq_id: str, start: int = None, end: int = None) -> str:
|
|
28
|
+
"""
|
|
29
|
+
获取序列区间 [start, end),0-based 左闭右开。
|
|
30
|
+
如果 start/end 都为空,返回整条序列。
|
|
31
|
+
"""
|
|
32
|
+
seq = self.handler[seq_id]
|
|
33
|
+
if start is None and end is None:
|
|
34
|
+
return str(seq)
|
|
35
|
+
return str(seq[start:end])
|
|
36
|
+
|
|
37
|
+
def get_length(self, seq_id: str) -> int:
|
|
38
|
+
"""获取序列长度"""
|
|
39
|
+
return len(self.handler[seq_id])
|
|
40
|
+
|
|
41
|
+
def list_seq_ids(self):
|
|
42
|
+
"""返回所有序列 ID"""
|
|
43
|
+
return list(self.handler.keys())
|
|
44
|
+
|
|
45
|
+
def __iter__(self) -> Iterator[Tuple[str, str]]:
|
|
46
|
+
"""遍历所有序列 (seq_id, sequence)"""
|
|
47
|
+
for seq_id in self.handler.keys():
|
|
48
|
+
yield seq_id, str(self.handler[seq_id])
|
|
49
|
+
|
|
50
|
+
def close(self):
|
|
51
|
+
"""关闭文件句柄"""
|
|
52
|
+
self.handler.close()
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
from functools import lru_cache
|
|
2
|
+
from typing import Optional, Union, List, Tuple
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Gff3ColumnName:
|
|
9
|
+
"""
|
|
10
|
+
序列 ID(染色体、contig 等),必须与参考序列一致, 如chr1
|
|
11
|
+
"""
|
|
12
|
+
SEQ_ID = "seq_id"
|
|
13
|
+
"""
|
|
14
|
+
特征来源(预测程序、数据库名等)如Ensembl
|
|
15
|
+
"""
|
|
16
|
+
SOURCE = "source"
|
|
17
|
+
"""
|
|
18
|
+
特征类型(使用 SO ontology term), 如gene, mRNA, exon, CDS
|
|
19
|
+
"""
|
|
20
|
+
TYPE = "type"
|
|
21
|
+
"""
|
|
22
|
+
起始位置(1-based, inclusive)
|
|
23
|
+
"""
|
|
24
|
+
START = "start"
|
|
25
|
+
"""
|
|
26
|
+
结束位置(1-based, inclusive)
|
|
27
|
+
"""
|
|
28
|
+
END = "end"
|
|
29
|
+
"""
|
|
30
|
+
打分值(浮点数,或 . 代表无值)
|
|
31
|
+
"""
|
|
32
|
+
SCORE = "score"
|
|
33
|
+
"""
|
|
34
|
+
链信息(+, -, 或 . 未知)
|
|
35
|
+
"""
|
|
36
|
+
STRAND = "strand"
|
|
37
|
+
"""
|
|
38
|
+
仅对 CDS 有意义,取值为 0, 1, 2(表示阅读框相对起点偏移),其他特征用 .
|
|
39
|
+
"""
|
|
40
|
+
PHASE = "phase"
|
|
41
|
+
"""
|
|
42
|
+
属性字段,key=value 对形式,以 ; 分隔;至少应包含 ID 或 Parent, 如ID=mRNA0001;Parent=gene0001;Name=BRCA1-201
|
|
43
|
+
"""
|
|
44
|
+
ATTRIBUTES = "attributes"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def fetch_values(cls):
|
|
49
|
+
return [
|
|
50
|
+
value
|
|
51
|
+
for name, value in vars(cls).items()
|
|
52
|
+
if not name.startswith('_') and not callable(value) and name != 'fetch_values'
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
class Gff3Options:
|
|
56
|
+
|
|
57
|
+
class PageRequest(BaseModel):
|
|
58
|
+
"""
|
|
59
|
+
分页查询请求,like query = GFF3Query(df)
|
|
60
|
+
result, total = (
|
|
61
|
+
query.filter(seqid=["chr1", "chr2"], feature_type=["gene", "exon"], start=200, end=400)
|
|
62
|
+
.order(order_by=["seqid", "start"], ascending=[True, False])
|
|
63
|
+
.paginate(page=1, page_size=2)
|
|
64
|
+
)
|
|
65
|
+
"""
|
|
66
|
+
seq_id: Optional[Union[str, List[str], None]] = None
|
|
67
|
+
type: Optional[Union[str, List[str], None]] = None
|
|
68
|
+
start: Optional[int] = None
|
|
69
|
+
end: Optional[int] = None
|
|
70
|
+
order_by: Optional[Union[str, List[str]]] = Gff3ColumnName.START
|
|
71
|
+
ascending: Optional[Union[bool, List[bool]]] = True
|
|
72
|
+
page: int = 1
|
|
73
|
+
size: int = 20
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
"""
|
|
77
|
+
GFF3 file parser
|
|
78
|
+
"""
|
|
79
|
+
def __init__(self, gff3_path):
|
|
80
|
+
self.df = self.read_gff3(gff3_path)
|
|
81
|
+
|
|
82
|
+
@lru_cache
|
|
83
|
+
def read_gff3(self, gff3_path):
|
|
84
|
+
df = pd.read_csv(gff3_path,
|
|
85
|
+
skiprows=1,
|
|
86
|
+
header=None,
|
|
87
|
+
sep="\t",
|
|
88
|
+
names=Gff3ColumnName.fetch_values()
|
|
89
|
+
)
|
|
90
|
+
def extract_attr(attrs, key):
|
|
91
|
+
for item in attrs.split(";"):
|
|
92
|
+
if item.startswith(f"{key}="):
|
|
93
|
+
return item.split("=", 1)[1]
|
|
94
|
+
return None
|
|
95
|
+
df["ID"] = df[Gff3ColumnName.ATTRIBUTES].apply(lambda x: extract_attr(x, "ID"))
|
|
96
|
+
df["Parent"] = df[Gff3ColumnName.ATTRIBUTES].apply(lambda x: extract_attr(x, "Parent"))
|
|
97
|
+
return df
|
|
98
|
+
|
|
99
|
+
def fetch_page(self, page_request: PageRequest) -> Tuple[List[dict], int]:
|
|
100
|
+
"""
|
|
101
|
+
分页查询gff3数据
|
|
102
|
+
:param page_request:
|
|
103
|
+
:return:
|
|
104
|
+
"""
|
|
105
|
+
return _GFF3Query(self.df).filter(
|
|
106
|
+
seq_id=page_request.seq_id,
|
|
107
|
+
seq_type=page_request.type,
|
|
108
|
+
start=page_request.start,
|
|
109
|
+
end=page_request.end
|
|
110
|
+
).order(
|
|
111
|
+
order_by=page_request.order_by,
|
|
112
|
+
ascending=page_request.ascending
|
|
113
|
+
).paginate(
|
|
114
|
+
page=page_request.page,
|
|
115
|
+
size=page_request.size
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
def fetch_by_gene_id(self, gene_id, is_contain_descendants = True):
|
|
119
|
+
"""
|
|
120
|
+
通过基因ID查询
|
|
121
|
+
:param is_contain_descendants: 是否包含子节点一下节点
|
|
122
|
+
:param gene_id:
|
|
123
|
+
:return:
|
|
124
|
+
"""
|
|
125
|
+
children_df = self._find_children(gene_id)
|
|
126
|
+
if children_df.empty:
|
|
127
|
+
return []
|
|
128
|
+
results = []
|
|
129
|
+
if is_contain_descendants:
|
|
130
|
+
for children_dict in children_df.to_dict(orient="records"):
|
|
131
|
+
descendants_df = self._find_descendants(children_dict.get("ID"))
|
|
132
|
+
if children_df.empty:
|
|
133
|
+
continue
|
|
134
|
+
feature_list = descendants_df.to_dict(orient="records")
|
|
135
|
+
children_dict['feature_list'] = descendants_df.to_dict(orient="records")
|
|
136
|
+
results.append(children_dict)
|
|
137
|
+
else:
|
|
138
|
+
results = children_df.to_dict(orient="records")
|
|
139
|
+
return results
|
|
140
|
+
|
|
141
|
+
@lru_cache
|
|
142
|
+
def _find_children(self,
|
|
143
|
+
parent_id: str,
|
|
144
|
+
seq_id: Optional[str] = None,
|
|
145
|
+
feature_type: Optional[str] = None) -> pd.DataFrame:
|
|
146
|
+
df = self.df
|
|
147
|
+
"""查找直接子节点"""
|
|
148
|
+
result = df[df["Parent"] == parent_id]
|
|
149
|
+
if seq_id:
|
|
150
|
+
result = result[result["seq_id"] == seq_id]
|
|
151
|
+
if feature_type:
|
|
152
|
+
result = result[result["type"] == feature_type]
|
|
153
|
+
return result
|
|
154
|
+
|
|
155
|
+
@lru_cache
|
|
156
|
+
def _find_descendants(self,
|
|
157
|
+
parent_id: str,
|
|
158
|
+
seq_id: Optional[str] = None,
|
|
159
|
+
feature_type: Optional[str] = None) -> pd.DataFrame:
|
|
160
|
+
"""查找所有后代(逐层展开,不做递归遍历)"""
|
|
161
|
+
results = []
|
|
162
|
+
df = self.df
|
|
163
|
+
to_visit = [parent_id]
|
|
164
|
+
while to_visit:
|
|
165
|
+
children = df[df["Parent"].isin(to_visit)]
|
|
166
|
+
if children.empty:
|
|
167
|
+
break
|
|
168
|
+
results.append(children)
|
|
169
|
+
# 下一轮查询:用这批 children 的 ID 当作 parent
|
|
170
|
+
to_visit = children["ID"].tolist()
|
|
171
|
+
|
|
172
|
+
if results:
|
|
173
|
+
result = pd.concat(results, ignore_index=True)
|
|
174
|
+
if seq_id:
|
|
175
|
+
result = result[result["seq_id"] == seq_id]
|
|
176
|
+
if feature_type:
|
|
177
|
+
result = result[result["type"] == feature_type]
|
|
178
|
+
return result
|
|
179
|
+
else:
|
|
180
|
+
return pd.DataFrame(columns=df.columns)
|
|
181
|
+
|
|
182
|
+
class _GFF3Query:
|
|
183
|
+
def __init__(self, df: pd.DataFrame):
|
|
184
|
+
self.df = df.copy()
|
|
185
|
+
self.filtered = self.df
|
|
186
|
+
|
|
187
|
+
def filter(
|
|
188
|
+
self,
|
|
189
|
+
seq_id: Union[str, List[str], None] = None,
|
|
190
|
+
seq_type: Union[str, List[str], None] = None,
|
|
191
|
+
start: int = None,
|
|
192
|
+
end: int = None,
|
|
193
|
+
):
|
|
194
|
+
"""条件过滤"""
|
|
195
|
+
if seq_id is not None:
|
|
196
|
+
if isinstance(seq_id, list):
|
|
197
|
+
self.filtered = self.filtered[self.filtered[Gff3ColumnName.SEQ_ID].isin(seq_id)]
|
|
198
|
+
else:
|
|
199
|
+
self.filtered = self.filtered[self.filtered[Gff3ColumnName.SEQ_ID] == seq_id]
|
|
200
|
+
|
|
201
|
+
if seq_type is not None:
|
|
202
|
+
if isinstance(seq_type, list):
|
|
203
|
+
self.filtered = self.filtered[self.filtered[Gff3ColumnName.TYPE].isin(seq_type)]
|
|
204
|
+
else:
|
|
205
|
+
self.filtered = self.filtered[self.filtered[Gff3ColumnName.TYPE] == seq_type]
|
|
206
|
+
|
|
207
|
+
if start is not None:
|
|
208
|
+
self.filtered = self.filtered[self.filtered[Gff3ColumnName.START] >= start]
|
|
209
|
+
if end is not None:
|
|
210
|
+
self.filtered = self.filtered[self.filtered[Gff3ColumnName.END] <= end]
|
|
211
|
+
|
|
212
|
+
return self
|
|
213
|
+
|
|
214
|
+
def order(self, order_by: Union[str, List[str]] = Gff3ColumnName.START, ascending: Union[bool, List[bool]] = True):
|
|
215
|
+
"""排序,可支持多字段"""
|
|
216
|
+
if isinstance(order_by, str):
|
|
217
|
+
order_by = [order_by]
|
|
218
|
+
if isinstance(ascending, bool):
|
|
219
|
+
ascending = [ascending] * len(order_by)
|
|
220
|
+
|
|
221
|
+
valid_cols = [col for col in order_by if col in self.filtered.columns]
|
|
222
|
+
if valid_cols:
|
|
223
|
+
self.filtered = self.filtered.sort_values(by=valid_cols, ascending=ascending)
|
|
224
|
+
|
|
225
|
+
return self
|
|
226
|
+
|
|
227
|
+
def paginate(self, page: int = 1, size: int = 20) -> Tuple[List[dict], int]:
|
|
228
|
+
"""分页,并返回 (结果list, 总条数)"""
|
|
229
|
+
total_count = len(self.filtered)
|
|
230
|
+
offset = (page - 1) * size
|
|
231
|
+
page_df = self.filtered.iloc[offset: offset + size]
|
|
232
|
+
return page_df.to_dict(orient="records"), total_count
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
if __name__ == '__main__':
|
|
236
|
+
gff3_options = Gff3Options("/Users/zhangyang/Downloads/Ccs.final.gff")
|
|
237
|
+
result = gff3_options.fetch_by_gene_id('Ccs01G000100', is_contain_descendants=True)
|
|
238
|
+
print(result)
|
|
File without changes
|
|
File without changes
|