xcmap_bio 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xcmap_bio/__init__.py ADDED
File without changes
File without changes
@@ -0,0 +1,52 @@
1
+ from pyfaidx import Fasta
2
+ from typing import Iterator, Tuple
3
+
4
+
5
+ class FastaOptions:
6
+ """
7
+ 高效 FASTA 读取工具类,基于 pyfaidx。
8
+ 支持随机访问区间、获取序列长度、遍历序列。
9
+ """
10
+ def __init__(self, fasta_path: str, rebuild_index: bool = False):
11
+ self.handler = Fasta(fasta_path, rebuild=rebuild_index)
12
+
13
+ def fetch_seq(self, seq_id: str, start: int = None, end: int = None):
14
+ fasta_reader = _FastaReader(self.handler)
15
+ return fasta_reader.get_seq(seq_id, start, end)
16
+
17
+
18
+ class _FastaReader:
19
+ """
20
+ 高效 FASTA 工具类,基于 pyfaidx。
21
+ 支持随机访问区间、获取序列长度、遍历序列。
22
+ """
23
+
24
+ def __init__(self, handler):
25
+ self.handler = handler
26
+
27
+ def get_seq(self, seq_id: str, start: int = None, end: int = None) -> str:
28
+ """
29
+ 获取序列区间 [start, end),0-based 左闭右开。
30
+ 如果 start/end 都为空,返回整条序列。
31
+ """
32
+ seq = self.handler[seq_id]
33
+ if start is None and end is None:
34
+ return str(seq)
35
+ return str(seq[start:end])
36
+
37
+ def get_length(self, seq_id: str) -> int:
38
+ """获取序列长度"""
39
+ return len(self.handler[seq_id])
40
+
41
+ def list_seq_ids(self):
42
+ """返回所有序列 ID"""
43
+ return list(self.handler.keys())
44
+
45
+ def __iter__(self) -> Iterator[Tuple[str, str]]:
46
+ """遍历所有序列 (seq_id, sequence)"""
47
+ for seq_id in self.handler.keys():
48
+ yield seq_id, str(self.handler[seq_id])
49
+
50
+ def close(self):
51
+ """关闭文件句柄"""
52
+ self.handler.close()
@@ -0,0 +1,238 @@
1
+ from functools import lru_cache
2
+ from typing import Optional, Union, List, Tuple
3
+
4
+ import pandas as pd
5
+ from pydantic import BaseModel
6
+
7
+
8
+ class Gff3ColumnName:
9
+ """
10
+ 序列 ID(染色体、contig 等),必须与参考序列一致, 如chr1
11
+ """
12
+ SEQ_ID = "seq_id"
13
+ """
14
+ 特征来源(预测程序、数据库名等)如Ensembl
15
+ """
16
+ SOURCE = "source"
17
+ """
18
+ 特征类型(使用 SO ontology term), 如gene, mRNA, exon, CDS
19
+ """
20
+ TYPE = "type"
21
+ """
22
+ 起始位置(1-based, inclusive)
23
+ """
24
+ START = "start"
25
+ """
26
+ 结束位置(1-based, inclusive)
27
+ """
28
+ END = "end"
29
+ """
30
+ 打分值(浮点数,或 . 代表无值)
31
+ """
32
+ SCORE = "score"
33
+ """
34
+ 链信息(+, -, 或 . 未知)
35
+ """
36
+ STRAND = "strand"
37
+ """
38
+ 仅对 CDS 有意义,取值为 0, 1, 2(表示阅读框相对起点偏移),其他特征用 .
39
+ """
40
+ PHASE = "phase"
41
+ """
42
+ 属性字段,key=value 对形式,以 ; 分隔;至少应包含 ID 或 Parent, 如ID=mRNA0001;Parent=gene0001;Name=BRCA1-201
43
+ """
44
+ ATTRIBUTES = "attributes"
45
+
46
+
47
+ @classmethod
48
+ def fetch_values(cls):
49
+ return [
50
+ value
51
+ for name, value in vars(cls).items()
52
+ if not name.startswith('_') and not callable(value) and name != 'fetch_values'
53
+ ]
54
+
55
+ class Gff3Options:
56
+
57
+ class PageRequest(BaseModel):
58
+ """
59
+ 分页查询请求,like query = GFF3Query(df)
60
+ result, total = (
61
+ query.filter(seqid=["chr1", "chr2"], feature_type=["gene", "exon"], start=200, end=400)
62
+ .order(order_by=["seqid", "start"], ascending=[True, False])
63
+ .paginate(page=1, page_size=2)
64
+ )
65
+ """
66
+ seq_id: Optional[Union[str, List[str], None]] = None
67
+ type: Optional[Union[str, List[str], None]] = None
68
+ start: Optional[int] = None
69
+ end: Optional[int] = None
70
+ order_by: Optional[Union[str, List[str]]] = Gff3ColumnName.START
71
+ ascending: Optional[Union[bool, List[bool]]] = True
72
+ page: int = 1
73
+ size: int = 20
74
+
75
+
76
+ """
77
+ GFF3 file parser
78
+ """
79
+ def __init__(self, gff3_path):
80
+ self.df = self.read_gff3(gff3_path)
81
+
82
+ @lru_cache
83
+ def read_gff3(self, gff3_path):
84
+ df = pd.read_csv(gff3_path,
85
+ skiprows=1,
86
+ header=None,
87
+ sep="\t",
88
+ names=Gff3ColumnName.fetch_values()
89
+ )
90
+ def extract_attr(attrs, key):
91
+ for item in attrs.split(";"):
92
+ if item.startswith(f"{key}="):
93
+ return item.split("=", 1)[1]
94
+ return None
95
+ df["ID"] = df[Gff3ColumnName.ATTRIBUTES].apply(lambda x: extract_attr(x, "ID"))
96
+ df["Parent"] = df[Gff3ColumnName.ATTRIBUTES].apply(lambda x: extract_attr(x, "Parent"))
97
+ return df
98
+
99
+ def fetch_page(self, page_request: PageRequest) -> Tuple[List[dict], int]:
100
+ """
101
+ 分页查询gff3数据
102
+ :param page_request:
103
+ :return:
104
+ """
105
+ return _GFF3Query(self.df).filter(
106
+ seq_id=page_request.seq_id,
107
+ seq_type=page_request.type,
108
+ start=page_request.start,
109
+ end=page_request.end
110
+ ).order(
111
+ order_by=page_request.order_by,
112
+ ascending=page_request.ascending
113
+ ).paginate(
114
+ page=page_request.page,
115
+ size=page_request.size
116
+ )
117
+
118
+ def fetch_by_gene_id(self, gene_id, is_contain_descendants = True):
119
+ """
120
+ 通过基因ID查询
121
+ :param is_contain_descendants: 是否包含子节点一下节点
122
+ :param gene_id:
123
+ :return:
124
+ """
125
+ children_df = self._find_children(gene_id)
126
+ if children_df.empty:
127
+ return []
128
+ results = []
129
+ if is_contain_descendants:
130
+ for children_dict in children_df.to_dict(orient="records"):
131
+ descendants_df = self._find_descendants(children_dict.get("ID"))
132
+ if children_df.empty:
133
+ continue
134
+ feature_list = descendants_df.to_dict(orient="records")
135
+ children_dict['feature_list'] = descendants_df.to_dict(orient="records")
136
+ results.append(children_dict)
137
+ else:
138
+ results = children_df.to_dict(orient="records")
139
+ return results
140
+
141
+ @lru_cache
142
+ def _find_children(self,
143
+ parent_id: str,
144
+ seq_id: Optional[str] = None,
145
+ feature_type: Optional[str] = None) -> pd.DataFrame:
146
+ df = self.df
147
+ """查找直接子节点"""
148
+ result = df[df["Parent"] == parent_id]
149
+ if seq_id:
150
+ result = result[result["seq_id"] == seq_id]
151
+ if feature_type:
152
+ result = result[result["type"] == feature_type]
153
+ return result
154
+
155
+ @lru_cache
156
+ def _find_descendants(self,
157
+ parent_id: str,
158
+ seq_id: Optional[str] = None,
159
+ feature_type: Optional[str] = None) -> pd.DataFrame:
160
+ """查找所有后代(逐层展开,不做递归遍历)"""
161
+ results = []
162
+ df = self.df
163
+ to_visit = [parent_id]
164
+ while to_visit:
165
+ children = df[df["Parent"].isin(to_visit)]
166
+ if children.empty:
167
+ break
168
+ results.append(children)
169
+ # 下一轮查询:用这批 children 的 ID 当作 parent
170
+ to_visit = children["ID"].tolist()
171
+
172
+ if results:
173
+ result = pd.concat(results, ignore_index=True)
174
+ if seq_id:
175
+ result = result[result["seq_id"] == seq_id]
176
+ if feature_type:
177
+ result = result[result["type"] == feature_type]
178
+ return result
179
+ else:
180
+ return pd.DataFrame(columns=df.columns)
181
+
182
+ class _GFF3Query:
183
+ def __init__(self, df: pd.DataFrame):
184
+ self.df = df.copy()
185
+ self.filtered = self.df
186
+
187
+ def filter(
188
+ self,
189
+ seq_id: Union[str, List[str], None] = None,
190
+ seq_type: Union[str, List[str], None] = None,
191
+ start: int = None,
192
+ end: int = None,
193
+ ):
194
+ """条件过滤"""
195
+ if seq_id is not None:
196
+ if isinstance(seq_id, list):
197
+ self.filtered = self.filtered[self.filtered[Gff3ColumnName.SEQ_ID].isin(seq_id)]
198
+ else:
199
+ self.filtered = self.filtered[self.filtered[Gff3ColumnName.SEQ_ID] == seq_id]
200
+
201
+ if seq_type is not None:
202
+ if isinstance(seq_type, list):
203
+ self.filtered = self.filtered[self.filtered[Gff3ColumnName.TYPE].isin(seq_type)]
204
+ else:
205
+ self.filtered = self.filtered[self.filtered[Gff3ColumnName.TYPE] == seq_type]
206
+
207
+ if start is not None:
208
+ self.filtered = self.filtered[self.filtered[Gff3ColumnName.START] >= start]
209
+ if end is not None:
210
+ self.filtered = self.filtered[self.filtered[Gff3ColumnName.END] <= end]
211
+
212
+ return self
213
+
214
+ def order(self, order_by: Union[str, List[str]] = Gff3ColumnName.START, ascending: Union[bool, List[bool]] = True):
215
+ """排序,可支持多字段"""
216
+ if isinstance(order_by, str):
217
+ order_by = [order_by]
218
+ if isinstance(ascending, bool):
219
+ ascending = [ascending] * len(order_by)
220
+
221
+ valid_cols = [col for col in order_by if col in self.filtered.columns]
222
+ if valid_cols:
223
+ self.filtered = self.filtered.sort_values(by=valid_cols, ascending=ascending)
224
+
225
+ return self
226
+
227
+ def paginate(self, page: int = 1, size: int = 20) -> Tuple[List[dict], int]:
228
+ """分页,并返回 (结果list, 总条数)"""
229
+ total_count = len(self.filtered)
230
+ offset = (page - 1) * size
231
+ page_df = self.filtered.iloc[offset: offset + size]
232
+ return page_df.to_dict(orient="records"), total_count
233
+
234
+
235
+ if __name__ == '__main__':
236
+ gff3_options = Gff3Options("/Users/zhangyang/Downloads/Ccs.final.gff")
237
+ result = gff3_options.fetch_by_gene_id('Ccs01G000100', is_contain_descendants=True)
238
+ print(result)
File without changes
File without changes
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.3
2
+ Name: xcmap_bio
3
+ Version: 0.0.1
4
+ Summary: Biological tools
5
+ Author: zhangyang
6
+ Author-email: bryanyo1017@126.com
7
+ Requires-Python: >=3.13
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.13
10
+ Requires-Dist: pandas (>=2.3.2,<3.0.0)
11
+ Requires-Dist: pyfaidx (>=0.9.0.3,<0.10.0.0)
12
+ Requires-Dist: pysam (>=0.23.3,<0.24.0)
13
+ Requires-Dist: xcmap (>=0.1.5,<0.2.0)
14
+ Description-Content-Type: text/markdown
15
+
16
+
@@ -0,0 +1,9 @@
1
+ xcmap_bio/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ xcmap_bio/genome/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ xcmap_bio/genome/fasta.py,sha256=xeHndi1akC4DVD-vm2aNpRdKUadUXl9pnPOatAIBSLk,1634
4
+ xcmap_bio/genome/gff3.py,sha256=jhq8qgdaryuRTO0Q9pOOAXxUhcE57JEue6_mJK2a4FI,8182
5
+ xcmap_bio/homology/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ xcmap_bio/transcriptone/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ xcmap_bio-0.0.1.dist-info/METADATA,sha256=zMzzQYUcU5PIhC-y8SRnuxnivRueSEoat-L8bk22JoM,458
8
+ xcmap_bio-0.0.1.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
9
+ xcmap_bio-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 2.1.2
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any