pyplogo 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyplogo-0.1.0/PKG-INFO +27 -0
- pyplogo-0.1.0/README.md +16 -0
- pyplogo-0.1.0/pyplogo/__init__.py +59 -0
- pyplogo-0.1.0/pyplogo/data/__init__.py +8 -0
- pyplogo-0.1.0/pyplogo/data/aa_properties.py +46 -0
- pyplogo-0.1.0/pyplogo/extractors/__init__.py +8 -0
- pyplogo-0.1.0/pyplogo/extractors/structure_based.py +325 -0
- pyplogo-0.1.0/pyplogo/utils/__init__.py +23 -0
- pyplogo-0.1.0/pyplogo/utils/formatters.py +136 -0
- pyplogo-0.1.0/pyplogo/utils/parsers.py +86 -0
- pyplogo-0.1.0/pyplogo/visualizers/__init__.py +50 -0
- pyplogo-0.1.0/pyplogo/visualizers/secondary_structure.py +1222 -0
- pyplogo-0.1.0/pyplogo/visualizers/themes.py +1755 -0
- pyplogo-0.1.0/pyplogo.egg-info/PKG-INFO +27 -0
- pyplogo-0.1.0/pyplogo.egg-info/SOURCES.txt +19 -0
- pyplogo-0.1.0/pyplogo.egg-info/dependency_links.txt +1 -0
- pyplogo-0.1.0/pyplogo.egg-info/requires.txt +3 -0
- pyplogo-0.1.0/pyplogo.egg-info/top_level.txt +1 -0
- pyplogo-0.1.0/pyproject.toml +17 -0
- pyplogo-0.1.0/setup.cfg +4 -0
- pyplogo-0.1.0/tests/test.py +63 -0
pyplogo-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pyplogo
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A professional tool for protein secondary structure analysis and visualization. Offers a complete solution from structure extraction, multi-theme customization to high-quality publication-ready plotting.
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: biopython>=1.86
|
|
9
|
+
Requires-Dist: matplotlib>=3.10.8
|
|
10
|
+
Requires-Dist: numpy>=2.2.6
|
|
11
|
+
|
|
12
|
+
\# pyplogo-toolkit
|
|
13
|
+
|
|
14
|
+
PyPLogo工具包:专业的蛋白质二级结构分析可视化工具。提供从结构提取、多主题定制到高质量出版级绘图的完整解决方案。
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
PyPLogo Toolkit: A professional tool for protein secondary structure analysis and visualization. Offers a complete solution from structure extraction, multi-theme customization to high-quality publication-ready plotting.
|
|
19
|
+
|
|
20
|
+
<img width="1190" height="790" alt="output-1" src="https://github.com/user-attachments/assets/32ea6516-c587-44ea-9a30-b1a5a79923e1" />
|
|
21
|
+
|
|
22
|
+
<img width="2987" height="1335" alt="output-2" src="https://github.com/user-attachments/assets/bd4a6dc9-a04a-4a95-8114-5c8adfb8266b" />
|
|
23
|
+
|
|
24
|
+
<img width="1190" height="790" alt="output" src="https://github.com/user-attachments/assets/4b34c491-f050-4336-9c99-5d2c9da98381" />
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
|
pyplogo-0.1.0/README.md
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
\# pyplogo-toolkit
|
|
2
|
+
|
|
3
|
+
PyPLogo工具包:专业的蛋白质二级结构分析可视化工具。提供从结构提取、多主题定制到高质量出版级绘图的完整解决方案。
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
PyPLogo Toolkit: A professional tool for protein secondary structure analysis and visualization. Offers a complete solution from structure extraction, multi-theme customization to high-quality publication-ready plotting.
|
|
8
|
+
|
|
9
|
+
<img width="1190" height="790" alt="output-1" src="https://github.com/user-attachments/assets/32ea6516-c587-44ea-9a30-b1a5a79923e1" />
|
|
10
|
+
|
|
11
|
+
<img width="2987" height="1335" alt="output-2" src="https://github.com/user-attachments/assets/bd4a6dc9-a04a-4a95-8114-5c8adfb8266b" />
|
|
12
|
+
|
|
13
|
+
<img width="1190" height="790" alt="output" src="https://github.com/user-attachments/assets/4b34c491-f050-4336-9c99-5d2c9da98381" />
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PyPLogo - Protein Secondary Structure Visualization Tool
|
|
3
|
+
A Python package for extracting and visualizing protein secondary structures
|
|
4
|
+
from PDB/CIF files using DSSP with publication-quality output.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
__author__ = "Your Name"
|
|
9
|
+
__email__ = "your.email@example.com"
|
|
10
|
+
|
|
11
|
+
# Import main classes for easier access
|
|
12
|
+
from .visualizers.secondary_structure import SecondaryStructureVisualizer
|
|
13
|
+
from .visualizers.themes import ScientificTheme, NatureTheme, \
|
|
14
|
+
DesertTheme, ArcticTheme, TropicalTheme, GemstoneTheme, VintageTheme, \
|
|
15
|
+
SpaceTheme, SpringTheme, AutumnTheme, CoralReefTheme, AuroraTheme, \
|
|
16
|
+
MetalTheme, CandyTheme, OasisTheme, StarlightTheme, SakuraTheme, \
|
|
17
|
+
VolcanoTheme, JadeTheme, TwilightTheme, OnyxTheme, LavenderTheme, \
|
|
18
|
+
HoneyTheme, MintTheme, CoralTheme, AmethystTheme, LemonTheme, \
|
|
19
|
+
SapphireTheme, RubyTheme, EmeraldTheme, TopazTheme, MalachiteTheme, \
|
|
20
|
+
OpalTheme, PearlTheme
|
|
21
|
+
|
|
22
|
+
# Define public API
|
|
23
|
+
__all__ = [
|
|
24
|
+
'SecondaryStructureVisualizer',
|
|
25
|
+
'ScientificTheme',
|
|
26
|
+
'NatureTheme',
|
|
27
|
+
'DesertTheme',
|
|
28
|
+
'ArcticTheme',
|
|
29
|
+
'TropicalTheme',
|
|
30
|
+
'GemstoneTheme',
|
|
31
|
+
'VintageTheme',
|
|
32
|
+
'SpaceTheme',
|
|
33
|
+
'SpringTheme',
|
|
34
|
+
'AutumnTheme',
|
|
35
|
+
'CoralReefTheme',
|
|
36
|
+
'AuroraTheme',
|
|
37
|
+
'MetalTheme',
|
|
38
|
+
'CandyTheme',
|
|
39
|
+
'OasisTheme',
|
|
40
|
+
'StarlightTheme',
|
|
41
|
+
'SakuraTheme',
|
|
42
|
+
'VolcanoTheme',
|
|
43
|
+
'JadeTheme',
|
|
44
|
+
'TwilightTheme',
|
|
45
|
+
'OnyxTheme',
|
|
46
|
+
'LavenderTheme',
|
|
47
|
+
'HoneyTheme',
|
|
48
|
+
'MintTheme',
|
|
49
|
+
'CoralTheme',
|
|
50
|
+
'AmethystTheme',
|
|
51
|
+
'LemonTheme',
|
|
52
|
+
'SapphireTheme',
|
|
53
|
+
'RubyTheme',
|
|
54
|
+
'EmeraldTheme',
|
|
55
|
+
'TopazTheme',
|
|
56
|
+
'MalachiteTheme',
|
|
57
|
+
'OpalTheme',
|
|
58
|
+
'PearlTheme'
|
|
59
|
+
]
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Amino acid properties and characteristics
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
AA_PROPERTIES = {
|
|
6
|
+
'A': {'name': 'Alanine', 'type': 'hydrophobic', 'weight': 89.1},
|
|
7
|
+
'C': {'name': 'Cysteine', 'type': 'polar', 'weight': 121.2},
|
|
8
|
+
'D': {'name': 'Aspartic Acid', 'type': 'acidic', 'weight': 133.1},
|
|
9
|
+
'E': {'name': 'Glutamic Acid', 'type': 'acidic', 'weight': 147.1},
|
|
10
|
+
'F': {'name': 'Phenylalanine', 'type': 'hydrophobic', 'weight': 165.2},
|
|
11
|
+
'G': {'name': 'Glycine', 'type': 'special', 'weight': 75.1},
|
|
12
|
+
'H': {'name': 'Histidine', 'type': 'basic', 'weight': 155.2},
|
|
13
|
+
'I': {'name': 'Isoleucine', 'type': 'hydrophobic', 'weight': 131.2},
|
|
14
|
+
'K': {'name': 'Lysine', 'type': 'basic', 'weight': 146.2},
|
|
15
|
+
'L': {'name': 'Leucine', 'type': 'hydrophobic', 'weight': 131.2},
|
|
16
|
+
'M': {'name': 'Methionine', 'type': 'hydrophobic', 'weight': 149.2},
|
|
17
|
+
'N': {'name': 'Asparagine', 'type': 'polar', 'weight': 132.1},
|
|
18
|
+
'P': {'name': 'Proline', 'type': 'special', 'weight': 115.1},
|
|
19
|
+
'Q': {'name': 'Glutamine', 'type': 'polar', 'weight': 146.2},
|
|
20
|
+
'R': {'name': 'Arginine', 'type': 'basic', 'weight': 174.2},
|
|
21
|
+
'S': {'name': 'Serine', 'type': 'polar', 'weight': 105.1},
|
|
22
|
+
'T': {'name': 'Threonine', 'type': 'polar', 'weight': 119.1},
|
|
23
|
+
'V': {'name': 'Valine', 'type': 'hydrophobic', 'weight': 117.1},
|
|
24
|
+
'W': {'name': 'Tryptophan', 'type': 'hydrophobic', 'weight': 204.2},
|
|
25
|
+
'Y': {'name': 'Tyrosine', 'type': 'polar', 'weight': 181.2}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
# Color schemes for amino acids
|
|
29
|
+
AA_COLORS = {
|
|
30
|
+
'hydrophobic': '#FF6B6B',
|
|
31
|
+
'polar': '#4ECDC4',
|
|
32
|
+
'acidic': '#FFE66D',
|
|
33
|
+
'basic': '#45B7D1',
|
|
34
|
+
'special': '#96CEB4'
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
# Secondary structure color scheme
|
|
38
|
+
SS_COLORS = {
|
|
39
|
+
'H': '#FF6B6B', # α-helix - red
|
|
40
|
+
'E': '#FFE66D', # β-strand - yellow
|
|
41
|
+
'C': '#45B7D1', # coil - blue
|
|
42
|
+
'G': '#FF8E72', # 3₁₀-helix - orange
|
|
43
|
+
'I': '#C44D58', # π-helix - dark red
|
|
44
|
+
'T': '#4ECDC4', # turn - teal
|
|
45
|
+
'S': '#96CEB4' # bend - green
|
|
46
|
+
}
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import warnings
|
|
3
|
+
import tempfile
|
|
4
|
+
import numpy as np
|
|
5
|
+
from Bio.PDB import PDBParser, MMCIFParser, DSSP
|
|
6
|
+
from Bio.PDB.PDBExceptions import PDBConstructionWarning
|
|
7
|
+
from Bio import Align
|
|
8
|
+
from Bio.Align import substitution_matrices
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import List, Tuple
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class StructureData:
|
|
14
|
+
"""Container for structure extraction results"""
|
|
15
|
+
sequence: str
|
|
16
|
+
secondary_structure: str
|
|
17
|
+
source_file: str
|
|
18
|
+
method: str
|
|
19
|
+
disulfide_bonds: List[Tuple[int, int]] = field(default_factory=list) # 新增二硫键字段
|
|
20
|
+
|
|
21
|
+
class StructureExtractor:
|
|
22
|
+
"""Extract secondary structure and disulfide bonds from PDB/CIF files using DSSP"""
|
|
23
|
+
|
|
24
|
+
def __init__(self):
|
|
25
|
+
"""Initialize structure extractor"""
|
|
26
|
+
self.ss_mapping = {
|
|
27
|
+
'H': 'H', 'G': 'G', 'I': 'I', # Helices
|
|
28
|
+
'E': 'E', 'B': 'E', # Strands
|
|
29
|
+
'T': 'T', 'S': 'S', # Turns and bends
|
|
30
|
+
' ': 'C', '-': 'C' # Coil
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
# 设置字典文件路径
|
|
34
|
+
base_dir = os.path.dirname(os.path.abspath(__file__))
|
|
35
|
+
dict_dir = os.path.join(base_dir, '..', 'data', 'dssp_dicts')
|
|
36
|
+
dict_dir = os.path.abspath(dict_dir)
|
|
37
|
+
os.environ['LIBCIFPP_DATA_DIR'] = dict_dir
|
|
38
|
+
print(f"设置 DSSP 字典路径: {dict_dir}")
|
|
39
|
+
|
|
40
|
+
def _run_dssp(self, model, file_path):
|
|
41
|
+
"""运行 DSSP 并处理可能的错误"""
|
|
42
|
+
try:
|
|
43
|
+
# 使用 mkdssp
|
|
44
|
+
return DSSP(model, file_path, dssp='mkdssp')
|
|
45
|
+
except Exception as e:
|
|
46
|
+
# 尝试替代方法
|
|
47
|
+
try:
|
|
48
|
+
# 尝试使用标准 dssp
|
|
49
|
+
return DSSP(model, file_path)
|
|
50
|
+
except Exception as e2:
|
|
51
|
+
# 尝试直接调用 mkdssp 可执行文件
|
|
52
|
+
try:
|
|
53
|
+
from Bio.PDB.DSSP import dssp_dict_from_pdb_file
|
|
54
|
+
return dssp_dict_from_pdb_file(file_path, DSSP='mkdssp')
|
|
55
|
+
except Exception as e3:
|
|
56
|
+
try:
|
|
57
|
+
# 尝试使用标准 dssp
|
|
58
|
+
return dssp_dict_from_pdb_file(file_path)
|
|
59
|
+
except Exception as e4:
|
|
60
|
+
raise RuntimeError(f"DSSP failed to produce an output: {str(e4)}") from e4
|
|
61
|
+
|
|
62
|
+
def from_pdb(self, file_path: str, chain_id: str = 'A', input_sequence: str = None) -> StructureData:
|
|
63
|
+
"""从PDB文件提取二级结构和二硫键"""
|
|
64
|
+
try:
|
|
65
|
+
# 转换为绝对路径
|
|
66
|
+
file_path = os.path.abspath(file_path)
|
|
67
|
+
|
|
68
|
+
# 检查文件是否需要修复
|
|
69
|
+
with open(file_path, 'r') as f:
|
|
70
|
+
first_line = f.readline()
|
|
71
|
+
|
|
72
|
+
# 如果缺少HEADER,创建修复版本
|
|
73
|
+
if not first_line.startswith('HEADER'):
|
|
74
|
+
print("检测到PDB文件缺少HEADER记录,正在修复...")
|
|
75
|
+
file_path = self._fix_pdb_header(file_path)
|
|
76
|
+
|
|
77
|
+
with warnings.catch_warnings():
|
|
78
|
+
warnings.simplefilter("ignore", PDBConstructionWarning)
|
|
79
|
+
parser = PDBParser(QUIET=True)
|
|
80
|
+
structure = parser.get_structure('protein', file_path)
|
|
81
|
+
model = structure[0]
|
|
82
|
+
|
|
83
|
+
# 运行DSSP
|
|
84
|
+
dssp_result = self._run_dssp(model, file_path)
|
|
85
|
+
|
|
86
|
+
# 提取结构数据
|
|
87
|
+
result = self._extract_from_dssp(dssp_result, chain_id, file_path)
|
|
88
|
+
|
|
89
|
+
# 提取二硫键信息
|
|
90
|
+
result.disulfide_bonds = self._extract_disulfide_bonds(model, chain_id)
|
|
91
|
+
|
|
92
|
+
# 序列匹配
|
|
93
|
+
if input_sequence:
|
|
94
|
+
matched_ss = self.match_sequence(
|
|
95
|
+
result.sequence,
|
|
96
|
+
result.secondary_structure,
|
|
97
|
+
input_sequence
|
|
98
|
+
)
|
|
99
|
+
return StructureData(
|
|
100
|
+
sequence=input_sequence,
|
|
101
|
+
secondary_structure=matched_ss,
|
|
102
|
+
source_file=file_path,
|
|
103
|
+
method="dssp (matched)",
|
|
104
|
+
disulfide_bonds=result.disulfide_bonds
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
return result
|
|
108
|
+
|
|
109
|
+
except Exception as e:
|
|
110
|
+
raise ValueError(f"Error processing PDB file: {e}")
|
|
111
|
+
|
|
112
|
+
def _fix_pdb_header(self, original_file):
|
|
113
|
+
"""为PDB文件添加缺失的HEADER记录"""
|
|
114
|
+
import tempfile
|
|
115
|
+
|
|
116
|
+
# 读取原始内容
|
|
117
|
+
with open(original_file, 'r') as f:
|
|
118
|
+
content = f.read()
|
|
119
|
+
|
|
120
|
+
# 添加HEADER记录
|
|
121
|
+
header = "HEADER PROTEIN 01-JAN-70 FIXED_PDB\n"
|
|
122
|
+
fixed_content = header + content
|
|
123
|
+
|
|
124
|
+
# 创建临时文件
|
|
125
|
+
temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.pdb', delete=False)
|
|
126
|
+
temp_file.write(fixed_content)
|
|
127
|
+
temp_file.close()
|
|
128
|
+
|
|
129
|
+
return temp_file.name
|
|
130
|
+
|
|
131
|
+
def from_cif(self, file_path: str, chain_id: str = 'A', input_sequence: str = None) -> StructureData:
|
|
132
|
+
"""从 CIF 文件提取二级结构和二硫键,可选自定义序列"""
|
|
133
|
+
try:
|
|
134
|
+
with warnings.catch_warnings():
|
|
135
|
+
warnings.simplefilter("ignore", PDBConstructionWarning)
|
|
136
|
+
parser = MMCIFParser(QUIET=True)
|
|
137
|
+
structure = parser.get_structure('protein', file_path)
|
|
138
|
+
model = structure[0] # 使用第一个模型
|
|
139
|
+
|
|
140
|
+
# 运行 DSSP
|
|
141
|
+
dssp = self._run_dssp(model, file_path)
|
|
142
|
+
|
|
143
|
+
# 提取结构数据
|
|
144
|
+
result = self._extract_from_dssp(dssp, chain_id, file_path)
|
|
145
|
+
|
|
146
|
+
# 提取二硫键信息
|
|
147
|
+
result.disulfide_bonds = self._extract_disulfide_bonds(model, chain_id)
|
|
148
|
+
|
|
149
|
+
# 如果提供了自定义序列,进行匹配
|
|
150
|
+
if input_sequence:
|
|
151
|
+
matched_ss = self.match_sequence(
|
|
152
|
+
result.sequence,
|
|
153
|
+
result.secondary_structure,
|
|
154
|
+
input_sequence
|
|
155
|
+
)
|
|
156
|
+
return StructureData(
|
|
157
|
+
sequence=input_sequence,
|
|
158
|
+
secondary_structure=matched_ss,
|
|
159
|
+
source_file=file_path,
|
|
160
|
+
method="dssp (matched)",
|
|
161
|
+
disulfide_bonds=result.disulfide_bonds # 保留二硫键信息
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
return result
|
|
165
|
+
|
|
166
|
+
except Exception as e:
|
|
167
|
+
raise ValueError(f"Error processing CIF file: {e}")
|
|
168
|
+
|
|
169
|
+
def _extract_from_dssp(self, dssp, chain_id: str, file_path: str) -> StructureData:
|
|
170
|
+
"""从 DSSP 对象提取数据"""
|
|
171
|
+
sequence = []
|
|
172
|
+
secondary_structure = []
|
|
173
|
+
|
|
174
|
+
# 处理不同的 DSSP 返回类型
|
|
175
|
+
if isinstance(dssp, dict):
|
|
176
|
+
# 来自 dssp_dict_from_pdb_file 的返回类型
|
|
177
|
+
for key, value in dssp.items():
|
|
178
|
+
if key[0] == chain_id:
|
|
179
|
+
aa = value[1]
|
|
180
|
+
ss = value[2]
|
|
181
|
+
|
|
182
|
+
# 跳过非标准氨基酸
|
|
183
|
+
if aa not in 'ACDEFGHIKLMNPQRSTVWY':
|
|
184
|
+
continue
|
|
185
|
+
|
|
186
|
+
sequence.append(aa)
|
|
187
|
+
secondary_structure.append(self.ss_mapping.get(ss, 'C'))
|
|
188
|
+
else:
|
|
189
|
+
# 来自 DSSP 类的返回类型
|
|
190
|
+
for key in dssp.keys():
|
|
191
|
+
if key[0] == chain_id:
|
|
192
|
+
aa = dssp[key][1]
|
|
193
|
+
ss = dssp[key][2]
|
|
194
|
+
|
|
195
|
+
# 跳过非标准氨基酸
|
|
196
|
+
if aa not in 'ACDEFGHIKLMNPQRSTVWY':
|
|
197
|
+
continue
|
|
198
|
+
|
|
199
|
+
sequence.append(aa)
|
|
200
|
+
secondary_structure.append(self.ss_mapping.get(ss, 'C'))
|
|
201
|
+
|
|
202
|
+
if not sequence:
|
|
203
|
+
raise ValueError(f"No data extracted for chain {chain_id}. "
|
|
204
|
+
f"Check if chain exists and contains standard amino acids.")
|
|
205
|
+
|
|
206
|
+
return StructureData(
|
|
207
|
+
sequence=''.join(sequence),
|
|
208
|
+
secondary_structure=''.join(secondary_structure),
|
|
209
|
+
source_file=file_path,
|
|
210
|
+
method="dssp"
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
def _extract_disulfide_bonds(self, model, chain_id: str) -> List[Tuple[int, int]]:
|
|
214
|
+
"""从结构中提取二硫键信息"""
|
|
215
|
+
disulfide_bonds = []
|
|
216
|
+
|
|
217
|
+
# 获取指定链
|
|
218
|
+
chain = None
|
|
219
|
+
for c in model:
|
|
220
|
+
if c.id == chain_id:
|
|
221
|
+
chain = c
|
|
222
|
+
break
|
|
223
|
+
|
|
224
|
+
if not chain:
|
|
225
|
+
return disulfide_bonds
|
|
226
|
+
|
|
227
|
+
# 提取半胱氨酸残基
|
|
228
|
+
cysteines = []
|
|
229
|
+
for residue in chain:
|
|
230
|
+
resname = residue.get_resname().strip()
|
|
231
|
+
if resname == 'CYS' and self._is_amino_acid(residue):
|
|
232
|
+
cysteines.append(residue)
|
|
233
|
+
|
|
234
|
+
# 检测二硫键 (距离小于3.0Å)
|
|
235
|
+
for i, cys1 in enumerate(cysteines):
|
|
236
|
+
for j, cys2 in enumerate(cysteines):
|
|
237
|
+
if i >= j:
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
# 计算SG原子间距离
|
|
241
|
+
sg1 = None
|
|
242
|
+
sg2 = None
|
|
243
|
+
for atom in cys1:
|
|
244
|
+
if atom.get_name() == 'SG':
|
|
245
|
+
sg1 = atom
|
|
246
|
+
for atom in cys2:
|
|
247
|
+
if atom.get_name() == 'SG':
|
|
248
|
+
sg2 = atom
|
|
249
|
+
|
|
250
|
+
if sg1 and sg2:
|
|
251
|
+
distance = sg1 - sg2
|
|
252
|
+
if distance < 3.0: # 二硫键典型距离
|
|
253
|
+
# 获取残基索引 (基于PDB文件中的序号)
|
|
254
|
+
res_id1 = cys1.get_id()[1]
|
|
255
|
+
res_id2 = cys2.get_id()[1]
|
|
256
|
+
disulfide_bonds.append((res_id1, res_id2))
|
|
257
|
+
|
|
258
|
+
return disulfide_bonds
|
|
259
|
+
|
|
260
|
+
def _is_amino_acid(self, residue):
|
|
261
|
+
"""检查残基是否是标准氨基酸"""
|
|
262
|
+
try:
|
|
263
|
+
return residue.get_resname() in [
|
|
264
|
+
'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY',
|
|
265
|
+
'HIS', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER',
|
|
266
|
+
'THR', 'TRP', 'TYR', 'VAL'
|
|
267
|
+
]
|
|
268
|
+
except:
|
|
269
|
+
return False
|
|
270
|
+
|
|
271
|
+
def match_sequence(self, structure_sequence, structure_ss, input_sequence):
|
|
272
|
+
"""
|
|
273
|
+
匹配输入序列与结构序列,生成对应的二级结构字符串
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
structure_sequence: 从结构文件中提取的序列
|
|
277
|
+
structure_ss: 从结构文件中提取的二级结构
|
|
278
|
+
input_sequence: 用户输入的序列
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
与输入序列长度匹配的二级结构字符串
|
|
282
|
+
"""
|
|
283
|
+
# 如果输入序列为空,直接返回结构序列
|
|
284
|
+
if not input_sequence:
|
|
285
|
+
return structure_ss
|
|
286
|
+
|
|
287
|
+
# 使用局部序列比对找到最佳匹配
|
|
288
|
+
aligner = Align.PairwiseAligner()
|
|
289
|
+
aligner.mode = 'local'
|
|
290
|
+
aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
|
|
291
|
+
alignments = aligner.align(structure_sequence, input_sequence)
|
|
292
|
+
|
|
293
|
+
if not alignments:
|
|
294
|
+
# 如果没有找到匹配,返回全coil结构
|
|
295
|
+
return 'C' * len(input_sequence)
|
|
296
|
+
|
|
297
|
+
# 取最佳比对结果
|
|
298
|
+
best_alignment = alignments[0]
|
|
299
|
+
|
|
300
|
+
# 提取比对信息
|
|
301
|
+
aligned_target = best_alignment.target
|
|
302
|
+
aligned_query = best_alignment.query
|
|
303
|
+
|
|
304
|
+
# 创建与输入序列等长的二级结构数组,初始化为coil
|
|
305
|
+
matched_ss = ['C'] * len(input_sequence)
|
|
306
|
+
|
|
307
|
+
# 遍历比对结果,填充匹配位置的二级结构
|
|
308
|
+
target_idx = 0
|
|
309
|
+
query_idx = 0
|
|
310
|
+
|
|
311
|
+
for t_char, q_char in zip(aligned_target, aligned_query):
|
|
312
|
+
if t_char != '-' and q_char != '-':
|
|
313
|
+
# 匹配位置,使用结构中的二级结构
|
|
314
|
+
if target_idx < len(structure_ss):
|
|
315
|
+
matched_ss[query_idx] = structure_ss[target_idx]
|
|
316
|
+
target_idx += 1
|
|
317
|
+
query_idx += 1
|
|
318
|
+
elif t_char == '-' and q_char != '-':
|
|
319
|
+
# 插入位置,保持为coil
|
|
320
|
+
query_idx += 1
|
|
321
|
+
elif t_char != '-' and q_char == '-':
|
|
322
|
+
# 删除位置,跳过
|
|
323
|
+
target_idx += 1
|
|
324
|
+
|
|
325
|
+
return ''.join(matched_ss)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utilities module for PyPLogo
|
|
3
|
+
Contains utility functions for parsing, formatting, and handling protein data
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from .parsers import parse_fasta, validate_sequence, calculate_sequence_stats
|
|
7
|
+
from .formatters import (format_sequence, format_secondary_structure,
|
|
8
|
+
format_confidence_scores, create_sequence_chunks,
|
|
9
|
+
calculate_sequence_position, format_legend,
|
|
10
|
+
normalize_confidence_scores)
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
'parse_fasta',
|
|
14
|
+
'validate_sequence',
|
|
15
|
+
'calculate_sequence_stats',
|
|
16
|
+
'format_sequence',
|
|
17
|
+
'format_secondary_structure',
|
|
18
|
+
'format_confidence_scores',
|
|
19
|
+
'create_sequence_chunks',
|
|
20
|
+
'calculate_sequence_position',
|
|
21
|
+
'format_legend',
|
|
22
|
+
'normalize_confidence_scores'
|
|
23
|
+
]
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility functions for formatting and formatting protein sequences and structures.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import List, Tuple, Dict, Union
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
def format_sequence(sequence: str, per_line: int = 50) -> List[str]:
|
|
9
|
+
"""
|
|
10
|
+
Format amino acid sequence into multiple lines
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
sequence: Amino acid sequence
|
|
14
|
+
per_line: Number of amino acids per line
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
List of formatted sequence lines
|
|
18
|
+
"""
|
|
19
|
+
if not sequence:
|
|
20
|
+
return []
|
|
21
|
+
|
|
22
|
+
# 将序列分割成指定长度的块
|
|
23
|
+
chunks = []
|
|
24
|
+
for i in range(0, len(sequence), per_line):
|
|
25
|
+
chunk = sequence[i:i + per_line]
|
|
26
|
+
chunks.append(chunk)
|
|
27
|
+
|
|
28
|
+
return chunks
|
|
29
|
+
|
|
30
|
+
def format_secondary_structure(ss_string: str, line_length: int = 50) -> List[str]:
|
|
31
|
+
"""
|
|
32
|
+
Format secondary structure string into multiple lines
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
ss_string: Secondary structure string
|
|
36
|
+
line_length: Number of characters per line
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
List of formatted secondary structure lines
|
|
40
|
+
"""
|
|
41
|
+
return [ss_string[i:i+line_length]
|
|
42
|
+
for i in range(0, len(ss_string), line_length)]
|
|
43
|
+
|
|
44
|
+
def format_confidence_scores(confidence: np.ndarray,
|
|
45
|
+
line_length: int = 50) -> List[np.ndarray]:
|
|
46
|
+
"""
|
|
47
|
+
Format confidence scores into multiple lines
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
confidence: Confidence scores array
|
|
51
|
+
line_length: Number of scores per line
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
List of formatted confidence arrays
|
|
55
|
+
"""
|
|
56
|
+
return [confidence[i:i+line_length]
|
|
57
|
+
for i in range(0, len(confidence), line_length)]
|
|
58
|
+
|
|
59
|
+
def create_sequence_chunks(sequence: str, chunk_size: int = 50) -> List[Tuple[int, int]]:
|
|
60
|
+
"""
|
|
61
|
+
Create chunks for sequence processing
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
sequence: Amino acid sequence
|
|
65
|
+
chunk_size: Size of each chunk
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
List of (start, end) tuples for each chunk
|
|
69
|
+
"""
|
|
70
|
+
return [(i, min(i + chunk_size, len(sequence)))
|
|
71
|
+
for i in range(0, len(sequence), chunk_size)]
|
|
72
|
+
|
|
73
|
+
def calculate_sequence_position(residue_index: int,
|
|
74
|
+
chunk_size: int = 50,
|
|
75
|
+
line_index: int = 0) -> Tuple[int, int]:
|
|
76
|
+
"""
|
|
77
|
+
Calculate visual position for a residue
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
residue_index: Index of residue in sequence
|
|
81
|
+
chunk_size: Number of residues per line
|
|
82
|
+
line_index: Starting line index
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
Tuple of (line_number, position_in_line)
|
|
86
|
+
"""
|
|
87
|
+
line_number = line_index + (residue_index // chunk_size)
|
|
88
|
+
position_in_line = residue_index % chunk_size
|
|
89
|
+
return line_number, position_in_line
|
|
90
|
+
|
|
91
|
+
def format_legend(ss_types: List[str],
|
|
92
|
+
color_map: Dict[str, str]) -> str:
|
|
93
|
+
"""
|
|
94
|
+
Create formatted legend for secondary structure types
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
ss_types: List of secondary structure types
|
|
98
|
+
color_map: Mapping from SS type to color code
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Formatted legend string
|
|
102
|
+
"""
|
|
103
|
+
legend_lines = []
|
|
104
|
+
for ss_type in ss_types:
|
|
105
|
+
color = color_map.get(ss_type, '#000000')
|
|
106
|
+
legend_lines.append(f"{ss_type}: {color}")
|
|
107
|
+
return "\n".join(legend_lines)
|
|
108
|
+
|
|
109
|
+
def normalize_confidence_scores(scores: np.ndarray) -> np.ndarray:
|
|
110
|
+
"""
|
|
111
|
+
Normalize confidence scores to 0-1 range
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
scores: Raw confidence scores
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Normalized scores
|
|
118
|
+
"""
|
|
119
|
+
if not scores:
|
|
120
|
+
return []
|
|
121
|
+
|
|
122
|
+
# 转换为列表处理(如果输入是numpy数组)
|
|
123
|
+
scores_list = list(scores)
|
|
124
|
+
|
|
125
|
+
# 处理空列表
|
|
126
|
+
if len(scores_list) == 0:
|
|
127
|
+
return []
|
|
128
|
+
|
|
129
|
+
# 找到最大值进行标准化
|
|
130
|
+
max_score = max(scores_list)
|
|
131
|
+
|
|
132
|
+
if max_score == 0:
|
|
133
|
+
return [0.0] * len(scores_list)
|
|
134
|
+
|
|
135
|
+
normalized = [score / max_score for score in scores_list]
|
|
136
|
+
return normalized
|