RNApolis 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnapolis/aligner.py +103 -0
- rnapolis/clashfinder.py +4 -4
- rnapolis/component_A.csv +38 -0
- rnapolis/component_C.csv +36 -0
- rnapolis/component_G.csv +39 -0
- rnapolis/component_U.csv +35 -0
- rnapolis/parser_v2.py +305 -1
- rnapolis/tertiary_v2.py +40 -7
- rnapolis/unifier.py +153 -0
- {rnapolis-0.5.0.dist-info → rnapolis-0.6.0.dist-info}/METADATA +3 -2
- rnapolis-0.6.0.dist-info/RECORD +26 -0
- {rnapolis-0.5.0.dist-info → rnapolis-0.6.0.dist-info}/WHEEL +1 -1
- {rnapolis-0.5.0.dist-info → rnapolis-0.6.0.dist-info}/entry_points.txt +2 -0
- rnapolis-0.5.0.dist-info/RECORD +0 -20
- {rnapolis-0.5.0.dist-info → rnapolis-0.6.0.dist-info/licenses}/LICENSE +0 -0
- {rnapolis-0.5.0.dist-info → rnapolis-0.6.0.dist-info}/top_level.txt +0 -0
rnapolis/aligner.py
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
import argparse
|
3
|
+
import os
|
4
|
+
import tempfile
|
5
|
+
|
6
|
+
import pandas as pd
|
7
|
+
|
8
|
+
from rnapolis.parser import is_cif
|
9
|
+
from rnapolis.parser_v2 import parse_cif_atoms, parse_pdb_atoms, write_cif, write_pdb
|
10
|
+
from rnapolis.tertiary_v2 import Structure
|
11
|
+
|
12
|
+
|
13
|
+
def main():
|
14
|
+
"""Main function to run the unifier tool."""
|
15
|
+
parser = argparse.ArgumentParser(description="Align two PDB or mmCIF files.")
|
16
|
+
parser.add_argument("--output", "-o", help="Output directory", required=True)
|
17
|
+
parser.add_argument(
|
18
|
+
"--format",
|
19
|
+
"-f",
|
20
|
+
help="Output format (possible values: PDB, mmCIF, keep. Default: keep)",
|
21
|
+
default="keep",
|
22
|
+
)
|
23
|
+
parser.add_argument("pdb1", help="First PDB or mmCIF file")
|
24
|
+
parser.add_argument("pdb2", help="Second PDB or mmCIF file")
|
25
|
+
args = parser.parse_args()
|
26
|
+
|
27
|
+
from pymol import cmd
|
28
|
+
|
29
|
+
cmd.load(args.pdb1, "pdb1")
|
30
|
+
cmd.load(args.pdb2, "pdb2")
|
31
|
+
cmd.align("pdb1", "pdb2", object="aligned", cycles=0)
|
32
|
+
|
33
|
+
pdb1_aligned = []
|
34
|
+
pdb2_aligned = []
|
35
|
+
|
36
|
+
with tempfile.NamedTemporaryFile("wt+", suffix=".aln") as f:
|
37
|
+
cmd.save(f.name, "aligned")
|
38
|
+
f.seek(0)
|
39
|
+
|
40
|
+
for line in f:
|
41
|
+
if line.startswith("pdb1"):
|
42
|
+
pdb1_aligned.append(line.split()[1])
|
43
|
+
elif line.startswith("pdb2"):
|
44
|
+
pdb2_aligned.append(line.split()[1])
|
45
|
+
|
46
|
+
pdb1_aligned = " ".join(pdb1_aligned)
|
47
|
+
pdb2_aligned = " ".join(pdb2_aligned)
|
48
|
+
residues_to_remove = {"pdb1": [], "pdb2": []}
|
49
|
+
|
50
|
+
for i, (c1, c2) in enumerate(zip(pdb1_aligned, pdb2_aligned)):
|
51
|
+
if c1 == c2:
|
52
|
+
continue
|
53
|
+
if c1 == "-":
|
54
|
+
residues_to_remove["pdb2"].append(i)
|
55
|
+
elif c2 == "-":
|
56
|
+
residues_to_remove["pdb1"].append(i)
|
57
|
+
elif c1 != c2:
|
58
|
+
residues_to_remove["pdb1"].append(i)
|
59
|
+
residues_to_remove["pdb2"].append(i)
|
60
|
+
else:
|
61
|
+
raise ValueError("This should not happen!")
|
62
|
+
|
63
|
+
if not residues_to_remove["pdb1"] and not residues_to_remove["pdb2"]:
|
64
|
+
print("Structures are already aligned")
|
65
|
+
|
66
|
+
structures = {}
|
67
|
+
for key, path in [("pdb1", args.pdb1), ("pdb2", args.pdb2)]:
|
68
|
+
with open(path) as f:
|
69
|
+
if is_cif(f):
|
70
|
+
atoms = parse_cif_atoms(f)
|
71
|
+
else:
|
72
|
+
atoms = parse_pdb_atoms(f)
|
73
|
+
|
74
|
+
structures[key] = Structure(atoms).residues
|
75
|
+
|
76
|
+
for key, residues in structures.items():
|
77
|
+
for i in sorted(residues_to_remove[key], reverse=True):
|
78
|
+
del residues[i]
|
79
|
+
|
80
|
+
# Write output
|
81
|
+
os.makedirs(args.output, exist_ok=True)
|
82
|
+
|
83
|
+
for (key, residues), path in zip(structures.items(), [args.pdb1, args.pdb2]):
|
84
|
+
base, _ = os.path.splitext(os.path.basename(path))
|
85
|
+
|
86
|
+
if args.format == "keep":
|
87
|
+
format = residues[0].atoms.attrs["format"]
|
88
|
+
else:
|
89
|
+
format = args.format
|
90
|
+
|
91
|
+
ext = ".pdb" if format == "PDB" else ".cif"
|
92
|
+
|
93
|
+
with open(f"{args.output}/{base}{ext}", "w") as f:
|
94
|
+
df = pd.concat([residue.atoms for residue in residues])
|
95
|
+
|
96
|
+
if format == "PDB":
|
97
|
+
write_pdb(df, f)
|
98
|
+
else:
|
99
|
+
write_cif(df, f)
|
100
|
+
|
101
|
+
|
102
|
+
if __name__ == "__main__":
|
103
|
+
main()
|
rnapolis/clashfinder.py
CHANGED
@@ -179,20 +179,20 @@ def main():
|
|
179
179
|
for ci, cj in sorted(clashing_chains):
|
180
180
|
if ci == cj:
|
181
181
|
print(
|
182
|
-
f"Clashes found in chain {ci} with maximum occupancy sum equal to {max_occupancy_chains[(ci,cj)]}"
|
182
|
+
f"Clashes found in chain {ci} with maximum occupancy sum equal to {max_occupancy_chains[(ci, cj)]}"
|
183
183
|
)
|
184
184
|
else:
|
185
185
|
print(
|
186
|
-
f"Clashes found between chains {ci} and {cj} with maximum occupancy sum equal to {max_occupancy_chains[(ci,cj)]}"
|
186
|
+
f"Clashes found between chains {ci} and {cj} with maximum occupancy sum equal to {max_occupancy_chains[(ci, cj)]}"
|
187
187
|
)
|
188
188
|
for ri, rj in clashing_chains[(ci, cj)]:
|
189
189
|
if ri == rj:
|
190
190
|
print(
|
191
|
-
f" Clashes found in residue {ri} with maximum occupancy sum equal to {max_occupancy_residues[(ri,rj)]}"
|
191
|
+
f" Clashes found in residue {ri} with maximum occupancy sum equal to {max_occupancy_residues[(ri, rj)]}"
|
192
192
|
)
|
193
193
|
else:
|
194
194
|
print(
|
195
|
-
f" Clashes found between residues {ri} and {rj} with maximum occupancy sum equal to {max_occupancy_residues[(ri,rj)]}"
|
195
|
+
f" Clashes found between residues {ri} and {rj} with maximum occupancy sum equal to {max_occupancy_residues[(ri, rj)]}"
|
196
196
|
)
|
197
197
|
for ai, aj, occupancy in sorted(clashing_chains[(ci, cj)][(ri, rj)]):
|
198
198
|
print(
|
rnapolis/component_A.csv
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
comp_id,atom_id,alt_atom_id,type_symbol,charge,pdbx_align,pdbx_aromatic_flag,pdbx_leaving_atom_flag,pdbx_stereo_config,model_Cartn_x,model_Cartn_y,model_Cartn_z,pdbx_model_Cartn_x_ideal,pdbx_model_Cartn_y_ideal,pdbx_model_Cartn_z_ideal,pdbx_component_atom_id,pdbx_component_comp_id,pdbx_ordinal
|
2
|
+
A,OP3,O3P,O,0,1,N,Y,N,22.586,9.736,-6.030,2.135,-1.141,-5.313,OP3,A,1
|
3
|
+
A,P,P,P,0,1,N,N,N,23.014,10.223,-7.491,1.024,-0.137,-4.723,P,A,2
|
4
|
+
A,OP1,O1P,O,0,1,N,N,N,21.938,9.966,-8.412,1.633,1.190,-4.488,OP1,A,3
|
5
|
+
A,OP2,O2P,O,0,1,N,N,N,24.378,9.686,-7.860,-0.183,0.005,-5.778,OP2,A,4
|
6
|
+
A,O5',O5*,O,0,1,N,N,N,23.144,11.720,-7.092,0.456,-0.720,-3.334,O5',A,5
|
7
|
+
A,C5',C5*,C,0,1,N,N,N,24.013,12.484,-7.839,-0.520,0.209,-2.863,C5',A,6
|
8
|
+
A,C4',C4*,C,0,1,N,N,R,23.996,13.899,-7.276,-1.101,-0.287,-1.538,C4',A,7
|
9
|
+
A,O4',O4*,O,0,1,N,N,N,24.523,13.928,-5.945,-0.064,-0.383,-0.538,O4',A,8
|
10
|
+
A,C3',C3*,C,0,1,N,N,S,24.868,14.877,-8.041,-2.105,0.739,-0.969,C3',A,9
|
11
|
+
A,O3',O3*,O,0,1,N,N,N,24.195,15.389,-9.178,-3.445,0.360,-1.287,O3',A,10
|
12
|
+
A,C2',C2*,C,0,1,N,N,R,25.172,15.942,-7.004,-1.874,0.684,0.558,C2',A,11
|
13
|
+
A,O2',O2*,O,0,1,N,N,N,24.060,16.759,-6.874,-3.065,0.271,1.231,O2',A,12
|
14
|
+
A,C1',C1*,C,0,1,N,N,R,25.387,15.094,-5.814,-0.755,-0.367,0.729,C1',A,13
|
15
|
+
A,N9,N9,N,0,1,Y,N,N,26.745,14.503,-5.630,0.158,0.029,1.803,N9,A,14
|
16
|
+
A,C8,C8,C,0,1,Y,N,N,27.163,13.163,-5.773,1.265,0.813,1.672,C8,A,15
|
17
|
+
A,N7,N7,N,0,1,Y,N,N,28.488,13.043,-5.514,1.843,0.963,2.828,N7,A,16
|
18
|
+
A,C5,C5,C,0,1,Y,N,N,28.887,14.292,-5.222,1.143,0.292,3.773,C5,A,17
|
19
|
+
A,C6,C6,C,0,1,Y,N,N,30.177,14.748,-4.871,1.290,0.091,5.156,C6,A,18
|
20
|
+
A,N6,N6,N,0,1,N,N,N,31.245,13.997,-4.775,2.344,0.664,5.846,N6,A,19
|
21
|
+
A,N1,N1,N,0,1,Y,N,N,30.286,16.119,-4.615,0.391,-0.656,5.787,N1,A,20
|
22
|
+
A,C2,C2,C,0,1,Y,N,N,29.272,16.866,-4.702,-0.617,-1.206,5.136,C2,A,21
|
23
|
+
A,N3,N3,N,0,1,Y,N,N,27.992,16.509,-5.030,-0.792,-1.051,3.841,N3,A,22
|
24
|
+
A,C4,C4,C,0,1,Y,N,N,27.856,15.249,-5.271,0.056,-0.320,3.126,C4,A,23
|
25
|
+
A,HOP3,3HOP,H,0,0,N,N,N,23.296,9.905,-5.422,2.448,-0.755,-6.142,HOP3,A,24
|
26
|
+
A,HOP2,2HOP,H,0,0,N,N,N,25.088,9.855,-7.252,-0.552,-0.879,-5.902,HOP2,A,25
|
27
|
+
A,H5',1H5*,H,0,1,N,N,N,25.039,12.050,-7.885,-1.319,0.301,-3.599,H5',A,26
|
28
|
+
A,H5'',2H5*,H,0,0,N,N,N,23.788,12.450,-8.930,-0.052,1.182,-2.712,H5'',A,27
|
29
|
+
A,H4',H4*,H,0,1,N,N,N,22.923,14.196,-7.338,-1.586,-1.254,-1.677,H4',A,28
|
30
|
+
A,H3',H3*,H,0,1,N,N,N,25.794,14.420,-8.461,-1.890,1.736,-1.353,H3',A,29
|
31
|
+
A,HO3',H3T,H,0,0,N,Y,N,24.741,16.001,-9.657,-4.024,1.035,-0.908,HO3',A,30
|
32
|
+
A,H2',H2*,H,0,1,N,N,N,26.025,16.626,-7.219,-1.543,1.654,0.930,H2',A,31
|
33
|
+
A,HO2',2HO*,H,0,0,N,N,N,24.250,17.425,-6.224,-3.740,0.936,1.037,HO2',A,32
|
34
|
+
A,H1',H1*,H,0,1,N,N,N,25.197,15.783,-4.958,-1.185,-1.346,0.940,H1',A,33
|
35
|
+
A,H8,H8,H,0,1,N,N,N,26.526,12.307,-6.055,1.611,1.246,0.745,H8,A,34
|
36
|
+
A,H61,1H6,H,0,1,N,N,N,32.176,14.326,-4.521,2.432,0.522,6.801,H61,A,35
|
37
|
+
A,H62,2H6,H,0,1,N,N,N,31.340,13.509,-5.666,2.996,1.205,5.374,H62,A,36
|
38
|
+
A,H2,H2,H,0,1,N,N,N,29.524,17.915,-4.473,-1.325,-1.807,5.688,H2,A,37
|
rnapolis/component_C.csv
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
comp_id,atom_id,alt_atom_id,type_symbol,charge,pdbx_align,pdbx_aromatic_flag,pdbx_leaving_atom_flag,pdbx_stereo_config,model_Cartn_x,model_Cartn_y,model_Cartn_z,pdbx_model_Cartn_x_ideal,pdbx_model_Cartn_y_ideal,pdbx_model_Cartn_z_ideal,pdbx_component_atom_id,pdbx_component_comp_id,pdbx_ordinal
|
2
|
+
C,OP3,O3P,O,0,1,N,Y,N,26.803,20.514,-11.017,2.147,-1.021,-4.678,OP3,C,1
|
3
|
+
C,P,P,P,0,1,N,N,N,27.386,20.433,-12.503,1.049,-0.039,-4.028,P,C,2
|
4
|
+
C,OP1,O1P,O,0,1,N,N,N,26.539,21.293,-13.322,1.692,1.237,-3.646,OP1,C,3
|
5
|
+
C,OP2,O2P,O,0,1,N,N,N,27.570,19.015,-12.877,-0.116,0.246,-5.102,OP2,C,4
|
6
|
+
C,O5',O5*,O,0,1,N,N,N,28.830,21.055,-12.361,0.415,-0.733,-2.721,O5',C,5
|
7
|
+
C,C5',C5*,C,0,1,N,N,N,29.051,22.423,-11.973,-0.546,0.181,-2.193,C5',C,6
|
8
|
+
C,C4',C4*,C,0,1,N,N,R,30.525,22.652,-11.850,-1.189,-0.419,-0.942,C4',C,7
|
9
|
+
C,O4',O4*,O,0,1,N,N,N,30.993,22.001,-10.600,-0.190,-0.648,0.076,O4',C,8
|
10
|
+
C,C3',C3*,C,0,1,N,N,S,31.457,22.096,-12.933,-2.178,0.583,-0.307,C3',C,9
|
11
|
+
C,O3',O3*,O,0,1,N,N,N,31.346,22.915,-14.074,-3.518,0.283,-0.703,O3',C,10
|
12
|
+
C,C2',C2*,C,0,1,N,N,R,32.751,22.157,-12.194,-2.001,0.373,1.215,C2',C,11
|
13
|
+
C,O2',O2*,O,0,1,N,N,N,33.186,23.463,-12.031,-3.228,-0.059,1.806,O2',C,12
|
14
|
+
C,C1',C1*,C,0,1,N,N,R,32.361,21.627,-10.851,-0.924,-0.729,1.317,C1',C,13
|
15
|
+
C,N1,N1,N,0,1,N,N,N,32.476,20.131,-10.779,-0.036,-0.470,2.453,N1,C,14
|
16
|
+
C,C2,C2,C,0,1,N,N,N,33.674,19.589,-10.493,0.652,0.683,2.514,C2,C,15
|
17
|
+
C,O2,O2,O,0,1,N,N,N,34.680,20.354,-10.277,0.529,1.504,1.620,O2,C,16
|
18
|
+
C,N3,N3,N,0,1,N,N,N,33.855,18.230,-10.434,1.467,0.945,3.535,N3,C,17
|
19
|
+
C,C4,C4,C,0,1,N,N,N,32.804,17.495,-10.663,1.620,0.070,4.520,C4,C,18
|
20
|
+
C,N4,N4,N,0,1,N,N,N,32.905,16.139,-10.606,2.464,0.350,5.569,N4,C,19
|
21
|
+
C,C5,C5,C,0,1,N,N,N,31.488,18.044,-10.975,0.916,-1.151,4.483,C5,C,20
|
22
|
+
C,C6,C6,C,0,1,N,N,N,31.389,19.360,-11.041,0.087,-1.399,3.442,C6,C,21
|
23
|
+
C,HOP3,3HOP,H,0,0,N,N,N,27.354,19.953,-10.483,2.501,-0.569,-5.456,HOP3,C,22
|
24
|
+
C,HOP2,2HOP,H,0,0,N,N,N,28.121,18.454,-12.343,-0.508,-0.608,-5.323,HOP2,C,23
|
25
|
+
C,H5',1H5*,H,0,1,N,N,N,28.562,23.147,-12.665,-1.315,0.371,-2.941,H5',C,24
|
26
|
+
C,H5'',2H5*,H,0,0,N,N,N,28.496,22.699,-11.045,-0.052,1.118,-1.933,H5'',C,25
|
27
|
+
C,H4',H4*,H,0,1,N,N,N,30.596,23.763,-11.911,-1.699,-1.350,-1.188,H4',C,26
|
28
|
+
C,H3',H3*,H,0,1,N,N,N,31.269,21.074,-13.339,-1.917,1.604,-0.586,H3',C,27
|
29
|
+
C,HO3',H3T,H,0,0,N,Y,N,31.923,22.570,-14.745,-4.088,0.939,-0.278,HO3',C,28
|
30
|
+
C,H2',H2*,H,0,1,N,N,N,33.568,21.607,-12.716,-1.653,1.290,1.689,H2',C,29
|
31
|
+
C,HO2',2HO*,H,0,0,N,N,N,34.010,23.501,-11.560,-3.874,0.644,1.656,HO2',C,30
|
32
|
+
C,H1',H1*,H,0,1,N,N,N,33.051,22.057,-10.088,-1.392,-1.708,1.418,H1',C,31
|
33
|
+
C,H41,1H4,H,0,1,N,N,N,33.824,15.755,-10.388,2.950,1.189,5.590,H41,C,32
|
34
|
+
C,H42,2H4,H,0,1,N,N,N,32.564,15.734,-11.478,2.571,-0.289,6.290,H42,C,33
|
35
|
+
C,H5,H5,H,0,1,N,N,N,30.568,17.464,-11.160,1.030,-1.873,5.278,H5,C,34
|
36
|
+
C,H6,H6,H,0,1,N,N,N,30.417,19.806,-11.309,-0.465,-2.326,3.393,H6,C,35
|
rnapolis/component_G.csv
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
comp_id,atom_id,alt_atom_id,type_symbol,charge,pdbx_align,pdbx_aromatic_flag,pdbx_leaving_atom_flag,pdbx_stereo_config,model_Cartn_x,model_Cartn_y,model_Cartn_z,pdbx_model_Cartn_x_ideal,pdbx_model_Cartn_y_ideal,pdbx_model_Cartn_z_ideal,pdbx_component_atom_id,pdbx_component_comp_id,pdbx_ordinal
|
2
|
+
G,OP3,O3P,O,0,1,N,Y,N,34.313,3.314,-0.422,-1.945,-1.360,5.599,OP3,G,1
|
3
|
+
G,P,P,P,0,1,N,N,N,33.741,4.431,0.367,-0.911,-0.277,5.008,P,G,2
|
4
|
+
G,OP1,O1P,O,0,1,N,N,N,33.537,5.671,-0.451,-1.598,1.022,4.844,OP1,G,3
|
5
|
+
G,OP2,O2P,O,0,1,N,N,N,34.442,4.727,1.661,0.325,-0.105,6.025,OP2,G,4
|
6
|
+
G,O5',O5*,O,0,1,N,N,N,32.289,3.932,0.811,-0.365,-0.780,3.580,O5',G,5
|
7
|
+
G,C5',C5*,C,0,1,N,N,N,32.101,2.551,1.198,0.542,0.217,3.109,C5',G,6
|
8
|
+
G,C4',C4*,C,0,1,N,N,R,30.760,2.450,1.879,1.100,-0.200,1.748,C4',G,7
|
9
|
+
G,O4',O4*,O,0,1,N,N,N,30.797,3.202,3.104,0.033,-0.318,0.782,O4',G,8
|
10
|
+
G,C3',C3*,C,0,1,N,N,S,29.597,3.022,1.070,2.025,0.898,1.182,C3',G,9
|
11
|
+
G,O3',O3*,O,0,1,N,N,N,29.106,2.045,0.152,3.395,0.582,1.439,O3',G,10
|
12
|
+
G,C2',C2*,C,0,1,N,N,R,28.603,3.421,2.118,1.741,0.884,-0.338,C2',G,11
|
13
|
+
G,O2',O2*,O,0,1,N,N,N,27.930,2.319,2.657,2.927,0.560,-1.066,O2',G,12
|
14
|
+
G,C1',C1*,C,0,1,N,N,R,29.487,3.936,3.170,0.675,-0.220,-0.507,C1',G,13
|
15
|
+
G,N9,N9,N,0,1,Y,N,N,29.942,5.378,3.195,-0.297,0.162,-1.534,N9,G,14
|
16
|
+
G,C8,C8,C,0,1,Y,N,N,31.187,5.907,3.065,-1.440,0.880,-1.334,C8,G,15
|
17
|
+
G,N7,N7,N,0,1,Y,N,N,31.237,7.191,3.136,-2.066,1.037,-2.464,N7,G,16
|
18
|
+
G,C5,C5,C,0,1,Y,N,N,29.896,7.536,3.341,-1.364,0.431,-3.453,C5,G,17
|
19
|
+
G,C6,C6,C,0,1,N,N,N,29.331,8.813,3.503,-1.556,0.279,-4.846,C6,G,18
|
20
|
+
G,O6,O6,O,0,1,N,N,N,29.901,9.926,3.495,-2.534,0.755,-5.397,O6,G,19
|
21
|
+
G,N1,N1,N,0,1,N,N,N,27.948,8.749,3.683,-0.626,-0.401,-5.551,N1,G,20
|
22
|
+
G,C2,C2,C,0,1,N,N,N,27.233,7.615,3.707,0.459,-0.934,-4.923,C2,G,21
|
23
|
+
G,N2,N2,N,0,1,N,N,N,25.894,7.743,3.899,1.384,-1.626,-5.664,N2,G,22
|
24
|
+
G,N3,N3,N,0,1,N,N,N,27.758,6.393,3.559,0.649,-0.800,-3.630,N3,G,23
|
25
|
+
G,C4,C4,C,0,1,Y,N,N,29.079,6.431,3.382,-0.226,-0.134,-2.868,C4,G,24
|
26
|
+
G,HOP3,3HOP,H,0,0,N,N,N,34.442,2.528,0.096,-2.247,-1.021,6.453,HOP3,G,25
|
27
|
+
G,HOP2,2HOP,H,0,0,N,N,N,34.571,3.941,2.179,0.745,-0.973,6.104,HOP2,G,26
|
28
|
+
G,H5',1H5*,H,0,1,N,N,N,32.209,1.841,0.344,1.362,0.327,3.820,H5',G,27
|
29
|
+
G,H5'',2H5*,H,0,0,N,N,N,32.936,2.156,1.822,0.018,1.168,3.011,H5'',G,28
|
30
|
+
G,H4',H4*,H,0,1,N,N,N,30.585,1.358,2.025,1.640,-1.144,1.833,H4',G,29
|
31
|
+
G,H3',H3*,H,0,1,N,N,N,29.867,3.891,0.426,1.772,1.868,1.610,H3',G,30
|
32
|
+
G,HO3',H3T,H,0,0,N,Y,N,28.382,2.400,-0.351,3.923,1.300,1.065,HO3',G,31
|
33
|
+
G,H2',H2*,H,0,1,N,N,N,27.827,4.115,1.719,1.346,1.847,-0.662,H2',G,32
|
34
|
+
G,HO2',2HO*,H,0,0,N,N,N,27.299,2.572,3.321,3.573,1.254,-0.871,HO2',G,33
|
35
|
+
G,H1',H1*,H,0,1,N,N,N,28.814,3.801,4.048,1.148,-1.167,-0.769,H1',G,34
|
36
|
+
G,H8,H8,H,0,1,N,N,N,32.110,5.323,2.909,-1.776,1.261,-0.381,H8,G,35
|
37
|
+
G,H1,H1,H,0,1,N,N,N,27.411,9.607,3.808,-0.736,-0.518,-6.508,H1,G,36
|
38
|
+
G,H21,1H2,H,0,1,N,N,N,25.350,6.880,3.917,2.165,-2.007,-5.232,H21,G,37
|
39
|
+
G,H22,2H2,H,0,1,N,N,N,25.507,8.377,3.200,1.256,-1.736,-6.619,H22,G,38
|
rnapolis/component_U.csv
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
comp_id,atom_id,alt_atom_id,type_symbol,charge,pdbx_align,pdbx_aromatic_flag,pdbx_leaving_atom_flag,pdbx_stereo_config,model_Cartn_x,model_Cartn_y,model_Cartn_z,pdbx_model_Cartn_x_ideal,pdbx_model_Cartn_y_ideal,pdbx_model_Cartn_z_ideal,pdbx_component_atom_id,pdbx_component_comp_id,pdbx_ordinal
|
2
|
+
U,OP3,O3P,O,0,1,N,Y,N,29.106,2.045,0.152,-2.122,1.033,-4.690,OP3,U,1
|
3
|
+
U,P,P,P,0,1,N,N,N,28.940,2.442,-1.379,-1.030,0.047,-4.037,P,U,2
|
4
|
+
U,OP1,O1P,O,0,1,N,N,N,28.520,1.217,-2.078,-1.679,-1.228,-3.660,OP1,U,3
|
5
|
+
U,OP2,O2P,O,0,1,N,N,N,30.133,3.129,-1.866,0.138,-0.241,-5.107,OP2,U,4
|
6
|
+
U,O5',O5*,O,0,1,N,N,N,27.784,3.522,-1.387,-0.399,0.736,-2.726,O5',U,5
|
7
|
+
U,C5',C5*,C,0,1,N,N,N,26.432,3.117,-1.047,0.557,-0.182,-2.196,C5',U,6
|
8
|
+
U,C4',C4*,C,0,1,N,N,R,25.647,4.373,-0.834,1.197,0.415,-0.942,C4',U,7
|
9
|
+
U,O4',O4*,O,0,1,N,N,N,26.122,5.093,0.327,0.194,0.645,0.074,O4',U,8
|
10
|
+
U,C3',C3*,C,0,1,N,N,S,25.763,5.465,-1.895,2.181,-0.588,-0.301,C3',U,9
|
11
|
+
U,O3',O3*,O,0,1,N,N,N,25.041,5.077,-3.062,3.524,-0.288,-0.686,O3',U,10
|
12
|
+
U,C2',C2*,C,0,1,N,N,R,25.213,6.663,-1.148,1.995,-0.383,1.218,C2',U,11
|
13
|
+
U,O2',O2*,O,0,1,N,N,N,23.829,6.500,-1.012,3.219,0.046,1.819,O2',U,12
|
14
|
+
U,C1',C1*,C,0,1,N,N,R,25.917,6.524,0.133,0.922,0.723,1.319,C1',U,13
|
15
|
+
U,N1,N1,N,0,1,N,N,N,27.224,7.194,0.137,0.028,0.464,2.451,N1,U,14
|
16
|
+
U,C2,C2,C,0,1,N,N,N,27.201,8.578,0.406,-0.690,-0.671,2.486,C2,U,15
|
17
|
+
U,O2,O2,O,0,1,N,N,N,26.156,9.121,0.619,-0.587,-1.474,1.580,O2,U,16
|
18
|
+
U,N3,N3,N,0,1,N,N,N,28.408,9.189,0.403,-1.515,-0.936,3.517,N3,U,17
|
19
|
+
U,C4,C4,C,0,1,N,N,N,29.660,8.606,0.152,-1.641,-0.055,4.530,C4,U,18
|
20
|
+
U,O4,O4,O,0,1,N,N,N,30.676,9.330,0.195,-2.391,-0.292,5.460,O4,U,19
|
21
|
+
U,C5,C5,C,0,1,N,N,N,29.604,7.215,-0.113,-0.894,1.146,4.502,C5,U,20
|
22
|
+
U,C6,C6,C,0,1,N,N,N,28.447,6.605,-0.111,-0.070,1.384,3.459,C6,U,21
|
23
|
+
U,HOP3,3HOP,H,0,0,N,N,N,29.377,2.835,0.603,-2.475,0.583,-5.470,HOP3,U,22
|
24
|
+
U,HOP2,2HOP,H,0,0,N,N,N,30.404,3.919,-1.414,0.534,0.613,-5.325,HOP2,U,23
|
25
|
+
U,H5',1H5*,H,0,1,N,N,N,25.974,2.434,-1.800,1.329,-0.373,-2.942,H5',U,24
|
26
|
+
U,H5'',2H5*,H,0,0,N,N,N,26.387,2.417,-0.179,0.060,-1.117,-1.940,H5'',U,25
|
27
|
+
U,H4',H4*,H,0,1,N,N,N,24.613,3.957,-0.795,1.712,1.345,-1.185,H4',U,26
|
28
|
+
U,H3',H3*,H,0,1,N,N,N,26.782,5.677,-2.293,1.923,-1.609,-0.583,H3',U,27
|
29
|
+
U,HO3',H3T,H,0,0,N,Y,N,25.113,5.756,-3.722,4.094,-0.926,-0.234,HO3',U,28
|
30
|
+
U,H2',H2*,H,0,1,N,N,N,25.358,7.661,-1.622,1.643,-1.301,1.688,H2',U,29
|
31
|
+
U,HO2',2HO*,H,0,0,N,N,N,23.484,7.251,-0.543,3.865,-0.657,1.671,HO2',U,30
|
32
|
+
U,H1',H1*,H,0,1,N,N,N,25.312,6.996,0.941,1.392,1.700,1.423,H1',U,31
|
33
|
+
U,H3,H3,H,0,1,N,N,N,28.370,10.187,0.610,-2.024,-1.762,3.528,H3,U,32
|
34
|
+
U,H5,H5,H,0,1,N,N,N,30.486,6.589,-0.327,-0.982,1.863,5.305,H5,U,33
|
35
|
+
U,H6,H6,H,0,1,N,N,N,28.506,5.526,-0.332,0.507,2.295,3.421,H6,U,34
|
rnapolis/parser_v2.py
CHANGED
@@ -1,7 +1,10 @@
|
|
1
|
-
|
1
|
+
import io
|
2
|
+
import tempfile
|
3
|
+
from typing import IO, TextIO, Union
|
2
4
|
|
3
5
|
import pandas as pd
|
4
6
|
from mmcif.io.IoAdapterPy import IoAdapterPy
|
7
|
+
from mmcif.io.PdbxReader import DataCategory, DataContainer
|
5
8
|
|
6
9
|
|
7
10
|
def parse_pdb_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
|
@@ -200,3 +203,304 @@ def parse_cif_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
|
|
200
203
|
df.attrs["format"] = "mmCIF"
|
201
204
|
|
202
205
|
return df
|
206
|
+
|
207
|
+
|
208
|
+
def write_pdb(
|
209
|
+
df: pd.DataFrame, output: Union[str, TextIO, None] = None
|
210
|
+
) -> Union[str, None]:
|
211
|
+
"""
|
212
|
+
Write a DataFrame of atom records to PDB format.
|
213
|
+
|
214
|
+
Parameters:
|
215
|
+
-----------
|
216
|
+
df : pd.DataFrame
|
217
|
+
DataFrame containing atom records, as created by parse_pdb_atoms or parse_cif_atoms
|
218
|
+
output : Union[str, TextIO, None], optional
|
219
|
+
Output file path or file-like object. If None, returns the PDB content as a string.
|
220
|
+
|
221
|
+
Returns:
|
222
|
+
--------
|
223
|
+
Union[str, None]
|
224
|
+
If output is None, returns the PDB content as a string. Otherwise, returns None.
|
225
|
+
"""
|
226
|
+
# Create a buffer to store the PDB content
|
227
|
+
buffer = io.StringIO()
|
228
|
+
|
229
|
+
# Get the format of the DataFrame
|
230
|
+
format_type = df.attrs.get("format", "PDB")
|
231
|
+
|
232
|
+
# Process each row in the DataFrame
|
233
|
+
for _, row in df.iterrows():
|
234
|
+
# Initialize the line with spaces
|
235
|
+
line = " " * 80
|
236
|
+
|
237
|
+
# Set record type (ATOM or HETATM)
|
238
|
+
if format_type == "PDB":
|
239
|
+
record_type = row["record_type"]
|
240
|
+
else: # mmCIF
|
241
|
+
record_type = row.get("group_PDB", "ATOM")
|
242
|
+
line = record_type.ljust(6) + line[6:]
|
243
|
+
|
244
|
+
# Set atom serial number
|
245
|
+
if format_type == "PDB":
|
246
|
+
serial = str(int(row["serial"]))
|
247
|
+
else: # mmCIF
|
248
|
+
serial = str(int(row["id"]))
|
249
|
+
line = line[:6] + serial.rjust(5) + line[11:]
|
250
|
+
|
251
|
+
# Set atom name
|
252
|
+
if format_type == "PDB":
|
253
|
+
atom_name = row["name"]
|
254
|
+
else: # mmCIF
|
255
|
+
atom_name = row.get("auth_atom_id", row.get("label_atom_id", ""))
|
256
|
+
|
257
|
+
# Right-justify atom name if it starts with a number
|
258
|
+
if atom_name and atom_name[0].isdigit():
|
259
|
+
line = line[:12] + atom_name.ljust(4) + line[16:]
|
260
|
+
else:
|
261
|
+
line = line[:12] + " " + atom_name.ljust(3) + line[16:]
|
262
|
+
|
263
|
+
# Set alternate location indicator
|
264
|
+
if format_type == "PDB":
|
265
|
+
alt_loc = row.get("altLoc", "")
|
266
|
+
else: # mmCIF
|
267
|
+
alt_loc = row.get("label_alt_id", "")
|
268
|
+
line = line[:16] + alt_loc + line[17:]
|
269
|
+
|
270
|
+
# Set residue name
|
271
|
+
if format_type == "PDB":
|
272
|
+
res_name = row["resName"]
|
273
|
+
else: # mmCIF
|
274
|
+
res_name = row.get("auth_comp_id", row.get("label_comp_id", ""))
|
275
|
+
line = line[:17] + res_name.ljust(3) + line[20:]
|
276
|
+
|
277
|
+
# Set chain identifier
|
278
|
+
if format_type == "PDB":
|
279
|
+
chain_id = row["chainID"]
|
280
|
+
else: # mmCIF
|
281
|
+
chain_id = row.get("auth_asym_id", row.get("label_asym_id", ""))
|
282
|
+
line = line[:21] + chain_id + line[22:]
|
283
|
+
|
284
|
+
# Set residue sequence number
|
285
|
+
if format_type == "PDB":
|
286
|
+
res_seq = str(int(row["resSeq"]))
|
287
|
+
else: # mmCIF
|
288
|
+
res_seq = str(int(row.get("auth_seq_id", row.get("label_seq_id", 0))))
|
289
|
+
line = line[:22] + res_seq.rjust(4) + line[26:]
|
290
|
+
|
291
|
+
# Set insertion code
|
292
|
+
if format_type == "PDB":
|
293
|
+
icode = row["iCode"] if pd.notna(row["iCode"]) else ""
|
294
|
+
else: # mmCIF
|
295
|
+
icode = (
|
296
|
+
row.get("pdbx_PDB_ins_code", "")
|
297
|
+
if pd.notna(row.get("pdbx_PDB_ins_code", ""))
|
298
|
+
else ""
|
299
|
+
)
|
300
|
+
line = line[:26] + icode + line[27:]
|
301
|
+
|
302
|
+
# Set X coordinate
|
303
|
+
if format_type == "PDB":
|
304
|
+
x = float(row["x"])
|
305
|
+
else: # mmCIF
|
306
|
+
x = float(row["Cartn_x"])
|
307
|
+
line = line[:30] + f"{x:8.3f}" + line[38:]
|
308
|
+
|
309
|
+
# Set Y coordinate
|
310
|
+
if format_type == "PDB":
|
311
|
+
y = float(row["y"])
|
312
|
+
else: # mmCIF
|
313
|
+
y = float(row["Cartn_y"])
|
314
|
+
line = line[:38] + f"{y:8.3f}" + line[46:]
|
315
|
+
|
316
|
+
# Set Z coordinate
|
317
|
+
if format_type == "PDB":
|
318
|
+
z = float(row["z"])
|
319
|
+
else: # mmCIF
|
320
|
+
z = float(row["Cartn_z"])
|
321
|
+
line = line[:46] + f"{z:8.3f}" + line[54:]
|
322
|
+
|
323
|
+
# Set occupancy
|
324
|
+
if format_type == "PDB":
|
325
|
+
occupancy = float(row["occupancy"])
|
326
|
+
else: # mmCIF
|
327
|
+
occupancy = float(row.get("occupancy", 1.0))
|
328
|
+
line = line[:54] + f"{occupancy:6.2f}" + line[60:]
|
329
|
+
|
330
|
+
# Set temperature factor
|
331
|
+
if format_type == "PDB":
|
332
|
+
temp_factor = float(row["tempFactor"])
|
333
|
+
else: # mmCIF
|
334
|
+
temp_factor = float(row.get("B_iso_or_equiv", 0.0))
|
335
|
+
line = line[:60] + f"{temp_factor:6.2f}" + line[66:]
|
336
|
+
|
337
|
+
# Set element symbol
|
338
|
+
if format_type == "PDB":
|
339
|
+
element = row["element"]
|
340
|
+
else: # mmCIF
|
341
|
+
element = row.get("type_symbol", "")
|
342
|
+
line = line[:76] + element.rjust(2) + line[78:]
|
343
|
+
|
344
|
+
# Set charge
|
345
|
+
if format_type == "PDB":
|
346
|
+
charge = row["charge"]
|
347
|
+
else: # mmCIF
|
348
|
+
charge = row.get("pdbx_formal_charge", "")
|
349
|
+
if charge and charge not in ["?", "."]:
|
350
|
+
# Convert numeric charge to PDB format (e.g., "1+" or "2-")
|
351
|
+
try:
|
352
|
+
charge_val = int(charge)
|
353
|
+
if charge_val != 0:
|
354
|
+
charge = f"{abs(charge_val)}{'+' if charge_val > 0 else '-'}"
|
355
|
+
else:
|
356
|
+
charge = ""
|
357
|
+
except ValueError:
|
358
|
+
pass
|
359
|
+
line = line[:78] + charge + line[80:]
|
360
|
+
|
361
|
+
# Write the line to the buffer
|
362
|
+
buffer.write(line.rstrip() + "\n")
|
363
|
+
|
364
|
+
# Add END record
|
365
|
+
buffer.write("END\n")
|
366
|
+
|
367
|
+
# Get the content as a string
|
368
|
+
content = buffer.getvalue()
|
369
|
+
buffer.close()
|
370
|
+
|
371
|
+
# Write to output if provided
|
372
|
+
if output is not None:
|
373
|
+
if isinstance(output, str):
|
374
|
+
with open(output, "w") as f:
|
375
|
+
f.write(content)
|
376
|
+
else:
|
377
|
+
output.write(content)
|
378
|
+
return None
|
379
|
+
|
380
|
+
# Return the content as a string
|
381
|
+
return content
|
382
|
+
|
383
|
+
|
384
|
+
def write_cif(
|
385
|
+
df: pd.DataFrame, output: Union[str, TextIO, None] = None
|
386
|
+
) -> Union[str, None]:
|
387
|
+
"""
|
388
|
+
Write a DataFrame of atom records to mmCIF format.
|
389
|
+
|
390
|
+
Parameters:
|
391
|
+
-----------
|
392
|
+
df : pd.DataFrame
|
393
|
+
DataFrame containing atom records, as created by parse_pdb_atoms or parse_cif_atoms
|
394
|
+
output : Union[str, TextIO, None], optional
|
395
|
+
Output file path or file-like object. If None, returns the mmCIF content as a string.
|
396
|
+
|
397
|
+
Returns:
|
398
|
+
--------
|
399
|
+
Union[str, None]
|
400
|
+
If output is None, returns the mmCIF content as a string. Otherwise, returns None.
|
401
|
+
"""
|
402
|
+
# Get the format of the DataFrame
|
403
|
+
format_type = df.attrs.get("format", "PDB")
|
404
|
+
|
405
|
+
# Create a new DataContainer
|
406
|
+
data_container = DataContainer("data_structure")
|
407
|
+
|
408
|
+
# Define the attributes for atom_site category
|
409
|
+
if format_type == "mmCIF":
|
410
|
+
# Use existing mmCIF attributes
|
411
|
+
attributes = list(df.columns)
|
412
|
+
else: # PDB format
|
413
|
+
# Map PDB columns to mmCIF attributes
|
414
|
+
attributes = [
|
415
|
+
"group_PDB", # record_type
|
416
|
+
"id", # serial
|
417
|
+
"type_symbol", # element
|
418
|
+
"label_atom_id", # name
|
419
|
+
"label_alt_id", # altLoc
|
420
|
+
"label_comp_id", # resName
|
421
|
+
"label_asym_id", # chainID
|
422
|
+
"label_entity_id", # (generated)
|
423
|
+
"label_seq_id", # resSeq
|
424
|
+
"pdbx_PDB_ins_code", # iCode
|
425
|
+
"Cartn_x", # x
|
426
|
+
"Cartn_y", # y
|
427
|
+
"Cartn_z", # z
|
428
|
+
"occupancy", # occupancy
|
429
|
+
"B_iso_or_equiv", # tempFactor
|
430
|
+
"pdbx_formal_charge", # charge
|
431
|
+
"auth_seq_id", # resSeq
|
432
|
+
"auth_comp_id", # resName
|
433
|
+
"auth_asym_id", # chainID
|
434
|
+
"auth_atom_id", # name
|
435
|
+
"pdbx_PDB_model_num", # (generated)
|
436
|
+
]
|
437
|
+
|
438
|
+
# Prepare rows for the atom_site category
|
439
|
+
rows = []
|
440
|
+
|
441
|
+
for _, row in df.iterrows():
|
442
|
+
if format_type == "mmCIF":
|
443
|
+
# Use existing mmCIF data
|
444
|
+
row_data = [str(row.get(attr, "?")) for attr in attributes]
|
445
|
+
else: # PDB format
|
446
|
+
# Map PDB data to mmCIF format
|
447
|
+
entity_id = "1" # Default entity ID
|
448
|
+
model_num = "1" # Default model number
|
449
|
+
|
450
|
+
row_data = [
|
451
|
+
str(row["record_type"]), # group_PDB
|
452
|
+
str(int(row["serial"])), # id
|
453
|
+
str(row["element"]), # type_symbol
|
454
|
+
str(row["name"]), # label_atom_id
|
455
|
+
str(row.get("altLoc", "")), # label_alt_id
|
456
|
+
str(row["resName"]), # label_comp_id
|
457
|
+
str(row["chainID"]), # label_asym_id
|
458
|
+
entity_id, # label_entity_id
|
459
|
+
str(int(row["resSeq"])), # label_seq_id
|
460
|
+
str(row["iCode"])
|
461
|
+
if pd.notna(row["iCode"])
|
462
|
+
else "?", # pdbx_PDB_ins_code
|
463
|
+
f"{float(row['x']):.3f}", # Cartn_x
|
464
|
+
f"{float(row['y']):.3f}", # Cartn_y
|
465
|
+
f"{float(row['z']):.3f}", # Cartn_z
|
466
|
+
f"{float(row['occupancy']):.2f}", # occupancy
|
467
|
+
f"{float(row['tempFactor']):.2f}", # B_iso_or_equiv
|
468
|
+
str(row.get("charge", "")) or "?", # pdbx_formal_charge
|
469
|
+
str(int(row["resSeq"])), # auth_seq_id
|
470
|
+
str(row["resName"]), # auth_comp_id
|
471
|
+
str(row["chainID"]), # auth_asym_id
|
472
|
+
str(row["name"]), # auth_atom_id
|
473
|
+
model_num, # pdbx_PDB_model_num
|
474
|
+
]
|
475
|
+
|
476
|
+
rows.append(row_data)
|
477
|
+
|
478
|
+
# Create the atom_site category
|
479
|
+
atom_site_category = DataCategory("atom_site", attributes, rows)
|
480
|
+
|
481
|
+
# Add the category to the data container
|
482
|
+
data_container.append(atom_site_category)
|
483
|
+
|
484
|
+
# Create an IoAdapter for writing
|
485
|
+
adapter = IoAdapterPy()
|
486
|
+
|
487
|
+
# Handle output
|
488
|
+
if output is None:
|
489
|
+
# Return as string - write to a temporary file and read it back
|
490
|
+
with tempfile.NamedTemporaryFile(mode="w+", suffix=".cif") as temp_file:
|
491
|
+
adapter.writeFile(temp_file.name, [data_container])
|
492
|
+
temp_file.flush()
|
493
|
+
temp_file.seek(0)
|
494
|
+
return temp_file.read()
|
495
|
+
elif isinstance(output, str):
|
496
|
+
# Write to a file path
|
497
|
+
adapter.writeFile(output, [data_container])
|
498
|
+
return None
|
499
|
+
else:
|
500
|
+
# Write to a file-like object
|
501
|
+
with tempfile.NamedTemporaryFile(mode="w+", suffix=".cif") as temp_file:
|
502
|
+
adapter.writeFile(temp_file.name, [data_container])
|
503
|
+
temp_file.flush()
|
504
|
+
temp_file.seek(0)
|
505
|
+
output.write(temp_file.read())
|
506
|
+
return None
|
rnapolis/tertiary_v2.py
CHANGED
@@ -379,7 +379,7 @@ class Residue:
|
|
379
379
|
self.atoms = residue_df
|
380
380
|
self.format = residue_df.attrs.get("format", "unknown")
|
381
381
|
|
382
|
-
@
|
382
|
+
@property
|
383
383
|
def chain_id(self) -> str:
|
384
384
|
"""Get the chain identifier for this residue."""
|
385
385
|
if self.format == "PDB":
|
@@ -391,7 +391,18 @@ class Residue:
|
|
391
391
|
return self.atoms["label_asym_id"].iloc[0]
|
392
392
|
return ""
|
393
393
|
|
394
|
-
@
|
394
|
+
@chain_id.setter
|
395
|
+
def chain_id(self, value: str) -> None:
|
396
|
+
"""Set the chain identifier for this residue."""
|
397
|
+
if self.format == "PDB":
|
398
|
+
self.atoms["chainID"] = value
|
399
|
+
elif self.format == "mmCIF":
|
400
|
+
if "auth_asym_id" in self.atoms.columns:
|
401
|
+
self.atoms["auth_asym_id"] = value
|
402
|
+
if "label_asym_id" in self.atoms.columns:
|
403
|
+
self.atoms["label_asym_id"] = value
|
404
|
+
|
405
|
+
@property
|
395
406
|
def residue_number(self) -> int:
|
396
407
|
"""Get the residue sequence number."""
|
397
408
|
if self.format == "PDB":
|
@@ -403,7 +414,18 @@ class Residue:
|
|
403
414
|
return int(self.atoms["label_seq_id"].iloc[0])
|
404
415
|
return 0
|
405
416
|
|
406
|
-
@
|
417
|
+
@residue_number.setter
|
418
|
+
def residue_number(self, value: int) -> None:
|
419
|
+
"""Set the residue sequence number."""
|
420
|
+
if self.format == "PDB":
|
421
|
+
self.atoms["resSeq"] = value
|
422
|
+
elif self.format == "mmCIF":
|
423
|
+
if "auth_seq_id" in self.atoms.columns:
|
424
|
+
self.atoms["auth_seq_id"] = value
|
425
|
+
if "label_seq_id" in self.atoms.columns:
|
426
|
+
self.atoms["label_seq_id"] = value
|
427
|
+
|
428
|
+
@property
|
407
429
|
def insertion_code(self) -> Optional[str]:
|
408
430
|
"""Get the insertion code, if any."""
|
409
431
|
if self.format == "PDB":
|
@@ -415,6 +437,15 @@ class Residue:
|
|
415
437
|
return icode if pd.notna(icode) else None
|
416
438
|
return None
|
417
439
|
|
440
|
+
@insertion_code.setter
|
441
|
+
def insertion_code(self, value: Optional[str]) -> None:
|
442
|
+
"""Set the insertion code."""
|
443
|
+
if self.format == "PDB":
|
444
|
+
self.atoms["iCode"] = value
|
445
|
+
elif self.format == "mmCIF":
|
446
|
+
if "pdbx_PDB_ins_code" in self.atoms.columns:
|
447
|
+
self.atoms["pdbx_PDB_ins_code"] = value
|
448
|
+
|
418
449
|
@cached_property
|
419
450
|
def residue_name(self) -> str:
|
420
451
|
"""Get the residue name (e.g., 'A', 'G', 'C', 'U', etc.)."""
|
@@ -494,10 +525,11 @@ class Residue:
|
|
494
525
|
def __str__(self) -> str:
|
495
526
|
"""String representation of the residue."""
|
496
527
|
# Start with chain ID and residue name
|
497
|
-
|
528
|
+
chain = self.chain_id
|
529
|
+
if chain.isspace() or not chain:
|
498
530
|
builder = f"{self.residue_name}"
|
499
531
|
else:
|
500
|
-
builder = f"{
|
532
|
+
builder = f"{chain}.{self.residue_name}"
|
501
533
|
|
502
534
|
# Add a separator if the residue name ends with a digit
|
503
535
|
if len(self.residue_name) > 0 and self.residue_name[-1] in string.digits:
|
@@ -507,8 +539,9 @@ class Residue:
|
|
507
539
|
builder += f"{self.residue_number}"
|
508
540
|
|
509
541
|
# Add insertion code if present
|
510
|
-
|
511
|
-
|
542
|
+
icode = self.insertion_code
|
543
|
+
if icode is not None:
|
544
|
+
builder += f"^{icode}"
|
512
545
|
|
513
546
|
return builder
|
514
547
|
|
rnapolis/unifier.py
ADDED
@@ -0,0 +1,153 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
import argparse
|
3
|
+
import os
|
4
|
+
import sys
|
5
|
+
from collections import Counter
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
|
9
|
+
from rnapolis.parser import is_cif
|
10
|
+
from rnapolis.parser_v2 import parse_cif_atoms, parse_pdb_atoms, write_cif, write_pdb
|
11
|
+
from rnapolis.tertiary_v2 import Structure
|
12
|
+
|
13
|
+
|
14
|
+
def load_components():
|
15
|
+
result = {}
|
16
|
+
for residue in "ACGU":
|
17
|
+
component = os.path.join(
|
18
|
+
os.path.abspath(os.path.dirname(__file__)), f"component_{residue}.csv"
|
19
|
+
)
|
20
|
+
result[residue] = pd.read_csv(component)
|
21
|
+
return result
|
22
|
+
|
23
|
+
|
24
|
+
def main():
|
25
|
+
"""Main function to run the unifier tool."""
|
26
|
+
parser = argparse.ArgumentParser(
|
27
|
+
description="Unify content of a set of PDB or mmCIF files."
|
28
|
+
)
|
29
|
+
parser.add_argument("--output", "-o", help="Output directory", required=True)
|
30
|
+
parser.add_argument(
|
31
|
+
"--format",
|
32
|
+
"-f",
|
33
|
+
help="Output format (possible values: PDB, mmCIF, keep. Default: keep)",
|
34
|
+
default="keep",
|
35
|
+
)
|
36
|
+
parser.add_argument("files", nargs="+", help="PDB or mmCIF files to compare")
|
37
|
+
args = parser.parse_args()
|
38
|
+
|
39
|
+
components = load_components()
|
40
|
+
structures = []
|
41
|
+
|
42
|
+
for path in args.files:
|
43
|
+
with open(path) as f:
|
44
|
+
if is_cif(f):
|
45
|
+
atoms = parse_cif_atoms(f)
|
46
|
+
else:
|
47
|
+
atoms = parse_pdb_atoms(f)
|
48
|
+
|
49
|
+
residues = []
|
50
|
+
|
51
|
+
for residue in Structure(atoms).residues:
|
52
|
+
if residue.residue_name not in "ACGU":
|
53
|
+
continue
|
54
|
+
|
55
|
+
component = components[residue.residue_name]
|
56
|
+
mapping_dict = dict(
|
57
|
+
[row["alt_atom_id"], row["atom_id"]] for _, row in component.iterrows()
|
58
|
+
)
|
59
|
+
valid_names = component["atom_id"]
|
60
|
+
valid_names = valid_names[~valid_names.str.startswith("H")]
|
61
|
+
valid_order = {value: idx for idx, value in enumerate(valid_names.tolist())}
|
62
|
+
column = "name" if residue.format == "PDB" else "auth_atom_id"
|
63
|
+
|
64
|
+
# Replace alternative name with standard name
|
65
|
+
residue.atoms[column] = residue.atoms[column].replace(mapping_dict)
|
66
|
+
# Leave only standard, non-hydrogen atoms
|
67
|
+
residue.atoms = residue.atoms[residue.atoms[column].isin(valid_names)]
|
68
|
+
# Reorder atoms
|
69
|
+
residue.atoms = residue.atoms.sort_values(
|
70
|
+
by=[column], key=lambda col: col.map(valid_order)
|
71
|
+
)
|
72
|
+
residues.append(residue)
|
73
|
+
|
74
|
+
structures.append((path, residues))
|
75
|
+
|
76
|
+
for path, residues in structures:
|
77
|
+
ref_path, ref_residues = structures[0]
|
78
|
+
|
79
|
+
# Validity check 1: residue count must be equal
|
80
|
+
if len(residues) != len(ref_residues):
|
81
|
+
print(
|
82
|
+
f"Number of residues in {path} does not match {ref_path}, cannot continue"
|
83
|
+
)
|
84
|
+
sys.exit(1)
|
85
|
+
|
86
|
+
# Validity check 2: residue names must be equal
|
87
|
+
for i, (residue, ref_residue) in enumerate(zip(residues, ref_residues)):
|
88
|
+
if residue.residue_name != ref_residue.residue_name:
|
89
|
+
print(
|
90
|
+
f"Residue {str(residue)} in {path} does not match {str(ref_residue)} in {ref_path}, cannot continue"
|
91
|
+
)
|
92
|
+
sys.exit(1)
|
93
|
+
|
94
|
+
# Find residues with different number of atoms
|
95
|
+
residues_to_remove = set()
|
96
|
+
for i, (residue, ref_residue) in enumerate(zip(residues, ref_residues)):
|
97
|
+
if len(residue.atoms) != len(ref_residue.atoms):
|
98
|
+
print(
|
99
|
+
f"Number of atoms in {str(residue)} in {path} does not match {str(ref_residue)} in {ref_path}, will unify this"
|
100
|
+
)
|
101
|
+
residues_to_remove.add(i)
|
102
|
+
|
103
|
+
# Remove residues with different number of atoms
|
104
|
+
for _, residues in structures:
|
105
|
+
for i in sorted(residues_to_remove, reverse=True):
|
106
|
+
del residues[i]
|
107
|
+
|
108
|
+
# Find most common residue identifiers for each residue
|
109
|
+
n = len(structures[0][1])
|
110
|
+
counters = [Counter() for _ in range(n)]
|
111
|
+
for _, residues in structures:
|
112
|
+
for i, residue in enumerate(residues):
|
113
|
+
counters[i].update(
|
114
|
+
[(residue.chain_id, residue.residue_number, residue.insertion_code)]
|
115
|
+
)
|
116
|
+
|
117
|
+
# If any residue has different identifiers, use the most common one in all structures
|
118
|
+
for i, counter in enumerate(counters):
|
119
|
+
(chain_id, residue_number, insertion_code), count = counter.most_common(1)[0]
|
120
|
+
if count != len(structures):
|
121
|
+
print(
|
122
|
+
f"Residue {i + 1} has different identifiers in different structures, will unify this"
|
123
|
+
)
|
124
|
+
for _, residues in structures:
|
125
|
+
residue = residues[i]
|
126
|
+
residue.chain_id = chain_id
|
127
|
+
residue.residue_number = residue_number
|
128
|
+
residue.insertion_code = insertion_code
|
129
|
+
|
130
|
+
# Write output
|
131
|
+
os.makedirs(args.output, exist_ok=True)
|
132
|
+
|
133
|
+
for path, residues in structures:
|
134
|
+
base, _ = os.path.splitext(os.path.basename(path))
|
135
|
+
|
136
|
+
if args.format == "keep":
|
137
|
+
format = residues[0].atoms.attrs["format"]
|
138
|
+
else:
|
139
|
+
format = args.format
|
140
|
+
|
141
|
+
ext = ".pdb" if format == "PDB" else ".cif"
|
142
|
+
|
143
|
+
with open(f"{args.output}/{base}{ext}", "w") as f:
|
144
|
+
df = pd.concat([residue.atoms for residue in residues])
|
145
|
+
|
146
|
+
if format == "PDB":
|
147
|
+
write_pdb(df, f)
|
148
|
+
else:
|
149
|
+
write_cif(df, f)
|
150
|
+
|
151
|
+
|
152
|
+
if __name__ == "__main__":
|
153
|
+
main()
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: RNApolis
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.6.0
|
4
4
|
Summary: A Python library containing RNA-related bioinformatics functions and classes
|
5
5
|
Home-page: https://github.com/tzok/rnapolis-py
|
6
6
|
Author: Tomasz Zok
|
@@ -32,6 +32,7 @@ Dynamic: classifier
|
|
32
32
|
Dynamic: description
|
33
33
|
Dynamic: description-content-type
|
34
34
|
Dynamic: home-page
|
35
|
+
Dynamic: license-file
|
35
36
|
Dynamic: project-url
|
36
37
|
Dynamic: requires-dist
|
37
38
|
Dynamic: summary
|
@@ -0,0 +1,26 @@
|
|
1
|
+
rnapolis/aligner.py,sha256=oJ81FrjlEEzqJcYJdZUE1PrPjabIOT7j0idwAHXVQMI,3156
|
2
|
+
rnapolis/annotator.py,sha256=hRRzRmneYxbg2tvwVHMWLfzmJb4szV0JL_6EOC09Gwg,22101
|
3
|
+
rnapolis/clashfinder.py,sha256=AC9_tIx7QIk57sELq_aKfU1u3UMOXbgcccQeGHhMR6c,8517
|
4
|
+
rnapolis/common.py,sha256=LY6Uz96Br8ki_gA8LpfatgtvVbt9jOTkwgagayqTgf8,31251
|
5
|
+
rnapolis/component_A.csv,sha256=koirS-AwUZwoYGItT8yn3wS6Idvmh2FANfTQcOS_xh8,2897
|
6
|
+
rnapolis/component_C.csv,sha256=NtvsAu_YrUgTjzZm3j4poW4IZ99x3dPARB09XVIiMCc,2803
|
7
|
+
rnapolis/component_G.csv,sha256=Z5wl8OnHRyx4XhTyBiWgRZiEvmZXhoxtVRH8bn6Vxf0,2898
|
8
|
+
rnapolis/component_U.csv,sha256=8BUoU1m2YzGmi8_kw1xdpf3pucszHjFEtTex87CuXiE,2645
|
9
|
+
rnapolis/metareader.py,sha256=I1-cXc2YNBPwa3zihAnMTjEsAo79tEKzSmWu5yvN1Pk,2071
|
10
|
+
rnapolis/mmcif_pdbx_v50.dic,sha256=5QFx1ssDaehR4_DQ-tS9VQux262SiLXaqcwmwwejF5c,5744659
|
11
|
+
rnapolis/molecule_filter.py,sha256=jgcpJxx_oXEBX0d30v4k_FdwRouRUPUsEtCYWgLGpD4,7310
|
12
|
+
rnapolis/motif_extractor.py,sha256=Lfn1iEkhkP9eZD3GPEWNAfy00QO7QPCc8wM_XS1ory8,1147
|
13
|
+
rnapolis/parser.py,sha256=3g4mtFvpiEENFcSBBtx_E_x1vJPF9BujWnts0kb9XjE,16340
|
14
|
+
rnapolis/parser_v2.py,sha256=ltesVKBiIKk9JlM02ttTJzLm1g5MHdPzDgQTcl40GP8,16257
|
15
|
+
rnapolis/rfam_folder.py,sha256=SjiiyML_T1__saruFwSMJEoQ7Y55GIU8ktS8ZUn5-fw,11111
|
16
|
+
rnapolis/tertiary.py,sha256=6t9ZB4w33-5n_M3sns1RoFXCOTgVAgGH4WDNG5OG9Kg,23426
|
17
|
+
rnapolis/tertiary_v2.py,sha256=I1uyHWIUePNGO5m-suoL4ibtz02qAJUMvYm0BUKUygY,22480
|
18
|
+
rnapolis/transformer.py,sha256=aC0nBmHHJf5TyLvBIV57Jj3tlwpvHbPo347opfAOlQA,3844
|
19
|
+
rnapolis/unifier.py,sha256=bXscX3lxeSxT4K1fm2UEURcU9_0JA0HdTbd8ZoHZFAY,5442
|
20
|
+
rnapolis/util.py,sha256=IdquFO3PV1_KDqodjupzm0Rqvgy0CeSzxGHaGEHYXVU,543
|
21
|
+
rnapolis-0.6.0.dist-info/licenses/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
|
22
|
+
rnapolis-0.6.0.dist-info/METADATA,sha256=TcGmjLlYH8jPvWJr48a2ce-UhIIl_dAO_wygm4ZPrKY,54537
|
23
|
+
rnapolis-0.6.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
24
|
+
rnapolis-0.6.0.dist-info/entry_points.txt,sha256=kS_Ji3_6UaomxkOaYpGHh4aZKaIh9CAfzoexbaS3y50,372
|
25
|
+
rnapolis-0.6.0.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
|
26
|
+
rnapolis-0.6.0.dist-info/RECORD,,
|
@@ -1,4 +1,5 @@
|
|
1
1
|
[console_scripts]
|
2
|
+
aligner = rnapolis.aligner:main
|
2
3
|
annotator = rnapolis.annotator:main
|
3
4
|
clashfinder = rnapolis.clashfinder:main
|
4
5
|
metareader = rnapolis.metareader:main
|
@@ -6,3 +7,4 @@ molecule-filter = rnapolis.molecule_filter:main
|
|
6
7
|
motif-extractor = rnapolis.motif_extractor:main
|
7
8
|
rfam-folder = rnapolis.rfam_folder:main
|
8
9
|
transformer = rnapolis.transformer:main
|
10
|
+
unifier = rnapolis.unifier:main
|
rnapolis-0.5.0.dist-info/RECORD
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
rnapolis/annotator.py,sha256=hRRzRmneYxbg2tvwVHMWLfzmJb4szV0JL_6EOC09Gwg,22101
|
2
|
-
rnapolis/clashfinder.py,sha256=i95kp0o6OWNqmJDBr-PbsZd7RY2iJtBDr7QqolJSuAQ,8513
|
3
|
-
rnapolis/common.py,sha256=LY6Uz96Br8ki_gA8LpfatgtvVbt9jOTkwgagayqTgf8,31251
|
4
|
-
rnapolis/metareader.py,sha256=I1-cXc2YNBPwa3zihAnMTjEsAo79tEKzSmWu5yvN1Pk,2071
|
5
|
-
rnapolis/mmcif_pdbx_v50.dic,sha256=5QFx1ssDaehR4_DQ-tS9VQux262SiLXaqcwmwwejF5c,5744659
|
6
|
-
rnapolis/molecule_filter.py,sha256=jgcpJxx_oXEBX0d30v4k_FdwRouRUPUsEtCYWgLGpD4,7310
|
7
|
-
rnapolis/motif_extractor.py,sha256=Lfn1iEkhkP9eZD3GPEWNAfy00QO7QPCc8wM_XS1ory8,1147
|
8
|
-
rnapolis/parser.py,sha256=3g4mtFvpiEENFcSBBtx_E_x1vJPF9BujWnts0kb9XjE,16340
|
9
|
-
rnapolis/parser_v2.py,sha256=L85dRYlh_aOcSvt2ZtRJYFhYa0bwvYgoTQi9kUSqDGQ,5803
|
10
|
-
rnapolis/rfam_folder.py,sha256=SjiiyML_T1__saruFwSMJEoQ7Y55GIU8ktS8ZUn5-fw,11111
|
11
|
-
rnapolis/tertiary.py,sha256=6t9ZB4w33-5n_M3sns1RoFXCOTgVAgGH4WDNG5OG9Kg,23426
|
12
|
-
rnapolis/tertiary_v2.py,sha256=GuTSEtbkMlYks6XA-P8pbLaT4M1cVS1T8gb8zcaGRzQ,21250
|
13
|
-
rnapolis/transformer.py,sha256=aC0nBmHHJf5TyLvBIV57Jj3tlwpvHbPo347opfAOlQA,3844
|
14
|
-
rnapolis/util.py,sha256=IdquFO3PV1_KDqodjupzm0Rqvgy0CeSzxGHaGEHYXVU,543
|
15
|
-
rnapolis-0.5.0.dist-info/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
|
16
|
-
rnapolis-0.5.0.dist-info/METADATA,sha256=gq8j-Oln2H84wuzLZNvilJ5m1dPYtvm7vX2cpEunHYg,54515
|
17
|
-
rnapolis-0.5.0.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
18
|
-
rnapolis-0.5.0.dist-info/entry_points.txt,sha256=foN2Pn5e-OzEz0fFmNoX6PnFSZFQntOlY8LbognP5F0,308
|
19
|
-
rnapolis-0.5.0.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
|
20
|
-
rnapolis-0.5.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|