RNApolis 0.4.17__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {rnapolis-0.4.17/src/RNApolis.egg-info → rnapolis-0.5.0}/PKG-INFO +1 -1
  2. {rnapolis-0.4.17 → rnapolis-0.5.0}/setup.py +1 -1
  3. {rnapolis-0.4.17 → rnapolis-0.5.0/src/RNApolis.egg-info}/PKG-INFO +1 -1
  4. {rnapolis-0.4.17 → rnapolis-0.5.0}/src/RNApolis.egg-info/SOURCES.txt +4 -1
  5. rnapolis-0.5.0/src/rnapolis/parser_v2.py +202 -0
  6. rnapolis-0.5.0/src/rnapolis/tertiary_v2.py +618 -0
  7. rnapolis-0.5.0/tests/test_v2.py +237 -0
  8. {rnapolis-0.4.17 → rnapolis-0.5.0}/LICENSE +0 -0
  9. {rnapolis-0.4.17 → rnapolis-0.5.0}/README.md +0 -0
  10. {rnapolis-0.4.17 → rnapolis-0.5.0}/pyproject.toml +0 -0
  11. {rnapolis-0.4.17 → rnapolis-0.5.0}/setup.cfg +0 -0
  12. {rnapolis-0.4.17 → rnapolis-0.5.0}/src/RNApolis.egg-info/dependency_links.txt +0 -0
  13. {rnapolis-0.4.17 → rnapolis-0.5.0}/src/RNApolis.egg-info/entry_points.txt +0 -0
  14. {rnapolis-0.4.17 → rnapolis-0.5.0}/src/RNApolis.egg-info/requires.txt +0 -0
  15. {rnapolis-0.4.17 → rnapolis-0.5.0}/src/RNApolis.egg-info/top_level.txt +0 -0
  16. {rnapolis-0.4.17 → rnapolis-0.5.0}/src/rnapolis/annotator.py +0 -0
  17. {rnapolis-0.4.17 → rnapolis-0.5.0}/src/rnapolis/clashfinder.py +0 -0
  18. {rnapolis-0.4.17 → rnapolis-0.5.0}/src/rnapolis/common.py +0 -0
  19. {rnapolis-0.4.17 → rnapolis-0.5.0}/src/rnapolis/metareader.py +0 -0
  20. {rnapolis-0.4.17 → rnapolis-0.5.0}/src/rnapolis/mmcif_pdbx_v50.dic +0 -0
  21. {rnapolis-0.4.17 → rnapolis-0.5.0}/src/rnapolis/molecule_filter.py +0 -0
  22. {rnapolis-0.4.17 → rnapolis-0.5.0}/src/rnapolis/motif_extractor.py +0 -0
  23. {rnapolis-0.4.17 → rnapolis-0.5.0}/src/rnapolis/parser.py +0 -0
  24. {rnapolis-0.4.17 → rnapolis-0.5.0}/src/rnapolis/rfam_folder.py +0 -0
  25. {rnapolis-0.4.17 → rnapolis-0.5.0}/src/rnapolis/tertiary.py +0 -0
  26. {rnapolis-0.4.17 → rnapolis-0.5.0}/src/rnapolis/transformer.py +0 -0
  27. {rnapolis-0.4.17 → rnapolis-0.5.0}/src/rnapolis/util.py +0 -0
  28. {rnapolis-0.4.17 → rnapolis-0.5.0}/tests/test_annotator.py +0 -0
  29. {rnapolis-0.4.17 → rnapolis-0.5.0}/tests/test_bugfixes.py +0 -0
  30. {rnapolis-0.4.17 → rnapolis-0.5.0}/tests/test_common.py +0 -0
  31. {rnapolis-0.4.17 → rnapolis-0.5.0}/tests/test_metareader.py +0 -0
  32. {rnapolis-0.4.17 → rnapolis-0.5.0}/tests/test_molecule_filter.py +0 -0
  33. {rnapolis-0.4.17 → rnapolis-0.5.0}/tests/test_parser.py +0 -0
  34. {rnapolis-0.4.17 → rnapolis-0.5.0}/tests/test_quadruplexes.py +0 -0
  35. {rnapolis-0.4.17 → rnapolis-0.5.0}/tests/test_rfam_folder.py +0 -0
  36. {rnapolis-0.4.17 → rnapolis-0.5.0}/tests/test_tertiary.py +0 -0
  37. {rnapolis-0.4.17 → rnapolis-0.5.0}/tests/test_transformer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: RNApolis
3
- Version: 0.4.17
3
+ Version: 0.5.0
4
4
  Summary: A Python library containing RNA-related bioinformatics functions and classes
5
5
  Home-page: https://github.com/tzok/rnapolis-py
6
6
  Author: Tomasz Zok
@@ -5,7 +5,7 @@ with open("README.md") as f:
5
5
 
6
6
  setup(
7
7
  name="RNApolis",
8
- version="0.4.17",
8
+ version="0.5.0",
9
9
  packages=["rnapolis"],
10
10
  package_dir={"": "src"},
11
11
  author="Tomasz Zok",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: RNApolis
3
- Version: 0.4.17
3
+ Version: 0.5.0
4
4
  Summary: A Python library containing RNA-related bioinformatics functions and classes
5
5
  Home-page: https://github.com/tzok/rnapolis-py
6
6
  Author: Tomasz Zok
@@ -16,8 +16,10 @@ src/rnapolis/mmcif_pdbx_v50.dic
16
16
  src/rnapolis/molecule_filter.py
17
17
  src/rnapolis/motif_extractor.py
18
18
  src/rnapolis/parser.py
19
+ src/rnapolis/parser_v2.py
19
20
  src/rnapolis/rfam_folder.py
20
21
  src/rnapolis/tertiary.py
22
+ src/rnapolis/tertiary_v2.py
21
23
  src/rnapolis/transformer.py
22
24
  src/rnapolis/util.py
23
25
  tests/test_annotator.py
@@ -29,4 +31,5 @@ tests/test_parser.py
29
31
  tests/test_quadruplexes.py
30
32
  tests/test_rfam_folder.py
31
33
  tests/test_tertiary.py
32
- tests/test_transformer.py
34
+ tests/test_transformer.py
35
+ tests/test_v2.py
@@ -0,0 +1,202 @@
1
+ from typing import IO, Union
2
+
3
+ import pandas as pd
4
+ from mmcif.io.IoAdapterPy import IoAdapterPy
5
+
6
+
7
+ def parse_pdb_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
8
+ """
9
+ Parse PDB file content and extract ATOM and HETATM records into a pandas DataFrame.
10
+
11
+ Parameters:
12
+ -----------
13
+ content : Union[str, IO[str]]
14
+ Content of a PDB file as a string or file-like object
15
+
16
+ Returns:
17
+ --------
18
+ pd.DataFrame
19
+ DataFrame containing parsed ATOM and HETATM records with columns corresponding to PDB format
20
+ """
21
+ records = []
22
+
23
+ # Handle both string content and file-like objects
24
+ if isinstance(content, str):
25
+ lines = content.splitlines()
26
+ else:
27
+ # Read all lines from the file-like object
28
+ content.seek(0) # Ensure we're at the beginning of the file
29
+ lines = content.readlines()
30
+ # Convert bytes to string if needed
31
+ if isinstance(lines[0], bytes):
32
+ lines = [line.decode("utf-8") for line in lines]
33
+
34
+ for line in lines:
35
+ record_type = line[:6].strip()
36
+
37
+ # Only process ATOM and HETATM records
38
+ if record_type not in ["ATOM", "HETATM"]:
39
+ continue
40
+
41
+ # Parse fields according to PDB format specification
42
+ icode = line[26:27].strip()
43
+ record = {
44
+ "record_type": record_type,
45
+ "serial": line[6:11].strip(),
46
+ "name": line[12:16].strip(),
47
+ "altLoc": line[16:17].strip(),
48
+ "resName": line[17:20].strip(),
49
+ "chainID": line[21:22].strip(),
50
+ "resSeq": line[22:26].strip(),
51
+ "iCode": None if not icode else icode, # Convert empty string to None
52
+ "x": line[30:38].strip(),
53
+ "y": line[38:46].strip(),
54
+ "z": line[46:54].strip(),
55
+ "occupancy": line[54:60].strip(),
56
+ "tempFactor": line[60:66].strip(),
57
+ "element": line[76:78].strip(),
58
+ "charge": line[78:80].strip(),
59
+ }
60
+
61
+ records.append(record)
62
+
63
+ # Create DataFrame from records
64
+ if not records:
65
+ # Return empty DataFrame with correct columns if no records found
66
+ return pd.DataFrame(
67
+ columns=[
68
+ "record_type",
69
+ "serial",
70
+ "name",
71
+ "altLoc",
72
+ "resName",
73
+ "chainID",
74
+ "resSeq",
75
+ "iCode",
76
+ "x",
77
+ "y",
78
+ "z",
79
+ "occupancy",
80
+ "tempFactor",
81
+ "element",
82
+ "charge",
83
+ ]
84
+ )
85
+
86
+ df = pd.DataFrame(records)
87
+
88
+ # Convert numeric columns to appropriate types
89
+ numeric_columns = ["serial", "resSeq", "x", "y", "z", "occupancy", "tempFactor"]
90
+ for col in numeric_columns:
91
+ df[col] = pd.to_numeric(df[col], errors="coerce")
92
+
93
+ # Convert categorical columns
94
+ categorical_columns = [
95
+ "record_type",
96
+ "name",
97
+ "altLoc",
98
+ "resName",
99
+ "chainID",
100
+ "element",
101
+ "charge",
102
+ ]
103
+ for col in categorical_columns:
104
+ df[col] = df[col].astype("category")
105
+
106
+ # Add format attribute to the DataFrame
107
+ df.attrs["format"] = "PDB"
108
+
109
+ return df
110
+
111
+
112
+ def parse_cif_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
113
+ """
114
+ Parse mmCIF file content and extract atom_site records into a pandas DataFrame.
115
+
116
+ Parameters:
117
+ -----------
118
+ content : Union[str, IO[str]]
119
+ Content of a mmCIF file as a string or file-like object
120
+
121
+ Returns:
122
+ --------
123
+ pd.DataFrame
124
+ DataFrame containing parsed atom_site records with columns corresponding to mmCIF format
125
+ """
126
+ adapter = IoAdapterPy()
127
+
128
+ # Handle both string content and file-like objects
129
+ if isinstance(content, str):
130
+ # Create a temporary file to use with the adapter
131
+ import tempfile
132
+
133
+ with tempfile.NamedTemporaryFile(mode="w+", suffix=".cif") as temp_file:
134
+ temp_file.write(content)
135
+ temp_file.flush()
136
+ data = adapter.readFile(temp_file.name)
137
+ else:
138
+ # Assume it's a file-like object with a name attribute
139
+ data = adapter.readFile(content.name)
140
+
141
+ # Get the atom_site category
142
+ category = data[0].getObj("atom_site")
143
+
144
+ if not category:
145
+ # Return empty DataFrame if no atom_site category found
146
+ return pd.DataFrame()
147
+
148
+ # Extract attribute names and data rows
149
+ attributes = category.getAttributeList()
150
+ rows = category.getRowList()
151
+
152
+ # Create a list of dictionaries for each atom
153
+ records = []
154
+ for row in rows:
155
+ record = dict(zip(attributes, row))
156
+
157
+ # Convert "?" or "." in insertion code to None
158
+ if "pdbx_PDB_ins_code" in record:
159
+ if record["pdbx_PDB_ins_code"] in ["?", ".", ""]:
160
+ record["pdbx_PDB_ins_code"] = None
161
+
162
+ records.append(record)
163
+
164
+ # Create DataFrame from records
165
+ df = pd.DataFrame(records)
166
+
167
+ # Convert numeric columns to appropriate types
168
+ numeric_columns = [
169
+ "id",
170
+ "auth_seq_id",
171
+ "Cartn_x",
172
+ "Cartn_y",
173
+ "Cartn_z",
174
+ "occupancy",
175
+ "B_iso_or_equiv",
176
+ "pdbx_formal_charge",
177
+ ]
178
+
179
+ for col in numeric_columns:
180
+ if col in df.columns:
181
+ df[col] = pd.to_numeric(df[col], errors="coerce")
182
+
183
+ # Convert categorical columns
184
+ categorical_columns = [
185
+ "group_PDB",
186
+ "type_symbol",
187
+ "label_atom_id",
188
+ "label_comp_id",
189
+ "label_asym_id",
190
+ "auth_atom_id",
191
+ "auth_comp_id",
192
+ "auth_asym_id",
193
+ ]
194
+
195
+ for col in categorical_columns:
196
+ if col in df.columns:
197
+ df[col] = df[col].astype("category")
198
+
199
+ # Add format attribute to the DataFrame
200
+ df.attrs["format"] = "mmCIF"
201
+
202
+ return df
@@ -0,0 +1,618 @@
1
+ import string
2
+ from functools import cached_property
3
+ from typing import List, Optional
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ # Constants
9
+ AVERAGE_OXYGEN_PHOSPHORUS_DISTANCE_COVALENT = 1.6
10
+
11
+
12
+ def calculate_torsion_angle(
13
+ a1: np.ndarray, a2: np.ndarray, a3: np.ndarray, a4: np.ndarray
14
+ ) -> float:
15
+ """
16
+ Calculate the torsion angle between four points in 3D space.
17
+
18
+ Parameters:
19
+ -----------
20
+ a1, a2, a3, a4 : np.ndarray
21
+ 3D coordinates of the four atoms
22
+
23
+ Returns:
24
+ --------
25
+ float
26
+ Torsion angle in radians
27
+ """
28
+ # Calculate vectors between points
29
+ v1 = a2 - a1
30
+ v2 = a3 - a2
31
+ v3 = a4 - a3
32
+
33
+ # Calculate normal vectors
34
+ n1 = np.cross(v1, v2)
35
+ n2 = np.cross(v2, v3)
36
+
37
+ # Normalize normal vectors
38
+ n1_norm = np.linalg.norm(n1)
39
+ n2_norm = np.linalg.norm(n2)
40
+
41
+ # Check for collinearity
42
+ if n1_norm < 1e-6 or n2_norm < 1e-6:
43
+ return float("nan")
44
+
45
+ n1 = n1 / n1_norm
46
+ n2 = n2 / n2_norm
47
+
48
+ # Calculate the angle using dot product
49
+ m1 = np.cross(n1, v2 / np.linalg.norm(v2))
50
+ x = np.dot(n1, n2)
51
+ y = np.dot(m1, n2)
52
+
53
+ # Return angle in radians
54
+ angle = np.arctan2(y, x)
55
+
56
+ return angle
57
+
58
+
59
+ class Structure:
60
+ """
61
+ A class representing a molecular structure parsed from PDB or mmCIF format.
62
+
63
+ This class takes a DataFrame created by parser_v2 functions and provides
64
+ methods to access and manipulate the structure data.
65
+ """
66
+
67
+ def __init__(self, atoms: pd.DataFrame):
68
+ """
69
+ Initialize a Structure object with atom data.
70
+
71
+ Parameters:
72
+ -----------
73
+ atoms : pd.DataFrame
74
+ DataFrame containing atom data, as created by parse_pdb_atoms or parse_cif_atoms
75
+ """
76
+ self.atoms = atoms
77
+ self.format = atoms.attrs.get("format", "unknown")
78
+
79
+ @cached_property
80
+ def residues(self) -> List["Residue"]:
81
+ """
82
+ Group atoms by residue and return a list of Residue objects.
83
+
84
+ The grouping logic depends on the format of the input data:
85
+ - For PDB: group by (chainID, resSeq, iCode)
86
+ - For mmCIF: group by (label_asym_id, label_seq_id) if present,
87
+ otherwise by (auth_asym_id, auth_seq_id, pdbx_PDB_ins_code)
88
+
89
+ Returns:
90
+ --------
91
+ List[Residue]
92
+ List of Residue objects, each representing a single residue
93
+ """
94
+ if self.format == "PDB":
95
+ # Group by chain ID, residue sequence number, and insertion code
96
+ groupby_cols = ["chainID", "resSeq", "iCode"]
97
+
98
+ # Filter out columns that don't exist in the DataFrame
99
+ groupby_cols = [col for col in groupby_cols if col in self.atoms.columns]
100
+
101
+ # Group atoms by residue
102
+ grouped = self.atoms.groupby(groupby_cols, dropna=False, observed=False)
103
+
104
+ elif self.format == "mmCIF":
105
+ # Prefer auth_* columns if they exist
106
+ if (
107
+ "auth_asym_id" in self.atoms.columns
108
+ and "auth_seq_id" in self.atoms.columns
109
+ ):
110
+ groupby_cols = ["auth_asym_id", "auth_seq_id"]
111
+
112
+ # Add insertion code if it exists
113
+ if "pdbx_PDB_ins_code" in self.atoms.columns:
114
+ groupby_cols.append("pdbx_PDB_ins_code")
115
+ else:
116
+ # Fall back to label_* columns
117
+ groupby_cols = ["label_asym_id", "label_seq_id"]
118
+
119
+ # Add insertion code if it exists
120
+ if "pdbx_PDB_ins_code" in self.atoms.columns:
121
+ groupby_cols.append("pdbx_PDB_ins_code")
122
+
123
+ # Group atoms by residue
124
+ grouped = self.atoms.groupby(groupby_cols, dropna=False, observed=False)
125
+
126
+ else:
127
+ # For unknown formats, return an empty list
128
+ return []
129
+
130
+ # Convert groups to a list of DataFrames
131
+ residue_dfs = []
132
+ for _, group in grouped:
133
+ # Create a copy of the group DataFrame
134
+ residue_df = group.copy()
135
+
136
+ # Preserve the format attribute
137
+ residue_df.attrs["format"] = self.format
138
+
139
+ residue_dfs.append(residue_df)
140
+
141
+ # Convert groups to a list of Residue objects
142
+ residues = []
143
+ for _, group in grouped:
144
+ # Create a copy of the group DataFrame
145
+ residue_df = group.copy()
146
+
147
+ # Preserve the format attribute
148
+ residue_df.attrs["format"] = self.format
149
+
150
+ # Create a Residue object
151
+ residues.append(Residue(residue_df))
152
+
153
+ return residues
154
+
155
+ @cached_property
156
+ def connected_residues(self) -> List[List["Residue"]]:
157
+ """
158
+ Find segments of connected residues in the structure.
159
+
160
+ Returns:
161
+ --------
162
+ List[List[Residue]]
163
+ List of segments, where each segment is a list of connected residues
164
+ """
165
+ # Group residues by chain
166
+ residues_by_chain = {}
167
+ for residue in self.residues:
168
+ chain_id = residue.chain_id
169
+ if chain_id not in residues_by_chain:
170
+ residues_by_chain[chain_id] = []
171
+ residues_by_chain[chain_id].append(residue)
172
+
173
+ # Sort residues in each chain by residue number
174
+ for chain_id in residues_by_chain:
175
+ residues_by_chain[chain_id].sort(
176
+ key=lambda r: (r.residue_number, r.insertion_code or "")
177
+ )
178
+
179
+ # Find connected segments in each chain
180
+ segments = []
181
+ for chain_id, chain_residues in residues_by_chain.items():
182
+ current_segment = []
183
+
184
+ for residue in chain_residues:
185
+ if not current_segment:
186
+ # Start a new segment
187
+ current_segment.append(residue)
188
+ else:
189
+ # Check if this residue is connected to the previous one
190
+ prev_residue = current_segment[-1]
191
+ if prev_residue.is_connected(residue):
192
+ current_segment.append(residue)
193
+ else:
194
+ # End the current segment and start a new one
195
+ if (
196
+ len(current_segment) > 1
197
+ ): # Only add segments with at least 2 residues
198
+ segments.append(current_segment)
199
+ current_segment = [residue]
200
+
201
+ # Add the last segment if it has at least 2 residues
202
+ if len(current_segment) > 1:
203
+ segments.append(current_segment)
204
+
205
+ return segments
206
+
207
+ @cached_property
208
+ def torsion_angles(self) -> pd.DataFrame:
209
+ """
210
+ Calculate torsion angles for all connected residues in the structure.
211
+
212
+ Returns:
213
+ --------
214
+ pd.DataFrame
215
+ DataFrame containing torsion angle values for each residue
216
+ """
217
+ # Find connected segments
218
+ segments = self.connected_residues
219
+
220
+ # Prepare data for the DataFrame
221
+ data = []
222
+
223
+ # Define the torsion angles to calculate
224
+ torsion_definitions = {
225
+ "alpha": [("O3'", -1), ("P", 0), ("O5'", 0), ("C5'", 0)],
226
+ "beta": [("P", 0), ("O5'", 0), ("C5'", 0), ("C4'", 0)],
227
+ "gamma": [("O5'", 0), ("C5'", 0), ("C4'", 0), ("C3'", 0)],
228
+ "delta": [("C5'", 0), ("C4'", 0), ("C3'", 0), ("O3'", 0)],
229
+ "epsilon": [("C4'", 0), ("C3'", 0), ("O3'", 0), ("P", 1)],
230
+ "zeta": [("C3'", 0), ("O3'", 0), ("P", 1), ("O5'", 1)],
231
+ "chi": None, # Will be handled separately due to purine/pyrimidine difference
232
+ }
233
+
234
+ # Process each segment
235
+ for segment in segments:
236
+ for i, residue in enumerate(segment):
237
+ # Prepare row data
238
+ row = {
239
+ "chain_id": residue.chain_id,
240
+ "residue_number": residue.residue_number,
241
+ "insertion_code": residue.insertion_code,
242
+ "residue_name": residue.residue_name,
243
+ }
244
+
245
+ # Calculate standard torsion angles
246
+ for angle_name, atoms_def in torsion_definitions.items():
247
+ if angle_name == "chi":
248
+ continue # Skip chi for now
249
+
250
+ if angle_name == "alpha" and i == 0:
251
+ continue # Skip alpha for the second residue
252
+
253
+ if angle_name in ["epsilon", "zeta"] and i == len(segment) - 1:
254
+ continue # Skip epsilon and zeta for the second-to-last residue
255
+
256
+ # Get the atoms for this angle
257
+ atoms = []
258
+ valid = True
259
+
260
+ for atom_name, offset in atoms_def:
261
+ res_idx = i + offset
262
+ if 0 <= res_idx < len(segment):
263
+ atom = segment[res_idx].find_atom(atom_name)
264
+ if atom is not None:
265
+ atoms.append(atom.coordinates)
266
+ else:
267
+ valid = False
268
+ break
269
+ else:
270
+ valid = False
271
+ break
272
+
273
+ # Calculate the angle if all atoms were found
274
+ if valid and len(atoms) == 4:
275
+ angle = calculate_torsion_angle(
276
+ atoms[0], atoms[1], atoms[2], atoms[3]
277
+ )
278
+ row[angle_name] = angle
279
+ else:
280
+ row[angle_name] = None
281
+
282
+ # Calculate chi angle based on residue type
283
+ # Pyrimidines: O4'-C1'-N1-C2
284
+ # Purines: O4'-C1'-N9-C4
285
+ purine_bases = ["A", "G", "DA", "DG"]
286
+ pyrimidine_bases = ["C", "U", "T", "DC", "DT"]
287
+
288
+ o4_prime = residue.find_atom("O4'")
289
+ c1_prime = residue.find_atom("C1'")
290
+
291
+ if o4_prime is not None and c1_prime is not None:
292
+ if residue.residue_name in purine_bases:
293
+ n9 = residue.find_atom("N9")
294
+ c4 = residue.find_atom("C4")
295
+ if n9 is not None and c4 is not None:
296
+ chi = calculate_torsion_angle(
297
+ o4_prime.coordinates,
298
+ c1_prime.coordinates,
299
+ n9.coordinates,
300
+ c4.coordinates,
301
+ )
302
+ row["chi"] = chi
303
+ elif residue.residue_name in pyrimidine_bases:
304
+ n1 = residue.find_atom("N1")
305
+ c2 = residue.find_atom("C2")
306
+ if n1 is not None and c2 is not None:
307
+ chi = calculate_torsion_angle(
308
+ o4_prime.coordinates,
309
+ c1_prime.coordinates,
310
+ n1.coordinates,
311
+ c2.coordinates,
312
+ )
313
+ row["chi"] = chi
314
+
315
+ data.append(row)
316
+
317
+ # Create DataFrame
318
+ if not data:
319
+ # Return empty DataFrame with correct columns
320
+ return pd.DataFrame(
321
+ columns=[
322
+ "chain_id",
323
+ "residue_number",
324
+ "insertion_code",
325
+ "residue_name",
326
+ "alpha",
327
+ "beta",
328
+ "gamma",
329
+ "delta",
330
+ "epsilon",
331
+ "zeta",
332
+ "chi",
333
+ ]
334
+ )
335
+
336
+ df = pd.DataFrame(data)
337
+
338
+ # Ensure all angle columns exist
339
+ for angle in ["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "chi"]:
340
+ if angle not in df.columns:
341
+ df[angle] = None
342
+
343
+ # Reorder columns to ensure consistent order
344
+ ordered_columns = [
345
+ "chain_id",
346
+ "residue_number",
347
+ "insertion_code",
348
+ "residue_name",
349
+ "alpha",
350
+ "beta",
351
+ "gamma",
352
+ "delta",
353
+ "epsilon",
354
+ "zeta",
355
+ "chi",
356
+ ]
357
+ df = df[ordered_columns]
358
+
359
+ return df
360
+
361
+
362
+ class Residue:
363
+ """
364
+ A class representing a single residue in a molecular structure.
365
+
366
+ This class encapsulates a DataFrame containing atoms belonging to a single residue
367
+ and provides methods to access residue properties.
368
+ """
369
+
370
+ def __init__(self, residue_df: pd.DataFrame):
371
+ """
372
+ Initialize a Residue object with atom data for a single residue.
373
+
374
+ Parameters:
375
+ -----------
376
+ residue_df : pd.DataFrame
377
+ DataFrame containing atom data for a single residue
378
+ """
379
+ self.atoms = residue_df
380
+ self.format = residue_df.attrs.get("format", "unknown")
381
+
382
+ @cached_property
383
+ def chain_id(self) -> str:
384
+ """Get the chain identifier for this residue."""
385
+ if self.format == "PDB":
386
+ return self.atoms["chainID"].iloc[0]
387
+ elif self.format == "mmCIF":
388
+ if "auth_asym_id" in self.atoms.columns:
389
+ return self.atoms["auth_asym_id"].iloc[0]
390
+ else:
391
+ return self.atoms["label_asym_id"].iloc[0]
392
+ return ""
393
+
394
+ @cached_property
395
+ def residue_number(self) -> int:
396
+ """Get the residue sequence number."""
397
+ if self.format == "PDB":
398
+ return int(self.atoms["resSeq"].iloc[0])
399
+ elif self.format == "mmCIF":
400
+ if "auth_seq_id" in self.atoms.columns:
401
+ return int(self.atoms["auth_seq_id"].iloc[0])
402
+ else:
403
+ return int(self.atoms["label_seq_id"].iloc[0])
404
+ return 0
405
+
406
+ @cached_property
407
+ def insertion_code(self) -> Optional[str]:
408
+ """Get the insertion code, if any."""
409
+ if self.format == "PDB":
410
+ icode = self.atoms["iCode"].iloc[0]
411
+ return icode if pd.notna(icode) else None
412
+ elif self.format == "mmCIF":
413
+ if "pdbx_PDB_ins_code" in self.atoms.columns:
414
+ icode = self.atoms["pdbx_PDB_ins_code"].iloc[0]
415
+ return icode if pd.notna(icode) else None
416
+ return None
417
+
418
+ @cached_property
419
+ def residue_name(self) -> str:
420
+ """Get the residue name (e.g., 'A', 'G', 'C', 'U', etc.)."""
421
+ if self.format == "PDB":
422
+ return self.atoms["resName"].iloc[0]
423
+ elif self.format == "mmCIF":
424
+ if "auth_comp_id" in self.atoms.columns:
425
+ return self.atoms["auth_comp_id"].iloc[0]
426
+ else:
427
+ return self.atoms["label_comp_id"].iloc[0]
428
+ return ""
429
+
430
+ @cached_property
431
+ def atoms_list(self) -> List["Atom"]:
432
+ """Get a list of all atoms in this residue."""
433
+ return [Atom(self.atoms.iloc[i], self.format) for i in range(len(self.atoms))]
434
+
435
+ def find_atom(self, atom_name: str) -> Optional["Atom"]:
436
+ """
437
+ Find an atom by name in this residue.
438
+
439
+ Parameters:
440
+ -----------
441
+ atom_name : str
442
+ Name of the atom to find
443
+
444
+ Returns:
445
+ --------
446
+ Optional[Atom]
447
+ The Atom object, or None if not found
448
+ """
449
+ if self.format == "PDB":
450
+ mask = self.atoms["name"] == atom_name
451
+ atoms_df = self.atoms[mask]
452
+ if len(atoms_df) > 0:
453
+ return Atom(atoms_df.iloc[0], self.format)
454
+ elif self.format == "mmCIF":
455
+ if "auth_atom_id" in self.atoms.columns:
456
+ mask = self.atoms["auth_atom_id"] == atom_name
457
+ atoms_df = self.atoms[mask]
458
+ if len(atoms_df) > 0:
459
+ return Atom(atoms_df.iloc[0], self.format)
460
+ else:
461
+ mask = self.atoms["label_atom_id"] == atom_name
462
+ atoms_df = self.atoms[mask]
463
+ if len(atoms_df) > 0:
464
+ return Atom(atoms_df.iloc[0], self.format)
465
+ return None
466
+
467
+ def is_connected(self, next_residue_candidate: "Residue") -> bool:
468
+ """
469
+ Check if this residue is connected to the next residue candidate.
470
+
471
+ The connection is determined by the distance between the O3' atom of this residue
472
+ and the P atom of the next residue. If the distance is less than 1.5 times the
473
+ average O-P covalent bond distance, the residues are considered connected.
474
+
475
+ Parameters:
476
+ -----------
477
+ next_residue_candidate : Residue
478
+ The residue to check for connection
479
+
480
+ Returns:
481
+ --------
482
+ bool
483
+ True if the residues are connected, False otherwise
484
+ """
485
+ o3p = self.find_atom("O3'")
486
+ p = next_residue_candidate.find_atom("P")
487
+
488
+ if o3p is not None and p is not None:
489
+ distance = np.linalg.norm(o3p.coordinates - p.coordinates).item()
490
+ return distance < 1.5 * AVERAGE_OXYGEN_PHOSPHORUS_DISTANCE_COVALENT
491
+
492
+ return False
493
+
494
+ def __str__(self) -> str:
495
+ """String representation of the residue."""
496
+ # Start with chain ID and residue name
497
+ if self.chain_id.isspace() or not self.chain_id:
498
+ builder = f"{self.residue_name}"
499
+ else:
500
+ builder = f"{self.chain_id}.{self.residue_name}"
501
+
502
+ # Add a separator if the residue name ends with a digit
503
+ if len(self.residue_name) > 0 and self.residue_name[-1] in string.digits:
504
+ builder += "/"
505
+
506
+ # Add residue number
507
+ builder += f"{self.residue_number}"
508
+
509
+ # Add insertion code if present
510
+ if self.insertion_code is not None:
511
+ builder += f"^{self.insertion_code}"
512
+
513
+ return builder
514
+
515
+ def __repr__(self) -> str:
516
+ """Detailed string representation of the residue."""
517
+ return f"Residue({self.__str__()}, {len(self.atoms)} atoms)"
518
+
519
+
520
+ class Atom:
521
+ """
522
+ A class representing a single atom in a molecular structure.
523
+
524
+ This class encapsulates a pandas Series containing data for a single atom
525
+ and provides methods to access atom properties.
526
+ """
527
+
528
+ def __init__(self, atom_data: pd.Series, format: str):
529
+ """
530
+ Initialize an Atom object with atom data.
531
+
532
+ Parameters:
533
+ -----------
534
+ atom_data : pd.Series
535
+ Series containing data for a single atom
536
+ format : str
537
+ Format of the data ('PDB' or 'mmCIF')
538
+ """
539
+ self.data = atom_data
540
+ self.format = format
541
+
542
+ @cached_property
543
+ def name(self) -> str:
544
+ """Get the atom name."""
545
+ if self.format == "PDB":
546
+ return self.data["name"]
547
+ elif self.format == "mmCIF":
548
+ if "auth_atom_id" in self.data:
549
+ return self.data["auth_atom_id"]
550
+ else:
551
+ return self.data["label_atom_id"]
552
+ return ""
553
+
554
+ @cached_property
555
+ def element(self) -> str:
556
+ """Get the element symbol."""
557
+ if self.format == "PDB":
558
+ return self.data["element"]
559
+ elif self.format == "mmCIF":
560
+ if "type_symbol" in self.data:
561
+ return self.data["type_symbol"]
562
+ return ""
563
+
564
+ @cached_property
565
+ def coordinates(self) -> np.ndarray:
566
+ """Get the 3D coordinates of the atom."""
567
+ if self.format == "PDB":
568
+ return np.array([self.data["x"], self.data["y"], self.data["z"]])
569
+ elif self.format == "mmCIF":
570
+ return np.array(
571
+ [self.data["Cartn_x"], self.data["Cartn_y"], self.data["Cartn_z"]]
572
+ )
573
+ return np.array([0.0, 0.0, 0.0])
574
+
575
+ @cached_property
576
+ def occupancy(self) -> float:
577
+ """Get the occupancy value."""
578
+ if self.format == "PDB":
579
+ return (
580
+ float(self.data["occupancy"])
581
+ if pd.notna(self.data["occupancy"])
582
+ else 1.0
583
+ )
584
+ elif self.format == "mmCIF":
585
+ if "occupancy" in self.data:
586
+ return (
587
+ float(self.data["occupancy"])
588
+ if pd.notna(self.data["occupancy"])
589
+ else 1.0
590
+ )
591
+ return 1.0
592
+
593
+ @cached_property
594
+ def temperature_factor(self) -> float:
595
+ """Get the temperature factor (B-factor)."""
596
+ if self.format == "PDB":
597
+ return (
598
+ float(self.data["tempFactor"])
599
+ if pd.notna(self.data["tempFactor"])
600
+ else 0.0
601
+ )
602
+ elif self.format == "mmCIF":
603
+ if "B_iso_or_equiv" in self.data:
604
+ return (
605
+ float(self.data["B_iso_or_equiv"])
606
+ if pd.notna(self.data["B_iso_or_equiv"])
607
+ else 0.0
608
+ )
609
+ return 0.0
610
+
611
+ def __str__(self) -> str:
612
+ """String representation of the atom."""
613
+ return f"{self.name} ({self.element})"
614
+
615
+ def __repr__(self) -> str:
616
+ """Detailed string representation of the atom."""
617
+ coords = self.coordinates
618
+ return f"Atom({self.name}, {self.element}, [{coords[0]:.3f}, {coords[1]:.3f}, {coords[2]:.3f}])"
@@ -0,0 +1,237 @@
1
+ import os
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import pytest
6
+
7
+ from rnapolis.parser_v2 import parse_cif_atoms, parse_pdb_atoms
8
+ from rnapolis.tertiary_v2 import Structure
9
+
10
+
11
+ @pytest.fixture
12
+ def data_dir():
13
+ """Return the path to the test data directory."""
14
+ return os.path.join(os.path.dirname(__file__))
15
+
16
+
17
+ def test_parse_4qln_formats(data_dir):
18
+ """Test parsing 4qln in both PDB and mmCIF formats and compare residues and torsion angles."""
19
+ # Load PDB and mmCIF files
20
+ pdb_path = os.path.join(data_dir, "4qln.pdb")
21
+ cif_path = os.path.join(data_dir, "4qln.cif")
22
+
23
+ # Skip test if files don't exist
24
+ if not (os.path.exists(pdb_path) and os.path.exists(cif_path)):
25
+ pytest.skip(f"Test files not found: {pdb_path} or {cif_path}")
26
+
27
+ # Parse both formats
28
+ with open(pdb_path, "r") as pdb_file:
29
+ pdb_atoms = parse_pdb_atoms(pdb_file)
30
+
31
+ with open(cif_path, "r") as cif_file:
32
+ cif_atoms = parse_cif_atoms(cif_file)
33
+
34
+ # Create structures
35
+ pdb_structure = Structure(pdb_atoms)
36
+ cif_structure = Structure(cif_atoms)
37
+
38
+ # Get residues
39
+ pdb_residues = pdb_structure.residues
40
+ cif_residues = cif_structure.residues
41
+
42
+ # Basic checks
43
+ assert len(pdb_residues) > 0, "No residues found in PDB file"
44
+ assert len(cif_residues) > 0, "No residues found in mmCIF file"
45
+
46
+ # Compare residue counts
47
+ assert len(pdb_residues) == len(cif_residues), (
48
+ f"Different number of residues: PDB={len(pdb_residues)}, mmCIF={len(cif_residues)}"
49
+ )
50
+
51
+ # Compare residue identifiers
52
+ pdb_residue_ids = [
53
+ (r.chain_id, r.residue_number, r.insertion_code) for r in pdb_residues
54
+ ]
55
+ cif_residue_ids = [
56
+ (r.chain_id, r.residue_number, r.insertion_code) for r in cif_residues
57
+ ]
58
+
59
+ # Sort both lists to ensure consistent ordering
60
+ pdb_residue_ids.sort()
61
+ cif_residue_ids.sort()
62
+
63
+ # Check if residue identifiers match
64
+ for i, (pdb_id, cif_id) in enumerate(zip(pdb_residue_ids, cif_residue_ids)):
65
+ assert pdb_id == cif_id, (
66
+ f"Residue mismatch at position {i}: PDB={pdb_id}, mmCIF={cif_id}"
67
+ )
68
+
69
+ # Create a mapping from residue ID to residue name for both formats
70
+ pdb_id_to_name = {
71
+ (r.chain_id, r.residue_number, r.insertion_code): r.residue_name
72
+ for r in pdb_residues
73
+ }
74
+ cif_id_to_name = {
75
+ (r.chain_id, r.residue_number, r.insertion_code): r.residue_name
76
+ for r in cif_residues
77
+ }
78
+
79
+ # Check if residue names match for each residue ID
80
+ for res_id in pdb_id_to_name:
81
+ assert res_id in cif_id_to_name, f"Residue ID {res_id} not found in mmCIF"
82
+ assert pdb_id_to_name[res_id] == cif_id_to_name[res_id], (
83
+ f"Residue name mismatch for {res_id}: PDB={pdb_id_to_name[res_id]}, mmCIF={cif_id_to_name[res_id]}"
84
+ )
85
+
86
+ # Calculate torsion angles for both structures
87
+ pdb_torsion_df = pdb_structure.torsion_angles
88
+ cif_torsion_df = cif_structure.torsion_angles
89
+
90
+ # Check if torsion angle DataFrames have the same shape
91
+ assert pdb_torsion_df.shape == cif_torsion_df.shape, (
92
+ f"Different torsion angle DataFrame shapes: PDB={pdb_torsion_df.shape}, mmCIF={cif_torsion_df.shape}"
93
+ )
94
+
95
+ # Sort both DataFrames by chain_id, residue_number, and insertion_code for consistent comparison
96
+ pdb_torsion_df = pdb_torsion_df.sort_values(
97
+ by=["chain_id", "residue_number", "insertion_code"]
98
+ ).reset_index(drop=True)
99
+
100
+ cif_torsion_df = cif_torsion_df.sort_values(
101
+ by=["chain_id", "residue_number", "insertion_code"]
102
+ ).reset_index(drop=True)
103
+
104
+ # Compare residue identifiers in torsion angle DataFrames
105
+ pd.testing.assert_series_equal(
106
+ pdb_torsion_df["chain_id"],
107
+ cif_torsion_df["chain_id"],
108
+ check_names=False,
109
+ check_dtype=False,
110
+ )
111
+ pd.testing.assert_series_equal(
112
+ pdb_torsion_df["residue_number"],
113
+ cif_torsion_df["residue_number"],
114
+ check_names=False,
115
+ check_dtype=False,
116
+ )
117
+ pd.testing.assert_series_equal(
118
+ pdb_torsion_df["residue_name"],
119
+ cif_torsion_df["residue_name"],
120
+ check_names=False,
121
+ check_dtype=False,
122
+ )
123
+
124
+ # Compare torsion angle values with a tolerance
125
+ angle_columns = ["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "chi"]
126
+ for col in angle_columns:
127
+ # Skip columns that might not exist in both DataFrames
128
+ if col not in pdb_torsion_df.columns or col not in cif_torsion_df.columns:
129
+ continue
130
+
131
+ # Get non-NaN values that exist in both DataFrames
132
+ pdb_values = pdb_torsion_df[col]
133
+ cif_values = cif_torsion_df[col]
134
+
135
+ # Check if the same values are NaN in both DataFrames
136
+ assert pdb_values.isna().equals(cif_values.isna()), (
137
+ f"Different NaN patterns in {col} angle"
138
+ )
139
+
140
+ # Compare non-NaN values with tolerance
141
+ mask = ~pdb_values.isna()
142
+ if mask.any():
143
+ pdb_non_nan = pdb_values[mask].values
144
+ cif_non_nan = cif_values[mask].values
145
+
146
+ # Allow a small tolerance for floating-point differences
147
+ np.testing.assert_allclose(
148
+ pdb_non_nan,
149
+ cif_non_nan,
150
+ rtol=1e-5,
151
+ atol=1e-5,
152
+ err_msg=f"Torsion angle values for {col} don't match between PDB and mmCIF",
153
+ )
154
+
155
+
156
+ def test_torsion_angle_calculation():
157
+ """Test the torsion angle calculation function."""
158
+ # Define four points that form a known torsion angle
159
+ a1 = np.array([1.0, 0.0, 0.0])
160
+ a2 = np.array([0.0, 0.0, 0.0])
161
+ a3 = np.array([0.0, 1.0, 0.0])
162
+ a4 = np.array([0.0, 1.0, 1.0])
163
+
164
+ # Calculate the torsion angle
165
+ from rnapolis.tertiary_v2 import calculate_torsion_angle
166
+
167
+ angle = calculate_torsion_angle(a1, a2, a3, a4)
168
+
169
+ # The expected angle is pi/2 radians (90 degrees)
170
+ assert abs(angle - np.pi / 2) < 1e-6, (
171
+ f"Expected angle close to pi/2 radians, got {angle}"
172
+ )
173
+
174
+ # Test with collinear points
175
+ a1 = np.array([0.0, 0.0, 0.0])
176
+ a2 = np.array([1.0, 0.0, 0.0])
177
+ a3 = np.array([2.0, 0.0, 0.0])
178
+ a4 = np.array([3.0, 0.0, 0.0])
179
+
180
+ angle = calculate_torsion_angle(a1, a2, a3, a4)
181
+ assert np.isnan(angle), f"Expected NaN for collinear points, got {angle}"
182
+
183
+
184
+ def test_connected_residues_and_torsion_angles(data_dir):
185
+ """Test finding connected residues and calculating torsion angles."""
186
+ # Load PDB file
187
+ pdb_path = os.path.join(data_dir, "4qln.pdb")
188
+
189
+ # Skip test if file doesn't exist
190
+ if not os.path.exists(pdb_path):
191
+ pytest.skip(f"Test file not found: {pdb_path}")
192
+
193
+ # Parse PDB file
194
+ with open(pdb_path, "r") as pdb_file:
195
+ pdb_atoms = parse_pdb_atoms(pdb_file)
196
+
197
+ # Create structure
198
+ structure = Structure(pdb_atoms)
199
+
200
+ # Find connected residues
201
+ segments = structure.connected_residues
202
+
203
+ # Check that we found at least one segment
204
+ assert len(segments) > 0, "No connected residue segments found"
205
+
206
+ # Check that each segment has at least 2 residues
207
+ for segment in segments:
208
+ assert len(segment) >= 2, f"Segment has fewer than 2 residues: {segment}"
209
+
210
+ # Calculate torsion angles
211
+ torsion_df = structure.torsion_angles
212
+
213
+ # Check that the DataFrame has the expected columns
214
+ expected_columns = [
215
+ "chain_id",
216
+ "residue_number",
217
+ "insertion_code",
218
+ "residue_name",
219
+ "alpha",
220
+ "beta",
221
+ "gamma",
222
+ "delta",
223
+ "epsilon",
224
+ "zeta",
225
+ "chi",
226
+ ]
227
+ for col in expected_columns:
228
+ assert col in torsion_df.columns, (
229
+ f"Expected column {col} not found in torsion angles DataFrame"
230
+ )
231
+
232
+ # Check that we have some torsion angle values
233
+ assert len(torsion_df) > 0, "No torsion angles calculated"
234
+
235
+ # Check that at least some angles are not null
236
+ for angle in ["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "chi"]:
237
+ assert torsion_df[angle].notna().any(), f"No valid {angle} angles calculated"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes