RNApolis 0.4.17__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,651 @@
1
+ import string
2
+ from functools import cached_property
3
+ from typing import List, Optional
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ # Constants
9
+ AVERAGE_OXYGEN_PHOSPHORUS_DISTANCE_COVALENT = 1.6
10
+
11
+
12
+ def calculate_torsion_angle(
13
+ a1: np.ndarray, a2: np.ndarray, a3: np.ndarray, a4: np.ndarray
14
+ ) -> float:
15
+ """
16
+ Calculate the torsion angle between four points in 3D space.
17
+
18
+ Parameters:
19
+ -----------
20
+ a1, a2, a3, a4 : np.ndarray
21
+ 3D coordinates of the four atoms
22
+
23
+ Returns:
24
+ --------
25
+ float
26
+ Torsion angle in radians
27
+ """
28
+ # Calculate vectors between points
29
+ v1 = a2 - a1
30
+ v2 = a3 - a2
31
+ v3 = a4 - a3
32
+
33
+ # Calculate normal vectors
34
+ n1 = np.cross(v1, v2)
35
+ n2 = np.cross(v2, v3)
36
+
37
+ # Normalize normal vectors
38
+ n1_norm = np.linalg.norm(n1)
39
+ n2_norm = np.linalg.norm(n2)
40
+
41
+ # Check for collinearity
42
+ if n1_norm < 1e-6 or n2_norm < 1e-6:
43
+ return float("nan")
44
+
45
+ n1 = n1 / n1_norm
46
+ n2 = n2 / n2_norm
47
+
48
+ # Calculate the angle using dot product
49
+ m1 = np.cross(n1, v2 / np.linalg.norm(v2))
50
+ x = np.dot(n1, n2)
51
+ y = np.dot(m1, n2)
52
+
53
+ # Return angle in radians
54
+ angle = np.arctan2(y, x)
55
+
56
+ return angle
57
+
58
+
59
+ class Structure:
60
+ """
61
+ A class representing a molecular structure parsed from PDB or mmCIF format.
62
+
63
+ This class takes a DataFrame created by parser_v2 functions and provides
64
+ methods to access and manipulate the structure data.
65
+ """
66
+
67
+ def __init__(self, atoms: pd.DataFrame):
68
+ """
69
+ Initialize a Structure object with atom data.
70
+
71
+ Parameters:
72
+ -----------
73
+ atoms : pd.DataFrame
74
+ DataFrame containing atom data, as created by parse_pdb_atoms or parse_cif_atoms
75
+ """
76
+ self.atoms = atoms
77
+ self.format = atoms.attrs.get("format", "unknown")
78
+
79
+ @cached_property
80
+ def residues(self) -> List["Residue"]:
81
+ """
82
+ Group atoms by residue and return a list of Residue objects.
83
+
84
+ The grouping logic depends on the format of the input data:
85
+ - For PDB: group by (chainID, resSeq, iCode)
86
+ - For mmCIF: group by (label_asym_id, label_seq_id) if present,
87
+ otherwise by (auth_asym_id, auth_seq_id, pdbx_PDB_ins_code)
88
+
89
+ Returns:
90
+ --------
91
+ List[Residue]
92
+ List of Residue objects, each representing a single residue
93
+ """
94
+ if self.format == "PDB":
95
+ # Group by chain ID, residue sequence number, and insertion code
96
+ groupby_cols = ["chainID", "resSeq", "iCode"]
97
+
98
+ # Filter out columns that don't exist in the DataFrame
99
+ groupby_cols = [col for col in groupby_cols if col in self.atoms.columns]
100
+
101
+ # Group atoms by residue
102
+ grouped = self.atoms.groupby(groupby_cols, dropna=False, observed=False)
103
+
104
+ elif self.format == "mmCIF":
105
+ # Prefer auth_* columns if they exist
106
+ if (
107
+ "auth_asym_id" in self.atoms.columns
108
+ and "auth_seq_id" in self.atoms.columns
109
+ ):
110
+ groupby_cols = ["auth_asym_id", "auth_seq_id"]
111
+
112
+ # Add insertion code if it exists
113
+ if "pdbx_PDB_ins_code" in self.atoms.columns:
114
+ groupby_cols.append("pdbx_PDB_ins_code")
115
+ else:
116
+ # Fall back to label_* columns
117
+ groupby_cols = ["label_asym_id", "label_seq_id"]
118
+
119
+ # Add insertion code if it exists
120
+ if "pdbx_PDB_ins_code" in self.atoms.columns:
121
+ groupby_cols.append("pdbx_PDB_ins_code")
122
+
123
+ # Group atoms by residue
124
+ grouped = self.atoms.groupby(groupby_cols, dropna=False, observed=False)
125
+
126
+ else:
127
+ # For unknown formats, return an empty list
128
+ return []
129
+
130
+ # Convert groups to a list of DataFrames
131
+ residue_dfs = []
132
+ for _, group in grouped:
133
+ # Create a copy of the group DataFrame
134
+ residue_df = group.copy()
135
+
136
+ # Preserve the format attribute
137
+ residue_df.attrs["format"] = self.format
138
+
139
+ residue_dfs.append(residue_df)
140
+
141
+ # Convert groups to a list of Residue objects
142
+ residues = []
143
+ for _, group in grouped:
144
+ # Create a copy of the group DataFrame
145
+ residue_df = group.copy()
146
+
147
+ # Preserve the format attribute
148
+ residue_df.attrs["format"] = self.format
149
+
150
+ # Create a Residue object
151
+ residues.append(Residue(residue_df))
152
+
153
+ return residues
154
+
155
+ @cached_property
156
+ def connected_residues(self) -> List[List["Residue"]]:
157
+ """
158
+ Find segments of connected residues in the structure.
159
+
160
+ Returns:
161
+ --------
162
+ List[List[Residue]]
163
+ List of segments, where each segment is a list of connected residues
164
+ """
165
+ # Group residues by chain
166
+ residues_by_chain = {}
167
+ for residue in self.residues:
168
+ chain_id = residue.chain_id
169
+ if chain_id not in residues_by_chain:
170
+ residues_by_chain[chain_id] = []
171
+ residues_by_chain[chain_id].append(residue)
172
+
173
+ # Sort residues in each chain by residue number
174
+ for chain_id in residues_by_chain:
175
+ residues_by_chain[chain_id].sort(
176
+ key=lambda r: (r.residue_number, r.insertion_code or "")
177
+ )
178
+
179
+ # Find connected segments in each chain
180
+ segments = []
181
+ for chain_id, chain_residues in residues_by_chain.items():
182
+ current_segment = []
183
+
184
+ for residue in chain_residues:
185
+ if not current_segment:
186
+ # Start a new segment
187
+ current_segment.append(residue)
188
+ else:
189
+ # Check if this residue is connected to the previous one
190
+ prev_residue = current_segment[-1]
191
+ if prev_residue.is_connected(residue):
192
+ current_segment.append(residue)
193
+ else:
194
+ # End the current segment and start a new one
195
+ if (
196
+ len(current_segment) > 1
197
+ ): # Only add segments with at least 2 residues
198
+ segments.append(current_segment)
199
+ current_segment = [residue]
200
+
201
+ # Add the last segment if it has at least 2 residues
202
+ if len(current_segment) > 1:
203
+ segments.append(current_segment)
204
+
205
+ return segments
206
+
207
+ @cached_property
208
+ def torsion_angles(self) -> pd.DataFrame:
209
+ """
210
+ Calculate torsion angles for all connected residues in the structure.
211
+
212
+ Returns:
213
+ --------
214
+ pd.DataFrame
215
+ DataFrame containing torsion angle values for each residue
216
+ """
217
+ # Find connected segments
218
+ segments = self.connected_residues
219
+
220
+ # Prepare data for the DataFrame
221
+ data = []
222
+
223
+ # Define the torsion angles to calculate
224
+ torsion_definitions = {
225
+ "alpha": [("O3'", -1), ("P", 0), ("O5'", 0), ("C5'", 0)],
226
+ "beta": [("P", 0), ("O5'", 0), ("C5'", 0), ("C4'", 0)],
227
+ "gamma": [("O5'", 0), ("C5'", 0), ("C4'", 0), ("C3'", 0)],
228
+ "delta": [("C5'", 0), ("C4'", 0), ("C3'", 0), ("O3'", 0)],
229
+ "epsilon": [("C4'", 0), ("C3'", 0), ("O3'", 0), ("P", 1)],
230
+ "zeta": [("C3'", 0), ("O3'", 0), ("P", 1), ("O5'", 1)],
231
+ "chi": None, # Will be handled separately due to purine/pyrimidine difference
232
+ }
233
+
234
+ # Process each segment
235
+ for segment in segments:
236
+ for i, residue in enumerate(segment):
237
+ # Prepare row data
238
+ row = {
239
+ "chain_id": residue.chain_id,
240
+ "residue_number": residue.residue_number,
241
+ "insertion_code": residue.insertion_code,
242
+ "residue_name": residue.residue_name,
243
+ }
244
+
245
+ # Calculate standard torsion angles
246
+ for angle_name, atoms_def in torsion_definitions.items():
247
+ if angle_name == "chi":
248
+ continue # Skip chi for now
249
+
250
+ if angle_name == "alpha" and i == 0:
251
+ continue # Skip alpha for the second residue
252
+
253
+ if angle_name in ["epsilon", "zeta"] and i == len(segment) - 1:
254
+ continue # Skip epsilon and zeta for the second-to-last residue
255
+
256
+ # Get the atoms for this angle
257
+ atoms = []
258
+ valid = True
259
+
260
+ for atom_name, offset in atoms_def:
261
+ res_idx = i + offset
262
+ if 0 <= res_idx < len(segment):
263
+ atom = segment[res_idx].find_atom(atom_name)
264
+ if atom is not None:
265
+ atoms.append(atom.coordinates)
266
+ else:
267
+ valid = False
268
+ break
269
+ else:
270
+ valid = False
271
+ break
272
+
273
+ # Calculate the angle if all atoms were found
274
+ if valid and len(atoms) == 4:
275
+ angle = calculate_torsion_angle(
276
+ atoms[0], atoms[1], atoms[2], atoms[3]
277
+ )
278
+ row[angle_name] = angle
279
+ else:
280
+ row[angle_name] = None
281
+
282
+ # Calculate chi angle based on residue type
283
+ # Pyrimidines: O4'-C1'-N1-C2
284
+ # Purines: O4'-C1'-N9-C4
285
+ purine_bases = ["A", "G", "DA", "DG"]
286
+ pyrimidine_bases = ["C", "U", "T", "DC", "DT"]
287
+
288
+ o4_prime = residue.find_atom("O4'")
289
+ c1_prime = residue.find_atom("C1'")
290
+
291
+ if o4_prime is not None and c1_prime is not None:
292
+ if residue.residue_name in purine_bases:
293
+ n9 = residue.find_atom("N9")
294
+ c4 = residue.find_atom("C4")
295
+ if n9 is not None and c4 is not None:
296
+ chi = calculate_torsion_angle(
297
+ o4_prime.coordinates,
298
+ c1_prime.coordinates,
299
+ n9.coordinates,
300
+ c4.coordinates,
301
+ )
302
+ row["chi"] = chi
303
+ elif residue.residue_name in pyrimidine_bases:
304
+ n1 = residue.find_atom("N1")
305
+ c2 = residue.find_atom("C2")
306
+ if n1 is not None and c2 is not None:
307
+ chi = calculate_torsion_angle(
308
+ o4_prime.coordinates,
309
+ c1_prime.coordinates,
310
+ n1.coordinates,
311
+ c2.coordinates,
312
+ )
313
+ row["chi"] = chi
314
+
315
+ data.append(row)
316
+
317
+ # Create DataFrame
318
+ if not data:
319
+ # Return empty DataFrame with correct columns
320
+ return pd.DataFrame(
321
+ columns=[
322
+ "chain_id",
323
+ "residue_number",
324
+ "insertion_code",
325
+ "residue_name",
326
+ "alpha",
327
+ "beta",
328
+ "gamma",
329
+ "delta",
330
+ "epsilon",
331
+ "zeta",
332
+ "chi",
333
+ ]
334
+ )
335
+
336
+ df = pd.DataFrame(data)
337
+
338
+ # Ensure all angle columns exist
339
+ for angle in ["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "chi"]:
340
+ if angle not in df.columns:
341
+ df[angle] = None
342
+
343
+ # Reorder columns to ensure consistent order
344
+ ordered_columns = [
345
+ "chain_id",
346
+ "residue_number",
347
+ "insertion_code",
348
+ "residue_name",
349
+ "alpha",
350
+ "beta",
351
+ "gamma",
352
+ "delta",
353
+ "epsilon",
354
+ "zeta",
355
+ "chi",
356
+ ]
357
+ df = df[ordered_columns]
358
+
359
+ return df
360
+
361
+
362
+ class Residue:
363
+ """
364
+ A class representing a single residue in a molecular structure.
365
+
366
+ This class encapsulates a DataFrame containing atoms belonging to a single residue
367
+ and provides methods to access residue properties.
368
+ """
369
+
370
+ def __init__(self, residue_df: pd.DataFrame):
371
+ """
372
+ Initialize a Residue object with atom data for a single residue.
373
+
374
+ Parameters:
375
+ -----------
376
+ residue_df : pd.DataFrame
377
+ DataFrame containing atom data for a single residue
378
+ """
379
+ self.atoms = residue_df
380
+ self.format = residue_df.attrs.get("format", "unknown")
381
+
382
+ @property
383
+ def chain_id(self) -> str:
384
+ """Get the chain identifier for this residue."""
385
+ if self.format == "PDB":
386
+ return self.atoms["chainID"].iloc[0]
387
+ elif self.format == "mmCIF":
388
+ if "auth_asym_id" in self.atoms.columns:
389
+ return self.atoms["auth_asym_id"].iloc[0]
390
+ else:
391
+ return self.atoms["label_asym_id"].iloc[0]
392
+ return ""
393
+
394
+ @chain_id.setter
395
+ def chain_id(self, value: str) -> None:
396
+ """Set the chain identifier for this residue."""
397
+ if self.format == "PDB":
398
+ self.atoms["chainID"] = value
399
+ elif self.format == "mmCIF":
400
+ if "auth_asym_id" in self.atoms.columns:
401
+ self.atoms["auth_asym_id"] = value
402
+ if "label_asym_id" in self.atoms.columns:
403
+ self.atoms["label_asym_id"] = value
404
+
405
+ @property
406
+ def residue_number(self) -> int:
407
+ """Get the residue sequence number."""
408
+ if self.format == "PDB":
409
+ return int(self.atoms["resSeq"].iloc[0])
410
+ elif self.format == "mmCIF":
411
+ if "auth_seq_id" in self.atoms.columns:
412
+ return int(self.atoms["auth_seq_id"].iloc[0])
413
+ else:
414
+ return int(self.atoms["label_seq_id"].iloc[0])
415
+ return 0
416
+
417
+ @residue_number.setter
418
+ def residue_number(self, value: int) -> None:
419
+ """Set the residue sequence number."""
420
+ if self.format == "PDB":
421
+ self.atoms["resSeq"] = value
422
+ elif self.format == "mmCIF":
423
+ if "auth_seq_id" in self.atoms.columns:
424
+ self.atoms["auth_seq_id"] = value
425
+ if "label_seq_id" in self.atoms.columns:
426
+ self.atoms["label_seq_id"] = value
427
+
428
+ @property
429
+ def insertion_code(self) -> Optional[str]:
430
+ """Get the insertion code, if any."""
431
+ if self.format == "PDB":
432
+ icode = self.atoms["iCode"].iloc[0]
433
+ return icode if pd.notna(icode) else None
434
+ elif self.format == "mmCIF":
435
+ if "pdbx_PDB_ins_code" in self.atoms.columns:
436
+ icode = self.atoms["pdbx_PDB_ins_code"].iloc[0]
437
+ return icode if pd.notna(icode) else None
438
+ return None
439
+
440
+ @insertion_code.setter
441
+ def insertion_code(self, value: Optional[str]) -> None:
442
+ """Set the insertion code."""
443
+ if self.format == "PDB":
444
+ self.atoms["iCode"] = value
445
+ elif self.format == "mmCIF":
446
+ if "pdbx_PDB_ins_code" in self.atoms.columns:
447
+ self.atoms["pdbx_PDB_ins_code"] = value
448
+
449
+ @cached_property
450
+ def residue_name(self) -> str:
451
+ """Get the residue name (e.g., 'A', 'G', 'C', 'U', etc.)."""
452
+ if self.format == "PDB":
453
+ return self.atoms["resName"].iloc[0]
454
+ elif self.format == "mmCIF":
455
+ if "auth_comp_id" in self.atoms.columns:
456
+ return self.atoms["auth_comp_id"].iloc[0]
457
+ else:
458
+ return self.atoms["label_comp_id"].iloc[0]
459
+ return ""
460
+
461
+ @cached_property
462
+ def atoms_list(self) -> List["Atom"]:
463
+ """Get a list of all atoms in this residue."""
464
+ return [Atom(self.atoms.iloc[i], self.format) for i in range(len(self.atoms))]
465
+
466
+ def find_atom(self, atom_name: str) -> Optional["Atom"]:
467
+ """
468
+ Find an atom by name in this residue.
469
+
470
+ Parameters:
471
+ -----------
472
+ atom_name : str
473
+ Name of the atom to find
474
+
475
+ Returns:
476
+ --------
477
+ Optional[Atom]
478
+ The Atom object, or None if not found
479
+ """
480
+ if self.format == "PDB":
481
+ mask = self.atoms["name"] == atom_name
482
+ atoms_df = self.atoms[mask]
483
+ if len(atoms_df) > 0:
484
+ return Atom(atoms_df.iloc[0], self.format)
485
+ elif self.format == "mmCIF":
486
+ if "auth_atom_id" in self.atoms.columns:
487
+ mask = self.atoms["auth_atom_id"] == atom_name
488
+ atoms_df = self.atoms[mask]
489
+ if len(atoms_df) > 0:
490
+ return Atom(atoms_df.iloc[0], self.format)
491
+ else:
492
+ mask = self.atoms["label_atom_id"] == atom_name
493
+ atoms_df = self.atoms[mask]
494
+ if len(atoms_df) > 0:
495
+ return Atom(atoms_df.iloc[0], self.format)
496
+ return None
497
+
498
+ def is_connected(self, next_residue_candidate: "Residue") -> bool:
499
+ """
500
+ Check if this residue is connected to the next residue candidate.
501
+
502
+ The connection is determined by the distance between the O3' atom of this residue
503
+ and the P atom of the next residue. If the distance is less than 1.5 times the
504
+ average O-P covalent bond distance, the residues are considered connected.
505
+
506
+ Parameters:
507
+ -----------
508
+ next_residue_candidate : Residue
509
+ The residue to check for connection
510
+
511
+ Returns:
512
+ --------
513
+ bool
514
+ True if the residues are connected, False otherwise
515
+ """
516
+ o3p = self.find_atom("O3'")
517
+ p = next_residue_candidate.find_atom("P")
518
+
519
+ if o3p is not None and p is not None:
520
+ distance = np.linalg.norm(o3p.coordinates - p.coordinates).item()
521
+ return distance < 1.5 * AVERAGE_OXYGEN_PHOSPHORUS_DISTANCE_COVALENT
522
+
523
+ return False
524
+
525
+ def __str__(self) -> str:
526
+ """String representation of the residue."""
527
+ # Start with chain ID and residue name
528
+ chain = self.chain_id
529
+ if chain.isspace() or not chain:
530
+ builder = f"{self.residue_name}"
531
+ else:
532
+ builder = f"{chain}.{self.residue_name}"
533
+
534
+ # Add a separator if the residue name ends with a digit
535
+ if len(self.residue_name) > 0 and self.residue_name[-1] in string.digits:
536
+ builder += "/"
537
+
538
+ # Add residue number
539
+ builder += f"{self.residue_number}"
540
+
541
+ # Add insertion code if present
542
+ icode = self.insertion_code
543
+ if icode is not None:
544
+ builder += f"^{icode}"
545
+
546
+ return builder
547
+
548
+ def __repr__(self) -> str:
549
+ """Detailed string representation of the residue."""
550
+ return f"Residue({self.__str__()}, {len(self.atoms)} atoms)"
551
+
552
+
553
+ class Atom:
554
+ """
555
+ A class representing a single atom in a molecular structure.
556
+
557
+ This class encapsulates a pandas Series containing data for a single atom
558
+ and provides methods to access atom properties.
559
+ """
560
+
561
+ def __init__(self, atom_data: pd.Series, format: str):
562
+ """
563
+ Initialize an Atom object with atom data.
564
+
565
+ Parameters:
566
+ -----------
567
+ atom_data : pd.Series
568
+ Series containing data for a single atom
569
+ format : str
570
+ Format of the data ('PDB' or 'mmCIF')
571
+ """
572
+ self.data = atom_data
573
+ self.format = format
574
+
575
+ @cached_property
576
+ def name(self) -> str:
577
+ """Get the atom name."""
578
+ if self.format == "PDB":
579
+ return self.data["name"]
580
+ elif self.format == "mmCIF":
581
+ if "auth_atom_id" in self.data:
582
+ return self.data["auth_atom_id"]
583
+ else:
584
+ return self.data["label_atom_id"]
585
+ return ""
586
+
587
+ @cached_property
588
+ def element(self) -> str:
589
+ """Get the element symbol."""
590
+ if self.format == "PDB":
591
+ return self.data["element"]
592
+ elif self.format == "mmCIF":
593
+ if "type_symbol" in self.data:
594
+ return self.data["type_symbol"]
595
+ return ""
596
+
597
+ @cached_property
598
+ def coordinates(self) -> np.ndarray:
599
+ """Get the 3D coordinates of the atom."""
600
+ if self.format == "PDB":
601
+ return np.array([self.data["x"], self.data["y"], self.data["z"]])
602
+ elif self.format == "mmCIF":
603
+ return np.array(
604
+ [self.data["Cartn_x"], self.data["Cartn_y"], self.data["Cartn_z"]]
605
+ )
606
+ return np.array([0.0, 0.0, 0.0])
607
+
608
+ @cached_property
609
+ def occupancy(self) -> float:
610
+ """Get the occupancy value."""
611
+ if self.format == "PDB":
612
+ return (
613
+ float(self.data["occupancy"])
614
+ if pd.notna(self.data["occupancy"])
615
+ else 1.0
616
+ )
617
+ elif self.format == "mmCIF":
618
+ if "occupancy" in self.data:
619
+ return (
620
+ float(self.data["occupancy"])
621
+ if pd.notna(self.data["occupancy"])
622
+ else 1.0
623
+ )
624
+ return 1.0
625
+
626
+ @cached_property
627
+ def temperature_factor(self) -> float:
628
+ """Get the temperature factor (B-factor)."""
629
+ if self.format == "PDB":
630
+ return (
631
+ float(self.data["tempFactor"])
632
+ if pd.notna(self.data["tempFactor"])
633
+ else 0.0
634
+ )
635
+ elif self.format == "mmCIF":
636
+ if "B_iso_or_equiv" in self.data:
637
+ return (
638
+ float(self.data["B_iso_or_equiv"])
639
+ if pd.notna(self.data["B_iso_or_equiv"])
640
+ else 0.0
641
+ )
642
+ return 0.0
643
+
644
+ def __str__(self) -> str:
645
+ """String representation of the atom."""
646
+ return f"{self.name} ({self.element})"
647
+
648
+ def __repr__(self) -> str:
649
+ """Detailed string representation of the atom."""
650
+ coords = self.coordinates
651
+ return f"Atom({self.name}, {self.element}, [{coords[0]:.3f}, {coords[1]:.3f}, {coords[2]:.3f}])"