PyPI - RNApolis - Versions diffs - 0.4.17__tar.gz → 0.5.0__tar.gz - Mend

RNApolis 0.4.17tar.gz → 0.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

{rnapolis-0.4.17/src/RNApolis.egg-info → rnapolis-0.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: RNApolis
-Version: 0.4.17
+Version: 0.5.0
 Summary: A Python library containing RNA-related bioinformatics functions and classes
 Home-page: https://github.com/tzok/rnapolis-py
 Author: Tomasz Zok

{rnapolis-0.4.17 → rnapolis-0.5.0}/setup.py RENAMED Viewed

@@ -5,7 +5,7 @@ with open("README.md") as f:
 setup(
     name="RNApolis",
-    version="0.4.17",
+    version="0.5.0",
     packages=["rnapolis"],
     package_dir={"": "src"},
     author="Tomasz Zok",

{rnapolis-0.4.17 → rnapolis-0.5.0/src/RNApolis.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: RNApolis
-Version: 0.4.17
+Version: 0.5.0
 Summary: A Python library containing RNA-related bioinformatics functions and classes
 Home-page: https://github.com/tzok/rnapolis-py
 Author: Tomasz Zok

{rnapolis-0.4.17 → rnapolis-0.5.0}/src/RNApolis.egg-info/SOURCES.txt RENAMED Viewed

@@ -16,8 +16,10 @@ src/rnapolis/mmcif_pdbx_v50.dic
 src/rnapolis/molecule_filter.py
 src/rnapolis/motif_extractor.py
 src/rnapolis/parser.py
+src/rnapolis/parser_v2.py
 src/rnapolis/rfam_folder.py
 src/rnapolis/tertiary.py
+src/rnapolis/tertiary_v2.py
 src/rnapolis/transformer.py
 src/rnapolis/util.py
 tests/test_annotator.py
@@ -29,4 +31,5 @@ tests/test_parser.py
 tests/test_quadruplexes.py
 tests/test_rfam_folder.py
 tests/test_tertiary.py
-tests/test_transformer.py
+tests/test_transformer.py
+tests/test_v2.py

rnapolis-0.5.0/src/rnapolis/parser_v2.py ADDED Viewed

@@ -0,0 +1,202 @@
+from typing import IO, Union
+import pandas as pd
+from mmcif.io.IoAdapterPy import IoAdapterPy
+def parse_pdb_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
+    """
+    Parse PDB file content and extract ATOM and HETATM records into a pandas DataFrame.
+    Parameters:
+    -----------
+    content : Union[str, IO[str]]
+        Content of a PDB file as a string or file-like object
+    Returns:
+    --------
+    pd.DataFrame
+        DataFrame containing parsed ATOM and HETATM records with columns corresponding to PDB format
+    """
+    records = []
+    # Handle both string content and file-like objects
+    if isinstance(content, str):
+        lines = content.splitlines()
+    else:
+        # Read all lines from the file-like object
+        content.seek(0)  # Ensure we're at the beginning of the file
+        lines = content.readlines()
+        # Convert bytes to string if needed
+        if isinstance(lines[0], bytes):
+            lines = [line.decode("utf-8") for line in lines]
+    for line in lines:
+        record_type = line[:6].strip()
+        # Only process ATOM and HETATM records
+        if record_type not in ["ATOM", "HETATM"]:
+            continue
+        # Parse fields according to PDB format specification
+        icode = line[26:27].strip()
+        record = {
+            "record_type": record_type,
+            "serial": line[6:11].strip(),
+            "name": line[12:16].strip(),
+            "altLoc": line[16:17].strip(),
+            "resName": line[17:20].strip(),
+            "chainID": line[21:22].strip(),
+            "resSeq": line[22:26].strip(),
+            "iCode": None if not icode else icode,  # Convert empty string to None
+            "x": line[30:38].strip(),
+            "y": line[38:46].strip(),
+            "z": line[46:54].strip(),
+            "occupancy": line[54:60].strip(),
+            "tempFactor": line[60:66].strip(),
+            "element": line[76:78].strip(),
+            "charge": line[78:80].strip(),
+        }
+        records.append(record)
+    # Create DataFrame from records
+    if not records:
+        # Return empty DataFrame with correct columns if no records found
+        return pd.DataFrame(
+            columns=[
+                "record_type",
+                "serial",
+                "name",
+                "altLoc",
+                "resName",
+                "chainID",
+                "resSeq",
+                "iCode",
+                "x",
+                "y",
+                "z",
+                "occupancy",
+                "tempFactor",
+                "element",
+                "charge",
+            ]
+        )
+    df = pd.DataFrame(records)
+    # Convert numeric columns to appropriate types
+    numeric_columns = ["serial", "resSeq", "x", "y", "z", "occupancy", "tempFactor"]
+    for col in numeric_columns:
+        df[col] = pd.to_numeric(df[col], errors="coerce")
+    # Convert categorical columns
+    categorical_columns = [
+        "record_type",
+        "name",
+        "altLoc",
+        "resName",
+        "chainID",
+        "element",
+        "charge",
+    ]
+    for col in categorical_columns:
+        df[col] = df[col].astype("category")
+    # Add format attribute to the DataFrame
+    df.attrs["format"] = "PDB"
+    return df
+def parse_cif_atoms(content: Union[str, IO[str]]) -> pd.DataFrame:
+    """
+    Parse mmCIF file content and extract atom_site records into a pandas DataFrame.
+    Parameters:
+    -----------
+    content : Union[str, IO[str]]
+        Content of a mmCIF file as a string or file-like object
+    Returns:
+    --------
+    pd.DataFrame
+        DataFrame containing parsed atom_site records with columns corresponding to mmCIF format
+    """
+    adapter = IoAdapterPy()
+    # Handle both string content and file-like objects
+    if isinstance(content, str):
+        # Create a temporary file to use with the adapter
+        import tempfile
+        with tempfile.NamedTemporaryFile(mode="w+", suffix=".cif") as temp_file:
+            temp_file.write(content)
+            temp_file.flush()
+            data = adapter.readFile(temp_file.name)
+    else:
+        # Assume it's a file-like object with a name attribute
+        data = adapter.readFile(content.name)
+    # Get the atom_site category
+    category = data[0].getObj("atom_site")
+    if not category:
+        # Return empty DataFrame if no atom_site category found
+        return pd.DataFrame()
+    # Extract attribute names and data rows
+    attributes = category.getAttributeList()
+    rows = category.getRowList()
+    # Create a list of dictionaries for each atom
+    records = []
+    for row in rows:
+        record = dict(zip(attributes, row))
+        # Convert "?" or "." in insertion code to None
+        if "pdbx_PDB_ins_code" in record:
+            if record["pdbx_PDB_ins_code"] in ["?", ".", ""]:
+                record["pdbx_PDB_ins_code"] = None
+        records.append(record)
+    # Create DataFrame from records
+    df = pd.DataFrame(records)
+    # Convert numeric columns to appropriate types
+    numeric_columns = [
+        "id",
+        "auth_seq_id",
+        "Cartn_x",
+        "Cartn_y",
+        "Cartn_z",
+        "occupancy",
+        "B_iso_or_equiv",
+        "pdbx_formal_charge",
+    ]
+    for col in numeric_columns:
+        if col in df.columns:
+            df[col] = pd.to_numeric(df[col], errors="coerce")
+    # Convert categorical columns
+    categorical_columns = [
+        "group_PDB",
+        "type_symbol",
+        "label_atom_id",
+        "label_comp_id",
+        "label_asym_id",
+        "auth_atom_id",
+        "auth_comp_id",
+        "auth_asym_id",
+    ]
+    for col in categorical_columns:
+        if col in df.columns:
+            df[col] = df[col].astype("category")
+    # Add format attribute to the DataFrame
+    df.attrs["format"] = "mmCIF"
+    return df

rnapolis-0.5.0/src/rnapolis/tertiary_v2.py ADDED Viewed

@@ -0,0 +1,618 @@
+import string
+from functools import cached_property
+from typing import List, Optional
+import numpy as np
+import pandas as pd
+# Constants
+AVERAGE_OXYGEN_PHOSPHORUS_DISTANCE_COVALENT = 1.6
+def calculate_torsion_angle(
+    a1: np.ndarray, a2: np.ndarray, a3: np.ndarray, a4: np.ndarray
+) -> float:
+    """
+    Calculate the torsion angle between four points in 3D space.
+    Parameters:
+    -----------
+    a1, a2, a3, a4 : np.ndarray
+        3D coordinates of the four atoms
+    Returns:
+    --------
+    float
+        Torsion angle in radians
+    """
+    # Calculate vectors between points
+    v1 = a2 - a1
+    v2 = a3 - a2
+    v3 = a4 - a3
+    # Calculate normal vectors
+    n1 = np.cross(v1, v2)
+    n2 = np.cross(v2, v3)
+    # Normalize normal vectors
+    n1_norm = np.linalg.norm(n1)
+    n2_norm = np.linalg.norm(n2)
+    # Check for collinearity
+    if n1_norm < 1e-6 or n2_norm < 1e-6:
+        return float("nan")
+    n1 = n1 / n1_norm
+    n2 = n2 / n2_norm
+    # Calculate the angle using dot product
+    m1 = np.cross(n1, v2 / np.linalg.norm(v2))
+    x = np.dot(n1, n2)
+    y = np.dot(m1, n2)
+    # Return angle in radians
+    angle = np.arctan2(y, x)
+    return angle
+class Structure:
+    """
+    A class representing a molecular structure parsed from PDB or mmCIF format.
+    This class takes a DataFrame created by parser_v2 functions and provides
+    methods to access and manipulate the structure data.
+    """
+    def __init__(self, atoms: pd.DataFrame):
+        """
+        Initialize a Structure object with atom data.
+        Parameters:
+        -----------
+        atoms : pd.DataFrame
+            DataFrame containing atom data, as created by parse_pdb_atoms or parse_cif_atoms
+        """
+        self.atoms = atoms
+        self.format = atoms.attrs.get("format", "unknown")
+    @cached_property
+    def residues(self) -> List["Residue"]:
+        """
+        Group atoms by residue and return a list of Residue objects.
+        The grouping logic depends on the format of the input data:
+        - For PDB: group by (chainID, resSeq, iCode)
+        - For mmCIF: group by (label_asym_id, label_seq_id) if present,
+                     otherwise by (auth_asym_id, auth_seq_id, pdbx_PDB_ins_code)
+        Returns:
+        --------
+        List[Residue]
+            List of Residue objects, each representing a single residue
+        """
+        if self.format == "PDB":
+            # Group by chain ID, residue sequence number, and insertion code
+            groupby_cols = ["chainID", "resSeq", "iCode"]
+            # Filter out columns that don't exist in the DataFrame
+            groupby_cols = [col for col in groupby_cols if col in self.atoms.columns]
+            # Group atoms by residue
+            grouped = self.atoms.groupby(groupby_cols, dropna=False, observed=False)
+        elif self.format == "mmCIF":
+            # Prefer auth_* columns if they exist
+            if (
+                "auth_asym_id" in self.atoms.columns
+                and "auth_seq_id" in self.atoms.columns
+            ):
+                groupby_cols = ["auth_asym_id", "auth_seq_id"]
+                # Add insertion code if it exists
+                if "pdbx_PDB_ins_code" in self.atoms.columns:
+                    groupby_cols.append("pdbx_PDB_ins_code")
+            else:
+                # Fall back to label_* columns
+                groupby_cols = ["label_asym_id", "label_seq_id"]
+                # Add insertion code if it exists
+                if "pdbx_PDB_ins_code" in self.atoms.columns:
+                    groupby_cols.append("pdbx_PDB_ins_code")
+            # Group atoms by residue
+            grouped = self.atoms.groupby(groupby_cols, dropna=False, observed=False)
+        else:
+            # For unknown formats, return an empty list
+            return []
+        # Convert groups to a list of DataFrames
+        residue_dfs = []
+        for _, group in grouped:
+            # Create a copy of the group DataFrame
+            residue_df = group.copy()
+            # Preserve the format attribute
+            residue_df.attrs["format"] = self.format
+            residue_dfs.append(residue_df)
+        # Convert groups to a list of Residue objects
+        residues = []
+        for _, group in grouped:
+            # Create a copy of the group DataFrame
+            residue_df = group.copy()
+            # Preserve the format attribute
+            residue_df.attrs["format"] = self.format
+            # Create a Residue object
+            residues.append(Residue(residue_df))
+        return residues
+    @cached_property
+    def connected_residues(self) -> List[List["Residue"]]:
+        """
+        Find segments of connected residues in the structure.
+        Returns:
+        --------
+        List[List[Residue]]
+            List of segments, where each segment is a list of connected residues
+        """
+        # Group residues by chain
+        residues_by_chain = {}
+        for residue in self.residues:
+            chain_id = residue.chain_id
+            if chain_id not in residues_by_chain:
+                residues_by_chain[chain_id] = []
+            residues_by_chain[chain_id].append(residue)
+        # Sort residues in each chain by residue number
+        for chain_id in residues_by_chain:
+            residues_by_chain[chain_id].sort(
+                key=lambda r: (r.residue_number, r.insertion_code or "")
+            )
+        # Find connected segments in each chain
+        segments = []
+        for chain_id, chain_residues in residues_by_chain.items():
+            current_segment = []
+            for residue in chain_residues:
+                if not current_segment:
+                    # Start a new segment
+                    current_segment.append(residue)
+                else:
+                    # Check if this residue is connected to the previous one
+                    prev_residue = current_segment[-1]
+                    if prev_residue.is_connected(residue):
+                        current_segment.append(residue)
+                    else:
+                        # End the current segment and start a new one
+                        if (
+                            len(current_segment) > 1
+                        ):  # Only add segments with at least 2 residues
+                            segments.append(current_segment)
+                        current_segment = [residue]
+            # Add the last segment if it has at least 2 residues
+            if len(current_segment) > 1:
+                segments.append(current_segment)
+        return segments
+    @cached_property
+    def torsion_angles(self) -> pd.DataFrame:
+        """
+        Calculate torsion angles for all connected residues in the structure.
+        Returns:
+        --------
+        pd.DataFrame
+            DataFrame containing torsion angle values for each residue
+        """
+        # Find connected segments
+        segments = self.connected_residues
+        # Prepare data for the DataFrame
+        data = []
+        # Define the torsion angles to calculate
+        torsion_definitions = {
+            "alpha": [("O3'", -1), ("P", 0), ("O5'", 0), ("C5'", 0)],
+            "beta": [("P", 0), ("O5'", 0), ("C5'", 0), ("C4'", 0)],
+            "gamma": [("O5'", 0), ("C5'", 0), ("C4'", 0), ("C3'", 0)],
+            "delta": [("C5'", 0), ("C4'", 0), ("C3'", 0), ("O3'", 0)],
+            "epsilon": [("C4'", 0), ("C3'", 0), ("O3'", 0), ("P", 1)],
+            "zeta": [("C3'", 0), ("O3'", 0), ("P", 1), ("O5'", 1)],
+            "chi": None,  # Will be handled separately due to purine/pyrimidine difference
+        }
+        # Process each segment
+        for segment in segments:
+            for i, residue in enumerate(segment):
+                # Prepare row data
+                row = {
+                    "chain_id": residue.chain_id,
+                    "residue_number": residue.residue_number,
+                    "insertion_code": residue.insertion_code,
+                    "residue_name": residue.residue_name,
+                }
+                # Calculate standard torsion angles
+                for angle_name, atoms_def in torsion_definitions.items():
+                    if angle_name == "chi":
+                        continue  # Skip chi for now
+                    if angle_name == "alpha" and i == 0:
+                        continue  # Skip alpha for the second residue
+                    if angle_name in ["epsilon", "zeta"] and i == len(segment) - 1:
+                        continue  # Skip epsilon and zeta for the second-to-last residue
+                    # Get the atoms for this angle
+                    atoms = []
+                    valid = True
+                    for atom_name, offset in atoms_def:
+                        res_idx = i + offset
+                        if 0 <= res_idx < len(segment):
+                            atom = segment[res_idx].find_atom(atom_name)
+                            if atom is not None:
+                                atoms.append(atom.coordinates)
+                            else:
+                                valid = False
+                                break
+                        else:
+                            valid = False
+                            break
+                    # Calculate the angle if all atoms were found
+                    if valid and len(atoms) == 4:
+                        angle = calculate_torsion_angle(
+                            atoms[0], atoms[1], atoms[2], atoms[3]
+                        )
+                        row[angle_name] = angle
+                    else:
+                        row[angle_name] = None
+                # Calculate chi angle based on residue type
+                # Pyrimidines: O4'-C1'-N1-C2
+                # Purines: O4'-C1'-N9-C4
+                purine_bases = ["A", "G", "DA", "DG"]
+                pyrimidine_bases = ["C", "U", "T", "DC", "DT"]
+                o4_prime = residue.find_atom("O4'")
+                c1_prime = residue.find_atom("C1'")
+                if o4_prime is not None and c1_prime is not None:
+                    if residue.residue_name in purine_bases:
+                        n9 = residue.find_atom("N9")
+                        c4 = residue.find_atom("C4")
+                        if n9 is not None and c4 is not None:
+                            chi = calculate_torsion_angle(
+                                o4_prime.coordinates,
+                                c1_prime.coordinates,
+                                n9.coordinates,
+                                c4.coordinates,
+                            )
+                            row["chi"] = chi
+                    elif residue.residue_name in pyrimidine_bases:
+                        n1 = residue.find_atom("N1")
+                        c2 = residue.find_atom("C2")
+                        if n1 is not None and c2 is not None:
+                            chi = calculate_torsion_angle(
+                                o4_prime.coordinates,
+                                c1_prime.coordinates,
+                                n1.coordinates,
+                                c2.coordinates,
+                            )
+                            row["chi"] = chi
+                data.append(row)
+        # Create DataFrame
+        if not data:
+            # Return empty DataFrame with correct columns
+            return pd.DataFrame(
+                columns=[
+                    "chain_id",
+                    "residue_number",
+                    "insertion_code",
+                    "residue_name",
+                    "alpha",
+                    "beta",
+                    "gamma",
+                    "delta",
+                    "epsilon",
+                    "zeta",
+                    "chi",
+                ]
+            )
+        df = pd.DataFrame(data)
+        # Ensure all angle columns exist
+        for angle in ["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "chi"]:
+            if angle not in df.columns:
+                df[angle] = None
+        # Reorder columns to ensure consistent order
+        ordered_columns = [
+            "chain_id",
+            "residue_number",
+            "insertion_code",
+            "residue_name",
+            "alpha",
+            "beta",
+            "gamma",
+            "delta",
+            "epsilon",
+            "zeta",
+            "chi",
+        ]
+        df = df[ordered_columns]
+        return df
+class Residue:
+    """
+    A class representing a single residue in a molecular structure.
+    This class encapsulates a DataFrame containing atoms belonging to a single residue
+    and provides methods to access residue properties.
+    """
+    def __init__(self, residue_df: pd.DataFrame):
+        """
+        Initialize a Residue object with atom data for a single residue.
+        Parameters:
+        -----------
+        residue_df : pd.DataFrame
+            DataFrame containing atom data for a single residue
+        """
+        self.atoms = residue_df
+        self.format = residue_df.attrs.get("format", "unknown")
+    @cached_property
+    def chain_id(self) -> str:
+        """Get the chain identifier for this residue."""
+        if self.format == "PDB":
+            return self.atoms["chainID"].iloc[0]
+        elif self.format == "mmCIF":
+            if "auth_asym_id" in self.atoms.columns:
+                return self.atoms["auth_asym_id"].iloc[0]
+            else:
+                return self.atoms["label_asym_id"].iloc[0]
+        return ""
+    @cached_property
+    def residue_number(self) -> int:
+        """Get the residue sequence number."""
+        if self.format == "PDB":
+            return int(self.atoms["resSeq"].iloc[0])
+        elif self.format == "mmCIF":
+            if "auth_seq_id" in self.atoms.columns:
+                return int(self.atoms["auth_seq_id"].iloc[0])
+            else:
+                return int(self.atoms["label_seq_id"].iloc[0])
+        return 0
+    @cached_property
+    def insertion_code(self) -> Optional[str]:
+        """Get the insertion code, if any."""
+        if self.format == "PDB":
+            icode = self.atoms["iCode"].iloc[0]
+            return icode if pd.notna(icode) else None
+        elif self.format == "mmCIF":
+            if "pdbx_PDB_ins_code" in self.atoms.columns:
+                icode = self.atoms["pdbx_PDB_ins_code"].iloc[0]
+                return icode if pd.notna(icode) else None
+        return None
+    @cached_property
+    def residue_name(self) -> str:
+        """Get the residue name (e.g., 'A', 'G', 'C', 'U', etc.)."""
+        if self.format == "PDB":
+            return self.atoms["resName"].iloc[0]
+        elif self.format == "mmCIF":
+            if "auth_comp_id" in self.atoms.columns:
+                return self.atoms["auth_comp_id"].iloc[0]
+            else:
+                return self.atoms["label_comp_id"].iloc[0]
+        return ""
+    @cached_property
+    def atoms_list(self) -> List["Atom"]:
+        """Get a list of all atoms in this residue."""
+        return [Atom(self.atoms.iloc[i], self.format) for i in range(len(self.atoms))]
+    def find_atom(self, atom_name: str) -> Optional["Atom"]:
+        """
+        Find an atom by name in this residue.
+        Parameters:
+        -----------
+        atom_name : str
+            Name of the atom to find
+        Returns:
+        --------
+        Optional[Atom]
+            The Atom object, or None if not found
+        """
+        if self.format == "PDB":
+            mask = self.atoms["name"] == atom_name
+            atoms_df = self.atoms[mask]
+            if len(atoms_df) > 0:
+                return Atom(atoms_df.iloc[0], self.format)
+        elif self.format == "mmCIF":
+            if "auth_atom_id" in self.atoms.columns:
+                mask = self.atoms["auth_atom_id"] == atom_name
+                atoms_df = self.atoms[mask]
+                if len(atoms_df) > 0:
+                    return Atom(atoms_df.iloc[0], self.format)
+            else:
+                mask = self.atoms["label_atom_id"] == atom_name
+                atoms_df = self.atoms[mask]
+                if len(atoms_df) > 0:
+                    return Atom(atoms_df.iloc[0], self.format)
+        return None
+    def is_connected(self, next_residue_candidate: "Residue") -> bool:
+        """
+        Check if this residue is connected to the next residue candidate.
+        The connection is determined by the distance between the O3' atom of this residue
+        and the P atom of the next residue. If the distance is less than 1.5 times the
+        average O-P covalent bond distance, the residues are considered connected.
+        Parameters:
+        -----------
+        next_residue_candidate : Residue
+            The residue to check for connection
+        Returns:
+        --------
+        bool
+            True if the residues are connected, False otherwise
+        """
+        o3p = self.find_atom("O3'")
+        p = next_residue_candidate.find_atom("P")
+        if o3p is not None and p is not None:
+            distance = np.linalg.norm(o3p.coordinates - p.coordinates).item()
+            return distance < 1.5 * AVERAGE_OXYGEN_PHOSPHORUS_DISTANCE_COVALENT
+        return False
+    def __str__(self) -> str:
+        """String representation of the residue."""
+        # Start with chain ID and residue name
+        if self.chain_id.isspace() or not self.chain_id:
+            builder = f"{self.residue_name}"
+        else:
+            builder = f"{self.chain_id}.{self.residue_name}"
+        # Add a separator if the residue name ends with a digit
+        if len(self.residue_name) > 0 and self.residue_name[-1] in string.digits:
+            builder += "/"
+        # Add residue number
+        builder += f"{self.residue_number}"
+        # Add insertion code if present
+        if self.insertion_code is not None:
+            builder += f"^{self.insertion_code}"
+        return builder
+    def __repr__(self) -> str:
+        """Detailed string representation of the residue."""
+        return f"Residue({self.__str__()}, {len(self.atoms)} atoms)"
+class Atom:
+    """
+    A class representing a single atom in a molecular structure.
+    This class encapsulates a pandas Series containing data for a single atom
+    and provides methods to access atom properties.
+    """
+    def __init__(self, atom_data: pd.Series, format: str):
+        """
+        Initialize an Atom object with atom data.
+        Parameters:
+        -----------
+        atom_data : pd.Series
+            Series containing data for a single atom
+        format : str
+            Format of the data ('PDB' or 'mmCIF')
+        """
+        self.data = atom_data
+        self.format = format
+    @cached_property
+    def name(self) -> str:
+        """Get the atom name."""
+        if self.format == "PDB":
+            return self.data["name"]
+        elif self.format == "mmCIF":
+            if "auth_atom_id" in self.data:
+                return self.data["auth_atom_id"]
+            else:
+                return self.data["label_atom_id"]
+        return ""
+    @cached_property
+    def element(self) -> str:
+        """Get the element symbol."""
+        if self.format == "PDB":
+            return self.data["element"]
+        elif self.format == "mmCIF":
+            if "type_symbol" in self.data:
+                return self.data["type_symbol"]
+        return ""
+    @cached_property
+    def coordinates(self) -> np.ndarray:
+        """Get the 3D coordinates of the atom."""
+        if self.format == "PDB":
+            return np.array([self.data["x"], self.data["y"], self.data["z"]])
+        elif self.format == "mmCIF":
+            return np.array(
+                [self.data["Cartn_x"], self.data["Cartn_y"], self.data["Cartn_z"]]
+            )
+        return np.array([0.0, 0.0, 0.0])
+    @cached_property
+    def occupancy(self) -> float:
+        """Get the occupancy value."""
+        if self.format == "PDB":
+            return (
+                float(self.data["occupancy"])
+                if pd.notna(self.data["occupancy"])
+                else 1.0
+            )
+        elif self.format == "mmCIF":
+            if "occupancy" in self.data:
+                return (
+                    float(self.data["occupancy"])
+                    if pd.notna(self.data["occupancy"])
+                    else 1.0
+                )
+        return 1.0
+    @cached_property
+    def temperature_factor(self) -> float:
+        """Get the temperature factor (B-factor)."""
+        if self.format == "PDB":
+            return (
+                float(self.data["tempFactor"])
+                if pd.notna(self.data["tempFactor"])
+                else 0.0
+            )
+        elif self.format == "mmCIF":
+            if "B_iso_or_equiv" in self.data:
+                return (
+                    float(self.data["B_iso_or_equiv"])
+                    if pd.notna(self.data["B_iso_or_equiv"])
+                    else 0.0
+                )
+        return 0.0
+    def __str__(self) -> str:
+        """String representation of the atom."""
+        return f"{self.name} ({self.element})"
+    def __repr__(self) -> str:
+        """Detailed string representation of the atom."""
+        coords = self.coordinates
+        return f"Atom({self.name}, {self.element}, [{coords[0]:.3f}, {coords[1]:.3f}, {coords[2]:.3f}])"

rnapolis-0.5.0/tests/test_v2.py ADDED Viewed

@@ -0,0 +1,237 @@
+import os
+import numpy as np
+import pandas as pd
+import pytest
+from rnapolis.parser_v2 import parse_cif_atoms, parse_pdb_atoms
+from rnapolis.tertiary_v2 import Structure
+@pytest.fixture
+def data_dir():
+    """Return the path to the test data directory."""
+    return os.path.join(os.path.dirname(__file__))
+def test_parse_4qln_formats(data_dir):
+    """Test parsing 4qln in both PDB and mmCIF formats and compare residues and torsion angles."""
+    # Load PDB and mmCIF files
+    pdb_path = os.path.join(data_dir, "4qln.pdb")
+    cif_path = os.path.join(data_dir, "4qln.cif")
+    # Skip test if files don't exist
+    if not (os.path.exists(pdb_path) and os.path.exists(cif_path)):
+        pytest.skip(f"Test files not found: {pdb_path} or {cif_path}")
+    # Parse both formats
+    with open(pdb_path, "r") as pdb_file:
+        pdb_atoms = parse_pdb_atoms(pdb_file)
+    with open(cif_path, "r") as cif_file:
+        cif_atoms = parse_cif_atoms(cif_file)
+    # Create structures
+    pdb_structure = Structure(pdb_atoms)
+    cif_structure = Structure(cif_atoms)
+    # Get residues
+    pdb_residues = pdb_structure.residues
+    cif_residues = cif_structure.residues
+    # Basic checks
+    assert len(pdb_residues) > 0, "No residues found in PDB file"
+    assert len(cif_residues) > 0, "No residues found in mmCIF file"
+    # Compare residue counts
+    assert len(pdb_residues) == len(cif_residues), (
+        f"Different number of residues: PDB={len(pdb_residues)}, mmCIF={len(cif_residues)}"
+    )
+    # Compare residue identifiers
+    pdb_residue_ids = [
+        (r.chain_id, r.residue_number, r.insertion_code) for r in pdb_residues
+    ]
+    cif_residue_ids = [
+        (r.chain_id, r.residue_number, r.insertion_code) for r in cif_residues
+    ]
+    # Sort both lists to ensure consistent ordering
+    pdb_residue_ids.sort()
+    cif_residue_ids.sort()
+    # Check if residue identifiers match
+    for i, (pdb_id, cif_id) in enumerate(zip(pdb_residue_ids, cif_residue_ids)):
+        assert pdb_id == cif_id, (
+            f"Residue mismatch at position {i}: PDB={pdb_id}, mmCIF={cif_id}"
+        )
+    # Create a mapping from residue ID to residue name for both formats
+    pdb_id_to_name = {
+        (r.chain_id, r.residue_number, r.insertion_code): r.residue_name
+        for r in pdb_residues
+    }
+    cif_id_to_name = {
+        (r.chain_id, r.residue_number, r.insertion_code): r.residue_name
+        for r in cif_residues
+    }
+    # Check if residue names match for each residue ID
+    for res_id in pdb_id_to_name:
+        assert res_id in cif_id_to_name, f"Residue ID {res_id} not found in mmCIF"
+        assert pdb_id_to_name[res_id] == cif_id_to_name[res_id], (
+            f"Residue name mismatch for {res_id}: PDB={pdb_id_to_name[res_id]}, mmCIF={cif_id_to_name[res_id]}"
+        )
+    # Calculate torsion angles for both structures
+    pdb_torsion_df = pdb_structure.torsion_angles
+    cif_torsion_df = cif_structure.torsion_angles
+    # Check if torsion angle DataFrames have the same shape
+    assert pdb_torsion_df.shape == cif_torsion_df.shape, (
+        f"Different torsion angle DataFrame shapes: PDB={pdb_torsion_df.shape}, mmCIF={cif_torsion_df.shape}"
+    )
+    # Sort both DataFrames by chain_id, residue_number, and insertion_code for consistent comparison
+    pdb_torsion_df = pdb_torsion_df.sort_values(
+        by=["chain_id", "residue_number", "insertion_code"]
+    ).reset_index(drop=True)
+    cif_torsion_df = cif_torsion_df.sort_values(
+        by=["chain_id", "residue_number", "insertion_code"]
+    ).reset_index(drop=True)
+    # Compare residue identifiers in torsion angle DataFrames
+    pd.testing.assert_series_equal(
+        pdb_torsion_df["chain_id"],
+        cif_torsion_df["chain_id"],
+        check_names=False,
+        check_dtype=False,
+    )
+    pd.testing.assert_series_equal(
+        pdb_torsion_df["residue_number"],
+        cif_torsion_df["residue_number"],
+        check_names=False,
+        check_dtype=False,
+    )
+    pd.testing.assert_series_equal(
+        pdb_torsion_df["residue_name"],
+        cif_torsion_df["residue_name"],
+        check_names=False,
+        check_dtype=False,
+    )
+    # Compare torsion angle values with a tolerance
+    angle_columns = ["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "chi"]
+    for col in angle_columns:
+        # Skip columns that might not exist in both DataFrames
+        if col not in pdb_torsion_df.columns or col not in cif_torsion_df.columns:
+            continue
+        # Get non-NaN values that exist in both DataFrames
+        pdb_values = pdb_torsion_df[col]
+        cif_values = cif_torsion_df[col]
+        # Check if the same values are NaN in both DataFrames
+        assert pdb_values.isna().equals(cif_values.isna()), (
+            f"Different NaN patterns in {col} angle"
+        )
+        # Compare non-NaN values with tolerance
+        mask = ~pdb_values.isna()
+        if mask.any():
+            pdb_non_nan = pdb_values[mask].values
+            cif_non_nan = cif_values[mask].values
+            # Allow a small tolerance for floating-point differences
+            np.testing.assert_allclose(
+                pdb_non_nan,
+                cif_non_nan,
+                rtol=1e-5,
+                atol=1e-5,
+                err_msg=f"Torsion angle values for {col} don't match between PDB and mmCIF",
+            )
+def test_torsion_angle_calculation():
+    """Test the torsion angle calculation function."""
+    # Define four points that form a known torsion angle
+    a1 = np.array([1.0, 0.0, 0.0])
+    a2 = np.array([0.0, 0.0, 0.0])
+    a3 = np.array([0.0, 1.0, 0.0])
+    a4 = np.array([0.0, 1.0, 1.0])
+    # Calculate the torsion angle
+    from rnapolis.tertiary_v2 import calculate_torsion_angle
+    angle = calculate_torsion_angle(a1, a2, a3, a4)
+    # The expected angle is pi/2 radians (90 degrees)
+    assert abs(angle - np.pi / 2) < 1e-6, (
+        f"Expected angle close to pi/2 radians, got {angle}"
+    )
+    # Test with collinear points
+    a1 = np.array([0.0, 0.0, 0.0])
+    a2 = np.array([1.0, 0.0, 0.0])
+    a3 = np.array([2.0, 0.0, 0.0])
+    a4 = np.array([3.0, 0.0, 0.0])
+    angle = calculate_torsion_angle(a1, a2, a3, a4)
+    assert np.isnan(angle), f"Expected NaN for collinear points, got {angle}"
+def test_connected_residues_and_torsion_angles(data_dir):
+    """Test finding connected residues and calculating torsion angles."""
+    # Load PDB file
+    pdb_path = os.path.join(data_dir, "4qln.pdb")
+    # Skip test if file doesn't exist
+    if not os.path.exists(pdb_path):
+        pytest.skip(f"Test file not found: {pdb_path}")
+    # Parse PDB file
+    with open(pdb_path, "r") as pdb_file:
+        pdb_atoms = parse_pdb_atoms(pdb_file)
+    # Create structure
+    structure = Structure(pdb_atoms)
+    # Find connected residues
+    segments = structure.connected_residues
+    # Check that we found at least one segment
+    assert len(segments) > 0, "No connected residue segments found"
+    # Check that each segment has at least 2 residues
+    for segment in segments:
+        assert len(segment) >= 2, f"Segment has fewer than 2 residues: {segment}"
+    # Calculate torsion angles
+    torsion_df = structure.torsion_angles
+    # Check that the DataFrame has the expected columns
+    expected_columns = [
+        "chain_id",
+        "residue_number",
+        "insertion_code",
+        "residue_name",
+        "alpha",
+        "beta",
+        "gamma",
+        "delta",
+        "epsilon",
+        "zeta",
+        "chi",
+    ]
+    for col in expected_columns:
+        assert col in torsion_df.columns, (
+            f"Expected column {col} not found in torsion angles DataFrame"
+        )
+    # Check that we have some torsion angle values
+    assert len(torsion_df) > 0, "No torsion angles calculated"
+    # Check that at least some angles are not null
+    for angle in ["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "chi"]:
+        assert torsion_df[angle].notna().any(), f"No valid {angle} angles calculated"