PyPI - gemmi-protools - Versions diffs - 0.1.0__py3-none-any.whl - Mend

gemmi-protools 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gemmi-protools might be problematic. Click here for more details.

Files changed (16) hide show

gemmi_protools/__init__.py +8 -0
gemmi_protools/align.py +183 -0
gemmi_protools/cif_opts.py +167 -0
gemmi_protools/convert.py +96 -0
gemmi_protools/dockq.py +139 -0
gemmi_protools/parse_pdb_header.py +387 -0
gemmi_protools/parser.py +279 -0
gemmi_protools/pdb_opts.py +177 -0
gemmi_protools/ppi.py +74 -0
gemmi_protools/reader.py +371 -0
gemmi_protools/struct_info.py +91 -0
gemmi_protools-0.1.0.dist-info/METADATA +19 -0
gemmi_protools-0.1.0.dist-info/RECORD +16 -0
gemmi_protools-0.1.0.dist-info/WHEEL +5 -0
gemmi_protools-0.1.0.dist-info/licenses/LICENSE +21 -0
gemmi_protools-0.1.0.dist-info/top_level.txt +1 -0

gemmi_protools/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""
+@Author: Luo Jiejian
+"""
+from .reader import StructureParser
+from .convert import gemmi2bio, bio2gemmi
+from .align import StructureAligner
+from .ppi import ppi_interface_residues
+from .dockq import dockq_score, dockq_score_interface

gemmi_protools/align.py ADDED Viewed

@@ -0,0 +1,183 @@
+"""
+@Author: Luo Jiejian
+"""
+import os
+import pathlib
+import re
+import shutil
+import subprocess
+import tempfile
+import uuid
+from copy import deepcopy
+from typing import Union, Dict, Any, List, Optional
+import numpy as np
+from Bio.PDB import Superimposer
+from typeguard import typechecked
+from .convert import gemmi2bio, bio2gemmi
+from .reader import StructureParser
+class StructureAligner(object):
+    @typechecked
+    def __init__(self, query_path: Union[str, pathlib.Path], ref_path: Union[str, pathlib.Path]):
+        self._query_st = StructureParser()
+        self._query_st.load_from_file(query_path)
+        self._query_st.set_default_model()
+        self._ref_st = StructureParser()
+        self._ref_st.load_from_file(ref_path)
+        self._ref_st.set_default_model()
+        self.values = dict()
+        self.rot_mat = None
+        self.is_aligned = False
+        self.by_query = None
+        self.by_ref = None
+        self.query_path = query_path
+        self.ref_path = ref_path
+    @property
+    def __mmalign_path(self):
+        _path = shutil.which("MMAlign") or shutil.which("MMalign")
+        if _path is None:
+            raise RuntimeError("Executable program MMAlign is not found. "
+                               "Download from https://zhanggroup.org/MM-align/ ."
+                               "Build it and add MMAlign to environment PATH")
+        else:
+            return _path
+    @staticmethod
+    @typechecked
+    def __parser_rotation_matrix(matrix_file: Union[str, pathlib.Path]):
+        rotation_matrix = []
+        translation_vector = []
+        with open(matrix_file, 'r') as file:
+            lines = file.readlines()
+            values = lines[2:5]
+            for cur_line in values:
+                tmp = re.split(pattern=r"\s+", string=cur_line.strip())
+                assert len(tmp) == 5
+                rotation_matrix.append(tmp[2:])
+                translation_vector.append(tmp[1])
+        return dict(R=np.array(rotation_matrix).astype(np.float32),
+                    T=np.array(translation_vector).astype(np.float32))
+    @staticmethod
+    @typechecked
+    def __parse_terminal_outputs(output_string: str) -> Dict[str, Any]:
+        lines = re.split(pattern=r"\n", string=output_string)
+        # chain mapping
+        patterns = dict(query_chain_ids=r"Structure_1.+\.pdb:([\w:]+)",
+                        ref_chain_ids=r"Structure_2.+\.pdb:([\w:]+)",
+                        query_total_length=r"Length of Structure_1.*?(\d+).*residues",
+                        ref_total_length=r"Length of Structure_2.*?(\d+).*residues",
+                        aligned_length=r"Aligned length=.*?(\d+)",
+                        rmsd=r"RMSD=.*?([\d.]+)",
+                        tmscore_by_query=r"TM-score=.*?([\d.]+).+Structure_1",
+                        tmscore_by_ref=r"TM-score=.*?([\d.]+).+Structure_2",
+                        aligned_seq_start=r"denotes other aligned residues",
+                        )
+        values = dict()
+        for idx, line in enumerate(lines):
+            current_keys = list(patterns.keys())
+            for key in current_keys:
+                tmp = re.search(patterns[key], line)
+                if tmp:
+                    if key in ['query_chain_ids', 'ref_chain_ids']:
+                        values[key] = re.split(pattern=":", string=tmp.groups()[0])
+                        del patterns[key]
+                    elif key in ['query_total_length', 'ref_total_length', 'aligned_length']:
+                        values[key] = int(tmp.groups()[0])
+                        del patterns[key]
+                    elif key in ['rmsd', 'tmscore_by_query', 'tmscore_by_ref']:
+                        values[key] = float(tmp.groups()[0])
+                        del patterns[key]
+                    elif key == "aligned_seq_start":
+                        # idx + 1 and idx + 3 for aligned sequences 1 and 2
+                        seq_1 = lines[idx + 1]
+                        seq_2 = lines[idx + 3]
+                        sp1 = re.split(pattern=r"\*", string=seq_1)
+                        sp2 = re.split(pattern=r"\*", string=seq_2)
+                        values["query_sequences"] = sp1[:-1] if "*" in seq_1 else sp1
+                        values["ref_sequences"] = sp2[:-1] if "*" in seq_2 else sp2
+                        del patterns[key]
+        return values
+    @typechecked
+    def make_alignment(self, query_chains: Optional[List[str]] = None,
+                       ref_chains: Optional[List[str]] = None, timeout=300.0):
+        """
+        :param query_chains: list, None for all chains
+        :param ref_chains: list, None for all chains
+        :param timeout: default 300
+        :return:
+        """
+        program_path = self.__mmalign_path
+        # clone
+        q_st = deepcopy(self._query_st)
+        r_st = deepcopy(self._ref_st)
+        tmp_dir = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
+        os.makedirs(tmp_dir)
+        if isinstance(query_chains, list):
+            q_st.pick_chains(query_chains)
+        if isinstance(ref_chains, list):
+            r_st.pick_chains(ref_chains)
+        _tmp_a = os.path.join(tmp_dir, "a.pdb")
+        q_st.to_pdb(_tmp_a)
+        _tmp_b = os.path.join(tmp_dir, "b.pdb")
+        r_st.to_pdb(_tmp_b)
+        matrix_file = os.path.join(tmp_dir, "m.txt")
+        _command = "%s %s %s -m %s" % (program_path, _tmp_a, _tmp_b, matrix_file)
+        try:
+            result = subprocess.run(_command, shell=True, check=True,
+                                    stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                                    timeout=timeout)
+        except Exception as e:
+            print("%s: between files %s and %s; between chains: %s and %s" % (
+                str(e), self.query_path, self.ref_path,
+                str(q_st.chain_ids), str(r_st.chain_ids))
+                  )
+        else:
+            self.values = self.__parse_terminal_outputs(result.stdout.decode())
+            self.rot_mat = self.__parser_rotation_matrix(matrix_file)
+            self.is_aligned = True
+            self.by_query = q_st.chain_ids if query_chains is None else query_chains
+            self.by_ref = r_st.chain_ids if ref_chains is None else ref_chains
+        finally:
+            if os.path.isdir(tmp_dir):
+                shutil.rmtree(tmp_dir)
+    @typechecked
+    def save_aligned_query(self, out_file: str):
+        """
+        :param out_file: .cif file
+        :return:
+        """
+        if not self.is_aligned:
+            raise RuntimeError("structure not aligned, run make_alignment first")
+        super_imposer = Superimposer()
+        super_imposer.rotran = (self.rot_mat["R"].T, self.rot_mat["T"])
+        bio_s = gemmi2bio(self._query_st.STRUCT)
+        super_imposer.apply(bio_s)
+        query_st_aligned = bio2gemmi(bio_s)
+        block = query_st_aligned.make_mmcif_block()
+        block.write_file(out_file)

gemmi_protools/cif_opts.py ADDED Viewed

@@ -0,0 +1,167 @@
+"""
+@Author: Luo Jiejian
+"""
+import pathlib
+from typing import Union, Dict, Any
+import gemmi
+import pandas as pd
+from typeguard import typechecked
+from .struct_info import Entity
+@typechecked
+def _is_cif(path: Union[str, pathlib.Path]) -> bool:
+    if isinstance(path, str):
+        path = pathlib.Path(path)
+    if path.suffixes:
+        if path.suffixes[-1] == ".cif":
+            return True
+        elif "".join(path.suffixes[-2:]) == ".cif.gz":
+            return True
+        else:
+            return False
+    else:
+        return False
+@typechecked
+def _value_mapper_from_block(block: gemmi.cif.Block, category: str, column1: str, column2: str,
+                             expand_column1: bool = False) -> Dict[str, Any]:
+    """
+    mapper from column1 to column2
+    :param block:
+    :param category:
+    :param column1:
+    :param column2:
+    :param expand_column1: bool, if True, values joint by comma in column1 with be split
+    :return:
+    Only return a mapper when both column1 and column2 in category
+    """
+    loop = block.find_mmcif_category(category)
+    tags = list(loop.tags)
+    results = dict()
+    if column1 in tags:
+        values1 = loop.column(tags.index(column1))
+        v1 = [values1.str(i) for i in range(len(values1))]
+        if column2 in tags:
+            values2 = loop.column(tags.index(column2))
+            v2 = [values2.str(i) for i in range(len(values2))]
+        else:
+            v2 = ["?"] * len(v1)
+        outputs = dict(zip(v1, v2))
+        if expand_column1:
+            outputs_ex = dict()
+            for key, val in outputs.items():
+                tmp = key.split(",")
+                for sk in tmp:
+                    nk = sk.strip()
+                    if nk:
+                        outputs_ex[nk] = val
+            results = outputs_ex
+        else:
+            results = outputs
+    return results
+@typechecked
+def _get_cif_resolution(block: gemmi.cif.Block) -> float:
+    resolution = 0.0
+    for key in ["_reflns.d_resolution_high",
+                "_refine.ls_d_res_high",
+                "_refine_hist.d_res_high",
+                "_em_3d_reconstruction.resolution",
+                ]:
+        v = block.find_value(key)
+        try:
+            vf = float(v)
+        except (TypeError, ValueError):
+            continue
+        else:
+            resolution = vf
+            break
+    return resolution
+@typechecked
+def _cif_entity_info(block: gemmi.cif.Block) -> Entity:
+    entity2description = _value_mapper_from_block(block, category="_entity.",
+                                                  column1="_entity.id",
+                                                  column2="_entity.pdbx_description")
+    polymer2entity = _value_mapper_from_block(block, category="_entity_poly.",
+                                              column1="_entity_poly.pdbx_strand_id",
+                                              column2="_entity_poly.entity_id",
+                                              expand_column1=True)
+    entity2species = _value_mapper_from_block(block, category="_entity_src_gen.",
+                                              column1="_entity_src_gen.entity_id",
+                                              column2="_entity_src_gen.pdbx_gene_src_scientific_name")
+    entity2species.update(_value_mapper_from_block(block, category="_pdbx_entity_src_syn.",
+                                                   column1="_pdbx_entity_src_syn.entity_id",
+                                                   column2="_pdbx_entity_src_syn.organism_scientific")
+                          )
+    entity2species.update(_value_mapper_from_block(block, category="_entity_src_nat.",
+                                                   column1="_entity_src_nat.entity_id",
+                                                   column2="_entity_src_nat.pdbx_organism_scientific")
+                          )
+    entity2taxid = _value_mapper_from_block(block, category="_entity_src_gen.",
+                                            column1="_entity_src_gen.entity_id",
+                                            column2="_entity_src_gen.pdbx_gene_src_ncbi_taxonomy_id")
+    entity2taxid.update(_value_mapper_from_block(block, category="_pdbx_entity_src_syn.",
+                                                 column1="_pdbx_entity_src_syn.entity_id",
+                                                 column2="_pdbx_entity_src_syn.ncbi_taxonomy_id")
+                        )
+    entity2taxid.update(_value_mapper_from_block(block, category="_entity_src_nat.",
+                                                 column1="_entity_src_nat.entity_id",
+                                                 column2="_entity_src_nat.pdbx_ncbi_taxonomy_id")
+                        )
+    vals = dict(eid2desc=entity2description,
+                eid2specie=entity2species,
+                eid2taxid=entity2taxid,
+                polymer2eid=polymer2entity
+                )
+    return Entity(**vals)
+@typechecked
+def _cif_block_for_output(structure: gemmi.Structure, entity: Entity) -> gemmi.cif.Block:
+    block = structure.make_mmcif_block()
+    ta = block.find_mmcif_category(category="_entity.")
+    da = pd.DataFrame(list(ta), columns=list(ta.tags))
+    if "_entity.id" in da.columns:
+        da["_entity.pdbx_description"] = da["_entity.id"].apply(
+            lambda i: entity["eid2desc"].get(i, "?").strip() or "?")
+        rows = []
+        for ar in da.to_numpy().tolist():
+            rows.append([gemmi.cif.quote(i) for i in ar])
+        if "_entity.pdbx_description" not in list(ta.tags):
+            ta.loop.add_columns(["_entity.pdbx_description"], "?")
+        ta = block.find_mmcif_category(category="_entity.")
+        for _ in range(len(ta)):
+            ta.remove_row(0)
+        for row in rows:
+            ta.append_row(row)
+    loop = block.init_loop("_entity_src_gen.", ["entity_id",
+                                                "pdbx_gene_src_scientific_name",
+                                                "pdbx_gene_src_ncbi_taxonomy_id"])
+    for k in entity["eid2specie"].keys():
+        loop.add_row([gemmi.cif.quote(k),
+                      gemmi.cif.quote(entity["eid2specie"].get(k, "?")),
+                      gemmi.cif.quote(entity["eid2taxid"].get(k, "?"))]
+                     )
+    block.move_item(-1, 16)
+    return block

gemmi_protools/convert.py ADDED Viewed

@@ -0,0 +1,96 @@
+"""
+@Author: Luo Jiejian
+"""
+import gemmi
+import numpy as np
+from Bio.PDB.Structure import Structure as BioStructure
+from Bio.PDB.StructureBuilder import StructureBuilder
+from typeguard import typechecked
+@typechecked
+def gemmi2bio(gemmi_structure: gemmi.Structure) -> BioStructure:
+    """
+    Convert gemmi structure to biopython structure
+    :param gemmi_structure:
+    :return:
+    return biopython structure
+    """
+    structure_builder = StructureBuilder()
+    structure_builder.init_structure(structure_id=gemmi_structure.name)
+    for model_idx, gemmi_model in enumerate(gemmi_structure):
+        structure_builder.init_model(model_idx)
+        for gemmi_chain in gemmi_model:
+            structure_builder.init_chain(gemmi_chain.name)
+            for gemmi_residue in gemmi_chain:
+                if gemmi_residue.het_flag == "H":
+                    if gemmi_residue.name in ["HOH", "WAT"]:
+                        het_flag = "W"
+                    else:
+                        het_flag = "H"
+                else:
+                    het_flag = " "
+                structure_builder.init_residue(resname=gemmi_residue.name, field=het_flag,
+                                               resseq=gemmi_residue.seqid.num, icode=gemmi_residue.seqid.icode)
+                for gemmi_atom in gemmi_residue:
+                    coord = np.array([gemmi_atom.pos.x, gemmi_atom.pos.y, gemmi_atom.pos.z])
+                    structure_builder.init_atom(name=gemmi_atom.name,
+                                                coord=coord,
+                                                b_factor=gemmi_atom.b_iso,
+                                                occupancy=gemmi_atom.occ,
+                                                altloc=gemmi_atom.altloc if gemmi_atom.has_altloc() else ' ',
+                                                fullname=gemmi_atom.name.center(4),
+                                                serial_number=gemmi_atom.serial,
+                                                element=gemmi_atom.element.name.upper())
+    bio_structure = structure_builder.get_structure()
+    return bio_structure
+@typechecked
+def bio2gemmi(bio_structure: BioStructure) -> gemmi.Structure:
+    """
+    Convert biopython structure to gemmi structure
+    :param bio_structure:
+    :return:
+    return gemmi structure
+    """
+    g_structure = gemmi.Structure()
+    g_structure.name = bio_structure.id
+    for bio_model in bio_structure:
+        # bio model start from 0, gemmi model start from 1
+        g_model = gemmi.Model(bio_model.id + 1)
+        for bio_chain in bio_model:
+            g_chain = gemmi.Chain(bio_chain.id)
+            for bio_residue in bio_chain:
+                g_residue = gemmi.Residue()
+                g_residue.name = bio_residue.resname
+                het_flag, r_num, i_code = bio_residue.id
+                g_residue.seqid.num = r_num
+                g_residue.seqid.icode = i_code
+                g_residue.het_flag = "A" if het_flag == " " else "H"
+                for bio_atom in bio_residue:
+                    g_atom = gemmi.Atom()
+                    g_atom.name = bio_atom.name
+                    g_atom.b_iso = bio_atom.bfactor
+                    g_atom.occ = bio_atom.occupancy
+                    g_atom.altloc = "\x00" if bio_atom.altloc == " " else bio_atom.altloc
+                    g_atom.element = gemmi.Element(bio_atom.element)
+                    g_atom.serial = bio_atom.serial_number
+                    px, py, pz = bio_atom.coord
+                    g_atom.pos = gemmi.Position(px, py, pz)
+                    g_residue.add_atom(g_atom)
+                g_chain.add_residue(g_residue)
+            g_model.add_chain(g_chain)
+        g_structure.add_model(g_model)
+    g_structure.setup_entities()
+    g_structure.assign_het_flags()
+    return g_structure

gemmi_protools/dockq.py ADDED Viewed

@@ -0,0 +1,139 @@
+"""
+@Author: Luo Jiejian
+"""
+import json
+import os
+import pathlib
+import shutil
+import subprocess
+import tempfile
+import uuid
+from typing import Optional, Union
+import pandas as pd
+from typeguard import typechecked
+from .reader import StructureParser
+@typechecked
+def _read_model(model_file: Union[str, pathlib.Path]):
+    st = StructureParser()
+    st.load_from_file(model_file)
+    st.set_default_model()
+    return st
+@typechecked
+def dockq_score(query_model: Union[str, pathlib.Path],
+                native_model: Union[str, pathlib.Path],
+                mapping: Optional[str] = None):
+    dockq_program = shutil.which("DockQ")
+    if dockq_program is None:
+        raise RuntimeError("DockQ is need")
+    q_st = _read_model(query_model)
+    n_st = _read_model(native_model)
+    tmp_dir = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
+    os.makedirs(tmp_dir)
+    result_file = os.path.join(tmp_dir, "result.json")
+    q_file = os.path.join(tmp_dir, "q.pdb")
+    n_file = os.path.join(tmp_dir, "n.pdb")
+    q_st.to_pdb(q_file, write_minimal_pdb=True)
+    n_st.to_pdb(n_file, write_minimal_pdb=True)
+    if mapping is None:
+        cid = "".join(n_st.chain_ids)
+        mapping = cid + ":" + cid
+    _command = "%s --mapping %s --json %s %s %s" % (dockq_program, mapping, result_file, q_file, n_file)
+    metrics = ['DockQ', 'F1', 'chain1', 'chain2']
+    try:
+        _ = subprocess.run(_command, shell=True, check=True,
+                           stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                           timeout=300.0)
+    except subprocess.CalledProcessError as e:
+        # Handle errors in the called executable
+        msg = e.stderr.decode()
+        outputs = pd.DataFrame(columns=metrics)
+    except Exception as e:
+        # Handle other exceptions such as file not found or permissions issues
+        msg = str(e)
+        outputs = pd.DataFrame(columns=metrics)
+    else:
+        with open(result_file, "r") as fin:
+            vals = json.load(fin)
+        msg = "Finished"
+        result = []
+        for v in vals["best_result"].values():
+            result.append(v)
+        outputs = pd.DataFrame(result)[metrics]
+    finally:
+        if os.path.isdir(tmp_dir):
+            shutil.rmtree(tmp_dir)
+    return dict(value=outputs,
+                msg=msg,
+                mapping=mapping,
+                model=query_model,
+                native=native_model
+                )
+def dockq_score_interface(query_model: Union[str, pathlib.Path],
+                          native_model: Union[str, pathlib.Path],
+                          chains_a: str,
+                          chains_b: str):
+    ppi_if = chains_a + "@" + chains_b
+    chs_a = list(chains_a)
+    chs_b = list(chains_b)
+    # if multiple chains, merge to one
+    q_st = _read_model(query_model)
+    n_st = _read_model(native_model)
+    for c in chs_a + chs_b:
+        if c not in q_st.chain_ids:
+            raise RuntimeError("Chain %s is not in the query model: %s" % (c, query_model))
+    for c in chs_a + chs_b:
+        if c not in n_st.chain_ids:
+            raise RuntimeError("Chain %s is not in the native model: %s" % (c, native_model))
+    if len(chs_a) > 1:
+        q_st.merge_chains(chs_a)
+        n_st.merge_chains(chs_a)
+    if len(chs_b) > 1:
+        q_st.merge_chains(chs_b)
+        n_st.merge_chains(chs_b)
+    tmp_dir = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
+    os.makedirs(tmp_dir)
+    q_file = os.path.join(tmp_dir, "qm.pdb")
+    n_file = os.path.join(tmp_dir, "nm.pdb")
+    q_st.to_pdb(q_file, write_minimal_pdb=True)
+    n_st.to_pdb(n_file, write_minimal_pdb=True)
+    chs = chs_a[0] + chs_b[0]
+    result = dockq_score(q_file, n_file, mapping="%s:%s" % (chs, chs))
+    if len(result["value"]) > 0:
+        q_score = round(result["value"].iloc[0]["DockQ"], 4)
+        f1 = round(result["value"].iloc[0]["F1"], 4)
+    else:
+        q_score = ""
+        f1 = ""
+    if os.path.isdir(tmp_dir):
+        shutil.rmtree(tmp_dir)
+    return dict(DockQ=q_score,
+                F1=f1,
+                interface=ppi_if,
+                model=query_model,
+                native=native_model
+                )