PyPI - gemmi-protools - Versions diffs - 0.1.0__py3-none-any.whl - Mend

gemmi-protools 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gemmi-protools might be problematic. Click here for more details.

Files changed (16) hide show

gemmi_protools/__init__.py +8 -0
gemmi_protools/align.py +183 -0
gemmi_protools/cif_opts.py +167 -0
gemmi_protools/convert.py +96 -0
gemmi_protools/dockq.py +139 -0
gemmi_protools/parse_pdb_header.py +387 -0
gemmi_protools/parser.py +279 -0
gemmi_protools/pdb_opts.py +177 -0
gemmi_protools/ppi.py +74 -0
gemmi_protools/reader.py +371 -0
gemmi_protools/struct_info.py +91 -0
gemmi_protools-0.1.0.dist-info/METADATA +19 -0
gemmi_protools-0.1.0.dist-info/RECORD +16 -0
gemmi_protools-0.1.0.dist-info/WHEEL +5 -0
gemmi_protools-0.1.0.dist-info/licenses/LICENSE +21 -0
gemmi_protools-0.1.0.dist-info/top_level.txt +1 -0

gemmi_protools/pdb_opts.py ADDED Viewed

@@ -0,0 +1,177 @@
+"""
+@Author: Luo Jiejian
+"""
+import gzip
+import io
+import pathlib
+import re
+from collections import defaultdict
+from typing import Dict, Union, List
+from typeguard import typechecked
+from .parse_pdb_header import _parse_pdb_header_list
+from .struct_info import Entity
+@typechecked
+def _molecule_information(header_dict: Dict) -> Entity:
+    entity2description = dict()
+    entity2species = dict()
+    entity2taxid = dict()
+    for idx in header_dict["compound"].keys():
+        compound = header_dict["compound"][idx]
+        if "chain" in compound:
+            chain = re.sub(pattern=r"\s+", repl="", string=compound["chain"])
+            if chain != "":
+                tmp = chain.split(",")
+                tmp.sort()
+                key = ",".join(tmp)
+                molecule = compound.get("molecule", "")
+                if idx in header_dict["source"]:
+                    source = header_dict["source"][idx]
+                    specie = source.get("organism_scientific", "")
+                    taxid = source.get("organism_taxid", "")
+                else:
+                    specie = ""
+                    taxid = ""
+                entity2description[key] = molecule
+                entity2species[key] = specie
+                entity2taxid[key] = taxid
+    vals = dict(eid2desc=entity2description,
+                eid2specie=entity2species,
+                eid2taxid=entity2taxid,
+                polymer2eid=dict()
+                )
+    return Entity(**vals)
+@typechecked
+def _is_pdb(path: Union[str, pathlib.Path]) -> bool:
+    if isinstance(path, str):
+        path = pathlib.Path(path)
+    if path.suffixes:
+        if path.suffixes[-1] == ".pdb":
+            return True
+        elif "".join(path.suffixes[-2:]) == ".pdb.gz":
+            return True
+        else:
+            return False
+    else:
+        return False
+# add by Ljj
+@typechecked
+def _pdb_entity_info(path: Union[str, pathlib.Path]) -> Entity:
+    if _is_pdb(path):
+        cur_path = pathlib.Path(path)
+        if cur_path.suffixes[-1] == ".pdb":
+            with open(path, "r") as text_io:
+                lines = text_io.readlines()
+        else:
+            with gzip.open(path, "rb") as gz_handle:
+                with io.TextIOWrapper(gz_handle, encoding="utf-8") as text_io:
+                    lines = text_io.readlines()
+    else:
+        raise ValueError("Only support .pdb or .pdb.gz file, but got %s" % path)
+    i = 0
+    for i in range(len(lines)):
+        line = lines[i]
+        record_type = line[0:6]
+        if record_type in ("ATOM  ", "HETATM", "MODEL "):
+            break
+    header = lines[0:i]
+    info = _parse_pdb_header_list(header)
+    return _molecule_information(info)
+@typechecked
+def _get_pdb_resolution(remark_lines: List[str]) -> float:
+    resolutions = []
+    for line in remark_lines:
+        tmp = re.search(r"REMARK.+RESOLUTION.+?([\d\.]+|NOT APPLICABLE)", line)
+        if tmp:
+            v = tmp.groups()[0]
+            try:
+                vf = float(v)
+            except (TypeError, ValueError):
+                continue
+            else:
+                resolutions.append(vf)
+    if resolutions:
+        return min(resolutions)
+    else:
+        return 0.0
+@typechecked
+def _compound_source_string(entity: Entity) -> List[str]:
+    entity2polymer = defaultdict(list)
+    for k, v in entity["polymer2eid"].items():
+        entity2polymer[v].append(k)
+    entity_labels = list(entity2polymer.keys())
+    entity_labels.sort()
+    values = []
+    for i, el in enumerate(entity_labels):
+        values.append(dict(mol_id=str(i + 1),
+                           chain=", ".join(entity2polymer[el]),
+                           molecule=entity["eid2desc"].get(el, "?"),
+                           organism_scientific=entity["eid2specie"].get(el, "?"),
+                           organism_taxid=entity["eid2taxid"].get(el, "?")
+                           )
+                      )
+    outputs = []
+    # compound
+    compound_mol0 = "COMPND    MOL_ID: {mol_id};"
+    compound_mol1 = "COMPND {n_line:>3} MOL_ID: {mol_id};"
+    compound_molecule = "COMPND {n_line:>3} MOLECULE: {molecule};"
+    compound_chain = "COMPND {n_line:>3} CHAIN: {chain};"
+    i = 0
+    for val in values:
+        if i == 0:
+            outputs.append(compound_mol0.format(**val))
+            i += 1
+            for c_str in [compound_molecule, compound_chain]:
+                cur_val = val.copy()
+                cur_val["n_line"] = i
+                outputs.append(c_str.format(**cur_val))
+                i += 1
+        else:
+            for c_str in [compound_mol1, compound_molecule, compound_chain]:
+                cur_val = val.copy()
+                cur_val["n_line"] = i
+                outputs.append(c_str.format(**cur_val))
+                i += 1
+    source_mol0 = "SOURCE    MOL_ID: {mol_id};"
+    source_mol1 = "SOURCE {n_line:>3} MOL_ID: {mol_id};"
+    source_scientific = "SOURCE {n_line:>3} ORGANISM_SCIENTIFIC: {organism_scientific};"
+    source_taxid = "SOURCE {n_line:>3} ORGANISM_TAXID: {organism_taxid};"
+    i = 0
+    for val in values:
+        if i == 0:
+            outputs.append(source_mol0.format(**val))
+            i += 1
+            for c_str in [source_scientific, source_taxid]:
+                cur_val = val.copy()
+                cur_val["n_line"] = i
+                outputs.append(c_str.format(**cur_val))
+                i += 1
+        else:
+            for c_str in [source_mol1, source_scientific, source_taxid]:
+                cur_val = val.copy()
+                cur_val["n_line"] = i
+                outputs.append(c_str.format(**cur_val))
+                i += 1
+    return outputs

gemmi_protools/ppi.py ADDED Viewed

@@ -0,0 +1,74 @@
+"""
+@Author: Luo Jiejian
+"""
+import pathlib
+from typing import Union, List
+import numpy as np
+from scipy.spatial import cKDTree
+from .reader import StructureParser
+def _ppi_atoms(struct, chains):
+    """
+    Load atoms for N and O of backbone and N, O, P, S of side chains, only for PPI searching
+    :param struct:
+    :param chains:
+    :return:
+    """
+    protein_atoms = ['N', 'ND1', 'ND2', 'NE', 'NE1', 'NE2', 'NH1', 'NH2', 'NZ',
+                     'O', 'OD1', 'OD2', 'OE1', 'OE2', 'OG', 'OG1', 'OH',
+                     'SD', 'SG']
+    xna_atoms = ['N1', 'N2', 'N3', 'N4', 'N6', 'N7', 'N9',
+                 'O2', "O2'", "O3'", 'O4', "O4'", "O5'", 'O6',
+                 'OP1', 'OP2', 'OP3', 'P']
+    pro_chs = []
+    xna_chs = []
+    for c in chains:
+        t = struct.chain_types.get(c, "")
+        if t == "protein":
+            pro_chs.append(c)
+        elif t in ["dna", "rna"]:
+            xna_chs.append(c)
+    pro_coord, pro_id = struct.get_atom_coords(pro_chs, protein_atoms)
+    xna_coord, xna_id = struct.get_atom_coords(xna_chs, xna_atoms)
+    return np.concatenate([pro_coord, xna_coord], axis=0), np.concatenate([pro_id, xna_id], axis=0)
+def ppi_interface_residues(in_file: Union[str, pathlib.Path],
+                           chains_x: List[str],
+                           chains_y: List[str],
+                           threshold: float = 4.0):
+    """
+    identify PPI among protein, DNA, RNA
+    :param in_file:
+    :param chains_x:
+    :param chains_y:
+    :param threshold:
+    :return:
+     PPI residues of chains_x, PPI residues of chains_y
+    """
+    st = StructureParser()
+    st.load_from_file(in_file)
+    st.set_default_model()
+    st.STRUCT.remove_alternative_conformations()
+    st.STRUCT.remove_ligands_and_waters()
+    st.STRUCT.remove_hydrogens()
+    st.STRUCT.remove_empty_chains()
+    st.update_entity()
+    x_coord, x_id = _ppi_atoms(st, chains_x)
+    y_coord, y_id = _ppi_atoms(st, chains_y)
+    kd_tree_x = cKDTree(x_coord)
+    kd_tree_y = cKDTree(y_coord)
+    pairs = kd_tree_x.sparse_distance_matrix(kd_tree_y, threshold, output_type='coo_matrix')
+    x_res = np.unique(x_id[pairs.row][["ch_name", 'res_num', 'res_icode', 'res_name']])
+    y_res = np.unique(y_id[pairs.col][["ch_name", 'res_num', 'res_icode', 'res_name']])
+    return x_res, y_res

gemmi_protools/reader.py ADDED Viewed

@@ -0,0 +1,371 @@
+"""
+@Author: Luo Jiejian
+"""
+import pathlib
+import re
+import string
+import warnings
+from copy import deepcopy
+from typing import Union, Optional, List
+import gemmi
+import numpy as np
+from typeguard import typechecked
+from .cif_opts import _cif_block_for_output, _is_cif
+from .parser import (_assign_digital_entity_names, _ent_from_structure,
+                     pdb_parser, cif_parser, _chain_type, _chain_names2one_letter,
+                     _assert_unique_chain_names_in_models, get_assembly)
+from .pdb_opts import _compound_source_string, _is_pdb
+from .struct_info import Info
+class StructureParser(object):
+    """
+    Enhance Structure reader for .cif, .cif.gz, .pdb or .pdb.gz
+    """
+    def __init__(self, structure: gemmi.Structure = None):
+        if not isinstance(structure, (type(None), gemmi.Structure)):
+            raise ValueError("structure must be gemmi.Structure or None")
+        if structure is None:
+            self.STRUCT = gemmi.Structure()
+        elif isinstance(structure, gemmi.Structure):
+            _assert_unique_chain_names_in_models(structure)
+            self.STRUCT = structure.clone()
+        else:
+            raise ValueError("structure must be gemmi.Structure or None")
+        self.STRUCT.setup_entities()
+        _assign_digital_entity_names(self.STRUCT)
+        self.INFO = Info()
+        self.INFO.from_gemmi_structure_infomap(self.STRUCT.info)
+        self.ENTITY = _ent_from_structure(self.STRUCT)
+        self.update_entity()
+    @typechecked
+    def load_from_file(self, path: Union[str, pathlib.PosixPath]):
+        if _is_pdb(path):
+            struct, entity = pdb_parser(path)
+        elif _is_cif(path):
+            struct, entity = cif_parser(path)
+        else:
+            raise ValueError("Only support .cif, .cif.gz, .pdb or .pdb.gz file, but got %s" % path)
+        _assert_unique_chain_names_in_models(struct)
+        self.STRUCT, self.ENTITY = struct, entity
+        self.INFO.from_gemmi_structure_infomap(self.STRUCT.info)
+        self.update_entity()
+    @typechecked
+    def to_pdb(self, outfile: str, write_minimal_pdb=False):
+        compound_source = _compound_source_string(self.ENTITY)
+        struct = self.STRUCT.clone()
+        struct.raw_remarks = compound_source + struct.raw_remarks
+        if write_minimal_pdb:
+            struct.write_minimal_pdb(outfile)
+        else:
+            struct.write_pdb(outfile)
+    @typechecked
+    def to_cif(self, outfile: str):
+        out_block = _cif_block_for_output(self.STRUCT, self.ENTITY)
+        out_block.write_file(outfile)
+    @property
+    def chain_ids(self):
+        vals = []
+        for m in self.STRUCT:
+            for c in m:
+                vals.append(c.name)
+        vals.sort()
+        return vals
+    @property
+    def model_numbers(self):
+        return [m.num for m in self.STRUCT]
+    @typechecked
+    def set_default_model(self, num: Optional[int] = None):
+        """
+        Set the first model as default
+        :param num:
+        :return:
+        """
+        if len(self.STRUCT) == 0:
+            raise RuntimeError("There is no model in structure")
+        keep_model = None
+        if num is None:
+            # default first model
+            keep_model = self.STRUCT[0]
+        else:
+            for model in self.STRUCT:
+                if model.num == num:
+                    keep_model = model
+                    break
+        if keep_model is None:
+            raise RuntimeError("Model %d not found in structure" % num)
+        # del, reversed order indexes
+        indexes_to_del = [i for i, model in enumerate(self.STRUCT) if model.num != keep_model.num]
+        indexes_to_del.sort(reverse=True)
+        for cur_index in indexes_to_del:
+            del self.STRUCT[cur_index]
+    @property
+    def chain_types(self):
+        return {c: _chain_type(self.STRUCT, c) for c in self.chain_ids}
+    @property
+    def assembly_names(self):
+        return [assem.name for assem in self.STRUCT.assemblies]
+    @property
+    def polymer_sequences(self):
+        cts = self.chain_types
+        out = dict()
+        for model in self.STRUCT:
+            for chain in model:
+                ct = cts.get(chain.name, "other")
+                if ct != "other":
+                    out[chain.name] = re.sub("-", "", chain.get_polymer().make_one_letter_sequence())
+        return out
+    @property
+    def polymer_residue_numbers(self):
+        cts = self.chain_types
+        out = dict()
+        id_type = np.dtype([
+            ("ch_name", "U5"),
+            ("res_num", "i4"),
+            ("res_icode", "U3"),
+            ("res_name", "U5"),
+        ])
+        for model in self.STRUCT:
+            for chain in model:
+                ct = cts.get(chain.name, "other")
+                if ct != "other":
+                    out[chain.name] = np.array([(chain.name, r.seqid.num, r.seqid.icode, r.name)
+                                                for r in chain.get_polymer()], dtype=id_type)
+        return out
+    @property
+    def chain_residues(self):
+        out = dict()
+        for model in self.STRUCT:
+            for chain in model:
+                out[chain.name] = [r.name for r in chain]
+        return out
+    def update_entity(self):
+        """
+        Update ENTITY, .entities .assemblies according to subchains
+        :return:
+        """
+        subchains = []
+        for model in self.STRUCT:
+            for chain in model:
+                subchains.extend([sc.subchain_id() for sc in chain.subchains()])
+        # update .entities
+        new_entities = gemmi.EntityList()
+        ent_names = []  # keep
+        for ent in self.STRUCT.entities:
+            tmp = [i for i in ent.subchains if i in subchains]
+            if tmp:
+                ent.subchains = tmp
+                new_entities.append(ent)
+                ent_names.append(ent.name)
+        self.STRUCT.entities = new_entities
+        # update .ENTITY
+        for super_key in ["eid2desc", "eid2specie", "eid2taxid"]:
+            for eid in list(self.ENTITY[super_key].keys()):
+                if eid not in ent_names:
+                    del self.ENTITY[super_key][eid]
+        for cid, eid in list(self.ENTITY["polymer2eid"].items()):
+            if eid not in ent_names or cid not in self.chain_ids:
+                del self.ENTITY["polymer2eid"][cid]
+        # update .assemblies
+        all_cid = self.chain_ids
+        del_assembly_indexes = []
+        for a_i, assembly in enumerate(self.STRUCT.assemblies):
+            del_gen_indexes = []
+            for g_i, gen in enumerate(assembly.generators):
+                # chains
+                tmp1 = [i for i in gen.chains if i in all_cid]
+                gen.chains = tmp1
+                tmp2 = [i for i in gen.subchains if i in subchains]
+                gen.subchains = tmp2
+                # empty gen
+                if gen.chains == [] and gen.subchains == []:
+                    del_gen_indexes.append(g_i)
+            del_gen_indexes.sort(reverse=True)
+            for dgi in del_gen_indexes:
+                del assembly.generators[dgi]
+            if len(del_gen_indexes) == len(assembly.generators):
+                del_assembly_indexes.append(a_i)
+        del_assembly_indexes.sort(reverse=True)
+        for dai in del_assembly_indexes:
+            del self.STRUCT.assemblies[dai]
+    @typechecked
+    def rename_chain(self, origin_name: str, target_name: str):
+        if origin_name not in self.chain_ids:
+            raise ValueError("chain %s not found" % origin_name)
+        other_chain_names = set(self.chain_ids) - {origin_name}
+        if target_name in other_chain_names:
+            raise ValueError("target chain name %s has existed, change to a different one." % target_name)
+        self.STRUCT.rename_chain(origin_name, target_name)
+        # update .polymer2eid if exist
+        if origin_name in self.ENTITY.polymer2eid:
+            val = self.ENTITY.polymer2eid[origin_name]
+            del self.ENTITY.polymer2eid[origin_name]
+            self.ENTITY.polymer2eid[target_name] = val
+        # update .assemblies.generator.chain if exists, for .pdb loading structure
+        for assembly in self.STRUCT.assemblies:
+            for gen in assembly.generators:
+                tmp = [target_name if c == origin_name else c for c in gen.chains]
+                gen.chains = tmp
+    @typechecked
+    def switch_chain_names(self, chain_name_1: str, chain_name_2: str):
+        if chain_name_1 not in self.chain_ids:
+            raise ValueError("chain_name_2 %s not in structure" % chain_name_1)
+        if chain_name_2 not in self.chain_ids:
+            raise ValueError("chain_name_2 %s not in structure" % chain_name_2)
+        l3 = [i + j + k for i in string.ascii_uppercase for j in string.ascii_uppercase for k in string.ascii_uppercase]
+        l3.sort(reverse=True)
+        current_names = set(self.chain_ids)
+        l3_l = [n for n in l3 if n not in current_names]
+        sw_name = l3_l.pop()
+        self.rename_chain(chain_name_1, sw_name)
+        self.rename_chain(chain_name_2, chain_name_1)
+        self.rename_chain(sw_name, chain_name_2)
+    @typechecked
+    def pick_chains(self, chain_names: List[str]):
+        self.set_default_model()
+        if chain_names:
+            missing = [c for c in chain_names if c not in self.chain_ids]
+            if missing:
+                raise ValueError("Chains %s not found" % ",".join(missing))
+            else:
+                del_chain_names = set(self.chain_ids) - set(chain_names)
+                del_chain_indexes = [i for i, ch in enumerate(self.STRUCT[0]) if ch.name in del_chain_names]
+                del_chain_indexes.sort(reverse=True)
+                for di in del_chain_indexes:
+                    del self.STRUCT[0][di]
+                self.update_entity()
+        else:
+            raise ValueError("No chain is given")
+    @typechecked
+    def make_chain_names_to_one_letter(self, only_uppercase: bool = True):
+        _mapper = _chain_names2one_letter(self.STRUCT, only_uppercase)
+        for origin_name, target_name in _mapper.items():
+            self.rename_chain(origin_name, target_name)
+        return _mapper
+    @typechecked
+    def get_assembly(self, assembly_name: str):
+        if assembly_name not in self.assembly_names:
+            raise ValueError("assembly %s is not found" % assembly_name)
+        struct, polymer2eid = get_assembly(self.STRUCT, assembly_name, gemmi.HowToNameCopiedChain.Short)
+        out = StructureParser(struct)
+        out.ENTITY = deepcopy(self.ENTITY)
+        out.ENTITY.polymer2eid = polymer2eid
+        # update info
+        prefix = "[Assembly %s] " % assembly_name
+        out.INFO.title = prefix + out.INFO.title
+        out.STRUCT.info = out.INFO.to_gemmi_structure_infomap()
+        return out
+    @typechecked
+    def merge_chains(self, chains: List[str]):
+        """
+        Merge a list of chains, target chain id is chains[0]
+        Renumber the new chain from 1
+        [No fix the Entity and some other information of structure]
+        :param chains:
+        :return:
+        GemmiLoader
+        """
+        for c in chains:
+            if c not in self.chain_ids:
+                raise RuntimeError("Chain %s is not in the structure" % c)
+        if len(self.STRUCT) > 1:
+            print("Multiple models in structure, do nothing")
+        elif len(chains) < 2:
+            print("Query chains less than 2, do nothing")
+        else:
+            new_chain = gemmi.Chain(chains[0])
+            residue_index = 1
+            model = self.STRUCT[0]
+            for ch in model:
+                if ch.name in chains:
+                    for res in ch:
+                        nr = deepcopy(res)
+                        nr.seqid.icode = " "
+                        nr.seqid.num = residue_index
+                        new_chain.add_residue(nr)
+                        residue_index += 1
+            for c in chains:
+                self.STRUCT[0].remove_chain(c)
+            self.STRUCT[0].add_chain(new_chain, unique_name=True)
+    def get_atom_coords(self, chains: List[str], atoms: Optional[List[str]] = None):
+        for c in chains:
+            if c not in self.chain_ids:
+                warnings.warn("Chain %s is not in the structure" % c)
+        coord = []
+        atom_id = []
+        id_type = np.dtype([
+            ("ch_name", "U5"),
+            ("res_num", "i4"),
+            ("res_icode", "U3"),
+            ("res_name", "U5"),
+            ("atom_name", "U5")
+        ])
+        model = self.STRUCT[0]
+        for ch in model:
+            if ch.name in chains:
+                for res in ch:
+                    for atom in res:
+                        if atoms is None or atom.name in atoms:
+                            cur_id = (ch.name, res.seqid.num, res.seqid.icode, res.name, atom.name)
+                            cur_pos = atom.pos.tolist()
+                            coord.append(cur_pos)
+                            atom_id.append(cur_id)
+        if coord:
+            return np.array(coord, dtype=np.float32), np.array(atom_id, dtype=id_type)
+        else:
+            return np.empty(shape=(0, 3), dtype=np.float32), np.array(atom_id, dtype=id_type)