PyPI - pyjess - Versions diffs - 0.5.2__pp310-pypy310_pp73-win_amd64.whl → 0.7.0__pp310-pypy310_pp73-win_amd64.whl - Mend

pyjess 0.5.2__pp310-pypy310_pp73-win_amd64.whl → 0.7.0__pp310-pypy310_pp73-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pyjess might be problematic. Click here for more details.

Files changed (21) hide show

pyjess/__main__.py +4 -0
pyjess/_jess.pyi +53 -9
pyjess/_jess.pypy310-pp73-win_amd64.pyd +0 -0
pyjess/_jess.pyx +855 -105
pyjess/cli.py +281 -0
pyjess/tests/__init__.py +2 -0
pyjess/tests/data/1AMY.cif +6259 -0
pyjess/tests/data/1sur.qry +26 -0
pyjess/tests/data/4.1.2.tpl +23 -0
pyjess/tests/data/5ayx.EF.pdb +63 -0
pyjess/tests/test_doctest.py +78 -0
pyjess/tests/test_hit.py +26 -2
pyjess/tests/test_jess.py +124 -3
pyjess/tests/test_molecule.py +146 -0
pyjess/tests/test_template.py +10 -1
{pyjess-0.5.2.dist-info → pyjess-0.7.0.dist-info}/METADATA +76 -15
pyjess-0.7.0.dist-info/RECORD +34 -0
pyjess-0.7.0.dist-info/entry_points.txt +3 -0
pyjess-0.5.2.dist-info/RECORD +0 -26
{pyjess-0.5.2.dist-info → pyjess-0.7.0.dist-info}/WHEEL +0 -0
{pyjess-0.5.2.dist-info → pyjess-0.7.0.dist-info}/licenses/COPYING +0 -0

pyjess/_jess.pyx CHANGED Viewed

@@ -2,6 +2,73 @@
 # cython: language_level=3, linetrace=True, binding=True
 """Bindings to Jess, a 3D template matching software.
+Jess is an algorithm for constraint-based structural template matching
+proposed by Jonathan Barker *et al.*. It can be used to identify
+catalytic residues from a known template inside a protein structure.
+Jess is an evolution of TESS, a geometric hashing algorithm developed by
+Andrew Wallace *et al.*, removing some pre-computation and
+structural requirements from the original algorithm.
+PyJess is a Python module that provides bindings to Jess using
+`Cython <https://cython.org/>`_. It allows creating templates, querying
+them with protein structures, and retrieving the hits using a Python API
+without performing any external I/O. It's also more than 10x faster than
+Jess thanks to algorithmic optimizations added to improve the original Jess
+code while producing consistent results.
+Example:
+    Load templates from a file, either as a file-like object or
+    given a filename::
+        >>> t1 = pyjess.Template.load("1.3.3.tpl")  # load from filename
+        >>> with open("4.1.2.tpl") as f:            # load from a file object
+        ...     t2 = pyjess.Template.load(f)
+    Load molecules from a file, either as a file-like object or given
+    a filename::
+        >>> mol = pyjess.Molecule.load("1AMY.pdb")
+        >>> mol[0]
+        Atom(serial=1, name='N', altloc=' ', residue_name='GLN', ...)
+    Create a `Jess` object storing the templates to support running
+    queries on them. The individual templates can still be accessed by
+    index::
+        >>> jess = pyjess.Jess([t1, t2])
+        >>> jess[0].id
+        '3r6v'
+    Run a query on the Jess object to retrieve all templates matching
+    a `Molecule`, *in no particular order*::
+        >>> hits = jess.query(mol, 2, 2, 2)
+        >>> for hit in hits:
+        ...     print(hit.template.id, hit.rmsd)
+        2om2 1.4386...
+        2om2 1.4877...
+        2om2 1.4376...
+        2om2 1.5284...
+        2om2 1.4863...
+        2om2 1.4369...
+        2om2 1.4790...
+        2om2 1.1414...
+        2om2 1.0755...
+        2om2 1.1973...
+        2om2 1.1353...
+        2om2 1.0711...
+        2om2 1.1494...
+    By default, a template can match a molecule in more than one way,
+    if several sets of atoms match the geometric constraints. Use the
+    ``best_match`` argument of `~Jess.query` to only retrieve the
+    best match per template::
+        >>> hits = jess.query(mol, 2, 2, 2, best_match=True)
+        >>> for hit in hits:
+        ...     print(hit.template.id, hit.rmsd)
+        2om2 1.071...
 References:
     - Barker, J. A., & Thornton, J. M. (2003). *An algorithm for
       constraint-based structural template matching: application to
@@ -18,10 +85,16 @@ References:
 # --- C imports --------------------------------------------------------------
 cimport cython
-from cpython.unicode cimport PyUnicode_FromStringAndSize
+from cpython.exc cimport PyErr_WarnEx
+from cpython.unicode cimport (
+    PyUnicode_FromStringAndSize,
+    PyUnicode_FromFormat,
+    PyUnicode_AsASCIIString,
+)
 from libc.math cimport isnan, exp, INFINITY, NAN
-from libc.stdio cimport FILE, fclose, fdopen, printf
+from libc.stdio cimport FILE, fclose, fdopen, printf, sprintf
+from libc.stdint cimport uintptr_t
 from libc.stdlib cimport calloc, realloc, free, malloc
 from libc.string cimport memcpy, memset, strncpy, strdup
@@ -37,24 +110,20 @@ from jess.jess cimport Jess as _Jess
 from jess.jess cimport JessQuery as _JessQuery
 from jess.molecule cimport Molecule as _Molecule
 from jess.super cimport Superposition as _Superposition
-from jess.template cimport Template as _Template
+from jess.template cimport Template as _Template, IgnoreType as _IgnoreType
 from jess.tess_template cimport TessTemplate as _TessTemplate
 from jess.tess_atom cimport TessAtom as _TessAtom
 # --- Python imports ---------------------------------------------------------
-import contextlib
 import functools
 import io
-import itertools
-import os
-import warnings
 __version__ = PROJECT_VERSION
 # --- Utils ------------------------------------------------------------------
-cdef inline void copy_token(char* dst, const char* src, size_t n) noexcept nogil:
+cdef inline void encode_token(char* dst, const char* src, size_t n) noexcept nogil:
     cdef size_t i
     for i in range(n):
         if src[i] == ord(' ') or src[i] == 0:
@@ -63,12 +132,155 @@ cdef inline void copy_token(char* dst, const char* src, size_t n) noexcept nogil
             dst[i] = src[i]
     dst[n] = 0
-@contextlib.contextmanager
-def nullcontext(return_value=None):
-    yield return_value
+cdef inline void decode_token(char* dst, const char* src, size_t n) noexcept nogil:
+    cdef size_t i
+    for i in range(n):
+        if src[i] == ord('_') or src[i] == 0:
+            dst[i] = ord(' ')
+        else:
+            dst[i] = src[i]
+    dst[n] = 0
+class nullcontext:
+    def __init__(self, return_value=None):
+        self.retval = return_value
+    def __enter__(self):
+        return self.retval
+    def __exit__(self, exc_type, exc_value, traceback):
+        return False
 # --- Classes ----------------------------------------------------------------
+cdef class _MoleculeParser:
+    cdef str id
+    def __init__(self, str id = None):
+        self.id = id
+cdef class _PDBMoleculeParser(_MoleculeParser):
+    cdef bint ignore_endmdl
+    cdef bint skip_hetatm
+    def __init__(self, str id = None, bint ignore_endmdl = False, bint skip_hetatm = False):
+        super().__init__(id=id)
+        self.ignore_endmdl = ignore_endmdl
+        self.skip_hetatm = skip_hetatm
+    def loads(self, text, molecule_type):
+        return self.load(io.StringIO(text), molecule_type)
+    def load(self, file, molecule_type):
+        cdef str  line
+        cdef str  id    = self.id
+        cdef list atoms = []
+        try:
+            handle = open(file)
+        except TypeError:
+            handle = nullcontext(file)
+        with handle as f:
+            for line in f:
+                if line.startswith("HEADER"):
+                    if id is None:
+                        id = line[62:66].strip() or None
+                elif line.startswith("ATOM"):
+                    atoms.append(Atom.loads(line))
+                elif line.startswith("HETATM") and not self.skip_hetatm:
+                    atoms.append(Atom.loads(line))
+                elif line.startswith("ENDMDL"):
+                    if not self.ignore_endmdl:
+                        break
+                elif line.lower().startswith(("data_", "loop_")):
+                    raise ValueError("mmCIF data tags found, file is not in PDB format")
+        return molecule_type(atoms, id=id)
+cdef class _CIFMoleculeParser(_MoleculeParser):
+    cdef object gemmi
+    cdef bint use_author
+    cdef bint skip_hetatm
+    _PRIMARY_COLUMNS = [
+        'id', 'type_symbol', 'label_atom_id', 'label_alt_id', 'label_comp_id',
+        'label_asym_id', 'label_seq_id', '?pdbx_PDB_ins_code', 'Cartn_x',
+        'Cartn_y', 'Cartn_z', 'occupancy', 'B_iso_or_equiv',
+        '?pdbx_formal_charge', '?group_PDB',
+    ]
+    _AUTH_COLUMNS = [
+        'id', 'type_symbol', 'auth_atom_id', 'label_alt_id', 'auth_comp_id',
+        'auth_asym_id', 'auth_seq_id', '?pdbx_PDB_ins_code', 'Cartn_x',
+        'Cartn_y', 'Cartn_z', 'occupancy', 'B_iso_or_equiv',
+        '?pdbx_formal_charge', '?group_PDB',
+    ]
+    def __init__(self, str id = None, bint use_author = False, bint skip_hetatm = False):
+        super().__init__(id=id)
+        self.gemmi = __import__('gemmi')
+        self.use_author = use_author
+        self.skip_hetatm = skip_hetatm
+    def _load_block(self, document, molecule_type):
+        block = document.sole_block()
+        cols = self._AUTH_COLUMNS if self.use_author else self._PRIMARY_COLUMNS
+        table = block.find('_atom_site.', cols)
+        max_residue_number = 0
+        if not table:
+            raise ValueError("missing columns in CIF files")
+        atoms = []
+        for row in table:
+            if row[14] != "ATOM" and (row[14] != "HETATM" or self.skip_hetatm):
+                continue
+            if row[6] == "." and row[14] == "HETATM":
+                PyErr_WarnEx(
+                    UserWarning,
+                    "HETATM line found without residue number. Consider "
+                    "parsing with use_author=True to use author-defined "
+                    "residue numbers, or skip_hetatm=True to disable "
+                    "parsing of HETATM altogether.",
+                    3,
+                )
+                residue_number = max_residue_number
+                max_residue_number += 1
+            else:
+                residue_number = int(row[6])
+                max_residue_number = max(residue_number, max_residue_number)
+            atom = Atom(
+                serial=int(row[0]),
+                element=row[1],
+                name=row[2],
+                altloc=' ' if row[3] == "." else row[3], # FIXME: replace with None?
+                residue_name=row[4],
+                chain_id=row[5],
+                residue_number=residue_number,
+                insertion_code=' ' if not row.has(7) or row[7] == "?" else row[7],
+                x=float(row[8]),
+                y=float(row[9]),
+                z=float(row[10]),
+                occupancy=0.0 if row[11] == '.' else float(row[11]),
+                temperature_factor=float(row[12]),
+                charge=0 if not row.has(13) or row[13] == "?" else int(row[13]),
+            )
+            atoms.append(atom)
+        id = block.name if self.id is None else self.id
+        return molecule_type(atoms, id=id)
+    def loads(self, text, molecule_type):
+        document = self.gemmi.cif.read_string(text)
+        return self._load_block(document, molecule_type)
+    def load(self, file, molecule_type):
+        if hasattr(file, "read"):
+            document = self.gemmi.cif.read_string(file.read())
+        else:
+            document = self.gemmi.cif.read_file(file)
+        return self._load_block(document, molecule_type)
 cdef class Molecule:
     """A molecule structure, as a sequence of `Atom` objects.
@@ -83,20 +295,40 @@ cdef class Molecule:
     cdef str        _id
     @classmethod
-    def loads(cls, text, str id = None, bint ignore_endmdl = False):
+    def loads(
+        cls,
+        text,
+        str format = "pdb",
+        *,
+        str id = None,
+        bint ignore_endmdl = False,
+        bint use_author = False,
+        bint skip_hetatm = False,
+    ):
         """Load a molecule from a PDB string.
         Arguments:
-            file (`str`, `os.PathLike`, or file-like object): Either the path
-                to a file, or a file-like object opened in **text mode**
-                containing a PDB molecule.
+            text (`str`): The serialized molecule to parse into a new
+                object.
+            format (`str`): The format to parse the file. Supported formats
+                are: ``pdb`` for the Protein Data Bank format, ``cif``
+                for Crystallographic Information File format (additionally
+                requires the `gemmi` module), or ``detect`` to attempt
+                auto-detection (the default).
+        Keyword Arguments:
             id (`str`, optional): The identifier of the molecule. If `None`
                 given, the parser will attempt to extract it from the
-                ``HEADER`` line.
+                ``HEADER`` line (for PDB files) or the block name (for CIF
+                files).
             ignore_endmdl (`bool`): Pass `True` to make the parser read all
                 the atoms from the PDB file. By default, the parser only
                 reads the atoms of the first model, and stops at the first
-                ``ENDMDL`` line.
+                ``ENDMDL`` line. *Ignored for CIF files*.
+            use_author (`bool`): Pass `True` to use the author-defined
+                labels while parsing CIF files, e.g. read the chain name
+                from ``_atom_site.auth_asym_id`` rather than
+                ``_atom_site.label_asym_id``. *Ignored for PDB files*.
         Returns:
             `~pyjess.Molecule`: The molecule parsed from the PDB file.
@@ -105,46 +337,264 @@ cdef class Molecule:
             `Molecule.load` to load a PDB molecule from a file-like
             object or from a path.
+        Caution:
+            Parsing from PDB file retains the heteroatoms (``HETATM`` lines)
+            while parsing from mmCIF usually discard them. This is because
+            mmCIF files store heteroatoms but do not require them to
+            have an associated residue number, which can throw off the way
+            atoms are modeled in Jess.
+        .. versionadded:: 0.7.0
+            The ``format`` argument, and support for CIF parsing.
         """
-        return cls.load(io.StringIO(text), id=id, ignore_endmdl=ignore_endmdl)
+        if format == "detect":
+            format = "cif" if text.lstrip().startswith(("data_", "loop_")) else "pdb"
+        return cls.load(
+            io.StringIO(text),
+            format=format,
+            id=id,
+            ignore_endmdl=ignore_endmdl,
+            skip_hetatm=skip_hetatm,
+        )
     @classmethod
-    def load(cls, file, str id = None, bint ignore_endmdl = False):
+    def load(
+        cls,
+        file,
+        str format = "detect",
+        *,
+        str id = None,
+        bint ignore_endmdl = False,
+        bint use_author = False,
+        bint skip_hetatm = False,
+    ):
         """Load a molecule from a PDB file.
         Arguments:
             file (`str`, `os.PathLike`, or file-like object): Either the path
                 to a file, or a file-like object opened in **text mode**
-                containing a PDB molecule.
+                containing a molecule.
+            format (`str`): The format to parse the file. Supported formats
+                are: ``pdb`` for the Protein Data Bank format, ``cif``
+                for Crystallographic Information File format (additionally
+                requires the `gemmi` module), or ``detect`` to attempt
+                auto-detection (the default).
+        Keyword Arguments:
             id (`str`, optional): The identifier of the molecule. If `None`
                 given, the parser will attempt to extract it from the
-                ``HEADER`` line.
+                ``HEADER`` line (for PDB files) or the block name (for CIF
+                files).
             ignore_endmdl (`bool`): Pass `True` to make the parser read all
                 the atoms from the PDB file. By default, the parser only
                 reads the atoms of the first model, and stops at the first
-                ``ENDMDL`` line.
+                ``ENDMDL`` line. *Ignored for CIF files*.
+            use_author (`bool`): Pass `True` to use the author-defined
+                labels while parsing CIF files, e.g. read the chain name
+                from ``_atom_site.auth_asym_id`` rather than
+                ``_atom_site.label_asym_id``. *Ignored for PDB files*.
+            skip_hetatm (`bool`): Pass `True` to skip parsing of heteroatoms
+                (``HETATM``) in the input file.
         Returns:
             `~pyjess.Molecule`: The molecule parsed from the PDB file.
+        See Also:
+            `Molecule.loads` to load a PDB molecule from a string.
+        Caution:
+            Parsing from PDB file retains the heteroatoms (``HETATM`` lines)
+            while parsing from mmCIF usually discard them. This is because
+            mmCIF files store heteroatoms but do not require them to
+            have an associated residue number, which can throw off the way
+            atoms are modeled in Jess.
+        .. versionadded:: 0.7.0
+            The ``format`` and ``skip_hetatm`` arguments, and mmCIF support.
         """
-        try:
-            handle = open(file)
-        except TypeError:
-            handle = nullcontext(file)
-        with handle as f:
-            atoms = []
-            for line in f:
-                if line.startswith("HEADER"):
-                    if id is None:
-                        id = line[62:66].strip() or None
-                elif line.startswith(("ATOM", "HETATM")):
-                    atoms.append(Atom.loads(line))
-                elif line.startswith("ENDMDL"):
-                    if not ignore_endmdl:
-                        break
+        cdef _MoleculeParser parser
+        cdef str             peek
+        if format == "detect":
+            try:
+                handle = open(file)
+            except TypeError:
+                handle = nullcontext(file)
+            with handle as f:
+                if f.seekable():
+                    peek = f.read(5)
+                    f.seek(0)
+                else:
+                    f = f.read()
+                    peek = f[5:]
+                if peek.startswith(("data_", "loop_")):
+                    parser = _CIFMoleculeParser(
+                        id=id,
+                        use_author=use_author,
+                        skip_hetatm=skip_hetatm,
+                    )
+                else:
+                    parser = _PDBMoleculeParser(
+                        id=id,
+                        ignore_endmdl=ignore_endmdl,
+                        skip_hetatm=skip_hetatm,
+                    )
+                if isinstance(f, str):
+                    return parser.loads(f, molecule_type=cls)
+                return parser.load(f, molecule_type=cls)
+        if format == "pdb":
+            parser = _PDBMoleculeParser(
+                id=id,
+                ignore_endmdl=ignore_endmdl,
+                skip_hetatm=skip_hetatm
+            )
+        elif format == "cif":
+            parser = _CIFMoleculeParser(
+                id=id,
+                use_author=use_author,
+                skip_hetatm=skip_hetatm,
+            )
+        else:
+            raise ValueError(f"invalid value for `format` argument: {format!r}")
+        return parser.load(file, molecule_type=cls)
+    @classmethod
+    def from_biopython(cls, object structure, str id = None):
+        """Create a new `~pyjess.Molecule` from a `Bio.PDB.Structure`.
+        Arguments:
+            structure (`Bio.PDB.Structure` or `Bio.PDB.Model`): The
+                Biopython object containing the structure data.
+            id (`str` or `None`): The identifier to give to the newly
+                created molecule. If `None` given, will use the value of
+                ``structure.id``.
+        Returns:
+            `~pyjess.Molecule`: A molecule object suitable for using
+            in `Jess.query`.
+        .. versionadded:: 0.7.0
+        """
+        cdef list atoms = []
+        for c in structure.get_chains():
+            for r in c.get_residues():
+                _, residue_number, insertion_code = r.id
+                for a in r.get_atoms():
+                    coord = a.get_coord()
+                    atom = Atom(
+                        name=a.fullname,
+                        x=coord[0],
+                        y=coord[1],
+                        z=coord[2],
+                        altloc=a.altloc,
+                        charge=a.pqr_charge or 0,
+                        occupancy=a.occupancy,
+                        serial=a.serial_number,
+                        residue_name=r.resname,
+                        residue_number=residue_number,
+                        segment=r.segid,
+                        insertion_code=insertion_code,
+                        chain_id=c.id,
+                        temperature_factor=a.bfactor,
+                        element=a.element,
+                    )
+                    atoms.append(atom)
+        return cls(atoms, id=structure.id)
+    @classmethod
+    def from_gemmi(cls, object model, str id=None):
+        """Create a new `~pyjess.Molecule` from a `gemmi.Model`.
+        Arguments:
+            structure (`gemmi.Model`): The ``gemmi`` object
+                containing the structure data.
+            id (`str` or `None`): The identifier to give to the newly
+                created molecule.
+        Returns:
+            `~pyjess.Molecule`: A molecule object suitable for using
+            in `Jess.query`.
+        .. versionadded:: 0.7.0
+        """
+        cdef list atoms = []
+        for cra in model.all():
+            a = cra.atom
+            r = cra.residue
+            c = cra.chain
+            atom = Atom(
+                name=a.padded_name(),
+                x=a.pos[0],
+                y=a.pos[1],
+                z=a.pos[2],
+                altloc=' ' if a.altloc == '\0' else a.altloc,
+                charge=a.charge,
+                element=a.element.name.upper(),
+                occupancy=a.occ,
+                temperature_factor=a.b_iso,
+                serial=a.serial,
+                segment=r.segment,
+                residue_name=r.name,
+                residue_number=r.seqid.num,
+                chain_id=c.name,
+                insertion_code=r.seqid.icode,
+            )
+            atoms.append(atom)
         return cls(atoms, id=id)
+    @classmethod
+    def from_biotite(cls, object atom_array, str id=None):
+        """Create a new `~pyjess.Molecule` from a `biotite.structure.AtomArray`.
+        Arguments:
+            structure (`biotite.structure.AtomArray`): The ``biotite``
+                object containing the structure data.
+        Returns:
+            `~pyjess.Molecule`: A molecule object suitable for using
+            in `Jess.query`.
+        Caution:
+            If loading data with the `biotite.structure.io.pdb.PDBFile` module,
+            ensure that you are requesting all atoms and all extra fields
+            in `~biotite.structure.io.pdb.PDBFile.get_structure`::
+                db_file = PDBFile.read("data/1AMY.pdb")
+                structure = pdb_file.get_structure(
+                    altloc="all",
+                    extra_fields=["atom_id", "b_factor", "occupancy", "charge"],
+                )
+                molecule = Molecule.from_biotite(structure[0])
+        .. versionadded:: 0.7.0
+        """
+        cdef list atoms = []
+        for a in atom_array:
+            atom = Atom(
+                name=str(a.atom_name),
+                x=a.coord[0],
+                y=a.coord[1],
+                z=a.coord[2],
+                altloc=str(getattr(a, 'altloc', ' ')),
+                charge=getattr(a, 'charge', 0),
+                element=str(a.element),
+                occupancy=getattr(a, 'occupancy', 1.0),
+                temperature_factor=a.b_factor,
+                serial=a.atom_id,
+                segment=str(getattr(a, 'segment', '')),
+                residue_name=str(a.res_name),
+                residue_number=a.res_id,
+                chain_id=str(a.chain_id),
+                insertion_code=str(a.ins_code).ljust(1),
+            )
+            atoms.append(atom)
+        return cls(atoms)
     def __cinit__(self):
         self._mol = NULL
@@ -249,17 +699,32 @@ cdef class Molecule:
         return self._id
     cpdef Molecule conserved(self, double cutoff = 0.0):
+        """Get a molecule containing only a subset of conserved atoms.
+        Arguments:
+            cutoff (`float`): The conservation cutoff for atoms. Atoms
+                with a `~Atom.temperature_factor` lower than this value
+                will be removed from the result.
+        Returns:
+            `~pyjess.Molecule`: A new molecule with atoms below the
+            conservation cutoff removed.
+        """
         assert self._mol is not NULL
-        cdef Atom atom
-        return type(self)(
-            id=self.id,
-            atoms=[
-                atom
-                for atom in self
-                if cutoff <= 0.0
-                or atom._atom.tempFactor >= cutoff
-            ]
-        )
+        cdef size_t i
+        cdef list   atoms
+        if cutoff <= 0.0:
+            return self.copy()
+        atoms = []
+        for i in range(self._mol.count):
+            if self._mol.atom[i].tempFactor >= cutoff:
+                atoms.append(self[i])
+        return type(self)(id=self.id, atoms=atoms)
     cpdef Molecule copy(self):
         """Create a copy of this molecule and its atoms.
@@ -329,8 +794,9 @@ cdef class Atom:
                 atom metadata from.
         """
-        cdef bytearray b
-        cdef Atom      atom
+        cdef const unsigned char* s
+        cdef bytearray            b
+        cdef Atom                 atom
         if isinstance(text, str):
             b = bytearray(text, 'utf-8')
@@ -339,14 +805,15 @@ cdef class Atom:
         if not b.endswith(b'\n'):
             b.append(b'\n')
         b.append(b'\0')
+        s = b
         atom = cls.__new__(cls)
-        atom._atom = <_Atom*> malloc(sizeof(_Atom))
-        if atom._atom == NULL:
-            raise MemoryError("Failed to allocate atom")
-        if not jess.atom.Atom_parse(atom._atom, b):
-            raise ValueError(f"Failed to parse atom: {text!r}")
+        with nogil:
+            atom._atom = <_Atom*> malloc(sizeof(_Atom))
+            if atom._atom == NULL:
+                raise MemoryError("Failed to allocate atom")
+            if not jess.atom.Atom_parse(atom._atom, <const char*> s):
+                raise ValueError(f"Failed to parse atom: {text!r}")
         return atom
@@ -364,21 +831,21 @@ cdef class Atom:
         *,
         int serial,
         str name,
-        str altloc,
         str residue_name,
         str chain_id,
         int residue_number,
-        str insertion_code,
         double x,
         double y,
         double z,
         double occupancy = 0.0,
         double temperature_factor = 0.0,
+        str altloc = ' ',
+        str insertion_code = ' ',
         str segment = '',
         str element = '',
         int charge = 0,
     ):
-        """__init__(self, *, serial, name, altloc, residue_name, chain_id, residue_number, insertion_code, x, y, z, occupancy=0.0, temperature_factor=0.0, segment='', element='', charge=0)\n--\n
+        """__init__(self, *, serial, name, residue_name, chain_id, residue_number, x, y, z, occupancy=0.0, temperature_factor=0.0, altloc=' ', insertion_code=' ', segment='', element='', charge=0)\n--\n
         Create a new atom.
@@ -390,11 +857,16 @@ cdef class Atom:
                 long.
         """
+        cdef bytearray _name
+        cdef bytes     _residue_name
+        cdef bytes     _segment
+        cdef bytes     _element
         if len(name) > 4:
             raise ValueError(f"Invalid atom name: {name!r}")
         if len(residue_name) > 3:
             raise ValueError(f"Invalid residue name: {residue_name!r}")
-        if len(segment) > 3:
+        if len(segment) > 4:
             raise ValueError(f"Invalid segment: {segment!r}")
         if len(element) > 2:
             raise ValueError(f"Invalid element: {element!r}")
@@ -405,6 +877,10 @@ cdef class Atom:
         if self._atom is NULL:
             raise MemoryError("Failed to allocate atom")
+        _residue_name = PyUnicode_AsASCIIString(residue_name)
+        _segment = PyUnicode_AsASCIIString(segment)
+        _element = PyUnicode_AsASCIIString(element)
         self._atom.serial = serial
         self._atom.altLoc = ord(altloc)
         self._atom.chainID1 = ord(chain_id[0]) if len(chain_id) > 0 else 0
@@ -417,14 +893,15 @@ cdef class Atom:
         self._atom.occupancy = occupancy
         self._atom.tempFactor = temperature_factor
         self._atom.charge = charge
-        copy_token(self._atom.resName, residue_name.encode('ascii').ljust(3, b'\0'), 3)
-        copy_token(self._atom.segID, segment.encode('ascii').ljust(3, b'\0'), 3)
-        copy_token(self._atom.element, element.encode('ascii').ljust(2, b'\0'), 2)
+        encode_token(self._atom.resName, _residue_name.ljust(3, b'\0'), 3)
+        encode_token(self._atom.segID, _segment.ljust(4, b'\0'), 4)
+        encode_token(self._atom.element, _element.ljust(2, b'\0'), 2)
+        # FIXME: is alignment proper?
         _name = bytearray(name, 'ascii')
         if len(_name) < 4:
             _name.insert(0, ord('_'))
-        copy_token(self._atom.name, _name.ljust(4, b'\0'), 4)
+        encode_token(self._atom.name, _name.ljust(4, b'\0'), 4)
     def __copy__(self):
         return self.copy()
@@ -517,7 +994,7 @@ cdef class Atom:
         """`str`: The segment identifier.
         """
         assert self._atom is not NULL
-        return self._atom.segID[:3].decode('ascii').strip('_')
+        return self._atom.segID[:4].decode('ascii').strip('_')
     @property
     def element(self):
@@ -538,7 +1015,7 @@ cdef class Atom:
         """`str`: The identifier of the chain the atom belongs to.
         """
         assert self._atom is not NULL
-        return "{}{}".format(chr(self._atom.chainID1), chr(self._atom.chainID2)).strip()
+        return PyUnicode_FromFormat("%c%c", self._atom.chainID1, self._atom.chainID2).strip()
     @property
     def occupancy(self):
@@ -563,16 +1040,22 @@ cdef class Atom:
     @property
     def x(self):
+        """`float`: The atom coordinate in the 1st dimension.
+        """
         assert self._atom is not NULL
         return self._atom.x[0]
     @property
     def y(self):
+        """`float`: The atom coordinate in the 2nd dimension.
+        """
         assert self._atom is not NULL
         return self._atom.x[1]
     @property
     def z(self):
+        """`float`: The atom coordinate in the 3rd dimension.
+        """
         assert self._atom is not NULL
         return self._atom.x[2]
@@ -734,18 +1217,19 @@ cdef class TemplateAtom:
                 _name = bytearray(name, 'ascii')
             else:
                 _name = bytearray(name)
+            # FIXME: is alignment proper?
             if len(_name) > 4:
                 raise ValueError(f"Invalid atom name: {name!r}")
-            elif len(_name) < 3:
+            elif len(_name) <= 3:
                 _name.insert(0, ord('_'))
-            copy_token(self._atom.name[m], _name.ljust(4, b'\0'), 4)
+            encode_token(self._atom.name[m], _name.ljust(4, b'\0'), 4)
         # copy residue names
         for m, name in enumerate(residue_names):
             _name = name.encode('ascii') if isinstance(name, str) else name
             if len(_name) > 3:
                 raise ValueError(f"Invalid residue name: {name!r}")
-            copy_token(self._atom.resName[m], _name.ljust(3, b'\0'), 3)
+            encode_token(self._atom.resName[m], _name.ljust(3, b'\0'), 3)
     cdef dict _state(self):
         return {
@@ -819,7 +1303,7 @@ cdef class TemplateAtom:
         assert self._atom is not NULL
         cdef char c1 = jess.tess_atom.TessAtom_chainID1(self._atom)
         cdef char c2 = jess.tess_atom.TessAtom_chainID2(self._atom)
-        return "{}{}".format(chr(c1), chr(c2)).strip()
+        return PyUnicode_FromFormat("%c%c", c1, c2).strip()
     @property
     def x(self):
@@ -893,7 +1377,10 @@ cdef class TemplateAtom:
         .. versionadded:: 0.4.0
         """
-        return type(self)(**self._state())
+        cdef TemplateAtom atom = TemplateAtom.__new__(TemplateAtom)
+        with nogil:
+            atom._atom = jess.tess_atom.TessAtom_copy(self._atom)
+        return atom
 cdef class Template:
@@ -946,12 +1433,13 @@ cdef class Template:
             `~pyjess.Template`: The template parsed from the given file.
         """
+        cdef str  line
+        cdef list atoms = []
         try:
             handle = open(file)
         except TypeError:
             handle = nullcontext(file)
         with handle as f:
-            atoms = []
             for line in f:
                 if line.startswith("ATOM"):
                     atoms.append(TemplateAtom.loads(line))
@@ -1048,7 +1536,14 @@ cdef class Template:
                 self._tess.distance[j][i] = dist
         # compute dimension
-        residues = { self._tess.atom[i].resSeq for i in range(count) }
+        residues = {
+            (
+                self._tess.atom[i].resSeq ,
+                self._tess.atom[i].chainID1,
+                self._tess.atom[i].chainID2,
+            )
+            for i in range(count)
+        }
         self._tess.dim = len(residues)
     def __copy__(self):
@@ -1131,6 +1626,8 @@ cdef class Template:
     @property
     def id(self):
+        """`str` or `None`: An identifier for the template, if any.
+        """
         assert self._tpl is not NULL
         cdef const char* name = self._tpl.name(self._tpl)
@@ -1146,11 +1643,18 @@ cdef class Template:
         return self._tess.dim
     cpdef Template copy(self):
-        return Template(
-            self,
-            self.id
-        )
+        """Create a copy of the template.
+        Returns:
+            `~pyjess.Template`: A new template object with identical
+            attributes and a copy of the `TemplateAtom` it contains.
+        """
+        cdef Template tpl = Template.__new__(Template)
+        with nogil:
+            tpl._tpl = self._tpl.copy(self._tpl)
+            tpl._tess = <_TessTemplate*> &tpl._tpl[1]
+        return tpl
 cdef class Query:
     """A query over templates with a given molecule.
@@ -1166,10 +1670,6 @@ cdef class Query:
             the templates.
         rmsd_threshold (`float`): The RMSD threshold for reporting
             results.
-        max_candidates (`int`): The maximum number of candidate hits
-            to report.
-        ignore_chain (`bool`): Whether to check or ignore the chain of
-            the atoms to match.
         best_match (`bool`): Whether the query will return only the
             best match to each template.
@@ -1177,18 +1677,20 @@ cdef class Query:
     cdef _JessQuery* _jq
     cdef bint        _partial
     cdef int         _candidates
+    cdef uintptr_t   _prev_tpl
+    cdef int         _max_candidates
+    cdef _IgnoreType _ignore_chain
     cdef readonly Jess     jess
     cdef readonly Molecule molecule
-    cdef readonly bint     ignore_chain
     cdef readonly bint     best_match
     cdef readonly double   rmsd_threshold
-    cdef readonly int      max_candidates
     def __cinit__(self):
         self._jq = NULL
         self._candidates = 0
         self._partial = False
+        self._prev_tpl = 0
     def __dealloc__(self):
         jess.jess.JessQuery_free(self._jq)
@@ -1196,11 +1698,48 @@ cdef class Query:
     def __iter__(self):
         return self
+    @property
+    def ignore_chain(self):
+        """`str` or `None`: The way atom chains are considered or discarded.
+        """
+        if self._ignore_chain == _IgnoreType.ignoreNone:
+            return None
+        elif self._ignore_chain == _IgnoreType.ignoreResidues:
+            return "residues"
+        elif self._ignore_chain == _IgnoreType.ignoreAtoms:
+            return "atoms"
+    @ignore_chain.setter
+    def ignore_chain(self, ignore_chain):
+        if ignore_chain is None:
+            self._ignore_chain = _IgnoreType.ignoreNone
+        elif ignore_chain == "residues":
+            self._ignore_chain = _IgnoreType.ignoreResidues
+        elif ignore_chain == "atoms":
+            self._ignore_chain = _IgnoreType.ignoreAtoms
+        else:
+            raise ValueError(f"invalid value for `ignore_chain`: {ignore_chain!r}")
+    @property
+    def max_candidates(self):
+        """`int`: The maximum number of candidate hits to report *by template*.
+        """
+        return None if self._max_candidates == -1 else self._max_candidates
+    @max_candidates.setter
+    def max_candidates(self, max_candidates):
+        if max_candidates is None:
+            self._max_candidates = -1
+        elif max_candidates >= 0:
+            self._max_candidates = max_candidates
+        else:
+            raise ValueError(f"invalid value for `max_candidates` argument: {max_candidates!r}")
     cdef bint _advance(self) noexcept nogil:
         if self._partial:
             self._partial = False
             return True
-        return jess.jess.JessQuery_next(self._jq, self.ignore_chain)
+        return jess.jess.JessQuery_next(self._jq, self._ignore_chain)
     cdef bint _rewind(self) noexcept nogil:
         self._partial = True
@@ -1244,10 +1783,11 @@ cdef class Query:
         # search the next hit without the GIL to allow parallel queries.
         with nogil:
-            while self._advance() and self._candidates < self.max_candidates:
+            while self._advance():
                 # load current iteration template, and check that the hit
                 # was obtained with the current template and not with the
                 # previous one
+                self._prev_tpl = <uintptr_t> tpl
                 tpl = jess.jess.JessQuery_template(self._jq)
                 if hit_found and hit_tpl != tpl:
                     self._rewind()
@@ -1274,10 +1814,10 @@ cdef class Query:
                     if nan:
                         with gil:
-                            warnings.warn(
-                                "Jess returned a superposition matrix with NaN values",
+                            PyErr_WarnEx(
                                 UserWarning,
-                                stacklevel=2,
+                                "Jess returned a superposition matrix with NaN values",
+                                2,
                             )
                     else:
                         self._copy_atoms(tpl, hit)
@@ -1286,9 +1826,21 @@ cdef class Query:
                         hit_tpl = tpl
                         hit_found = True
-                # free superposition items that are not used in a hit, and
-                # return hits immediately if we are not in best match mode
-                self._candidates += 1
+                # check if we already made it to the next template,
+                # or if we need to short-circuit the iteration and
+                # force the query to move to the next template as
+                # we found too many candidates already.
+                if <uintptr_t> tpl != self._prev_tpl:
+                    self._candidates = 0
+                else:
+                    self._candidates += 1
+                if self._max_candidates != -1 and self._candidates > self._max_candidates:
+                    self._candidates = 0
+                    jess.jess.JessQuery_nextTemplate(self._jq)
+                # free superposition items (as relevant data was copied in
+                # the Hit if needed) and return hits immediately if we are
+                # not in best match mode
                 jess.super.Superposition_free(sup)
                 if hit_found and not self.best_match:
                     break
@@ -1355,6 +1907,18 @@ cdef class Hit:
         for i, atom in enumerate(state["atoms"]):
             memcpy(&self._atoms[i], atom._atom, sizeof(_Atom))
+    cdef void _transform_atom(self, double* x, const double* src):
+        cdef size_t        i
+        cdef size_t        j
+        cdef const double* M = self._rotation
+        cdef const double* c = self._centre[0]
+        cdef const double* v = self._centre[1]
+        for i in range(3):
+            x[i] = v[i]
+            for j in range(3):
+                x[i] += M[3*i + j] * (src[j] - c[j])
     @property
     def determinant(self):
         """`float`: The determinant of the rotation matrix.
@@ -1423,15 +1987,11 @@ cdef class Hit:
             if transform:
                 atom._atom = <_Atom*> malloc(sizeof(_Atom))
                 memcpy(atom._atom, &self._atoms[k], sizeof(_Atom))
-                for i in range(3):
-                    atom._atom.x[i] = v[i]
-                    for j in range(3):
-                        atom._atom.x[i] += M[3*i + j] * (self._atoms[k].x[j] - c[j])
+                self._transform_atom(atom._atom.x, self._atoms[k].x)
             else:
                 atom.owned = True
                 atom.owner = self
                 atom._atom = &self._atoms[k]
             atoms.append(atom)
         return atoms
@@ -1467,17 +2027,142 @@ cdef class Hit:
         mol = self._molecule.copy()
         for k in range(mol._mol.count):
             atom = mol._mol.atom[k]
-            for i in range(3):
-                atom.x[i] = v[i]
-                for j in range(3):
-                    atom.x[i] += M[3*i + j] * (self._molecule._mol.atom[k].x[j] - c[j])
+            self._transform_atom(atom.x, self._molecule._mol.atom[k].x)
         return mol
+    cpdef str dumps(self, str format="pdb", bint transform=True):
+        """Write the hit to a string.
+        Arguments:
+            format (`str`): The format in which to write the hit.
+                Currently only supports ``pdb``, which writes the hits
+                in the same format as Jess.
+            transform (`bool`): Whether or not to transform coordinates
+                of the molecule atoms into template frame.
+        Raises:
+            `RuntimeError`: When attempting to dump a `Hit` which was
+                obtained from a `Template` which has no `~Template.id`.
+        .. versionadded:: 0.7.0
+        """
+        file = io.StringIO()
+        self.dump(file, format=format, transform=transform)
+        return file.getvalue()
+    cpdef void dump(self, object file, str format="pdb", bint transform=True):
+        """Write the hit to a file.
+        Arguments:
+            file (file-like object): A file opened in *text* mode where the
+                hit will be written.
+            format (`str`): The format in which to write the hit.
+                Currently only supports ``pdb``, which writes the hits
+                in the same format as Jess.
+            transform (`bool`): Whether or not to transform coordinates
+                of the molecule atoms into template frame.
+        Raises:
+            `RuntimeError`: When attempting to dump a `Hit` which was
+                obtained from a `Template` which has no `~Template.id`.
+        .. versionadded:: 0.7.0
+        """
+        assert self.template._tpl is not NULL
+        assert self._molecule._mol is not NULL
+        cdef _Atom*    atom
+        cdef size_t    k
+        cdef char[80]  buffer
+        cdef char[5]   name
+        cdef char[5]   resname
+        cdef double[3] x
+        cdef int       count   = self.template._tpl.count(self.template._tpl)
+        if self.template.id is None:
+            raise RuntimeError("cannot dump `Hit` where `self.template.id` is `None`")
+        file.write("REMARK ")
+        file.write(self._molecule.id)
+        file.write(f" {self.rmsd:5.3f} ")
+        file.write(self.template.id)
+        file.write(f" Det={self.determinant:4,.1f} log(E)~ {self.log_evalue:4.2f}\n")
+        for k in range(count):
+            atom = &self._atoms[k]
+            decode_token(name, atom.name, 4)
+            decode_token(resname, atom.resName, 3)
+            if transform:
+                self._transform_atom(x, atom.x)
+            else:
+                memcpy(x, atom.x, 3*sizeof(double))
+            n = sprintf(
+                buffer,
+                "ATOM  %5i%5s%c%-3s%c%c%4i%-4c%8.3f%8.3f%8.3f%6.2f%6.2f\n",
+                atom.serial,
+                name,
+                atom.altLoc,
+                resname,
+                atom.chainID1,
+                atom.chainID2,
+                atom.resSeq,
+                atom.iCode,
+                x[0],
+                x[1],
+                x[2],
+                atom.occupancy,
+                atom.tempFactor,
+                atom.segID,
+                atom.element,
+                atom.charge
+            )
+            file.write(PyUnicode_FromStringAndSize(buffer, n))
+        file.write("ENDMDL\n")
 cdef class Jess:
     """A handle to run Jess over a list of templates.
+    Example:
+        Create a `Jess` object from a list of templates::
+            >>> t1 = Template.load("1.3.3.tpl")
+            >>> t2 = Template.load("4.1.2.tpl")
+            >>> jess = Jess([t1, t2])
+        Once initialized, the `Jess` object cannot be modified further.
+        Use the `~Jess.query` method to query the templates with a
+        molecule::
+            >>> molecule = Molecule.load("1AMY.pdb")
+            >>> query = jess.query(molecule, 2, 2, 2)
+        The returned `Query` object is an iterator that can be
+        advanced through a ``for`` loop, or with the `next` built-in
+        function to get the first hit:
+            >>> hit = next(query)
+            >>> hit.rmsd
+            1.4386...
+        The hit can also be formatted in PDB format like in the
+        original JESS code::
+            >>> print(hit.dumps(format="pdb"), end="")
+            REMARK 1AMY 1.439 2om2 Det= 1.0 log(E)~ 1.11
+            ATOM    729  CA  THR A  94      34.202 -24.426   8.851  1.00  2.00
+            ATOM    732  CB  THR A  94      35.157 -23.467   8.101  1.00  4.66
+            ATOM    733  OG1 THR A  94      36.338 -23.247   8.871  1.00  9.85
+            ATOM    746  CD  GLU A  96      41.454 -29.509   8.013  1.00 24.05
+            ATOM    748  OE2 GLU A  96      42.536 -29.680   7.441  1.00 34.44
+            ATOM    747  OE1 GLU A  96      41.212 -28.521   8.708  1.00 18.56
+            ATOM    437  CZ  ARG A  55      44.471 -26.619  10.181  1.00  8.51
+            ATOM    436  NE  ARG A  55      44.334 -27.346  11.290  1.00  9.05
+            ATOM    438  NH1 ARG A  55      43.590 -26.751   9.179  1.00 13.17
+            ENDMDL
     .. versionadded:: 0.4.0
        Equality, hashing and pickle protocol support.
@@ -1579,9 +2264,10 @@ cdef class Jess:
         double distance_cutoff,
         double max_dynamic_distance,
         *,
-        int max_candidates = 1000,
-        bint ignore_chain = False,
+        object max_candidates = None,
+        object ignore_chain = None,
         bint best_match = False,
+        bint reorder = True,
     ):
         """Scan for templates matching the given molecule.
@@ -1596,18 +2282,81 @@ cdef class Jess:
                 dynamic distance after adding the global distance cutoff
                 and the individual atom distance cutoff defined for each
                 atom of the template.
-            ignore_chain (`bool`): Whether to check or ignore the chain of
-                the atoms to match.
+            max_candidates (`int` or `None`): The maximum number of candidate
+                hits to report by template. If a non-`None` value is given,
+                it may speed up querying for unspecific templates, but also
+                produce results potentially inconsistent with Jess.
+            ignore_chain (`str` or `None`): Whether to check or ignore the
+                chain of the atoms to match. The different supported modes
+                are:
+                - `None`: Force the atoms in the molecule to belong
+                  to different (resp. same) chains if so is the case
+                  in the template.
+                - ``residues``: Allow atoms to belong to different
+                  (resp. same) chains even if it is not the case in
+                  the template, but force all atoms of a residue to
+                  belong to the same chain.
+                - ``atoms``: Allow atoms to belong to any chain,
+                  independently to the template or the residue they
+                  belong to.
             best_match (`bool`): Pass `True` to return only the best match
-                to each template.
+                to each template, based on RMSD. In case of ties, the
+                first match is returned. Note that a match must still
+                be passing the RMSD threshold given in ``rmsd_threshold``
+                to be returned.
+            reorder (`bool`): Whether to enable template atom reordering
+                to accelerate matching in the scanner algorithm. Pass
+                `False` to reverse to the original, slower algorithm
+                which matches atoms in the same order as they appear in
+                the template, at the cost of longer run times.
         Returns:
             `~pyjess.Query`: An iterator over the query hits.
+        Caution:
+            Since ``v0.6.0``, this function uses an optimized variant of
+            the Jess scanning algorithm which minimized the number of steps
+            needed to generate matches, by re-ordering the order the
+            template atoms are iterated upon. Because of this change,
+            the query may return *exactly* the same matches but in an order
+            that *differs* from the original Jess version. If you really
+            need results in the original order, set ``reorder`` to `False`.
+        .. versionadded:: 0.6.0
+            The ``reorder`` argument, defaulting to `True`.
+        .. versionchanged:: 0.7.0
+            Default value of ``max_candidates`` argument to `None`.
+        .. versionchanged:: 0.7.0
+            ``ignore_chain`` now expects string variants rather than `bool`.
         """
+        if ignore_chain is True:
+            PyErr_WarnEx(
+                DeprecationWarning,
+                "`ignore_chain` parameter expects string parameters "
+                "to specificy the mode since PyJess v0.7.0. "
+                "Use `ignore_chain='atoms'` instead of `ignore_chain=True`",
+                2,
+            )
+            ignore_chain="atoms"
+        elif ignore_chain is False:
+            PyErr_WarnEx(
+                DeprecationWarning,
+                "`ignore_chain` parameter expects string parameters "
+                "to specificy the mode since PyJess v0.7.0. "
+                "Use `ignore_chain=None` instead of `ignore_chain=False`",
+                2,
+            )
+            ignore_chain=None
         cdef Query query = Query.__new__(Query)
-        query.ignore_chain = ignore_chain
         query.max_candidates = max_candidates
+        query.ignore_chain = ignore_chain
         query.rmsd_threshold = rmsd_threshold
         query.best_match = best_match
         query.molecule = molecule
@@ -1617,5 +2366,6 @@ cdef class Jess:
             molecule._mol,
             distance_cutoff,
             max_dynamic_distance,
+            reorder,
         )
         return query