pyjess 0.6.0__pp38-pypy38_pp73-win_amd64.whl → 0.7.0__pp38-pypy38_pp73-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyjess might be problematic. Click here for more details.
- pyjess/__main__.py +4 -0
- pyjess/_jess.pyi +52 -9
- pyjess/_jess.pypy38-pp73-win_amd64.pyd +0 -0
- pyjess/_jess.pyx +830 -101
- pyjess/cli.py +281 -0
- pyjess/tests/__init__.py +2 -0
- pyjess/tests/data/1AMY.cif +6259 -0
- pyjess/tests/data/1sur.qry +26 -0
- pyjess/tests/data/4.1.2.tpl +23 -0
- pyjess/tests/data/5ayx.EF.pdb +63 -0
- pyjess/tests/test_doctest.py +78 -0
- pyjess/tests/test_hit.py +26 -2
- pyjess/tests/test_jess.py +62 -1
- pyjess/tests/test_molecule.py +146 -0
- pyjess/tests/test_template.py +10 -1
- {pyjess-0.6.0.dist-info → pyjess-0.7.0.dist-info}/METADATA +59 -16
- pyjess-0.7.0.dist-info/RECORD +34 -0
- pyjess-0.7.0.dist-info/entry_points.txt +3 -0
- pyjess-0.6.0.dist-info/RECORD +0 -26
- {pyjess-0.6.0.dist-info → pyjess-0.7.0.dist-info}/WHEEL +0 -0
- {pyjess-0.6.0.dist-info → pyjess-0.7.0.dist-info}/licenses/COPYING +0 -0
pyjess/_jess.pyx
CHANGED
|
@@ -2,6 +2,73 @@
|
|
|
2
2
|
# cython: language_level=3, linetrace=True, binding=True
|
|
3
3
|
"""Bindings to Jess, a 3D template matching software.
|
|
4
4
|
|
|
5
|
+
Jess is an algorithm for constraint-based structural template matching
|
|
6
|
+
proposed by Jonathan Barker *et al.*. It can be used to identify
|
|
7
|
+
catalytic residues from a known template inside a protein structure.
|
|
8
|
+
Jess is an evolution of TESS, a geometric hashing algorithm developed by
|
|
9
|
+
Andrew Wallace *et al.*, removing some pre-computation and
|
|
10
|
+
structural requirements from the original algorithm.
|
|
11
|
+
|
|
12
|
+
PyJess is a Python module that provides bindings to Jess using
|
|
13
|
+
`Cython <https://cython.org/>`_. It allows creating templates, querying
|
|
14
|
+
them with protein structures, and retrieving the hits using a Python API
|
|
15
|
+
without performing any external I/O. It's also more than 10x faster than
|
|
16
|
+
Jess thanks to algorithmic optimizations added to improve the original Jess
|
|
17
|
+
code while producing consistent results.
|
|
18
|
+
|
|
19
|
+
Example:
|
|
20
|
+
Load templates from a file, either as a file-like object or
|
|
21
|
+
given a filename::
|
|
22
|
+
|
|
23
|
+
>>> t1 = pyjess.Template.load("1.3.3.tpl") # load from filename
|
|
24
|
+
>>> with open("4.1.2.tpl") as f: # load from a file object
|
|
25
|
+
... t2 = pyjess.Template.load(f)
|
|
26
|
+
|
|
27
|
+
Load molecules from a file, either as a file-like object or given
|
|
28
|
+
a filename::
|
|
29
|
+
|
|
30
|
+
>>> mol = pyjess.Molecule.load("1AMY.pdb")
|
|
31
|
+
>>> mol[0]
|
|
32
|
+
Atom(serial=1, name='N', altloc=' ', residue_name='GLN', ...)
|
|
33
|
+
|
|
34
|
+
Create a `Jess` object storing the templates to support running
|
|
35
|
+
queries on them. The individual templates can still be accessed by
|
|
36
|
+
index::
|
|
37
|
+
|
|
38
|
+
>>> jess = pyjess.Jess([t1, t2])
|
|
39
|
+
>>> jess[0].id
|
|
40
|
+
'3r6v'
|
|
41
|
+
|
|
42
|
+
Run a query on the Jess object to retrieve all templates matching
|
|
43
|
+
a `Molecule`, *in no particular order*::
|
|
44
|
+
|
|
45
|
+
>>> hits = jess.query(mol, 2, 2, 2)
|
|
46
|
+
>>> for hit in hits:
|
|
47
|
+
... print(hit.template.id, hit.rmsd)
|
|
48
|
+
2om2 1.4386...
|
|
49
|
+
2om2 1.4877...
|
|
50
|
+
2om2 1.4376...
|
|
51
|
+
2om2 1.5284...
|
|
52
|
+
2om2 1.4863...
|
|
53
|
+
2om2 1.4369...
|
|
54
|
+
2om2 1.4790...
|
|
55
|
+
2om2 1.1414...
|
|
56
|
+
2om2 1.0755...
|
|
57
|
+
2om2 1.1973...
|
|
58
|
+
2om2 1.1353...
|
|
59
|
+
2om2 1.0711...
|
|
60
|
+
2om2 1.1494...
|
|
61
|
+
|
|
62
|
+
By default, a template can match a molecule in more than one way,
|
|
63
|
+
if several sets of atoms match the geometric constraints. Use the
|
|
64
|
+
``best_match`` argument of `~Jess.query` to only retrieve the
|
|
65
|
+
best match per template::
|
|
66
|
+
|
|
67
|
+
>>> hits = jess.query(mol, 2, 2, 2, best_match=True)
|
|
68
|
+
>>> for hit in hits:
|
|
69
|
+
... print(hit.template.id, hit.rmsd)
|
|
70
|
+
2om2 1.071...
|
|
71
|
+
|
|
5
72
|
References:
|
|
6
73
|
- Barker, J. A., & Thornton, J. M. (2003). *An algorithm for
|
|
7
74
|
constraint-based structural template matching: application to
|
|
@@ -18,10 +85,16 @@ References:
|
|
|
18
85
|
# --- C imports --------------------------------------------------------------
|
|
19
86
|
|
|
20
87
|
cimport cython
|
|
21
|
-
from cpython.
|
|
88
|
+
from cpython.exc cimport PyErr_WarnEx
|
|
89
|
+
from cpython.unicode cimport (
|
|
90
|
+
PyUnicode_FromStringAndSize,
|
|
91
|
+
PyUnicode_FromFormat,
|
|
92
|
+
PyUnicode_AsASCIIString,
|
|
93
|
+
)
|
|
22
94
|
|
|
23
95
|
from libc.math cimport isnan, exp, INFINITY, NAN
|
|
24
|
-
from libc.stdio cimport FILE, fclose, fdopen, printf
|
|
96
|
+
from libc.stdio cimport FILE, fclose, fdopen, printf, sprintf
|
|
97
|
+
from libc.stdint cimport uintptr_t
|
|
25
98
|
from libc.stdlib cimport calloc, realloc, free, malloc
|
|
26
99
|
from libc.string cimport memcpy, memset, strncpy, strdup
|
|
27
100
|
|
|
@@ -37,24 +110,20 @@ from jess.jess cimport Jess as _Jess
|
|
|
37
110
|
from jess.jess cimport JessQuery as _JessQuery
|
|
38
111
|
from jess.molecule cimport Molecule as _Molecule
|
|
39
112
|
from jess.super cimport Superposition as _Superposition
|
|
40
|
-
from jess.template cimport Template as _Template
|
|
113
|
+
from jess.template cimport Template as _Template, IgnoreType as _IgnoreType
|
|
41
114
|
from jess.tess_template cimport TessTemplate as _TessTemplate
|
|
42
115
|
from jess.tess_atom cimport TessAtom as _TessAtom
|
|
43
116
|
|
|
44
117
|
# --- Python imports ---------------------------------------------------------
|
|
45
118
|
|
|
46
|
-
import contextlib
|
|
47
119
|
import functools
|
|
48
120
|
import io
|
|
49
|
-
import itertools
|
|
50
|
-
import os
|
|
51
|
-
import warnings
|
|
52
121
|
|
|
53
122
|
__version__ = PROJECT_VERSION
|
|
54
123
|
|
|
55
124
|
# --- Utils ------------------------------------------------------------------
|
|
56
125
|
|
|
57
|
-
cdef inline void
|
|
126
|
+
cdef inline void encode_token(char* dst, const char* src, size_t n) noexcept nogil:
|
|
58
127
|
cdef size_t i
|
|
59
128
|
for i in range(n):
|
|
60
129
|
if src[i] == ord(' ') or src[i] == 0:
|
|
@@ -63,12 +132,155 @@ cdef inline void copy_token(char* dst, const char* src, size_t n) noexcept nogil
|
|
|
63
132
|
dst[i] = src[i]
|
|
64
133
|
dst[n] = 0
|
|
65
134
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
135
|
+
cdef inline void decode_token(char* dst, const char* src, size_t n) noexcept nogil:
|
|
136
|
+
cdef size_t i
|
|
137
|
+
for i in range(n):
|
|
138
|
+
if src[i] == ord('_') or src[i] == 0:
|
|
139
|
+
dst[i] = ord(' ')
|
|
140
|
+
else:
|
|
141
|
+
dst[i] = src[i]
|
|
142
|
+
dst[n] = 0
|
|
143
|
+
|
|
144
|
+
class nullcontext:
|
|
145
|
+
def __init__(self, return_value=None):
|
|
146
|
+
self.retval = return_value
|
|
147
|
+
def __enter__(self):
|
|
148
|
+
return self.retval
|
|
149
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
150
|
+
return False
|
|
69
151
|
|
|
70
152
|
# --- Classes ----------------------------------------------------------------
|
|
71
153
|
|
|
154
|
+
cdef class _MoleculeParser:
|
|
155
|
+
cdef str id
|
|
156
|
+
|
|
157
|
+
def __init__(self, str id = None):
|
|
158
|
+
self.id = id
|
|
159
|
+
|
|
160
|
+
cdef class _PDBMoleculeParser(_MoleculeParser):
|
|
161
|
+
cdef bint ignore_endmdl
|
|
162
|
+
cdef bint skip_hetatm
|
|
163
|
+
|
|
164
|
+
def __init__(self, str id = None, bint ignore_endmdl = False, bint skip_hetatm = False):
|
|
165
|
+
super().__init__(id=id)
|
|
166
|
+
self.ignore_endmdl = ignore_endmdl
|
|
167
|
+
self.skip_hetatm = skip_hetatm
|
|
168
|
+
|
|
169
|
+
def loads(self, text, molecule_type):
|
|
170
|
+
return self.load(io.StringIO(text), molecule_type)
|
|
171
|
+
|
|
172
|
+
def load(self, file, molecule_type):
|
|
173
|
+
cdef str line
|
|
174
|
+
cdef str id = self.id
|
|
175
|
+
cdef list atoms = []
|
|
176
|
+
try:
|
|
177
|
+
handle = open(file)
|
|
178
|
+
except TypeError:
|
|
179
|
+
handle = nullcontext(file)
|
|
180
|
+
with handle as f:
|
|
181
|
+
for line in f:
|
|
182
|
+
if line.startswith("HEADER"):
|
|
183
|
+
if id is None:
|
|
184
|
+
id = line[62:66].strip() or None
|
|
185
|
+
elif line.startswith("ATOM"):
|
|
186
|
+
atoms.append(Atom.loads(line))
|
|
187
|
+
elif line.startswith("HETATM") and not self.skip_hetatm:
|
|
188
|
+
atoms.append(Atom.loads(line))
|
|
189
|
+
elif line.startswith("ENDMDL"):
|
|
190
|
+
if not self.ignore_endmdl:
|
|
191
|
+
break
|
|
192
|
+
elif line.lower().startswith(("data_", "loop_")):
|
|
193
|
+
raise ValueError("mmCIF data tags found, file is not in PDB format")
|
|
194
|
+
return molecule_type(atoms, id=id)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
cdef class _CIFMoleculeParser(_MoleculeParser):
|
|
198
|
+
cdef object gemmi
|
|
199
|
+
cdef bint use_author
|
|
200
|
+
cdef bint skip_hetatm
|
|
201
|
+
|
|
202
|
+
_PRIMARY_COLUMNS = [
|
|
203
|
+
'id', 'type_symbol', 'label_atom_id', 'label_alt_id', 'label_comp_id',
|
|
204
|
+
'label_asym_id', 'label_seq_id', '?pdbx_PDB_ins_code', 'Cartn_x',
|
|
205
|
+
'Cartn_y', 'Cartn_z', 'occupancy', 'B_iso_or_equiv',
|
|
206
|
+
'?pdbx_formal_charge', '?group_PDB',
|
|
207
|
+
]
|
|
208
|
+
|
|
209
|
+
_AUTH_COLUMNS = [
|
|
210
|
+
'id', 'type_symbol', 'auth_atom_id', 'label_alt_id', 'auth_comp_id',
|
|
211
|
+
'auth_asym_id', 'auth_seq_id', '?pdbx_PDB_ins_code', 'Cartn_x',
|
|
212
|
+
'Cartn_y', 'Cartn_z', 'occupancy', 'B_iso_or_equiv',
|
|
213
|
+
'?pdbx_formal_charge', '?group_PDB',
|
|
214
|
+
]
|
|
215
|
+
|
|
216
|
+
def __init__(self, str id = None, bint use_author = False, bint skip_hetatm = False):
|
|
217
|
+
super().__init__(id=id)
|
|
218
|
+
self.gemmi = __import__('gemmi')
|
|
219
|
+
self.use_author = use_author
|
|
220
|
+
self.skip_hetatm = skip_hetatm
|
|
221
|
+
|
|
222
|
+
def _load_block(self, document, molecule_type):
|
|
223
|
+
block = document.sole_block()
|
|
224
|
+
cols = self._AUTH_COLUMNS if self.use_author else self._PRIMARY_COLUMNS
|
|
225
|
+
table = block.find('_atom_site.', cols)
|
|
226
|
+
max_residue_number = 0
|
|
227
|
+
|
|
228
|
+
if not table:
|
|
229
|
+
raise ValueError("missing columns in CIF files")
|
|
230
|
+
|
|
231
|
+
atoms = []
|
|
232
|
+
for row in table:
|
|
233
|
+
if row[14] != "ATOM" and (row[14] != "HETATM" or self.skip_hetatm):
|
|
234
|
+
continue
|
|
235
|
+
|
|
236
|
+
if row[6] == "." and row[14] == "HETATM":
|
|
237
|
+
PyErr_WarnEx(
|
|
238
|
+
UserWarning,
|
|
239
|
+
"HETATM line found without residue number. Consider "
|
|
240
|
+
"parsing with use_author=True to use author-defined "
|
|
241
|
+
"residue numbers, or skip_hetatm=True to disable "
|
|
242
|
+
"parsing of HETATM altogether.",
|
|
243
|
+
3,
|
|
244
|
+
)
|
|
245
|
+
residue_number = max_residue_number
|
|
246
|
+
max_residue_number += 1
|
|
247
|
+
else:
|
|
248
|
+
residue_number = int(row[6])
|
|
249
|
+
max_residue_number = max(residue_number, max_residue_number)
|
|
250
|
+
|
|
251
|
+
atom = Atom(
|
|
252
|
+
serial=int(row[0]),
|
|
253
|
+
element=row[1],
|
|
254
|
+
name=row[2],
|
|
255
|
+
altloc=' ' if row[3] == "." else row[3], # FIXME: replace with None?
|
|
256
|
+
residue_name=row[4],
|
|
257
|
+
chain_id=row[5],
|
|
258
|
+
residue_number=residue_number,
|
|
259
|
+
insertion_code=' ' if not row.has(7) or row[7] == "?" else row[7],
|
|
260
|
+
x=float(row[8]),
|
|
261
|
+
y=float(row[9]),
|
|
262
|
+
z=float(row[10]),
|
|
263
|
+
occupancy=0.0 if row[11] == '.' else float(row[11]),
|
|
264
|
+
temperature_factor=float(row[12]),
|
|
265
|
+
charge=0 if not row.has(13) or row[13] == "?" else int(row[13]),
|
|
266
|
+
)
|
|
267
|
+
atoms.append(atom)
|
|
268
|
+
|
|
269
|
+
id = block.name if self.id is None else self.id
|
|
270
|
+
return molecule_type(atoms, id=id)
|
|
271
|
+
|
|
272
|
+
def loads(self, text, molecule_type):
|
|
273
|
+
document = self.gemmi.cif.read_string(text)
|
|
274
|
+
return self._load_block(document, molecule_type)
|
|
275
|
+
|
|
276
|
+
def load(self, file, molecule_type):
|
|
277
|
+
if hasattr(file, "read"):
|
|
278
|
+
document = self.gemmi.cif.read_string(file.read())
|
|
279
|
+
else:
|
|
280
|
+
document = self.gemmi.cif.read_file(file)
|
|
281
|
+
return self._load_block(document, molecule_type)
|
|
282
|
+
|
|
283
|
+
|
|
72
284
|
cdef class Molecule:
|
|
73
285
|
"""A molecule structure, as a sequence of `Atom` objects.
|
|
74
286
|
|
|
@@ -83,20 +295,40 @@ cdef class Molecule:
|
|
|
83
295
|
cdef str _id
|
|
84
296
|
|
|
85
297
|
@classmethod
|
|
86
|
-
def loads(
|
|
298
|
+
def loads(
|
|
299
|
+
cls,
|
|
300
|
+
text,
|
|
301
|
+
str format = "pdb",
|
|
302
|
+
*,
|
|
303
|
+
str id = None,
|
|
304
|
+
bint ignore_endmdl = False,
|
|
305
|
+
bint use_author = False,
|
|
306
|
+
bint skip_hetatm = False,
|
|
307
|
+
):
|
|
87
308
|
"""Load a molecule from a PDB string.
|
|
88
309
|
|
|
89
310
|
Arguments:
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
311
|
+
text (`str`): The serialized molecule to parse into a new
|
|
312
|
+
object.
|
|
313
|
+
format (`str`): The format to parse the file. Supported formats
|
|
314
|
+
are: ``pdb`` for the Protein Data Bank format, ``cif``
|
|
315
|
+
for Crystallographic Information File format (additionally
|
|
316
|
+
requires the `gemmi` module), or ``detect`` to attempt
|
|
317
|
+
auto-detection (the default).
|
|
318
|
+
|
|
319
|
+
Keyword Arguments:
|
|
93
320
|
id (`str`, optional): The identifier of the molecule. If `None`
|
|
94
321
|
given, the parser will attempt to extract it from the
|
|
95
|
-
``HEADER`` line
|
|
322
|
+
``HEADER`` line (for PDB files) or the block name (for CIF
|
|
323
|
+
files).
|
|
96
324
|
ignore_endmdl (`bool`): Pass `True` to make the parser read all
|
|
97
325
|
the atoms from the PDB file. By default, the parser only
|
|
98
326
|
reads the atoms of the first model, and stops at the first
|
|
99
|
-
``ENDMDL`` line.
|
|
327
|
+
``ENDMDL`` line. *Ignored for CIF files*.
|
|
328
|
+
use_author (`bool`): Pass `True` to use the author-defined
|
|
329
|
+
labels while parsing CIF files, e.g. read the chain name
|
|
330
|
+
from ``_atom_site.auth_asym_id`` rather than
|
|
331
|
+
``_atom_site.label_asym_id``. *Ignored for PDB files*.
|
|
100
332
|
|
|
101
333
|
Returns:
|
|
102
334
|
`~pyjess.Molecule`: The molecule parsed from the PDB file.
|
|
@@ -105,46 +337,264 @@ cdef class Molecule:
|
|
|
105
337
|
`Molecule.load` to load a PDB molecule from a file-like
|
|
106
338
|
object or from a path.
|
|
107
339
|
|
|
340
|
+
Caution:
|
|
341
|
+
Parsing from PDB file retains the heteroatoms (``HETATM`` lines)
|
|
342
|
+
while parsing from mmCIF usually discard them. This is because
|
|
343
|
+
mmCIF files store heteroatoms but do not require them to
|
|
344
|
+
have an associated residue number, which can throw off the way
|
|
345
|
+
atoms are modeled in Jess.
|
|
346
|
+
|
|
347
|
+
.. versionadded:: 0.7.0
|
|
348
|
+
The ``format`` argument, and support for CIF parsing.
|
|
349
|
+
|
|
108
350
|
"""
|
|
109
|
-
|
|
351
|
+
if format == "detect":
|
|
352
|
+
format = "cif" if text.lstrip().startswith(("data_", "loop_")) else "pdb"
|
|
353
|
+
return cls.load(
|
|
354
|
+
io.StringIO(text),
|
|
355
|
+
format=format,
|
|
356
|
+
id=id,
|
|
357
|
+
ignore_endmdl=ignore_endmdl,
|
|
358
|
+
skip_hetatm=skip_hetatm,
|
|
359
|
+
)
|
|
110
360
|
|
|
111
361
|
@classmethod
|
|
112
|
-
def load(
|
|
362
|
+
def load(
|
|
363
|
+
cls,
|
|
364
|
+
file,
|
|
365
|
+
str format = "detect",
|
|
366
|
+
*,
|
|
367
|
+
str id = None,
|
|
368
|
+
bint ignore_endmdl = False,
|
|
369
|
+
bint use_author = False,
|
|
370
|
+
bint skip_hetatm = False,
|
|
371
|
+
):
|
|
113
372
|
"""Load a molecule from a PDB file.
|
|
114
373
|
|
|
115
374
|
Arguments:
|
|
116
375
|
file (`str`, `os.PathLike`, or file-like object): Either the path
|
|
117
376
|
to a file, or a file-like object opened in **text mode**
|
|
118
|
-
containing a
|
|
377
|
+
containing a molecule.
|
|
378
|
+
format (`str`): The format to parse the file. Supported formats
|
|
379
|
+
are: ``pdb`` for the Protein Data Bank format, ``cif``
|
|
380
|
+
for Crystallographic Information File format (additionally
|
|
381
|
+
requires the `gemmi` module), or ``detect`` to attempt
|
|
382
|
+
auto-detection (the default).
|
|
383
|
+
|
|
384
|
+
Keyword Arguments:
|
|
119
385
|
id (`str`, optional): The identifier of the molecule. If `None`
|
|
120
386
|
given, the parser will attempt to extract it from the
|
|
121
|
-
``HEADER`` line
|
|
387
|
+
``HEADER`` line (for PDB files) or the block name (for CIF
|
|
388
|
+
files).
|
|
122
389
|
ignore_endmdl (`bool`): Pass `True` to make the parser read all
|
|
123
390
|
the atoms from the PDB file. By default, the parser only
|
|
124
391
|
reads the atoms of the first model, and stops at the first
|
|
125
|
-
``ENDMDL`` line.
|
|
392
|
+
``ENDMDL`` line. *Ignored for CIF files*.
|
|
393
|
+
use_author (`bool`): Pass `True` to use the author-defined
|
|
394
|
+
labels while parsing CIF files, e.g. read the chain name
|
|
395
|
+
from ``_atom_site.auth_asym_id`` rather than
|
|
396
|
+
``_atom_site.label_asym_id``. *Ignored for PDB files*.
|
|
397
|
+
skip_hetatm (`bool`): Pass `True` to skip parsing of heteroatoms
|
|
398
|
+
(``HETATM``) in the input file.
|
|
126
399
|
|
|
127
400
|
Returns:
|
|
128
401
|
`~pyjess.Molecule`: The molecule parsed from the PDB file.
|
|
129
402
|
|
|
403
|
+
See Also:
|
|
404
|
+
`Molecule.loads` to load a PDB molecule from a string.
|
|
405
|
+
|
|
406
|
+
Caution:
|
|
407
|
+
Parsing from PDB file retains the heteroatoms (``HETATM`` lines)
|
|
408
|
+
while parsing from mmCIF usually discard them. This is because
|
|
409
|
+
mmCIF files store heteroatoms but do not require them to
|
|
410
|
+
have an associated residue number, which can throw off the way
|
|
411
|
+
atoms are modeled in Jess.
|
|
412
|
+
|
|
413
|
+
.. versionadded:: 0.7.0
|
|
414
|
+
The ``format`` and ``skip_hetatm`` arguments, and mmCIF support.
|
|
415
|
+
|
|
130
416
|
"""
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
417
|
+
cdef _MoleculeParser parser
|
|
418
|
+
cdef str peek
|
|
419
|
+
|
|
420
|
+
if format == "detect":
|
|
421
|
+
try:
|
|
422
|
+
handle = open(file)
|
|
423
|
+
except TypeError:
|
|
424
|
+
handle = nullcontext(file)
|
|
425
|
+
with handle as f:
|
|
426
|
+
if f.seekable():
|
|
427
|
+
peek = f.read(5)
|
|
428
|
+
f.seek(0)
|
|
429
|
+
else:
|
|
430
|
+
f = f.read()
|
|
431
|
+
peek = f[5:]
|
|
432
|
+
if peek.startswith(("data_", "loop_")):
|
|
433
|
+
parser = _CIFMoleculeParser(
|
|
434
|
+
id=id,
|
|
435
|
+
use_author=use_author,
|
|
436
|
+
skip_hetatm=skip_hetatm,
|
|
437
|
+
)
|
|
438
|
+
else:
|
|
439
|
+
parser = _PDBMoleculeParser(
|
|
440
|
+
id=id,
|
|
441
|
+
ignore_endmdl=ignore_endmdl,
|
|
442
|
+
skip_hetatm=skip_hetatm,
|
|
443
|
+
)
|
|
444
|
+
if isinstance(f, str):
|
|
445
|
+
return parser.loads(f, molecule_type=cls)
|
|
446
|
+
return parser.load(f, molecule_type=cls)
|
|
447
|
+
if format == "pdb":
|
|
448
|
+
parser = _PDBMoleculeParser(
|
|
449
|
+
id=id,
|
|
450
|
+
ignore_endmdl=ignore_endmdl,
|
|
451
|
+
skip_hetatm=skip_hetatm
|
|
452
|
+
)
|
|
453
|
+
elif format == "cif":
|
|
454
|
+
parser = _CIFMoleculeParser(
|
|
455
|
+
id=id,
|
|
456
|
+
use_author=use_author,
|
|
457
|
+
skip_hetatm=skip_hetatm,
|
|
458
|
+
)
|
|
459
|
+
else:
|
|
460
|
+
raise ValueError(f"invalid value for `format` argument: {format!r}")
|
|
461
|
+
return parser.load(file, molecule_type=cls)
|
|
462
|
+
|
|
463
|
+
@classmethod
|
|
464
|
+
def from_biopython(cls, object structure, str id = None):
|
|
465
|
+
"""Create a new `~pyjess.Molecule` from a `Bio.PDB.Structure`.
|
|
466
|
+
|
|
467
|
+
Arguments:
|
|
468
|
+
structure (`Bio.PDB.Structure` or `Bio.PDB.Model`): The
|
|
469
|
+
Biopython object containing the structure data.
|
|
470
|
+
id (`str` or `None`): The identifier to give to the newly
|
|
471
|
+
created molecule. If `None` given, will use the value of
|
|
472
|
+
``structure.id``.
|
|
473
|
+
|
|
474
|
+
Returns:
|
|
475
|
+
`~pyjess.Molecule`: A molecule object suitable for using
|
|
476
|
+
in `Jess.query`.
|
|
477
|
+
|
|
478
|
+
.. versionadded:: 0.7.0
|
|
479
|
+
|
|
480
|
+
"""
|
|
481
|
+
cdef list atoms = []
|
|
482
|
+
for c in structure.get_chains():
|
|
483
|
+
for r in c.get_residues():
|
|
484
|
+
_, residue_number, insertion_code = r.id
|
|
485
|
+
for a in r.get_atoms():
|
|
486
|
+
coord = a.get_coord()
|
|
487
|
+
atom = Atom(
|
|
488
|
+
name=a.fullname,
|
|
489
|
+
x=coord[0],
|
|
490
|
+
y=coord[1],
|
|
491
|
+
z=coord[2],
|
|
492
|
+
altloc=a.altloc,
|
|
493
|
+
charge=a.pqr_charge or 0,
|
|
494
|
+
occupancy=a.occupancy,
|
|
495
|
+
serial=a.serial_number,
|
|
496
|
+
residue_name=r.resname,
|
|
497
|
+
residue_number=residue_number,
|
|
498
|
+
segment=r.segid,
|
|
499
|
+
insertion_code=insertion_code,
|
|
500
|
+
chain_id=c.id,
|
|
501
|
+
temperature_factor=a.bfactor,
|
|
502
|
+
element=a.element,
|
|
503
|
+
)
|
|
504
|
+
atoms.append(atom)
|
|
505
|
+
return cls(atoms, id=structure.id)
|
|
506
|
+
|
|
507
|
+
@classmethod
|
|
508
|
+
def from_gemmi(cls, object model, str id=None):
|
|
509
|
+
"""Create a new `~pyjess.Molecule` from a `gemmi.Model`.
|
|
510
|
+
|
|
511
|
+
Arguments:
|
|
512
|
+
structure (`gemmi.Model`): The ``gemmi`` object
|
|
513
|
+
containing the structure data.
|
|
514
|
+
id (`str` or `None`): The identifier to give to the newly
|
|
515
|
+
created molecule.
|
|
516
|
+
|
|
517
|
+
Returns:
|
|
518
|
+
`~pyjess.Molecule`: A molecule object suitable for using
|
|
519
|
+
in `Jess.query`.
|
|
520
|
+
|
|
521
|
+
.. versionadded:: 0.7.0
|
|
522
|
+
|
|
523
|
+
"""
|
|
524
|
+
cdef list atoms = []
|
|
525
|
+
for cra in model.all():
|
|
526
|
+
a = cra.atom
|
|
527
|
+
r = cra.residue
|
|
528
|
+
c = cra.chain
|
|
529
|
+
atom = Atom(
|
|
530
|
+
name=a.padded_name(),
|
|
531
|
+
x=a.pos[0],
|
|
532
|
+
y=a.pos[1],
|
|
533
|
+
z=a.pos[2],
|
|
534
|
+
altloc=' ' if a.altloc == '\0' else a.altloc,
|
|
535
|
+
charge=a.charge,
|
|
536
|
+
element=a.element.name.upper(),
|
|
537
|
+
occupancy=a.occ,
|
|
538
|
+
temperature_factor=a.b_iso,
|
|
539
|
+
serial=a.serial,
|
|
540
|
+
segment=r.segment,
|
|
541
|
+
residue_name=r.name,
|
|
542
|
+
residue_number=r.seqid.num,
|
|
543
|
+
chain_id=c.name,
|
|
544
|
+
insertion_code=r.seqid.icode,
|
|
545
|
+
)
|
|
546
|
+
atoms.append(atom)
|
|
146
547
|
return cls(atoms, id=id)
|
|
147
548
|
|
|
549
|
+
@classmethod
|
|
550
|
+
def from_biotite(cls, object atom_array, str id=None):
|
|
551
|
+
"""Create a new `~pyjess.Molecule` from a `biotite.structure.AtomArray`.
|
|
552
|
+
|
|
553
|
+
Arguments:
|
|
554
|
+
structure (`biotite.structure.AtomArray`): The ``biotite``
|
|
555
|
+
object containing the structure data.
|
|
556
|
+
|
|
557
|
+
Returns:
|
|
558
|
+
`~pyjess.Molecule`: A molecule object suitable for using
|
|
559
|
+
in `Jess.query`.
|
|
560
|
+
|
|
561
|
+
Caution:
|
|
562
|
+
If loading data with the `biotite.structure.io.pdb.PDBFile` module,
|
|
563
|
+
ensure that you are requesting all atoms and all extra fields
|
|
564
|
+
in `~biotite.structure.io.pdb.PDBFile.get_structure`::
|
|
565
|
+
|
|
566
|
+
db_file = PDBFile.read("data/1AMY.pdb")
|
|
567
|
+
structure = pdb_file.get_structure(
|
|
568
|
+
altloc="all",
|
|
569
|
+
extra_fields=["atom_id", "b_factor", "occupancy", "charge"],
|
|
570
|
+
)
|
|
571
|
+
molecule = Molecule.from_biotite(structure[0])
|
|
572
|
+
|
|
573
|
+
.. versionadded:: 0.7.0
|
|
574
|
+
|
|
575
|
+
"""
|
|
576
|
+
cdef list atoms = []
|
|
577
|
+
for a in atom_array:
|
|
578
|
+
atom = Atom(
|
|
579
|
+
name=str(a.atom_name),
|
|
580
|
+
x=a.coord[0],
|
|
581
|
+
y=a.coord[1],
|
|
582
|
+
z=a.coord[2],
|
|
583
|
+
altloc=str(getattr(a, 'altloc', ' ')),
|
|
584
|
+
charge=getattr(a, 'charge', 0),
|
|
585
|
+
element=str(a.element),
|
|
586
|
+
occupancy=getattr(a, 'occupancy', 1.0),
|
|
587
|
+
temperature_factor=a.b_factor,
|
|
588
|
+
serial=a.atom_id,
|
|
589
|
+
segment=str(getattr(a, 'segment', '')),
|
|
590
|
+
residue_name=str(a.res_name),
|
|
591
|
+
residue_number=a.res_id,
|
|
592
|
+
chain_id=str(a.chain_id),
|
|
593
|
+
insertion_code=str(a.ins_code).ljust(1),
|
|
594
|
+
)
|
|
595
|
+
atoms.append(atom)
|
|
596
|
+
return cls(atoms)
|
|
597
|
+
|
|
148
598
|
def __cinit__(self):
|
|
149
599
|
self._mol = NULL
|
|
150
600
|
|
|
@@ -249,17 +699,32 @@ cdef class Molecule:
|
|
|
249
699
|
return self._id
|
|
250
700
|
|
|
251
701
|
cpdef Molecule conserved(self, double cutoff = 0.0):
|
|
702
|
+
"""Get a molecule containing only a subset of conserved atoms.
|
|
703
|
+
|
|
704
|
+
Arguments:
|
|
705
|
+
cutoff (`float`): The conservation cutoff for atoms. Atoms
|
|
706
|
+
with a `~Atom.temperature_factor` lower than this value
|
|
707
|
+
will be removed from the result.
|
|
708
|
+
|
|
709
|
+
Returns:
|
|
710
|
+
`~pyjess.Molecule`: A new molecule with atoms below the
|
|
711
|
+
conservation cutoff removed.
|
|
712
|
+
|
|
713
|
+
"""
|
|
252
714
|
assert self._mol is not NULL
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
715
|
+
|
|
716
|
+
cdef size_t i
|
|
717
|
+
cdef list atoms
|
|
718
|
+
|
|
719
|
+
if cutoff <= 0.0:
|
|
720
|
+
return self.copy()
|
|
721
|
+
|
|
722
|
+
atoms = []
|
|
723
|
+
for i in range(self._mol.count):
|
|
724
|
+
if self._mol.atom[i].tempFactor >= cutoff:
|
|
725
|
+
atoms.append(self[i])
|
|
726
|
+
|
|
727
|
+
return type(self)(id=self.id, atoms=atoms)
|
|
263
728
|
|
|
264
729
|
cpdef Molecule copy(self):
|
|
265
730
|
"""Create a copy of this molecule and its atoms.
|
|
@@ -366,21 +831,21 @@ cdef class Atom:
|
|
|
366
831
|
*,
|
|
367
832
|
int serial,
|
|
368
833
|
str name,
|
|
369
|
-
str altloc,
|
|
370
834
|
str residue_name,
|
|
371
835
|
str chain_id,
|
|
372
836
|
int residue_number,
|
|
373
|
-
str insertion_code,
|
|
374
837
|
double x,
|
|
375
838
|
double y,
|
|
376
839
|
double z,
|
|
377
840
|
double occupancy = 0.0,
|
|
378
841
|
double temperature_factor = 0.0,
|
|
842
|
+
str altloc = ' ',
|
|
843
|
+
str insertion_code = ' ',
|
|
379
844
|
str segment = '',
|
|
380
845
|
str element = '',
|
|
381
846
|
int charge = 0,
|
|
382
847
|
):
|
|
383
|
-
"""__init__(self, *, serial, name,
|
|
848
|
+
"""__init__(self, *, serial, name, residue_name, chain_id, residue_number, x, y, z, occupancy=0.0, temperature_factor=0.0, altloc=' ', insertion_code=' ', segment='', element='', charge=0)\n--\n
|
|
384
849
|
|
|
385
850
|
Create a new atom.
|
|
386
851
|
|
|
@@ -392,11 +857,16 @@ cdef class Atom:
|
|
|
392
857
|
long.
|
|
393
858
|
|
|
394
859
|
"""
|
|
860
|
+
cdef bytearray _name
|
|
861
|
+
cdef bytes _residue_name
|
|
862
|
+
cdef bytes _segment
|
|
863
|
+
cdef bytes _element
|
|
864
|
+
|
|
395
865
|
if len(name) > 4:
|
|
396
866
|
raise ValueError(f"Invalid atom name: {name!r}")
|
|
397
867
|
if len(residue_name) > 3:
|
|
398
868
|
raise ValueError(f"Invalid residue name: {residue_name!r}")
|
|
399
|
-
if len(segment) >
|
|
869
|
+
if len(segment) > 4:
|
|
400
870
|
raise ValueError(f"Invalid segment: {segment!r}")
|
|
401
871
|
if len(element) > 2:
|
|
402
872
|
raise ValueError(f"Invalid element: {element!r}")
|
|
@@ -407,6 +877,10 @@ cdef class Atom:
|
|
|
407
877
|
if self._atom is NULL:
|
|
408
878
|
raise MemoryError("Failed to allocate atom")
|
|
409
879
|
|
|
880
|
+
_residue_name = PyUnicode_AsASCIIString(residue_name)
|
|
881
|
+
_segment = PyUnicode_AsASCIIString(segment)
|
|
882
|
+
_element = PyUnicode_AsASCIIString(element)
|
|
883
|
+
|
|
410
884
|
self._atom.serial = serial
|
|
411
885
|
self._atom.altLoc = ord(altloc)
|
|
412
886
|
self._atom.chainID1 = ord(chain_id[0]) if len(chain_id) > 0 else 0
|
|
@@ -419,14 +893,15 @@ cdef class Atom:
|
|
|
419
893
|
self._atom.occupancy = occupancy
|
|
420
894
|
self._atom.tempFactor = temperature_factor
|
|
421
895
|
self._atom.charge = charge
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
896
|
+
encode_token(self._atom.resName, _residue_name.ljust(3, b'\0'), 3)
|
|
897
|
+
encode_token(self._atom.segID, _segment.ljust(4, b'\0'), 4)
|
|
898
|
+
encode_token(self._atom.element, _element.ljust(2, b'\0'), 2)
|
|
425
899
|
|
|
900
|
+
# FIXME: is alignment proper?
|
|
426
901
|
_name = bytearray(name, 'ascii')
|
|
427
902
|
if len(_name) < 4:
|
|
428
903
|
_name.insert(0, ord('_'))
|
|
429
|
-
|
|
904
|
+
encode_token(self._atom.name, _name.ljust(4, b'\0'), 4)
|
|
430
905
|
|
|
431
906
|
def __copy__(self):
|
|
432
907
|
return self.copy()
|
|
@@ -519,7 +994,7 @@ cdef class Atom:
|
|
|
519
994
|
"""`str`: The segment identifier.
|
|
520
995
|
"""
|
|
521
996
|
assert self._atom is not NULL
|
|
522
|
-
return self._atom.segID[:
|
|
997
|
+
return self._atom.segID[:4].decode('ascii').strip('_')
|
|
523
998
|
|
|
524
999
|
@property
|
|
525
1000
|
def element(self):
|
|
@@ -540,7 +1015,7 @@ cdef class Atom:
|
|
|
540
1015
|
"""`str`: The identifier of the chain the atom belongs to.
|
|
541
1016
|
"""
|
|
542
1017
|
assert self._atom is not NULL
|
|
543
|
-
return "
|
|
1018
|
+
return PyUnicode_FromFormat("%c%c", self._atom.chainID1, self._atom.chainID2).strip()
|
|
544
1019
|
|
|
545
1020
|
@property
|
|
546
1021
|
def occupancy(self):
|
|
@@ -565,16 +1040,22 @@ cdef class Atom:
|
|
|
565
1040
|
|
|
566
1041
|
@property
|
|
567
1042
|
def x(self):
|
|
1043
|
+
"""`float`: The atom coordinate in the 1st dimension.
|
|
1044
|
+
"""
|
|
568
1045
|
assert self._atom is not NULL
|
|
569
1046
|
return self._atom.x[0]
|
|
570
1047
|
|
|
571
1048
|
@property
|
|
572
1049
|
def y(self):
|
|
1050
|
+
"""`float`: The atom coordinate in the 2nd dimension.
|
|
1051
|
+
"""
|
|
573
1052
|
assert self._atom is not NULL
|
|
574
1053
|
return self._atom.x[1]
|
|
575
1054
|
|
|
576
1055
|
@property
|
|
577
1056
|
def z(self):
|
|
1057
|
+
"""`float`: The atom coordinate in the 3rd dimension.
|
|
1058
|
+
"""
|
|
578
1059
|
assert self._atom is not NULL
|
|
579
1060
|
return self._atom.x[2]
|
|
580
1061
|
|
|
@@ -736,18 +1217,19 @@ cdef class TemplateAtom:
|
|
|
736
1217
|
_name = bytearray(name, 'ascii')
|
|
737
1218
|
else:
|
|
738
1219
|
_name = bytearray(name)
|
|
1220
|
+
# FIXME: is alignment proper?
|
|
739
1221
|
if len(_name) > 4:
|
|
740
1222
|
raise ValueError(f"Invalid atom name: {name!r}")
|
|
741
|
-
elif len(_name)
|
|
1223
|
+
elif len(_name) <= 3:
|
|
742
1224
|
_name.insert(0, ord('_'))
|
|
743
|
-
|
|
1225
|
+
encode_token(self._atom.name[m], _name.ljust(4, b'\0'), 4)
|
|
744
1226
|
|
|
745
1227
|
# copy residue names
|
|
746
1228
|
for m, name in enumerate(residue_names):
|
|
747
1229
|
_name = name.encode('ascii') if isinstance(name, str) else name
|
|
748
1230
|
if len(_name) > 3:
|
|
749
1231
|
raise ValueError(f"Invalid residue name: {name!r}")
|
|
750
|
-
|
|
1232
|
+
encode_token(self._atom.resName[m], _name.ljust(3, b'\0'), 3)
|
|
751
1233
|
|
|
752
1234
|
cdef dict _state(self):
|
|
753
1235
|
return {
|
|
@@ -821,7 +1303,7 @@ cdef class TemplateAtom:
|
|
|
821
1303
|
assert self._atom is not NULL
|
|
822
1304
|
cdef char c1 = jess.tess_atom.TessAtom_chainID1(self._atom)
|
|
823
1305
|
cdef char c2 = jess.tess_atom.TessAtom_chainID2(self._atom)
|
|
824
|
-
return "
|
|
1306
|
+
return PyUnicode_FromFormat("%c%c", c1, c2).strip()
|
|
825
1307
|
|
|
826
1308
|
@property
|
|
827
1309
|
def x(self):
|
|
@@ -895,7 +1377,10 @@ cdef class TemplateAtom:
|
|
|
895
1377
|
.. versionadded:: 0.4.0
|
|
896
1378
|
|
|
897
1379
|
"""
|
|
898
|
-
|
|
1380
|
+
cdef TemplateAtom atom = TemplateAtom.__new__(TemplateAtom)
|
|
1381
|
+
with nogil:
|
|
1382
|
+
atom._atom = jess.tess_atom.TessAtom_copy(self._atom)
|
|
1383
|
+
return atom
|
|
899
1384
|
|
|
900
1385
|
|
|
901
1386
|
cdef class Template:
|
|
@@ -948,12 +1433,13 @@ cdef class Template:
|
|
|
948
1433
|
`~pyjess.Template`: The template parsed from the given file.
|
|
949
1434
|
|
|
950
1435
|
"""
|
|
1436
|
+
cdef str line
|
|
1437
|
+
cdef list atoms = []
|
|
951
1438
|
try:
|
|
952
1439
|
handle = open(file)
|
|
953
1440
|
except TypeError:
|
|
954
1441
|
handle = nullcontext(file)
|
|
955
1442
|
with handle as f:
|
|
956
|
-
atoms = []
|
|
957
1443
|
for line in f:
|
|
958
1444
|
if line.startswith("ATOM"):
|
|
959
1445
|
atoms.append(TemplateAtom.loads(line))
|
|
@@ -1050,7 +1536,14 @@ cdef class Template:
|
|
|
1050
1536
|
self._tess.distance[j][i] = dist
|
|
1051
1537
|
|
|
1052
1538
|
# compute dimension
|
|
1053
|
-
residues = {
|
|
1539
|
+
residues = {
|
|
1540
|
+
(
|
|
1541
|
+
self._tess.atom[i].resSeq ,
|
|
1542
|
+
self._tess.atom[i].chainID1,
|
|
1543
|
+
self._tess.atom[i].chainID2,
|
|
1544
|
+
)
|
|
1545
|
+
for i in range(count)
|
|
1546
|
+
}
|
|
1054
1547
|
self._tess.dim = len(residues)
|
|
1055
1548
|
|
|
1056
1549
|
def __copy__(self):
|
|
@@ -1133,6 +1626,8 @@ cdef class Template:
|
|
|
1133
1626
|
|
|
1134
1627
|
@property
|
|
1135
1628
|
def id(self):
|
|
1629
|
+
"""`str` or `None`: An identifier for the template, if any.
|
|
1630
|
+
"""
|
|
1136
1631
|
assert self._tpl is not NULL
|
|
1137
1632
|
|
|
1138
1633
|
cdef const char* name = self._tpl.name(self._tpl)
|
|
@@ -1148,11 +1643,18 @@ cdef class Template:
|
|
|
1148
1643
|
return self._tess.dim
|
|
1149
1644
|
|
|
1150
1645
|
cpdef Template copy(self):
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1646
|
+
"""Create a copy of the template.
|
|
1647
|
+
|
|
1648
|
+
Returns:
|
|
1649
|
+
`~pyjess.Template`: A new template object with identical
|
|
1650
|
+
attributes and a copy of the `TemplateAtom` it contains.
|
|
1155
1651
|
|
|
1652
|
+
"""
|
|
1653
|
+
cdef Template tpl = Template.__new__(Template)
|
|
1654
|
+
with nogil:
|
|
1655
|
+
tpl._tpl = self._tpl.copy(self._tpl)
|
|
1656
|
+
tpl._tess = <_TessTemplate*> &tpl._tpl[1]
|
|
1657
|
+
return tpl
|
|
1156
1658
|
|
|
1157
1659
|
cdef class Query:
|
|
1158
1660
|
"""A query over templates with a given molecule.
|
|
@@ -1168,10 +1670,6 @@ cdef class Query:
|
|
|
1168
1670
|
the templates.
|
|
1169
1671
|
rmsd_threshold (`float`): The RMSD threshold for reporting
|
|
1170
1672
|
results.
|
|
1171
|
-
max_candidates (`int`): The maximum number of candidate hits
|
|
1172
|
-
to report.
|
|
1173
|
-
ignore_chain (`bool`): Whether to check or ignore the chain of
|
|
1174
|
-
the atoms to match.
|
|
1175
1673
|
best_match (`bool`): Whether the query will return only the
|
|
1176
1674
|
best match to each template.
|
|
1177
1675
|
|
|
@@ -1179,18 +1677,20 @@ cdef class Query:
|
|
|
1179
1677
|
cdef _JessQuery* _jq
|
|
1180
1678
|
cdef bint _partial
|
|
1181
1679
|
cdef int _candidates
|
|
1680
|
+
cdef uintptr_t _prev_tpl
|
|
1681
|
+
cdef int _max_candidates
|
|
1682
|
+
cdef _IgnoreType _ignore_chain
|
|
1182
1683
|
|
|
1183
1684
|
cdef readonly Jess jess
|
|
1184
1685
|
cdef readonly Molecule molecule
|
|
1185
|
-
cdef readonly bint ignore_chain
|
|
1186
1686
|
cdef readonly bint best_match
|
|
1187
1687
|
cdef readonly double rmsd_threshold
|
|
1188
|
-
cdef readonly int max_candidates
|
|
1189
1688
|
|
|
1190
1689
|
def __cinit__(self):
|
|
1191
1690
|
self._jq = NULL
|
|
1192
1691
|
self._candidates = 0
|
|
1193
1692
|
self._partial = False
|
|
1693
|
+
self._prev_tpl = 0
|
|
1194
1694
|
|
|
1195
1695
|
def __dealloc__(self):
|
|
1196
1696
|
jess.jess.JessQuery_free(self._jq)
|
|
@@ -1198,11 +1698,48 @@ cdef class Query:
|
|
|
1198
1698
|
def __iter__(self):
|
|
1199
1699
|
return self
|
|
1200
1700
|
|
|
1701
|
+
@property
|
|
1702
|
+
def ignore_chain(self):
|
|
1703
|
+
"""`str` or `None`: The way atom chains are considered or discarded.
|
|
1704
|
+
"""
|
|
1705
|
+
if self._ignore_chain == _IgnoreType.ignoreNone:
|
|
1706
|
+
return None
|
|
1707
|
+
elif self._ignore_chain == _IgnoreType.ignoreResidues:
|
|
1708
|
+
return "residues"
|
|
1709
|
+
elif self._ignore_chain == _IgnoreType.ignoreAtoms:
|
|
1710
|
+
return "atoms"
|
|
1711
|
+
|
|
1712
|
+
@ignore_chain.setter
|
|
1713
|
+
def ignore_chain(self, ignore_chain):
|
|
1714
|
+
if ignore_chain is None:
|
|
1715
|
+
self._ignore_chain = _IgnoreType.ignoreNone
|
|
1716
|
+
elif ignore_chain == "residues":
|
|
1717
|
+
self._ignore_chain = _IgnoreType.ignoreResidues
|
|
1718
|
+
elif ignore_chain == "atoms":
|
|
1719
|
+
self._ignore_chain = _IgnoreType.ignoreAtoms
|
|
1720
|
+
else:
|
|
1721
|
+
raise ValueError(f"invalid value for `ignore_chain`: {ignore_chain!r}")
|
|
1722
|
+
|
|
1723
|
+
@property
|
|
1724
|
+
def max_candidates(self):
|
|
1725
|
+
"""`int`: The maximum number of candidate hits to report *by template*.
|
|
1726
|
+
"""
|
|
1727
|
+
return None if self._max_candidates == -1 else self._max_candidates
|
|
1728
|
+
|
|
1729
|
+
@max_candidates.setter
|
|
1730
|
+
def max_candidates(self, max_candidates):
|
|
1731
|
+
if max_candidates is None:
|
|
1732
|
+
self._max_candidates = -1
|
|
1733
|
+
elif max_candidates >= 0:
|
|
1734
|
+
self._max_candidates = max_candidates
|
|
1735
|
+
else:
|
|
1736
|
+
raise ValueError(f"invalid value for `max_candidates` argument: {max_candidates!r}")
|
|
1737
|
+
|
|
1201
1738
|
cdef bint _advance(self) noexcept nogil:
|
|
1202
1739
|
if self._partial:
|
|
1203
1740
|
self._partial = False
|
|
1204
1741
|
return True
|
|
1205
|
-
return jess.jess.JessQuery_next(self._jq, self.
|
|
1742
|
+
return jess.jess.JessQuery_next(self._jq, self._ignore_chain)
|
|
1206
1743
|
|
|
1207
1744
|
cdef bint _rewind(self) noexcept nogil:
|
|
1208
1745
|
self._partial = True
|
|
@@ -1246,10 +1783,11 @@ cdef class Query:
|
|
|
1246
1783
|
|
|
1247
1784
|
# search the next hit without the GIL to allow parallel queries.
|
|
1248
1785
|
with nogil:
|
|
1249
|
-
while self._advance()
|
|
1786
|
+
while self._advance():
|
|
1250
1787
|
# load current iteration template, and check that the hit
|
|
1251
1788
|
# was obtained with the current template and not with the
|
|
1252
1789
|
# previous one
|
|
1790
|
+
self._prev_tpl = <uintptr_t> tpl
|
|
1253
1791
|
tpl = jess.jess.JessQuery_template(self._jq)
|
|
1254
1792
|
if hit_found and hit_tpl != tpl:
|
|
1255
1793
|
self._rewind()
|
|
@@ -1276,10 +1814,10 @@ cdef class Query:
|
|
|
1276
1814
|
|
|
1277
1815
|
if nan:
|
|
1278
1816
|
with gil:
|
|
1279
|
-
|
|
1280
|
-
"Jess returned a superposition matrix with NaN values",
|
|
1817
|
+
PyErr_WarnEx(
|
|
1281
1818
|
UserWarning,
|
|
1282
|
-
|
|
1819
|
+
"Jess returned a superposition matrix with NaN values",
|
|
1820
|
+
2,
|
|
1283
1821
|
)
|
|
1284
1822
|
else:
|
|
1285
1823
|
self._copy_atoms(tpl, hit)
|
|
@@ -1288,9 +1826,21 @@ cdef class Query:
|
|
|
1288
1826
|
hit_tpl = tpl
|
|
1289
1827
|
hit_found = True
|
|
1290
1828
|
|
|
1291
|
-
#
|
|
1292
|
-
#
|
|
1293
|
-
|
|
1829
|
+
# check if we already made it to the next template,
|
|
1830
|
+
# or if we need to short-circuit the iteration and
|
|
1831
|
+
# force the query to move to the next template as
|
|
1832
|
+
# we found too many candidates already.
|
|
1833
|
+
if <uintptr_t> tpl != self._prev_tpl:
|
|
1834
|
+
self._candidates = 0
|
|
1835
|
+
else:
|
|
1836
|
+
self._candidates += 1
|
|
1837
|
+
if self._max_candidates != -1 and self._candidates > self._max_candidates:
|
|
1838
|
+
self._candidates = 0
|
|
1839
|
+
jess.jess.JessQuery_nextTemplate(self._jq)
|
|
1840
|
+
|
|
1841
|
+
# free superposition items (as relevant data was copied in
|
|
1842
|
+
# the Hit if needed) and return hits immediately if we are
|
|
1843
|
+
# not in best match mode
|
|
1294
1844
|
jess.super.Superposition_free(sup)
|
|
1295
1845
|
if hit_found and not self.best_match:
|
|
1296
1846
|
break
|
|
@@ -1357,6 +1907,18 @@ cdef class Hit:
|
|
|
1357
1907
|
for i, atom in enumerate(state["atoms"]):
|
|
1358
1908
|
memcpy(&self._atoms[i], atom._atom, sizeof(_Atom))
|
|
1359
1909
|
|
|
1910
|
+
cdef void _transform_atom(self, double* x, const double* src):
|
|
1911
|
+
cdef size_t i
|
|
1912
|
+
cdef size_t j
|
|
1913
|
+
cdef const double* M = self._rotation
|
|
1914
|
+
cdef const double* c = self._centre[0]
|
|
1915
|
+
cdef const double* v = self._centre[1]
|
|
1916
|
+
|
|
1917
|
+
for i in range(3):
|
|
1918
|
+
x[i] = v[i]
|
|
1919
|
+
for j in range(3):
|
|
1920
|
+
x[i] += M[3*i + j] * (src[j] - c[j])
|
|
1921
|
+
|
|
1360
1922
|
@property
|
|
1361
1923
|
def determinant(self):
|
|
1362
1924
|
"""`float`: The determinant of the rotation matrix.
|
|
@@ -1425,15 +1987,11 @@ cdef class Hit:
|
|
|
1425
1987
|
if transform:
|
|
1426
1988
|
atom._atom = <_Atom*> malloc(sizeof(_Atom))
|
|
1427
1989
|
memcpy(atom._atom, &self._atoms[k], sizeof(_Atom))
|
|
1428
|
-
|
|
1429
|
-
atom._atom.x[i] = v[i]
|
|
1430
|
-
for j in range(3):
|
|
1431
|
-
atom._atom.x[i] += M[3*i + j] * (self._atoms[k].x[j] - c[j])
|
|
1990
|
+
self._transform_atom(atom._atom.x, self._atoms[k].x)
|
|
1432
1991
|
else:
|
|
1433
1992
|
atom.owned = True
|
|
1434
1993
|
atom.owner = self
|
|
1435
1994
|
atom._atom = &self._atoms[k]
|
|
1436
|
-
|
|
1437
1995
|
atoms.append(atom)
|
|
1438
1996
|
|
|
1439
1997
|
return atoms
|
|
@@ -1469,17 +2027,142 @@ cdef class Hit:
|
|
|
1469
2027
|
mol = self._molecule.copy()
|
|
1470
2028
|
for k in range(mol._mol.count):
|
|
1471
2029
|
atom = mol._mol.atom[k]
|
|
1472
|
-
|
|
1473
|
-
atom.x[i] = v[i]
|
|
1474
|
-
for j in range(3):
|
|
1475
|
-
atom.x[i] += M[3*i + j] * (self._molecule._mol.atom[k].x[j] - c[j])
|
|
2030
|
+
self._transform_atom(atom.x, self._molecule._mol.atom[k].x)
|
|
1476
2031
|
|
|
1477
2032
|
return mol
|
|
1478
2033
|
|
|
2034
|
+
cpdef str dumps(self, str format="pdb", bint transform=True):
|
|
2035
|
+
"""Write the hit to a string.
|
|
2036
|
+
|
|
2037
|
+
Arguments:
|
|
2038
|
+
format (`str`): The format in which to write the hit.
|
|
2039
|
+
Currently only supports ``pdb``, which writes the hits
|
|
2040
|
+
in the same format as Jess.
|
|
2041
|
+
transform (`bool`): Whether or not to transform coordinates
|
|
2042
|
+
of the molecule atoms into template frame.
|
|
2043
|
+
|
|
2044
|
+
Raises:
|
|
2045
|
+
`RuntimeError`: When attempting to dump a `Hit` which was
|
|
2046
|
+
obtained from a `Template` which has no `~Template.id`.
|
|
2047
|
+
|
|
2048
|
+
.. versionadded:: 0.7.0
|
|
2049
|
+
|
|
2050
|
+
"""
|
|
2051
|
+
file = io.StringIO()
|
|
2052
|
+
self.dump(file, format=format, transform=transform)
|
|
2053
|
+
return file.getvalue()
|
|
2054
|
+
|
|
2055
|
+
cpdef void dump(self, object file, str format="pdb", bint transform=True):
|
|
2056
|
+
"""Write the hit to a file.
|
|
2057
|
+
|
|
2058
|
+
Arguments:
|
|
2059
|
+
file (file-like object): A file opened in *text* mode where the
|
|
2060
|
+
hit will be written.
|
|
2061
|
+
format (`str`): The format in which to write the hit.
|
|
2062
|
+
Currently only supports ``pdb``, which writes the hits
|
|
2063
|
+
in the same format as Jess.
|
|
2064
|
+
transform (`bool`): Whether or not to transform coordinates
|
|
2065
|
+
of the molecule atoms into template frame.
|
|
2066
|
+
|
|
2067
|
+
Raises:
|
|
2068
|
+
`RuntimeError`: When attempting to dump a `Hit` which was
|
|
2069
|
+
obtained from a `Template` which has no `~Template.id`.
|
|
2070
|
+
|
|
2071
|
+
.. versionadded:: 0.7.0
|
|
2072
|
+
|
|
2073
|
+
"""
|
|
2074
|
+
assert self.template._tpl is not NULL
|
|
2075
|
+
assert self._molecule._mol is not NULL
|
|
2076
|
+
|
|
2077
|
+
cdef _Atom* atom
|
|
2078
|
+
cdef size_t k
|
|
2079
|
+
cdef char[80] buffer
|
|
2080
|
+
cdef char[5] name
|
|
2081
|
+
cdef char[5] resname
|
|
2082
|
+
cdef double[3] x
|
|
2083
|
+
cdef int count = self.template._tpl.count(self.template._tpl)
|
|
2084
|
+
|
|
2085
|
+
if self.template.id is None:
|
|
2086
|
+
raise RuntimeError("cannot dump `Hit` where `self.template.id` is `None`")
|
|
2087
|
+
|
|
2088
|
+
file.write("REMARK ")
|
|
2089
|
+
file.write(self._molecule.id)
|
|
2090
|
+
file.write(f" {self.rmsd:5.3f} ")
|
|
2091
|
+
file.write(self.template.id)
|
|
2092
|
+
file.write(f" Det={self.determinant:4,.1f} log(E)~ {self.log_evalue:4.2f}\n")
|
|
2093
|
+
|
|
2094
|
+
for k in range(count):
|
|
2095
|
+
atom = &self._atoms[k]
|
|
2096
|
+
decode_token(name, atom.name, 4)
|
|
2097
|
+
decode_token(resname, atom.resName, 3)
|
|
2098
|
+
if transform:
|
|
2099
|
+
self._transform_atom(x, atom.x)
|
|
2100
|
+
else:
|
|
2101
|
+
memcpy(x, atom.x, 3*sizeof(double))
|
|
2102
|
+
n = sprintf(
|
|
2103
|
+
buffer,
|
|
2104
|
+
"ATOM %5i%5s%c%-3s%c%c%4i%-4c%8.3f%8.3f%8.3f%6.2f%6.2f\n",
|
|
2105
|
+
atom.serial,
|
|
2106
|
+
name,
|
|
2107
|
+
atom.altLoc,
|
|
2108
|
+
resname,
|
|
2109
|
+
atom.chainID1,
|
|
2110
|
+
atom.chainID2,
|
|
2111
|
+
atom.resSeq,
|
|
2112
|
+
atom.iCode,
|
|
2113
|
+
x[0],
|
|
2114
|
+
x[1],
|
|
2115
|
+
x[2],
|
|
2116
|
+
atom.occupancy,
|
|
2117
|
+
atom.tempFactor,
|
|
2118
|
+
atom.segID,
|
|
2119
|
+
atom.element,
|
|
2120
|
+
atom.charge
|
|
2121
|
+
)
|
|
2122
|
+
file.write(PyUnicode_FromStringAndSize(buffer, n))
|
|
2123
|
+
file.write("ENDMDL\n")
|
|
1479
2124
|
|
|
1480
2125
|
cdef class Jess:
|
|
1481
2126
|
"""A handle to run Jess over a list of templates.
|
|
1482
2127
|
|
|
2128
|
+
Example:
|
|
2129
|
+
Create a `Jess` object from a list of templates::
|
|
2130
|
+
|
|
2131
|
+
>>> t1 = Template.load("1.3.3.tpl")
|
|
2132
|
+
>>> t2 = Template.load("4.1.2.tpl")
|
|
2133
|
+
>>> jess = Jess([t1, t2])
|
|
2134
|
+
|
|
2135
|
+
Once initialized, the `Jess` object cannot be modified further.
|
|
2136
|
+
Use the `~Jess.query` method to query the templates with a
|
|
2137
|
+
molecule::
|
|
2138
|
+
|
|
2139
|
+
>>> molecule = Molecule.load("1AMY.pdb")
|
|
2140
|
+
>>> query = jess.query(molecule, 2, 2, 2)
|
|
2141
|
+
|
|
2142
|
+
The returned `Query` object is an iterator that can be
|
|
2143
|
+
advanced through a ``for`` loop, or with the `next` built-in
|
|
2144
|
+
function to get the first hit:
|
|
2145
|
+
|
|
2146
|
+
>>> hit = next(query)
|
|
2147
|
+
>>> hit.rmsd
|
|
2148
|
+
1.4386...
|
|
2149
|
+
|
|
2150
|
+
The hit can also be formatted in PDB format like in the
|
|
2151
|
+
original JESS code::
|
|
2152
|
+
|
|
2153
|
+
>>> print(hit.dumps(format="pdb"), end="")
|
|
2154
|
+
REMARK 1AMY 1.439 2om2 Det= 1.0 log(E)~ 1.11
|
|
2155
|
+
ATOM 729 CA THR A 94 34.202 -24.426 8.851 1.00 2.00
|
|
2156
|
+
ATOM 732 CB THR A 94 35.157 -23.467 8.101 1.00 4.66
|
|
2157
|
+
ATOM 733 OG1 THR A 94 36.338 -23.247 8.871 1.00 9.85
|
|
2158
|
+
ATOM 746 CD GLU A 96 41.454 -29.509 8.013 1.00 24.05
|
|
2159
|
+
ATOM 748 OE2 GLU A 96 42.536 -29.680 7.441 1.00 34.44
|
|
2160
|
+
ATOM 747 OE1 GLU A 96 41.212 -28.521 8.708 1.00 18.56
|
|
2161
|
+
ATOM 437 CZ ARG A 55 44.471 -26.619 10.181 1.00 8.51
|
|
2162
|
+
ATOM 436 NE ARG A 55 44.334 -27.346 11.290 1.00 9.05
|
|
2163
|
+
ATOM 438 NH1 ARG A 55 43.590 -26.751 9.179 1.00 13.17
|
|
2164
|
+
ENDMDL
|
|
2165
|
+
|
|
1483
2166
|
.. versionadded:: 0.4.0
|
|
1484
2167
|
Equality, hashing and pickle protocol support.
|
|
1485
2168
|
|
|
@@ -1581,8 +2264,8 @@ cdef class Jess:
|
|
|
1581
2264
|
double distance_cutoff,
|
|
1582
2265
|
double max_dynamic_distance,
|
|
1583
2266
|
*,
|
|
1584
|
-
|
|
1585
|
-
|
|
2267
|
+
object max_candidates = None,
|
|
2268
|
+
object ignore_chain = None,
|
|
1586
2269
|
bint best_match = False,
|
|
1587
2270
|
bint reorder = True,
|
|
1588
2271
|
):
|
|
@@ -1599,15 +2282,35 @@ cdef class Jess:
|
|
|
1599
2282
|
dynamic distance after adding the global distance cutoff
|
|
1600
2283
|
and the individual atom distance cutoff defined for each
|
|
1601
2284
|
atom of the template.
|
|
1602
|
-
|
|
1603
|
-
|
|
2285
|
+
max_candidates (`int` or `None`): The maximum number of candidate
|
|
2286
|
+
hits to report by template. If a non-`None` value is given,
|
|
2287
|
+
it may speed up querying for unspecific templates, but also
|
|
2288
|
+
produce results potentially inconsistent with Jess.
|
|
2289
|
+
ignore_chain (`str` or `None`): Whether to check or ignore the
|
|
2290
|
+
chain of the atoms to match. The different supported modes
|
|
2291
|
+
are:
|
|
2292
|
+
|
|
2293
|
+
- `None`: Force the atoms in the molecule to belong
|
|
2294
|
+
to different (resp. same) chains if so is the case
|
|
2295
|
+
in the template.
|
|
2296
|
+
- ``residues``: Allow atoms to belong to different
|
|
2297
|
+
(resp. same) chains even if it is not the case in
|
|
2298
|
+
the template, but force all atoms of a residue to
|
|
2299
|
+
belong to the same chain.
|
|
2300
|
+
- ``atoms``: Allow atoms to belong to any chain,
|
|
2301
|
+
independently to the template or the residue they
|
|
2302
|
+
belong to.
|
|
2303
|
+
|
|
1604
2304
|
best_match (`bool`): Pass `True` to return only the best match
|
|
1605
|
-
to each template.
|
|
2305
|
+
to each template, based on RMSD. In case of ties, the
|
|
2306
|
+
first match is returned. Note that a match must still
|
|
2307
|
+
be passing the RMSD threshold given in ``rmsd_threshold``
|
|
2308
|
+
to be returned.
|
|
1606
2309
|
reorder (`bool`): Whether to enable template atom reordering
|
|
1607
|
-
to accelerate matching in the scanner algorithm. Pass
|
|
2310
|
+
to accelerate matching in the scanner algorithm. Pass
|
|
1608
2311
|
`False` to reverse to the original, slower algorithm
|
|
1609
2312
|
which matches atoms in the same order as they appear in
|
|
1610
|
-
the template, at the cost
|
|
2313
|
+
the template, at the cost of longer run times.
|
|
1611
2314
|
|
|
1612
2315
|
Returns:
|
|
1613
2316
|
`~pyjess.Query`: An iterator over the query hits.
|
|
@@ -1615,19 +2318,45 @@ cdef class Jess:
|
|
|
1615
2318
|
Caution:
|
|
1616
2319
|
Since ``v0.6.0``, this function uses an optimized variant of
|
|
1617
2320
|
the Jess scanning algorithm which minimized the number of steps
|
|
1618
|
-
needed to generate matches, by re-ordering the order the
|
|
2321
|
+
needed to generate matches, by re-ordering the order the
|
|
1619
2322
|
template atoms are iterated upon. Because of this change,
|
|
1620
|
-
the query may return *exactly* the same matches but in an order
|
|
2323
|
+
the query may return *exactly* the same matches but in an order
|
|
1621
2324
|
that *differs* from the original Jess version. If you really
|
|
1622
2325
|
need results in the original order, set ``reorder`` to `False`.
|
|
1623
2326
|
|
|
1624
2327
|
.. versionadded:: 0.6.0
|
|
1625
2328
|
The ``reorder`` argument, defaulting to `True`.
|
|
1626
2329
|
|
|
2330
|
+
.. versionchanged:: 0.7.0
|
|
2331
|
+
Default value of ``max_candidates`` argument to `None`.
|
|
2332
|
+
|
|
2333
|
+
.. versionchanged:: 0.7.0
|
|
2334
|
+
``ignore_chain`` now expects string variants rather than `bool`.
|
|
2335
|
+
|
|
1627
2336
|
"""
|
|
2337
|
+
|
|
2338
|
+
if ignore_chain is True:
|
|
2339
|
+
PyErr_WarnEx(
|
|
2340
|
+
DeprecationWarning,
|
|
2341
|
+
"`ignore_chain` parameter expects string parameters "
|
|
2342
|
+
"to specificy the mode since PyJess v0.7.0. "
|
|
2343
|
+
"Use `ignore_chain='atoms'` instead of `ignore_chain=True`",
|
|
2344
|
+
2,
|
|
2345
|
+
)
|
|
2346
|
+
ignore_chain="atoms"
|
|
2347
|
+
elif ignore_chain is False:
|
|
2348
|
+
PyErr_WarnEx(
|
|
2349
|
+
DeprecationWarning,
|
|
2350
|
+
"`ignore_chain` parameter expects string parameters "
|
|
2351
|
+
"to specificy the mode since PyJess v0.7.0. "
|
|
2352
|
+
"Use `ignore_chain=None` instead of `ignore_chain=False`",
|
|
2353
|
+
2,
|
|
2354
|
+
)
|
|
2355
|
+
ignore_chain=None
|
|
2356
|
+
|
|
1628
2357
|
cdef Query query = Query.__new__(Query)
|
|
1629
|
-
query.ignore_chain = ignore_chain
|
|
1630
2358
|
query.max_candidates = max_candidates
|
|
2359
|
+
query.ignore_chain = ignore_chain
|
|
1631
2360
|
query.rmsd_threshold = rmsd_threshold
|
|
1632
2361
|
query.best_match = best_match
|
|
1633
2362
|
query.molecule = molecule
|