pyjess 0.5.2__pp310-pypy310_pp73-win_amd64.whl → 0.7.0__pp310-pypy310_pp73-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyjess might be problematic. Click here for more details.
- pyjess/__main__.py +4 -0
- pyjess/_jess.pyi +53 -9
- pyjess/_jess.pypy310-pp73-win_amd64.pyd +0 -0
- pyjess/_jess.pyx +855 -105
- pyjess/cli.py +281 -0
- pyjess/tests/__init__.py +2 -0
- pyjess/tests/data/1AMY.cif +6259 -0
- pyjess/tests/data/1sur.qry +26 -0
- pyjess/tests/data/4.1.2.tpl +23 -0
- pyjess/tests/data/5ayx.EF.pdb +63 -0
- pyjess/tests/test_doctest.py +78 -0
- pyjess/tests/test_hit.py +26 -2
- pyjess/tests/test_jess.py +124 -3
- pyjess/tests/test_molecule.py +146 -0
- pyjess/tests/test_template.py +10 -1
- {pyjess-0.5.2.dist-info → pyjess-0.7.0.dist-info}/METADATA +76 -15
- pyjess-0.7.0.dist-info/RECORD +34 -0
- pyjess-0.7.0.dist-info/entry_points.txt +3 -0
- pyjess-0.5.2.dist-info/RECORD +0 -26
- {pyjess-0.5.2.dist-info → pyjess-0.7.0.dist-info}/WHEEL +0 -0
- {pyjess-0.5.2.dist-info → pyjess-0.7.0.dist-info}/licenses/COPYING +0 -0
pyjess/_jess.pyx
CHANGED
|
@@ -2,6 +2,73 @@
|
|
|
2
2
|
# cython: language_level=3, linetrace=True, binding=True
|
|
3
3
|
"""Bindings to Jess, a 3D template matching software.
|
|
4
4
|
|
|
5
|
+
Jess is an algorithm for constraint-based structural template matching
|
|
6
|
+
proposed by Jonathan Barker *et al.*. It can be used to identify
|
|
7
|
+
catalytic residues from a known template inside a protein structure.
|
|
8
|
+
Jess is an evolution of TESS, a geometric hashing algorithm developed by
|
|
9
|
+
Andrew Wallace *et al.*, removing some pre-computation and
|
|
10
|
+
structural requirements from the original algorithm.
|
|
11
|
+
|
|
12
|
+
PyJess is a Python module that provides bindings to Jess using
|
|
13
|
+
`Cython <https://cython.org/>`_. It allows creating templates, querying
|
|
14
|
+
them with protein structures, and retrieving the hits using a Python API
|
|
15
|
+
without performing any external I/O. It's also more than 10x faster than
|
|
16
|
+
Jess thanks to algorithmic optimizations added to improve the original Jess
|
|
17
|
+
code while producing consistent results.
|
|
18
|
+
|
|
19
|
+
Example:
|
|
20
|
+
Load templates from a file, either as a file-like object or
|
|
21
|
+
given a filename::
|
|
22
|
+
|
|
23
|
+
>>> t1 = pyjess.Template.load("1.3.3.tpl") # load from filename
|
|
24
|
+
>>> with open("4.1.2.tpl") as f: # load from a file object
|
|
25
|
+
... t2 = pyjess.Template.load(f)
|
|
26
|
+
|
|
27
|
+
Load molecules from a file, either as a file-like object or given
|
|
28
|
+
a filename::
|
|
29
|
+
|
|
30
|
+
>>> mol = pyjess.Molecule.load("1AMY.pdb")
|
|
31
|
+
>>> mol[0]
|
|
32
|
+
Atom(serial=1, name='N', altloc=' ', residue_name='GLN', ...)
|
|
33
|
+
|
|
34
|
+
Create a `Jess` object storing the templates to support running
|
|
35
|
+
queries on them. The individual templates can still be accessed by
|
|
36
|
+
index::
|
|
37
|
+
|
|
38
|
+
>>> jess = pyjess.Jess([t1, t2])
|
|
39
|
+
>>> jess[0].id
|
|
40
|
+
'3r6v'
|
|
41
|
+
|
|
42
|
+
Run a query on the Jess object to retrieve all templates matching
|
|
43
|
+
a `Molecule`, *in no particular order*::
|
|
44
|
+
|
|
45
|
+
>>> hits = jess.query(mol, 2, 2, 2)
|
|
46
|
+
>>> for hit in hits:
|
|
47
|
+
... print(hit.template.id, hit.rmsd)
|
|
48
|
+
2om2 1.4386...
|
|
49
|
+
2om2 1.4877...
|
|
50
|
+
2om2 1.4376...
|
|
51
|
+
2om2 1.5284...
|
|
52
|
+
2om2 1.4863...
|
|
53
|
+
2om2 1.4369...
|
|
54
|
+
2om2 1.4790...
|
|
55
|
+
2om2 1.1414...
|
|
56
|
+
2om2 1.0755...
|
|
57
|
+
2om2 1.1973...
|
|
58
|
+
2om2 1.1353...
|
|
59
|
+
2om2 1.0711...
|
|
60
|
+
2om2 1.1494...
|
|
61
|
+
|
|
62
|
+
By default, a template can match a molecule in more than one way,
|
|
63
|
+
if several sets of atoms match the geometric constraints. Use the
|
|
64
|
+
``best_match`` argument of `~Jess.query` to only retrieve the
|
|
65
|
+
best match per template::
|
|
66
|
+
|
|
67
|
+
>>> hits = jess.query(mol, 2, 2, 2, best_match=True)
|
|
68
|
+
>>> for hit in hits:
|
|
69
|
+
... print(hit.template.id, hit.rmsd)
|
|
70
|
+
2om2 1.071...
|
|
71
|
+
|
|
5
72
|
References:
|
|
6
73
|
- Barker, J. A., & Thornton, J. M. (2003). *An algorithm for
|
|
7
74
|
constraint-based structural template matching: application to
|
|
@@ -18,10 +85,16 @@ References:
|
|
|
18
85
|
# --- C imports --------------------------------------------------------------
|
|
19
86
|
|
|
20
87
|
cimport cython
|
|
21
|
-
from cpython.
|
|
88
|
+
from cpython.exc cimport PyErr_WarnEx
|
|
89
|
+
from cpython.unicode cimport (
|
|
90
|
+
PyUnicode_FromStringAndSize,
|
|
91
|
+
PyUnicode_FromFormat,
|
|
92
|
+
PyUnicode_AsASCIIString,
|
|
93
|
+
)
|
|
22
94
|
|
|
23
95
|
from libc.math cimport isnan, exp, INFINITY, NAN
|
|
24
|
-
from libc.stdio cimport FILE, fclose, fdopen, printf
|
|
96
|
+
from libc.stdio cimport FILE, fclose, fdopen, printf, sprintf
|
|
97
|
+
from libc.stdint cimport uintptr_t
|
|
25
98
|
from libc.stdlib cimport calloc, realloc, free, malloc
|
|
26
99
|
from libc.string cimport memcpy, memset, strncpy, strdup
|
|
27
100
|
|
|
@@ -37,24 +110,20 @@ from jess.jess cimport Jess as _Jess
|
|
|
37
110
|
from jess.jess cimport JessQuery as _JessQuery
|
|
38
111
|
from jess.molecule cimport Molecule as _Molecule
|
|
39
112
|
from jess.super cimport Superposition as _Superposition
|
|
40
|
-
from jess.template cimport Template as _Template
|
|
113
|
+
from jess.template cimport Template as _Template, IgnoreType as _IgnoreType
|
|
41
114
|
from jess.tess_template cimport TessTemplate as _TessTemplate
|
|
42
115
|
from jess.tess_atom cimport TessAtom as _TessAtom
|
|
43
116
|
|
|
44
117
|
# --- Python imports ---------------------------------------------------------
|
|
45
118
|
|
|
46
|
-
import contextlib
|
|
47
119
|
import functools
|
|
48
120
|
import io
|
|
49
|
-
import itertools
|
|
50
|
-
import os
|
|
51
|
-
import warnings
|
|
52
121
|
|
|
53
122
|
__version__ = PROJECT_VERSION
|
|
54
123
|
|
|
55
124
|
# --- Utils ------------------------------------------------------------------
|
|
56
125
|
|
|
57
|
-
cdef inline void
|
|
126
|
+
cdef inline void encode_token(char* dst, const char* src, size_t n) noexcept nogil:
|
|
58
127
|
cdef size_t i
|
|
59
128
|
for i in range(n):
|
|
60
129
|
if src[i] == ord(' ') or src[i] == 0:
|
|
@@ -63,12 +132,155 @@ cdef inline void copy_token(char* dst, const char* src, size_t n) noexcept nogil
|
|
|
63
132
|
dst[i] = src[i]
|
|
64
133
|
dst[n] = 0
|
|
65
134
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
135
|
+
cdef inline void decode_token(char* dst, const char* src, size_t n) noexcept nogil:
|
|
136
|
+
cdef size_t i
|
|
137
|
+
for i in range(n):
|
|
138
|
+
if src[i] == ord('_') or src[i] == 0:
|
|
139
|
+
dst[i] = ord(' ')
|
|
140
|
+
else:
|
|
141
|
+
dst[i] = src[i]
|
|
142
|
+
dst[n] = 0
|
|
143
|
+
|
|
144
|
+
class nullcontext:
|
|
145
|
+
def __init__(self, return_value=None):
|
|
146
|
+
self.retval = return_value
|
|
147
|
+
def __enter__(self):
|
|
148
|
+
return self.retval
|
|
149
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
150
|
+
return False
|
|
69
151
|
|
|
70
152
|
# --- Classes ----------------------------------------------------------------
|
|
71
153
|
|
|
154
|
+
cdef class _MoleculeParser:
|
|
155
|
+
cdef str id
|
|
156
|
+
|
|
157
|
+
def __init__(self, str id = None):
|
|
158
|
+
self.id = id
|
|
159
|
+
|
|
160
|
+
cdef class _PDBMoleculeParser(_MoleculeParser):
|
|
161
|
+
cdef bint ignore_endmdl
|
|
162
|
+
cdef bint skip_hetatm
|
|
163
|
+
|
|
164
|
+
def __init__(self, str id = None, bint ignore_endmdl = False, bint skip_hetatm = False):
|
|
165
|
+
super().__init__(id=id)
|
|
166
|
+
self.ignore_endmdl = ignore_endmdl
|
|
167
|
+
self.skip_hetatm = skip_hetatm
|
|
168
|
+
|
|
169
|
+
def loads(self, text, molecule_type):
|
|
170
|
+
return self.load(io.StringIO(text), molecule_type)
|
|
171
|
+
|
|
172
|
+
def load(self, file, molecule_type):
|
|
173
|
+
cdef str line
|
|
174
|
+
cdef str id = self.id
|
|
175
|
+
cdef list atoms = []
|
|
176
|
+
try:
|
|
177
|
+
handle = open(file)
|
|
178
|
+
except TypeError:
|
|
179
|
+
handle = nullcontext(file)
|
|
180
|
+
with handle as f:
|
|
181
|
+
for line in f:
|
|
182
|
+
if line.startswith("HEADER"):
|
|
183
|
+
if id is None:
|
|
184
|
+
id = line[62:66].strip() or None
|
|
185
|
+
elif line.startswith("ATOM"):
|
|
186
|
+
atoms.append(Atom.loads(line))
|
|
187
|
+
elif line.startswith("HETATM") and not self.skip_hetatm:
|
|
188
|
+
atoms.append(Atom.loads(line))
|
|
189
|
+
elif line.startswith("ENDMDL"):
|
|
190
|
+
if not self.ignore_endmdl:
|
|
191
|
+
break
|
|
192
|
+
elif line.lower().startswith(("data_", "loop_")):
|
|
193
|
+
raise ValueError("mmCIF data tags found, file is not in PDB format")
|
|
194
|
+
return molecule_type(atoms, id=id)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
cdef class _CIFMoleculeParser(_MoleculeParser):
|
|
198
|
+
cdef object gemmi
|
|
199
|
+
cdef bint use_author
|
|
200
|
+
cdef bint skip_hetatm
|
|
201
|
+
|
|
202
|
+
_PRIMARY_COLUMNS = [
|
|
203
|
+
'id', 'type_symbol', 'label_atom_id', 'label_alt_id', 'label_comp_id',
|
|
204
|
+
'label_asym_id', 'label_seq_id', '?pdbx_PDB_ins_code', 'Cartn_x',
|
|
205
|
+
'Cartn_y', 'Cartn_z', 'occupancy', 'B_iso_or_equiv',
|
|
206
|
+
'?pdbx_formal_charge', '?group_PDB',
|
|
207
|
+
]
|
|
208
|
+
|
|
209
|
+
_AUTH_COLUMNS = [
|
|
210
|
+
'id', 'type_symbol', 'auth_atom_id', 'label_alt_id', 'auth_comp_id',
|
|
211
|
+
'auth_asym_id', 'auth_seq_id', '?pdbx_PDB_ins_code', 'Cartn_x',
|
|
212
|
+
'Cartn_y', 'Cartn_z', 'occupancy', 'B_iso_or_equiv',
|
|
213
|
+
'?pdbx_formal_charge', '?group_PDB',
|
|
214
|
+
]
|
|
215
|
+
|
|
216
|
+
def __init__(self, str id = None, bint use_author = False, bint skip_hetatm = False):
|
|
217
|
+
super().__init__(id=id)
|
|
218
|
+
self.gemmi = __import__('gemmi')
|
|
219
|
+
self.use_author = use_author
|
|
220
|
+
self.skip_hetatm = skip_hetatm
|
|
221
|
+
|
|
222
|
+
def _load_block(self, document, molecule_type):
|
|
223
|
+
block = document.sole_block()
|
|
224
|
+
cols = self._AUTH_COLUMNS if self.use_author else self._PRIMARY_COLUMNS
|
|
225
|
+
table = block.find('_atom_site.', cols)
|
|
226
|
+
max_residue_number = 0
|
|
227
|
+
|
|
228
|
+
if not table:
|
|
229
|
+
raise ValueError("missing columns in CIF files")
|
|
230
|
+
|
|
231
|
+
atoms = []
|
|
232
|
+
for row in table:
|
|
233
|
+
if row[14] != "ATOM" and (row[14] != "HETATM" or self.skip_hetatm):
|
|
234
|
+
continue
|
|
235
|
+
|
|
236
|
+
if row[6] == "." and row[14] == "HETATM":
|
|
237
|
+
PyErr_WarnEx(
|
|
238
|
+
UserWarning,
|
|
239
|
+
"HETATM line found without residue number. Consider "
|
|
240
|
+
"parsing with use_author=True to use author-defined "
|
|
241
|
+
"residue numbers, or skip_hetatm=True to disable "
|
|
242
|
+
"parsing of HETATM altogether.",
|
|
243
|
+
3,
|
|
244
|
+
)
|
|
245
|
+
residue_number = max_residue_number
|
|
246
|
+
max_residue_number += 1
|
|
247
|
+
else:
|
|
248
|
+
residue_number = int(row[6])
|
|
249
|
+
max_residue_number = max(residue_number, max_residue_number)
|
|
250
|
+
|
|
251
|
+
atom = Atom(
|
|
252
|
+
serial=int(row[0]),
|
|
253
|
+
element=row[1],
|
|
254
|
+
name=row[2],
|
|
255
|
+
altloc=' ' if row[3] == "." else row[3], # FIXME: replace with None?
|
|
256
|
+
residue_name=row[4],
|
|
257
|
+
chain_id=row[5],
|
|
258
|
+
residue_number=residue_number,
|
|
259
|
+
insertion_code=' ' if not row.has(7) or row[7] == "?" else row[7],
|
|
260
|
+
x=float(row[8]),
|
|
261
|
+
y=float(row[9]),
|
|
262
|
+
z=float(row[10]),
|
|
263
|
+
occupancy=0.0 if row[11] == '.' else float(row[11]),
|
|
264
|
+
temperature_factor=float(row[12]),
|
|
265
|
+
charge=0 if not row.has(13) or row[13] == "?" else int(row[13]),
|
|
266
|
+
)
|
|
267
|
+
atoms.append(atom)
|
|
268
|
+
|
|
269
|
+
id = block.name if self.id is None else self.id
|
|
270
|
+
return molecule_type(atoms, id=id)
|
|
271
|
+
|
|
272
|
+
def loads(self, text, molecule_type):
|
|
273
|
+
document = self.gemmi.cif.read_string(text)
|
|
274
|
+
return self._load_block(document, molecule_type)
|
|
275
|
+
|
|
276
|
+
def load(self, file, molecule_type):
|
|
277
|
+
if hasattr(file, "read"):
|
|
278
|
+
document = self.gemmi.cif.read_string(file.read())
|
|
279
|
+
else:
|
|
280
|
+
document = self.gemmi.cif.read_file(file)
|
|
281
|
+
return self._load_block(document, molecule_type)
|
|
282
|
+
|
|
283
|
+
|
|
72
284
|
cdef class Molecule:
|
|
73
285
|
"""A molecule structure, as a sequence of `Atom` objects.
|
|
74
286
|
|
|
@@ -83,20 +295,40 @@ cdef class Molecule:
|
|
|
83
295
|
cdef str _id
|
|
84
296
|
|
|
85
297
|
@classmethod
|
|
86
|
-
def loads(
|
|
298
|
+
def loads(
|
|
299
|
+
cls,
|
|
300
|
+
text,
|
|
301
|
+
str format = "pdb",
|
|
302
|
+
*,
|
|
303
|
+
str id = None,
|
|
304
|
+
bint ignore_endmdl = False,
|
|
305
|
+
bint use_author = False,
|
|
306
|
+
bint skip_hetatm = False,
|
|
307
|
+
):
|
|
87
308
|
"""Load a molecule from a PDB string.
|
|
88
309
|
|
|
89
310
|
Arguments:
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
311
|
+
text (`str`): The serialized molecule to parse into a new
|
|
312
|
+
object.
|
|
313
|
+
format (`str`): The format to parse the file. Supported formats
|
|
314
|
+
are: ``pdb`` for the Protein Data Bank format, ``cif``
|
|
315
|
+
for Crystallographic Information File format (additionally
|
|
316
|
+
requires the `gemmi` module), or ``detect`` to attempt
|
|
317
|
+
auto-detection (the default).
|
|
318
|
+
|
|
319
|
+
Keyword Arguments:
|
|
93
320
|
id (`str`, optional): The identifier of the molecule. If `None`
|
|
94
321
|
given, the parser will attempt to extract it from the
|
|
95
|
-
``HEADER`` line
|
|
322
|
+
``HEADER`` line (for PDB files) or the block name (for CIF
|
|
323
|
+
files).
|
|
96
324
|
ignore_endmdl (`bool`): Pass `True` to make the parser read all
|
|
97
325
|
the atoms from the PDB file. By default, the parser only
|
|
98
326
|
reads the atoms of the first model, and stops at the first
|
|
99
|
-
``ENDMDL`` line.
|
|
327
|
+
``ENDMDL`` line. *Ignored for CIF files*.
|
|
328
|
+
use_author (`bool`): Pass `True` to use the author-defined
|
|
329
|
+
labels while parsing CIF files, e.g. read the chain name
|
|
330
|
+
from ``_atom_site.auth_asym_id`` rather than
|
|
331
|
+
``_atom_site.label_asym_id``. *Ignored for PDB files*.
|
|
100
332
|
|
|
101
333
|
Returns:
|
|
102
334
|
`~pyjess.Molecule`: The molecule parsed from the PDB file.
|
|
@@ -105,46 +337,264 @@ cdef class Molecule:
|
|
|
105
337
|
`Molecule.load` to load a PDB molecule from a file-like
|
|
106
338
|
object or from a path.
|
|
107
339
|
|
|
340
|
+
Caution:
|
|
341
|
+
Parsing from PDB file retains the heteroatoms (``HETATM`` lines)
|
|
342
|
+
while parsing from mmCIF usually discard them. This is because
|
|
343
|
+
mmCIF files store heteroatoms but do not require them to
|
|
344
|
+
have an associated residue number, which can throw off the way
|
|
345
|
+
atoms are modeled in Jess.
|
|
346
|
+
|
|
347
|
+
.. versionadded:: 0.7.0
|
|
348
|
+
The ``format`` argument, and support for CIF parsing.
|
|
349
|
+
|
|
108
350
|
"""
|
|
109
|
-
|
|
351
|
+
if format == "detect":
|
|
352
|
+
format = "cif" if text.lstrip().startswith(("data_", "loop_")) else "pdb"
|
|
353
|
+
return cls.load(
|
|
354
|
+
io.StringIO(text),
|
|
355
|
+
format=format,
|
|
356
|
+
id=id,
|
|
357
|
+
ignore_endmdl=ignore_endmdl,
|
|
358
|
+
skip_hetatm=skip_hetatm,
|
|
359
|
+
)
|
|
110
360
|
|
|
111
361
|
@classmethod
|
|
112
|
-
def load(
|
|
362
|
+
def load(
|
|
363
|
+
cls,
|
|
364
|
+
file,
|
|
365
|
+
str format = "detect",
|
|
366
|
+
*,
|
|
367
|
+
str id = None,
|
|
368
|
+
bint ignore_endmdl = False,
|
|
369
|
+
bint use_author = False,
|
|
370
|
+
bint skip_hetatm = False,
|
|
371
|
+
):
|
|
113
372
|
"""Load a molecule from a PDB file.
|
|
114
373
|
|
|
115
374
|
Arguments:
|
|
116
375
|
file (`str`, `os.PathLike`, or file-like object): Either the path
|
|
117
376
|
to a file, or a file-like object opened in **text mode**
|
|
118
|
-
containing a
|
|
377
|
+
containing a molecule.
|
|
378
|
+
format (`str`): The format to parse the file. Supported formats
|
|
379
|
+
are: ``pdb`` for the Protein Data Bank format, ``cif``
|
|
380
|
+
for Crystallographic Information File format (additionally
|
|
381
|
+
requires the `gemmi` module), or ``detect`` to attempt
|
|
382
|
+
auto-detection (the default).
|
|
383
|
+
|
|
384
|
+
Keyword Arguments:
|
|
119
385
|
id (`str`, optional): The identifier of the molecule. If `None`
|
|
120
386
|
given, the parser will attempt to extract it from the
|
|
121
|
-
``HEADER`` line
|
|
387
|
+
``HEADER`` line (for PDB files) or the block name (for CIF
|
|
388
|
+
files).
|
|
122
389
|
ignore_endmdl (`bool`): Pass `True` to make the parser read all
|
|
123
390
|
the atoms from the PDB file. By default, the parser only
|
|
124
391
|
reads the atoms of the first model, and stops at the first
|
|
125
|
-
``ENDMDL`` line.
|
|
392
|
+
``ENDMDL`` line. *Ignored for CIF files*.
|
|
393
|
+
use_author (`bool`): Pass `True` to use the author-defined
|
|
394
|
+
labels while parsing CIF files, e.g. read the chain name
|
|
395
|
+
from ``_atom_site.auth_asym_id`` rather than
|
|
396
|
+
``_atom_site.label_asym_id``. *Ignored for PDB files*.
|
|
397
|
+
skip_hetatm (`bool`): Pass `True` to skip parsing of heteroatoms
|
|
398
|
+
(``HETATM``) in the input file.
|
|
126
399
|
|
|
127
400
|
Returns:
|
|
128
401
|
`~pyjess.Molecule`: The molecule parsed from the PDB file.
|
|
129
402
|
|
|
403
|
+
See Also:
|
|
404
|
+
`Molecule.loads` to load a PDB molecule from a string.
|
|
405
|
+
|
|
406
|
+
Caution:
|
|
407
|
+
Parsing from PDB file retains the heteroatoms (``HETATM`` lines)
|
|
408
|
+
while parsing from mmCIF usually discard them. This is because
|
|
409
|
+
mmCIF files store heteroatoms but do not require them to
|
|
410
|
+
have an associated residue number, which can throw off the way
|
|
411
|
+
atoms are modeled in Jess.
|
|
412
|
+
|
|
413
|
+
.. versionadded:: 0.7.0
|
|
414
|
+
The ``format`` and ``skip_hetatm`` arguments, and mmCIF support.
|
|
415
|
+
|
|
130
416
|
"""
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
417
|
+
cdef _MoleculeParser parser
|
|
418
|
+
cdef str peek
|
|
419
|
+
|
|
420
|
+
if format == "detect":
|
|
421
|
+
try:
|
|
422
|
+
handle = open(file)
|
|
423
|
+
except TypeError:
|
|
424
|
+
handle = nullcontext(file)
|
|
425
|
+
with handle as f:
|
|
426
|
+
if f.seekable():
|
|
427
|
+
peek = f.read(5)
|
|
428
|
+
f.seek(0)
|
|
429
|
+
else:
|
|
430
|
+
f = f.read()
|
|
431
|
+
peek = f[5:]
|
|
432
|
+
if peek.startswith(("data_", "loop_")):
|
|
433
|
+
parser = _CIFMoleculeParser(
|
|
434
|
+
id=id,
|
|
435
|
+
use_author=use_author,
|
|
436
|
+
skip_hetatm=skip_hetatm,
|
|
437
|
+
)
|
|
438
|
+
else:
|
|
439
|
+
parser = _PDBMoleculeParser(
|
|
440
|
+
id=id,
|
|
441
|
+
ignore_endmdl=ignore_endmdl,
|
|
442
|
+
skip_hetatm=skip_hetatm,
|
|
443
|
+
)
|
|
444
|
+
if isinstance(f, str):
|
|
445
|
+
return parser.loads(f, molecule_type=cls)
|
|
446
|
+
return parser.load(f, molecule_type=cls)
|
|
447
|
+
if format == "pdb":
|
|
448
|
+
parser = _PDBMoleculeParser(
|
|
449
|
+
id=id,
|
|
450
|
+
ignore_endmdl=ignore_endmdl,
|
|
451
|
+
skip_hetatm=skip_hetatm
|
|
452
|
+
)
|
|
453
|
+
elif format == "cif":
|
|
454
|
+
parser = _CIFMoleculeParser(
|
|
455
|
+
id=id,
|
|
456
|
+
use_author=use_author,
|
|
457
|
+
skip_hetatm=skip_hetatm,
|
|
458
|
+
)
|
|
459
|
+
else:
|
|
460
|
+
raise ValueError(f"invalid value for `format` argument: {format!r}")
|
|
461
|
+
return parser.load(file, molecule_type=cls)
|
|
462
|
+
|
|
463
|
+
@classmethod
|
|
464
|
+
def from_biopython(cls, object structure, str id = None):
|
|
465
|
+
"""Create a new `~pyjess.Molecule` from a `Bio.PDB.Structure`.
|
|
466
|
+
|
|
467
|
+
Arguments:
|
|
468
|
+
structure (`Bio.PDB.Structure` or `Bio.PDB.Model`): The
|
|
469
|
+
Biopython object containing the structure data.
|
|
470
|
+
id (`str` or `None`): The identifier to give to the newly
|
|
471
|
+
created molecule. If `None` given, will use the value of
|
|
472
|
+
``structure.id``.
|
|
473
|
+
|
|
474
|
+
Returns:
|
|
475
|
+
`~pyjess.Molecule`: A molecule object suitable for using
|
|
476
|
+
in `Jess.query`.
|
|
477
|
+
|
|
478
|
+
.. versionadded:: 0.7.0
|
|
479
|
+
|
|
480
|
+
"""
|
|
481
|
+
cdef list atoms = []
|
|
482
|
+
for c in structure.get_chains():
|
|
483
|
+
for r in c.get_residues():
|
|
484
|
+
_, residue_number, insertion_code = r.id
|
|
485
|
+
for a in r.get_atoms():
|
|
486
|
+
coord = a.get_coord()
|
|
487
|
+
atom = Atom(
|
|
488
|
+
name=a.fullname,
|
|
489
|
+
x=coord[0],
|
|
490
|
+
y=coord[1],
|
|
491
|
+
z=coord[2],
|
|
492
|
+
altloc=a.altloc,
|
|
493
|
+
charge=a.pqr_charge or 0,
|
|
494
|
+
occupancy=a.occupancy,
|
|
495
|
+
serial=a.serial_number,
|
|
496
|
+
residue_name=r.resname,
|
|
497
|
+
residue_number=residue_number,
|
|
498
|
+
segment=r.segid,
|
|
499
|
+
insertion_code=insertion_code,
|
|
500
|
+
chain_id=c.id,
|
|
501
|
+
temperature_factor=a.bfactor,
|
|
502
|
+
element=a.element,
|
|
503
|
+
)
|
|
504
|
+
atoms.append(atom)
|
|
505
|
+
return cls(atoms, id=structure.id)
|
|
506
|
+
|
|
507
|
+
@classmethod
|
|
508
|
+
def from_gemmi(cls, object model, str id=None):
|
|
509
|
+
"""Create a new `~pyjess.Molecule` from a `gemmi.Model`.
|
|
510
|
+
|
|
511
|
+
Arguments:
|
|
512
|
+
structure (`gemmi.Model`): The ``gemmi`` object
|
|
513
|
+
containing the structure data.
|
|
514
|
+
id (`str` or `None`): The identifier to give to the newly
|
|
515
|
+
created molecule.
|
|
516
|
+
|
|
517
|
+
Returns:
|
|
518
|
+
`~pyjess.Molecule`: A molecule object suitable for using
|
|
519
|
+
in `Jess.query`.
|
|
520
|
+
|
|
521
|
+
.. versionadded:: 0.7.0
|
|
522
|
+
|
|
523
|
+
"""
|
|
524
|
+
cdef list atoms = []
|
|
525
|
+
for cra in model.all():
|
|
526
|
+
a = cra.atom
|
|
527
|
+
r = cra.residue
|
|
528
|
+
c = cra.chain
|
|
529
|
+
atom = Atom(
|
|
530
|
+
name=a.padded_name(),
|
|
531
|
+
x=a.pos[0],
|
|
532
|
+
y=a.pos[1],
|
|
533
|
+
z=a.pos[2],
|
|
534
|
+
altloc=' ' if a.altloc == '\0' else a.altloc,
|
|
535
|
+
charge=a.charge,
|
|
536
|
+
element=a.element.name.upper(),
|
|
537
|
+
occupancy=a.occ,
|
|
538
|
+
temperature_factor=a.b_iso,
|
|
539
|
+
serial=a.serial,
|
|
540
|
+
segment=r.segment,
|
|
541
|
+
residue_name=r.name,
|
|
542
|
+
residue_number=r.seqid.num,
|
|
543
|
+
chain_id=c.name,
|
|
544
|
+
insertion_code=r.seqid.icode,
|
|
545
|
+
)
|
|
546
|
+
atoms.append(atom)
|
|
146
547
|
return cls(atoms, id=id)
|
|
147
548
|
|
|
549
|
+
@classmethod
|
|
550
|
+
def from_biotite(cls, object atom_array, str id=None):
|
|
551
|
+
"""Create a new `~pyjess.Molecule` from a `biotite.structure.AtomArray`.
|
|
552
|
+
|
|
553
|
+
Arguments:
|
|
554
|
+
structure (`biotite.structure.AtomArray`): The ``biotite``
|
|
555
|
+
object containing the structure data.
|
|
556
|
+
|
|
557
|
+
Returns:
|
|
558
|
+
`~pyjess.Molecule`: A molecule object suitable for using
|
|
559
|
+
in `Jess.query`.
|
|
560
|
+
|
|
561
|
+
Caution:
|
|
562
|
+
If loading data with the `biotite.structure.io.pdb.PDBFile` module,
|
|
563
|
+
ensure that you are requesting all atoms and all extra fields
|
|
564
|
+
in `~biotite.structure.io.pdb.PDBFile.get_structure`::
|
|
565
|
+
|
|
566
|
+
db_file = PDBFile.read("data/1AMY.pdb")
|
|
567
|
+
structure = pdb_file.get_structure(
|
|
568
|
+
altloc="all",
|
|
569
|
+
extra_fields=["atom_id", "b_factor", "occupancy", "charge"],
|
|
570
|
+
)
|
|
571
|
+
molecule = Molecule.from_biotite(structure[0])
|
|
572
|
+
|
|
573
|
+
.. versionadded:: 0.7.0
|
|
574
|
+
|
|
575
|
+
"""
|
|
576
|
+
cdef list atoms = []
|
|
577
|
+
for a in atom_array:
|
|
578
|
+
atom = Atom(
|
|
579
|
+
name=str(a.atom_name),
|
|
580
|
+
x=a.coord[0],
|
|
581
|
+
y=a.coord[1],
|
|
582
|
+
z=a.coord[2],
|
|
583
|
+
altloc=str(getattr(a, 'altloc', ' ')),
|
|
584
|
+
charge=getattr(a, 'charge', 0),
|
|
585
|
+
element=str(a.element),
|
|
586
|
+
occupancy=getattr(a, 'occupancy', 1.0),
|
|
587
|
+
temperature_factor=a.b_factor,
|
|
588
|
+
serial=a.atom_id,
|
|
589
|
+
segment=str(getattr(a, 'segment', '')),
|
|
590
|
+
residue_name=str(a.res_name),
|
|
591
|
+
residue_number=a.res_id,
|
|
592
|
+
chain_id=str(a.chain_id),
|
|
593
|
+
insertion_code=str(a.ins_code).ljust(1),
|
|
594
|
+
)
|
|
595
|
+
atoms.append(atom)
|
|
596
|
+
return cls(atoms)
|
|
597
|
+
|
|
148
598
|
def __cinit__(self):
|
|
149
599
|
self._mol = NULL
|
|
150
600
|
|
|
@@ -249,17 +699,32 @@ cdef class Molecule:
|
|
|
249
699
|
return self._id
|
|
250
700
|
|
|
251
701
|
cpdef Molecule conserved(self, double cutoff = 0.0):
|
|
702
|
+
"""Get a molecule containing only a subset of conserved atoms.
|
|
703
|
+
|
|
704
|
+
Arguments:
|
|
705
|
+
cutoff (`float`): The conservation cutoff for atoms. Atoms
|
|
706
|
+
with a `~Atom.temperature_factor` lower than this value
|
|
707
|
+
will be removed from the result.
|
|
708
|
+
|
|
709
|
+
Returns:
|
|
710
|
+
`~pyjess.Molecule`: A new molecule with atoms below the
|
|
711
|
+
conservation cutoff removed.
|
|
712
|
+
|
|
713
|
+
"""
|
|
252
714
|
assert self._mol is not NULL
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
715
|
+
|
|
716
|
+
cdef size_t i
|
|
717
|
+
cdef list atoms
|
|
718
|
+
|
|
719
|
+
if cutoff <= 0.0:
|
|
720
|
+
return self.copy()
|
|
721
|
+
|
|
722
|
+
atoms = []
|
|
723
|
+
for i in range(self._mol.count):
|
|
724
|
+
if self._mol.atom[i].tempFactor >= cutoff:
|
|
725
|
+
atoms.append(self[i])
|
|
726
|
+
|
|
727
|
+
return type(self)(id=self.id, atoms=atoms)
|
|
263
728
|
|
|
264
729
|
cpdef Molecule copy(self):
|
|
265
730
|
"""Create a copy of this molecule and its atoms.
|
|
@@ -329,8 +794,9 @@ cdef class Atom:
|
|
|
329
794
|
atom metadata from.
|
|
330
795
|
|
|
331
796
|
"""
|
|
332
|
-
cdef
|
|
333
|
-
cdef
|
|
797
|
+
cdef const unsigned char* s
|
|
798
|
+
cdef bytearray b
|
|
799
|
+
cdef Atom atom
|
|
334
800
|
|
|
335
801
|
if isinstance(text, str):
|
|
336
802
|
b = bytearray(text, 'utf-8')
|
|
@@ -339,14 +805,15 @@ cdef class Atom:
|
|
|
339
805
|
if not b.endswith(b'\n'):
|
|
340
806
|
b.append(b'\n')
|
|
341
807
|
b.append(b'\0')
|
|
808
|
+
s = b
|
|
342
809
|
|
|
343
810
|
atom = cls.__new__(cls)
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
811
|
+
with nogil:
|
|
812
|
+
atom._atom = <_Atom*> malloc(sizeof(_Atom))
|
|
813
|
+
if atom._atom == NULL:
|
|
814
|
+
raise MemoryError("Failed to allocate atom")
|
|
815
|
+
if not jess.atom.Atom_parse(atom._atom, <const char*> s):
|
|
816
|
+
raise ValueError(f"Failed to parse atom: {text!r}")
|
|
350
817
|
|
|
351
818
|
return atom
|
|
352
819
|
|
|
@@ -364,21 +831,21 @@ cdef class Atom:
|
|
|
364
831
|
*,
|
|
365
832
|
int serial,
|
|
366
833
|
str name,
|
|
367
|
-
str altloc,
|
|
368
834
|
str residue_name,
|
|
369
835
|
str chain_id,
|
|
370
836
|
int residue_number,
|
|
371
|
-
str insertion_code,
|
|
372
837
|
double x,
|
|
373
838
|
double y,
|
|
374
839
|
double z,
|
|
375
840
|
double occupancy = 0.0,
|
|
376
841
|
double temperature_factor = 0.0,
|
|
842
|
+
str altloc = ' ',
|
|
843
|
+
str insertion_code = ' ',
|
|
377
844
|
str segment = '',
|
|
378
845
|
str element = '',
|
|
379
846
|
int charge = 0,
|
|
380
847
|
):
|
|
381
|
-
"""__init__(self, *, serial, name,
|
|
848
|
+
"""__init__(self, *, serial, name, residue_name, chain_id, residue_number, x, y, z, occupancy=0.0, temperature_factor=0.0, altloc=' ', insertion_code=' ', segment='', element='', charge=0)\n--\n
|
|
382
849
|
|
|
383
850
|
Create a new atom.
|
|
384
851
|
|
|
@@ -390,11 +857,16 @@ cdef class Atom:
|
|
|
390
857
|
long.
|
|
391
858
|
|
|
392
859
|
"""
|
|
860
|
+
cdef bytearray _name
|
|
861
|
+
cdef bytes _residue_name
|
|
862
|
+
cdef bytes _segment
|
|
863
|
+
cdef bytes _element
|
|
864
|
+
|
|
393
865
|
if len(name) > 4:
|
|
394
866
|
raise ValueError(f"Invalid atom name: {name!r}")
|
|
395
867
|
if len(residue_name) > 3:
|
|
396
868
|
raise ValueError(f"Invalid residue name: {residue_name!r}")
|
|
397
|
-
if len(segment) >
|
|
869
|
+
if len(segment) > 4:
|
|
398
870
|
raise ValueError(f"Invalid segment: {segment!r}")
|
|
399
871
|
if len(element) > 2:
|
|
400
872
|
raise ValueError(f"Invalid element: {element!r}")
|
|
@@ -405,6 +877,10 @@ cdef class Atom:
|
|
|
405
877
|
if self._atom is NULL:
|
|
406
878
|
raise MemoryError("Failed to allocate atom")
|
|
407
879
|
|
|
880
|
+
_residue_name = PyUnicode_AsASCIIString(residue_name)
|
|
881
|
+
_segment = PyUnicode_AsASCIIString(segment)
|
|
882
|
+
_element = PyUnicode_AsASCIIString(element)
|
|
883
|
+
|
|
408
884
|
self._atom.serial = serial
|
|
409
885
|
self._atom.altLoc = ord(altloc)
|
|
410
886
|
self._atom.chainID1 = ord(chain_id[0]) if len(chain_id) > 0 else 0
|
|
@@ -417,14 +893,15 @@ cdef class Atom:
|
|
|
417
893
|
self._atom.occupancy = occupancy
|
|
418
894
|
self._atom.tempFactor = temperature_factor
|
|
419
895
|
self._atom.charge = charge
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
896
|
+
encode_token(self._atom.resName, _residue_name.ljust(3, b'\0'), 3)
|
|
897
|
+
encode_token(self._atom.segID, _segment.ljust(4, b'\0'), 4)
|
|
898
|
+
encode_token(self._atom.element, _element.ljust(2, b'\0'), 2)
|
|
423
899
|
|
|
900
|
+
# FIXME: is alignment proper?
|
|
424
901
|
_name = bytearray(name, 'ascii')
|
|
425
902
|
if len(_name) < 4:
|
|
426
903
|
_name.insert(0, ord('_'))
|
|
427
|
-
|
|
904
|
+
encode_token(self._atom.name, _name.ljust(4, b'\0'), 4)
|
|
428
905
|
|
|
429
906
|
def __copy__(self):
|
|
430
907
|
return self.copy()
|
|
@@ -517,7 +994,7 @@ cdef class Atom:
|
|
|
517
994
|
"""`str`: The segment identifier.
|
|
518
995
|
"""
|
|
519
996
|
assert self._atom is not NULL
|
|
520
|
-
return self._atom.segID[:
|
|
997
|
+
return self._atom.segID[:4].decode('ascii').strip('_')
|
|
521
998
|
|
|
522
999
|
@property
|
|
523
1000
|
def element(self):
|
|
@@ -538,7 +1015,7 @@ cdef class Atom:
|
|
|
538
1015
|
"""`str`: The identifier of the chain the atom belongs to.
|
|
539
1016
|
"""
|
|
540
1017
|
assert self._atom is not NULL
|
|
541
|
-
return "
|
|
1018
|
+
return PyUnicode_FromFormat("%c%c", self._atom.chainID1, self._atom.chainID2).strip()
|
|
542
1019
|
|
|
543
1020
|
@property
|
|
544
1021
|
def occupancy(self):
|
|
@@ -563,16 +1040,22 @@ cdef class Atom:
|
|
|
563
1040
|
|
|
564
1041
|
@property
|
|
565
1042
|
def x(self):
|
|
1043
|
+
"""`float`: The atom coordinate in the 1st dimension.
|
|
1044
|
+
"""
|
|
566
1045
|
assert self._atom is not NULL
|
|
567
1046
|
return self._atom.x[0]
|
|
568
1047
|
|
|
569
1048
|
@property
|
|
570
1049
|
def y(self):
|
|
1050
|
+
"""`float`: The atom coordinate in the 2nd dimension.
|
|
1051
|
+
"""
|
|
571
1052
|
assert self._atom is not NULL
|
|
572
1053
|
return self._atom.x[1]
|
|
573
1054
|
|
|
574
1055
|
@property
|
|
575
1056
|
def z(self):
|
|
1057
|
+
"""`float`: The atom coordinate in the 3rd dimension.
|
|
1058
|
+
"""
|
|
576
1059
|
assert self._atom is not NULL
|
|
577
1060
|
return self._atom.x[2]
|
|
578
1061
|
|
|
@@ -734,18 +1217,19 @@ cdef class TemplateAtom:
|
|
|
734
1217
|
_name = bytearray(name, 'ascii')
|
|
735
1218
|
else:
|
|
736
1219
|
_name = bytearray(name)
|
|
1220
|
+
# FIXME: is alignment proper?
|
|
737
1221
|
if len(_name) > 4:
|
|
738
1222
|
raise ValueError(f"Invalid atom name: {name!r}")
|
|
739
|
-
elif len(_name)
|
|
1223
|
+
elif len(_name) <= 3:
|
|
740
1224
|
_name.insert(0, ord('_'))
|
|
741
|
-
|
|
1225
|
+
encode_token(self._atom.name[m], _name.ljust(4, b'\0'), 4)
|
|
742
1226
|
|
|
743
1227
|
# copy residue names
|
|
744
1228
|
for m, name in enumerate(residue_names):
|
|
745
1229
|
_name = name.encode('ascii') if isinstance(name, str) else name
|
|
746
1230
|
if len(_name) > 3:
|
|
747
1231
|
raise ValueError(f"Invalid residue name: {name!r}")
|
|
748
|
-
|
|
1232
|
+
encode_token(self._atom.resName[m], _name.ljust(3, b'\0'), 3)
|
|
749
1233
|
|
|
750
1234
|
cdef dict _state(self):
|
|
751
1235
|
return {
|
|
@@ -819,7 +1303,7 @@ cdef class TemplateAtom:
|
|
|
819
1303
|
assert self._atom is not NULL
|
|
820
1304
|
cdef char c1 = jess.tess_atom.TessAtom_chainID1(self._atom)
|
|
821
1305
|
cdef char c2 = jess.tess_atom.TessAtom_chainID2(self._atom)
|
|
822
|
-
return "
|
|
1306
|
+
return PyUnicode_FromFormat("%c%c", c1, c2).strip()
|
|
823
1307
|
|
|
824
1308
|
@property
|
|
825
1309
|
def x(self):
|
|
@@ -893,7 +1377,10 @@ cdef class TemplateAtom:
|
|
|
893
1377
|
.. versionadded:: 0.4.0
|
|
894
1378
|
|
|
895
1379
|
"""
|
|
896
|
-
|
|
1380
|
+
cdef TemplateAtom atom = TemplateAtom.__new__(TemplateAtom)
|
|
1381
|
+
with nogil:
|
|
1382
|
+
atom._atom = jess.tess_atom.TessAtom_copy(self._atom)
|
|
1383
|
+
return atom
|
|
897
1384
|
|
|
898
1385
|
|
|
899
1386
|
cdef class Template:
|
|
@@ -946,12 +1433,13 @@ cdef class Template:
|
|
|
946
1433
|
`~pyjess.Template`: The template parsed from the given file.
|
|
947
1434
|
|
|
948
1435
|
"""
|
|
1436
|
+
cdef str line
|
|
1437
|
+
cdef list atoms = []
|
|
949
1438
|
try:
|
|
950
1439
|
handle = open(file)
|
|
951
1440
|
except TypeError:
|
|
952
1441
|
handle = nullcontext(file)
|
|
953
1442
|
with handle as f:
|
|
954
|
-
atoms = []
|
|
955
1443
|
for line in f:
|
|
956
1444
|
if line.startswith("ATOM"):
|
|
957
1445
|
atoms.append(TemplateAtom.loads(line))
|
|
@@ -1048,7 +1536,14 @@ cdef class Template:
|
|
|
1048
1536
|
self._tess.distance[j][i] = dist
|
|
1049
1537
|
|
|
1050
1538
|
# compute dimension
|
|
1051
|
-
residues = {
|
|
1539
|
+
residues = {
|
|
1540
|
+
(
|
|
1541
|
+
self._tess.atom[i].resSeq ,
|
|
1542
|
+
self._tess.atom[i].chainID1,
|
|
1543
|
+
self._tess.atom[i].chainID2,
|
|
1544
|
+
)
|
|
1545
|
+
for i in range(count)
|
|
1546
|
+
}
|
|
1052
1547
|
self._tess.dim = len(residues)
|
|
1053
1548
|
|
|
1054
1549
|
def __copy__(self):
|
|
@@ -1131,6 +1626,8 @@ cdef class Template:
|
|
|
1131
1626
|
|
|
1132
1627
|
@property
|
|
1133
1628
|
def id(self):
|
|
1629
|
+
"""`str` or `None`: An identifier for the template, if any.
|
|
1630
|
+
"""
|
|
1134
1631
|
assert self._tpl is not NULL
|
|
1135
1632
|
|
|
1136
1633
|
cdef const char* name = self._tpl.name(self._tpl)
|
|
@@ -1146,11 +1643,18 @@ cdef class Template:
|
|
|
1146
1643
|
return self._tess.dim
|
|
1147
1644
|
|
|
1148
1645
|
cpdef Template copy(self):
|
|
1149
|
-
|
|
1150
|
-
self,
|
|
1151
|
-
self.id
|
|
1152
|
-
)
|
|
1646
|
+
"""Create a copy of the template.
|
|
1153
1647
|
|
|
1648
|
+
Returns:
|
|
1649
|
+
`~pyjess.Template`: A new template object with identical
|
|
1650
|
+
attributes and a copy of the `TemplateAtom` it contains.
|
|
1651
|
+
|
|
1652
|
+
"""
|
|
1653
|
+
cdef Template tpl = Template.__new__(Template)
|
|
1654
|
+
with nogil:
|
|
1655
|
+
tpl._tpl = self._tpl.copy(self._tpl)
|
|
1656
|
+
tpl._tess = <_TessTemplate*> &tpl._tpl[1]
|
|
1657
|
+
return tpl
|
|
1154
1658
|
|
|
1155
1659
|
cdef class Query:
|
|
1156
1660
|
"""A query over templates with a given molecule.
|
|
@@ -1166,10 +1670,6 @@ cdef class Query:
|
|
|
1166
1670
|
the templates.
|
|
1167
1671
|
rmsd_threshold (`float`): The RMSD threshold for reporting
|
|
1168
1672
|
results.
|
|
1169
|
-
max_candidates (`int`): The maximum number of candidate hits
|
|
1170
|
-
to report.
|
|
1171
|
-
ignore_chain (`bool`): Whether to check or ignore the chain of
|
|
1172
|
-
the atoms to match.
|
|
1173
1673
|
best_match (`bool`): Whether the query will return only the
|
|
1174
1674
|
best match to each template.
|
|
1175
1675
|
|
|
@@ -1177,18 +1677,20 @@ cdef class Query:
|
|
|
1177
1677
|
cdef _JessQuery* _jq
|
|
1178
1678
|
cdef bint _partial
|
|
1179
1679
|
cdef int _candidates
|
|
1680
|
+
cdef uintptr_t _prev_tpl
|
|
1681
|
+
cdef int _max_candidates
|
|
1682
|
+
cdef _IgnoreType _ignore_chain
|
|
1180
1683
|
|
|
1181
1684
|
cdef readonly Jess jess
|
|
1182
1685
|
cdef readonly Molecule molecule
|
|
1183
|
-
cdef readonly bint ignore_chain
|
|
1184
1686
|
cdef readonly bint best_match
|
|
1185
1687
|
cdef readonly double rmsd_threshold
|
|
1186
|
-
cdef readonly int max_candidates
|
|
1187
1688
|
|
|
1188
1689
|
def __cinit__(self):
|
|
1189
1690
|
self._jq = NULL
|
|
1190
1691
|
self._candidates = 0
|
|
1191
1692
|
self._partial = False
|
|
1693
|
+
self._prev_tpl = 0
|
|
1192
1694
|
|
|
1193
1695
|
def __dealloc__(self):
|
|
1194
1696
|
jess.jess.JessQuery_free(self._jq)
|
|
@@ -1196,11 +1698,48 @@ cdef class Query:
|
|
|
1196
1698
|
def __iter__(self):
|
|
1197
1699
|
return self
|
|
1198
1700
|
|
|
1701
|
+
@property
|
|
1702
|
+
def ignore_chain(self):
|
|
1703
|
+
"""`str` or `None`: The way atom chains are considered or discarded.
|
|
1704
|
+
"""
|
|
1705
|
+
if self._ignore_chain == _IgnoreType.ignoreNone:
|
|
1706
|
+
return None
|
|
1707
|
+
elif self._ignore_chain == _IgnoreType.ignoreResidues:
|
|
1708
|
+
return "residues"
|
|
1709
|
+
elif self._ignore_chain == _IgnoreType.ignoreAtoms:
|
|
1710
|
+
return "atoms"
|
|
1711
|
+
|
|
1712
|
+
@ignore_chain.setter
|
|
1713
|
+
def ignore_chain(self, ignore_chain):
|
|
1714
|
+
if ignore_chain is None:
|
|
1715
|
+
self._ignore_chain = _IgnoreType.ignoreNone
|
|
1716
|
+
elif ignore_chain == "residues":
|
|
1717
|
+
self._ignore_chain = _IgnoreType.ignoreResidues
|
|
1718
|
+
elif ignore_chain == "atoms":
|
|
1719
|
+
self._ignore_chain = _IgnoreType.ignoreAtoms
|
|
1720
|
+
else:
|
|
1721
|
+
raise ValueError(f"invalid value for `ignore_chain`: {ignore_chain!r}")
|
|
1722
|
+
|
|
1723
|
+
@property
|
|
1724
|
+
def max_candidates(self):
|
|
1725
|
+
"""`int`: The maximum number of candidate hits to report *by template*.
|
|
1726
|
+
"""
|
|
1727
|
+
return None if self._max_candidates == -1 else self._max_candidates
|
|
1728
|
+
|
|
1729
|
+
@max_candidates.setter
|
|
1730
|
+
def max_candidates(self, max_candidates):
|
|
1731
|
+
if max_candidates is None:
|
|
1732
|
+
self._max_candidates = -1
|
|
1733
|
+
elif max_candidates >= 0:
|
|
1734
|
+
self._max_candidates = max_candidates
|
|
1735
|
+
else:
|
|
1736
|
+
raise ValueError(f"invalid value for `max_candidates` argument: {max_candidates!r}")
|
|
1737
|
+
|
|
1199
1738
|
cdef bint _advance(self) noexcept nogil:
|
|
1200
1739
|
if self._partial:
|
|
1201
1740
|
self._partial = False
|
|
1202
1741
|
return True
|
|
1203
|
-
return jess.jess.JessQuery_next(self._jq, self.
|
|
1742
|
+
return jess.jess.JessQuery_next(self._jq, self._ignore_chain)
|
|
1204
1743
|
|
|
1205
1744
|
cdef bint _rewind(self) noexcept nogil:
|
|
1206
1745
|
self._partial = True
|
|
@@ -1244,10 +1783,11 @@ cdef class Query:
|
|
|
1244
1783
|
|
|
1245
1784
|
# search the next hit without the GIL to allow parallel queries.
|
|
1246
1785
|
with nogil:
|
|
1247
|
-
while self._advance()
|
|
1786
|
+
while self._advance():
|
|
1248
1787
|
# load current iteration template, and check that the hit
|
|
1249
1788
|
# was obtained with the current template and not with the
|
|
1250
1789
|
# previous one
|
|
1790
|
+
self._prev_tpl = <uintptr_t> tpl
|
|
1251
1791
|
tpl = jess.jess.JessQuery_template(self._jq)
|
|
1252
1792
|
if hit_found and hit_tpl != tpl:
|
|
1253
1793
|
self._rewind()
|
|
@@ -1274,10 +1814,10 @@ cdef class Query:
|
|
|
1274
1814
|
|
|
1275
1815
|
if nan:
|
|
1276
1816
|
with gil:
|
|
1277
|
-
|
|
1278
|
-
"Jess returned a superposition matrix with NaN values",
|
|
1817
|
+
PyErr_WarnEx(
|
|
1279
1818
|
UserWarning,
|
|
1280
|
-
|
|
1819
|
+
"Jess returned a superposition matrix with NaN values",
|
|
1820
|
+
2,
|
|
1281
1821
|
)
|
|
1282
1822
|
else:
|
|
1283
1823
|
self._copy_atoms(tpl, hit)
|
|
@@ -1286,9 +1826,21 @@ cdef class Query:
|
|
|
1286
1826
|
hit_tpl = tpl
|
|
1287
1827
|
hit_found = True
|
|
1288
1828
|
|
|
1289
|
-
#
|
|
1290
|
-
#
|
|
1291
|
-
|
|
1829
|
+
# check if we already made it to the next template,
|
|
1830
|
+
# or if we need to short-circuit the iteration and
|
|
1831
|
+
# force the query to move to the next template as
|
|
1832
|
+
# we found too many candidates already.
|
|
1833
|
+
if <uintptr_t> tpl != self._prev_tpl:
|
|
1834
|
+
self._candidates = 0
|
|
1835
|
+
else:
|
|
1836
|
+
self._candidates += 1
|
|
1837
|
+
if self._max_candidates != -1 and self._candidates > self._max_candidates:
|
|
1838
|
+
self._candidates = 0
|
|
1839
|
+
jess.jess.JessQuery_nextTemplate(self._jq)
|
|
1840
|
+
|
|
1841
|
+
# free superposition items (as relevant data was copied in
|
|
1842
|
+
# the Hit if needed) and return hits immediately if we are
|
|
1843
|
+
# not in best match mode
|
|
1292
1844
|
jess.super.Superposition_free(sup)
|
|
1293
1845
|
if hit_found and not self.best_match:
|
|
1294
1846
|
break
|
|
@@ -1355,6 +1907,18 @@ cdef class Hit:
|
|
|
1355
1907
|
for i, atom in enumerate(state["atoms"]):
|
|
1356
1908
|
memcpy(&self._atoms[i], atom._atom, sizeof(_Atom))
|
|
1357
1909
|
|
|
1910
|
+
cdef void _transform_atom(self, double* x, const double* src):
|
|
1911
|
+
cdef size_t i
|
|
1912
|
+
cdef size_t j
|
|
1913
|
+
cdef const double* M = self._rotation
|
|
1914
|
+
cdef const double* c = self._centre[0]
|
|
1915
|
+
cdef const double* v = self._centre[1]
|
|
1916
|
+
|
|
1917
|
+
for i in range(3):
|
|
1918
|
+
x[i] = v[i]
|
|
1919
|
+
for j in range(3):
|
|
1920
|
+
x[i] += M[3*i + j] * (src[j] - c[j])
|
|
1921
|
+
|
|
1358
1922
|
@property
|
|
1359
1923
|
def determinant(self):
|
|
1360
1924
|
"""`float`: The determinant of the rotation matrix.
|
|
@@ -1423,15 +1987,11 @@ cdef class Hit:
|
|
|
1423
1987
|
if transform:
|
|
1424
1988
|
atom._atom = <_Atom*> malloc(sizeof(_Atom))
|
|
1425
1989
|
memcpy(atom._atom, &self._atoms[k], sizeof(_Atom))
|
|
1426
|
-
|
|
1427
|
-
atom._atom.x[i] = v[i]
|
|
1428
|
-
for j in range(3):
|
|
1429
|
-
atom._atom.x[i] += M[3*i + j] * (self._atoms[k].x[j] - c[j])
|
|
1990
|
+
self._transform_atom(atom._atom.x, self._atoms[k].x)
|
|
1430
1991
|
else:
|
|
1431
1992
|
atom.owned = True
|
|
1432
1993
|
atom.owner = self
|
|
1433
1994
|
atom._atom = &self._atoms[k]
|
|
1434
|
-
|
|
1435
1995
|
atoms.append(atom)
|
|
1436
1996
|
|
|
1437
1997
|
return atoms
|
|
@@ -1467,17 +2027,142 @@ cdef class Hit:
|
|
|
1467
2027
|
mol = self._molecule.copy()
|
|
1468
2028
|
for k in range(mol._mol.count):
|
|
1469
2029
|
atom = mol._mol.atom[k]
|
|
1470
|
-
|
|
1471
|
-
atom.x[i] = v[i]
|
|
1472
|
-
for j in range(3):
|
|
1473
|
-
atom.x[i] += M[3*i + j] * (self._molecule._mol.atom[k].x[j] - c[j])
|
|
2030
|
+
self._transform_atom(atom.x, self._molecule._mol.atom[k].x)
|
|
1474
2031
|
|
|
1475
2032
|
return mol
|
|
1476
2033
|
|
|
2034
|
+
cpdef str dumps(self, str format="pdb", bint transform=True):
|
|
2035
|
+
"""Write the hit to a string.
|
|
2036
|
+
|
|
2037
|
+
Arguments:
|
|
2038
|
+
format (`str`): The format in which to write the hit.
|
|
2039
|
+
Currently only supports ``pdb``, which writes the hits
|
|
2040
|
+
in the same format as Jess.
|
|
2041
|
+
transform (`bool`): Whether or not to transform coordinates
|
|
2042
|
+
of the molecule atoms into template frame.
|
|
2043
|
+
|
|
2044
|
+
Raises:
|
|
2045
|
+
`RuntimeError`: When attempting to dump a `Hit` which was
|
|
2046
|
+
obtained from a `Template` which has no `~Template.id`.
|
|
2047
|
+
|
|
2048
|
+
.. versionadded:: 0.7.0
|
|
2049
|
+
|
|
2050
|
+
"""
|
|
2051
|
+
file = io.StringIO()
|
|
2052
|
+
self.dump(file, format=format, transform=transform)
|
|
2053
|
+
return file.getvalue()
|
|
2054
|
+
|
|
2055
|
+
cpdef void dump(self, object file, str format="pdb", bint transform=True):
|
|
2056
|
+
"""Write the hit to a file.
|
|
2057
|
+
|
|
2058
|
+
Arguments:
|
|
2059
|
+
file (file-like object): A file opened in *text* mode where the
|
|
2060
|
+
hit will be written.
|
|
2061
|
+
format (`str`): The format in which to write the hit.
|
|
2062
|
+
Currently only supports ``pdb``, which writes the hits
|
|
2063
|
+
in the same format as Jess.
|
|
2064
|
+
transform (`bool`): Whether or not to transform coordinates
|
|
2065
|
+
of the molecule atoms into template frame.
|
|
2066
|
+
|
|
2067
|
+
Raises:
|
|
2068
|
+
`RuntimeError`: When attempting to dump a `Hit` which was
|
|
2069
|
+
obtained from a `Template` which has no `~Template.id`.
|
|
2070
|
+
|
|
2071
|
+
.. versionadded:: 0.7.0
|
|
2072
|
+
|
|
2073
|
+
"""
|
|
2074
|
+
assert self.template._tpl is not NULL
|
|
2075
|
+
assert self._molecule._mol is not NULL
|
|
2076
|
+
|
|
2077
|
+
cdef _Atom* atom
|
|
2078
|
+
cdef size_t k
|
|
2079
|
+
cdef char[80] buffer
|
|
2080
|
+
cdef char[5] name
|
|
2081
|
+
cdef char[5] resname
|
|
2082
|
+
cdef double[3] x
|
|
2083
|
+
cdef int count = self.template._tpl.count(self.template._tpl)
|
|
2084
|
+
|
|
2085
|
+
if self.template.id is None:
|
|
2086
|
+
raise RuntimeError("cannot dump `Hit` where `self.template.id` is `None`")
|
|
2087
|
+
|
|
2088
|
+
file.write("REMARK ")
|
|
2089
|
+
file.write(self._molecule.id)
|
|
2090
|
+
file.write(f" {self.rmsd:5.3f} ")
|
|
2091
|
+
file.write(self.template.id)
|
|
2092
|
+
file.write(f" Det={self.determinant:4,.1f} log(E)~ {self.log_evalue:4.2f}\n")
|
|
2093
|
+
|
|
2094
|
+
for k in range(count):
|
|
2095
|
+
atom = &self._atoms[k]
|
|
2096
|
+
decode_token(name, atom.name, 4)
|
|
2097
|
+
decode_token(resname, atom.resName, 3)
|
|
2098
|
+
if transform:
|
|
2099
|
+
self._transform_atom(x, atom.x)
|
|
2100
|
+
else:
|
|
2101
|
+
memcpy(x, atom.x, 3*sizeof(double))
|
|
2102
|
+
n = sprintf(
|
|
2103
|
+
buffer,
|
|
2104
|
+
"ATOM %5i%5s%c%-3s%c%c%4i%-4c%8.3f%8.3f%8.3f%6.2f%6.2f\n",
|
|
2105
|
+
atom.serial,
|
|
2106
|
+
name,
|
|
2107
|
+
atom.altLoc,
|
|
2108
|
+
resname,
|
|
2109
|
+
atom.chainID1,
|
|
2110
|
+
atom.chainID2,
|
|
2111
|
+
atom.resSeq,
|
|
2112
|
+
atom.iCode,
|
|
2113
|
+
x[0],
|
|
2114
|
+
x[1],
|
|
2115
|
+
x[2],
|
|
2116
|
+
atom.occupancy,
|
|
2117
|
+
atom.tempFactor,
|
|
2118
|
+
atom.segID,
|
|
2119
|
+
atom.element,
|
|
2120
|
+
atom.charge
|
|
2121
|
+
)
|
|
2122
|
+
file.write(PyUnicode_FromStringAndSize(buffer, n))
|
|
2123
|
+
file.write("ENDMDL\n")
|
|
1477
2124
|
|
|
1478
2125
|
cdef class Jess:
|
|
1479
2126
|
"""A handle to run Jess over a list of templates.
|
|
1480
2127
|
|
|
2128
|
+
Example:
|
|
2129
|
+
Create a `Jess` object from a list of templates::
|
|
2130
|
+
|
|
2131
|
+
>>> t1 = Template.load("1.3.3.tpl")
|
|
2132
|
+
>>> t2 = Template.load("4.1.2.tpl")
|
|
2133
|
+
>>> jess = Jess([t1, t2])
|
|
2134
|
+
|
|
2135
|
+
Once initialized, the `Jess` object cannot be modified further.
|
|
2136
|
+
Use the `~Jess.query` method to query the templates with a
|
|
2137
|
+
molecule::
|
|
2138
|
+
|
|
2139
|
+
>>> molecule = Molecule.load("1AMY.pdb")
|
|
2140
|
+
>>> query = jess.query(molecule, 2, 2, 2)
|
|
2141
|
+
|
|
2142
|
+
The returned `Query` object is an iterator that can be
|
|
2143
|
+
advanced through a ``for`` loop, or with the `next` built-in
|
|
2144
|
+
function to get the first hit:
|
|
2145
|
+
|
|
2146
|
+
>>> hit = next(query)
|
|
2147
|
+
>>> hit.rmsd
|
|
2148
|
+
1.4386...
|
|
2149
|
+
|
|
2150
|
+
The hit can also be formatted in PDB format like in the
|
|
2151
|
+
original JESS code::
|
|
2152
|
+
|
|
2153
|
+
>>> print(hit.dumps(format="pdb"), end="")
|
|
2154
|
+
REMARK 1AMY 1.439 2om2 Det= 1.0 log(E)~ 1.11
|
|
2155
|
+
ATOM 729 CA THR A 94 34.202 -24.426 8.851 1.00 2.00
|
|
2156
|
+
ATOM 732 CB THR A 94 35.157 -23.467 8.101 1.00 4.66
|
|
2157
|
+
ATOM 733 OG1 THR A 94 36.338 -23.247 8.871 1.00 9.85
|
|
2158
|
+
ATOM 746 CD GLU A 96 41.454 -29.509 8.013 1.00 24.05
|
|
2159
|
+
ATOM 748 OE2 GLU A 96 42.536 -29.680 7.441 1.00 34.44
|
|
2160
|
+
ATOM 747 OE1 GLU A 96 41.212 -28.521 8.708 1.00 18.56
|
|
2161
|
+
ATOM 437 CZ ARG A 55 44.471 -26.619 10.181 1.00 8.51
|
|
2162
|
+
ATOM 436 NE ARG A 55 44.334 -27.346 11.290 1.00 9.05
|
|
2163
|
+
ATOM 438 NH1 ARG A 55 43.590 -26.751 9.179 1.00 13.17
|
|
2164
|
+
ENDMDL
|
|
2165
|
+
|
|
1481
2166
|
.. versionadded:: 0.4.0
|
|
1482
2167
|
Equality, hashing and pickle protocol support.
|
|
1483
2168
|
|
|
@@ -1579,9 +2264,10 @@ cdef class Jess:
|
|
|
1579
2264
|
double distance_cutoff,
|
|
1580
2265
|
double max_dynamic_distance,
|
|
1581
2266
|
*,
|
|
1582
|
-
|
|
1583
|
-
|
|
2267
|
+
object max_candidates = None,
|
|
2268
|
+
object ignore_chain = None,
|
|
1584
2269
|
bint best_match = False,
|
|
2270
|
+
bint reorder = True,
|
|
1585
2271
|
):
|
|
1586
2272
|
"""Scan for templates matching the given molecule.
|
|
1587
2273
|
|
|
@@ -1596,18 +2282,81 @@ cdef class Jess:
|
|
|
1596
2282
|
dynamic distance after adding the global distance cutoff
|
|
1597
2283
|
and the individual atom distance cutoff defined for each
|
|
1598
2284
|
atom of the template.
|
|
1599
|
-
|
|
1600
|
-
|
|
2285
|
+
max_candidates (`int` or `None`): The maximum number of candidate
|
|
2286
|
+
hits to report by template. If a non-`None` value is given,
|
|
2287
|
+
it may speed up querying for unspecific templates, but also
|
|
2288
|
+
produce results potentially inconsistent with Jess.
|
|
2289
|
+
ignore_chain (`str` or `None`): Whether to check or ignore the
|
|
2290
|
+
chain of the atoms to match. The different supported modes
|
|
2291
|
+
are:
|
|
2292
|
+
|
|
2293
|
+
- `None`: Force the atoms in the molecule to belong
|
|
2294
|
+
to different (resp. same) chains if so is the case
|
|
2295
|
+
in the template.
|
|
2296
|
+
- ``residues``: Allow atoms to belong to different
|
|
2297
|
+
(resp. same) chains even if it is not the case in
|
|
2298
|
+
the template, but force all atoms of a residue to
|
|
2299
|
+
belong to the same chain.
|
|
2300
|
+
- ``atoms``: Allow atoms to belong to any chain,
|
|
2301
|
+
independently to the template or the residue they
|
|
2302
|
+
belong to.
|
|
2303
|
+
|
|
1601
2304
|
best_match (`bool`): Pass `True` to return only the best match
|
|
1602
|
-
to each template.
|
|
2305
|
+
to each template, based on RMSD. In case of ties, the
|
|
2306
|
+
first match is returned. Note that a match must still
|
|
2307
|
+
be passing the RMSD threshold given in ``rmsd_threshold``
|
|
2308
|
+
to be returned.
|
|
2309
|
+
reorder (`bool`): Whether to enable template atom reordering
|
|
2310
|
+
to accelerate matching in the scanner algorithm. Pass
|
|
2311
|
+
`False` to reverse to the original, slower algorithm
|
|
2312
|
+
which matches atoms in the same order as they appear in
|
|
2313
|
+
the template, at the cost of longer run times.
|
|
1603
2314
|
|
|
1604
2315
|
Returns:
|
|
1605
2316
|
`~pyjess.Query`: An iterator over the query hits.
|
|
1606
2317
|
|
|
2318
|
+
Caution:
|
|
2319
|
+
Since ``v0.6.0``, this function uses an optimized variant of
|
|
2320
|
+
the Jess scanning algorithm which minimized the number of steps
|
|
2321
|
+
needed to generate matches, by re-ordering the order the
|
|
2322
|
+
template atoms are iterated upon. Because of this change,
|
|
2323
|
+
the query may return *exactly* the same matches but in an order
|
|
2324
|
+
that *differs* from the original Jess version. If you really
|
|
2325
|
+
need results in the original order, set ``reorder`` to `False`.
|
|
2326
|
+
|
|
2327
|
+
.. versionadded:: 0.6.0
|
|
2328
|
+
The ``reorder`` argument, defaulting to `True`.
|
|
2329
|
+
|
|
2330
|
+
.. versionchanged:: 0.7.0
|
|
2331
|
+
Default value of ``max_candidates`` argument to `None`.
|
|
2332
|
+
|
|
2333
|
+
.. versionchanged:: 0.7.0
|
|
2334
|
+
``ignore_chain`` now expects string variants rather than `bool`.
|
|
2335
|
+
|
|
1607
2336
|
"""
|
|
2337
|
+
|
|
2338
|
+
if ignore_chain is True:
|
|
2339
|
+
PyErr_WarnEx(
|
|
2340
|
+
DeprecationWarning,
|
|
2341
|
+
"`ignore_chain` parameter expects string parameters "
|
|
2342
|
+
"to specificy the mode since PyJess v0.7.0. "
|
|
2343
|
+
"Use `ignore_chain='atoms'` instead of `ignore_chain=True`",
|
|
2344
|
+
2,
|
|
2345
|
+
)
|
|
2346
|
+
ignore_chain="atoms"
|
|
2347
|
+
elif ignore_chain is False:
|
|
2348
|
+
PyErr_WarnEx(
|
|
2349
|
+
DeprecationWarning,
|
|
2350
|
+
"`ignore_chain` parameter expects string parameters "
|
|
2351
|
+
"to specificy the mode since PyJess v0.7.0. "
|
|
2352
|
+
"Use `ignore_chain=None` instead of `ignore_chain=False`",
|
|
2353
|
+
2,
|
|
2354
|
+
)
|
|
2355
|
+
ignore_chain=None
|
|
2356
|
+
|
|
1608
2357
|
cdef Query query = Query.__new__(Query)
|
|
1609
|
-
query.ignore_chain = ignore_chain
|
|
1610
2358
|
query.max_candidates = max_candidates
|
|
2359
|
+
query.ignore_chain = ignore_chain
|
|
1611
2360
|
query.rmsd_threshold = rmsd_threshold
|
|
1612
2361
|
query.best_match = best_match
|
|
1613
2362
|
query.molecule = molecule
|
|
@@ -1617,5 +2366,6 @@ cdef class Jess:
|
|
|
1617
2366
|
molecule._mol,
|
|
1618
2367
|
distance_cutoff,
|
|
1619
2368
|
max_dynamic_distance,
|
|
2369
|
+
reorder,
|
|
1620
2370
|
)
|
|
1621
2371
|
return query
|