cnotebook 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cnotebook/__init__.py +400 -0
- cnotebook/align.py +454 -0
- cnotebook/context.py +523 -0
- cnotebook/grid/__init__.py +55 -0
- cnotebook/grid/grid.py +1649 -0
- cnotebook/helpers.py +201 -0
- cnotebook/ipython_ext.py +56 -0
- cnotebook/marimo_ext.py +272 -0
- cnotebook/pandas_ext.py +1156 -0
- cnotebook/polars_ext.py +1235 -0
- cnotebook/render.py +200 -0
- cnotebook-2.1.0.dist-info/METADATA +336 -0
- cnotebook-2.1.0.dist-info/RECORD +16 -0
- cnotebook-2.1.0.dist-info/WHEEL +5 -0
- cnotebook-2.1.0.dist-info/licenses/LICENSE +21 -0
- cnotebook-2.1.0.dist-info/top_level.txt +1 -0
cnotebook/pandas_ext.py
ADDED
|
@@ -0,0 +1,1156 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import logging
|
|
3
|
+
import typing
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import oepandas as oepd
|
|
6
|
+
from typing import Iterable, Any, Literal, Hashable
|
|
7
|
+
from openeye import oechem, oedepict, oegraphsim, oegrapheme
|
|
8
|
+
from copy import copy as shallow_copy
|
|
9
|
+
from .context import pass_cnotebook_context, get_series_context
|
|
10
|
+
from .helpers import escape_brackets, create_structure_highlighter
|
|
11
|
+
from .align import create_aligner, fingerprint_maker
|
|
12
|
+
from .render import (
|
|
13
|
+
CNotebookContext, # noqa
|
|
14
|
+
oemol_to_disp,
|
|
15
|
+
oedisp_to_html,
|
|
16
|
+
render_invalid_molecule,
|
|
17
|
+
render_empty_molecule
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Only register iPython formatters if that is present
|
|
21
|
+
try:
|
|
22
|
+
# noinspection PyProtectedMember,PyPackageRequirements
|
|
23
|
+
from IPython import get_ipython
|
|
24
|
+
ipython_present = True
|
|
25
|
+
except ModuleNotFoundError:
|
|
26
|
+
ipython_present = False
|
|
27
|
+
|
|
28
|
+
if typing.TYPE_CHECKING:
|
|
29
|
+
from .context import CNotebookContext
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
SMARTS_DELIMITER_RE = re.compile(r'\s*[|\r\n\t]+\s*')
|
|
33
|
+
|
|
34
|
+
log = logging.getLogger("cnotebook")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def create_mol_formatter(*, ctx: CNotebookContext) -> typing.Callable[[oechem.OEMolBase], str]:
|
|
38
|
+
"""
|
|
39
|
+
Closure that creates a function that renders an OEMol to HTML
|
|
40
|
+
:param ctx: CNotebook rendering context
|
|
41
|
+
:return: Function that renders molecules to HTML
|
|
42
|
+
"""
|
|
43
|
+
def _oemol_to_html(mol: oechem.OEMolBase):
|
|
44
|
+
if isinstance(mol, oechem.OEMolBase):
|
|
45
|
+
|
|
46
|
+
# Render valid molecules
|
|
47
|
+
if mol.IsValid():
|
|
48
|
+
# Create the display object
|
|
49
|
+
disp = oemol_to_disp(mol, ctx=ctx)
|
|
50
|
+
|
|
51
|
+
# Apply display callbacks
|
|
52
|
+
if ctx.callbacks is not None:
|
|
53
|
+
for callback in ctx.callbacks:
|
|
54
|
+
callback(disp)
|
|
55
|
+
|
|
56
|
+
# Render into the string stream
|
|
57
|
+
return oedisp_to_html(disp)
|
|
58
|
+
|
|
59
|
+
# Empty molecule
|
|
60
|
+
elif mol.NumAtoms() == 0:
|
|
61
|
+
return render_empty_molecule(ctx=ctx)
|
|
62
|
+
|
|
63
|
+
# Invalid molecule
|
|
64
|
+
else:
|
|
65
|
+
return render_invalid_molecule(ctx=ctx)
|
|
66
|
+
|
|
67
|
+
return str(mol)
|
|
68
|
+
|
|
69
|
+
return _oemol_to_html
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@pass_cnotebook_context
|
|
73
|
+
def create_disp_formatter(
|
|
74
|
+
*,
|
|
75
|
+
callbacks: list[typing.Callable[[oedepict.OE2DMolDisplay], None]] | None = None,
|
|
76
|
+
ctx: CNotebookContext
|
|
77
|
+
) -> typing.Callable[[oedepict.OE2DMolDisplay], str]:
|
|
78
|
+
"""
|
|
79
|
+
Closure that creates a function that renders an OEMol to HTML
|
|
80
|
+
:param ctx: Render context
|
|
81
|
+
:param callbacks: List of callbacks to modify the rendering of the molecule
|
|
82
|
+
:return: Function that renders molecules to HTML
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def _oedisp_to_html(disp: oedepict.OE2DMolDisplay) -> str:
|
|
86
|
+
|
|
87
|
+
if isinstance(disp, oedepict.OE2DMolDisplay) and disp.IsValid():
|
|
88
|
+
# Copy the display, as not to modify the original with callbacks
|
|
89
|
+
# TODO: Update with ctx
|
|
90
|
+
disp_to_render = oedepict.OE2DMolDisplay(disp)
|
|
91
|
+
|
|
92
|
+
# Apply display callbacks
|
|
93
|
+
if callbacks is not None:
|
|
94
|
+
for callback in callbacks:
|
|
95
|
+
callback(disp_to_render)
|
|
96
|
+
|
|
97
|
+
return oedisp_to_html(disp_to_render, ctx=ctx)
|
|
98
|
+
return str(disp)
|
|
99
|
+
|
|
100
|
+
return _oedisp_to_html
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def escape_formatter(obj: Any) -> str:
|
|
104
|
+
return escape_brackets(str(obj))
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def render_dataframe(
|
|
108
|
+
df: pd.DataFrame,
|
|
109
|
+
formatters: dict | None = None,
|
|
110
|
+
col_space: dict[str, float | int] | None = None,
|
|
111
|
+
ctx: CNotebookContext | None = None,
|
|
112
|
+
**kwargs
|
|
113
|
+
) -> str:
|
|
114
|
+
"""
|
|
115
|
+
Render a DataFrame with molecules
|
|
116
|
+
:param df: DataFrame to render
|
|
117
|
+
:param formatters: Custom formatters for displaying columns
|
|
118
|
+
:param col_space: Custom column spacing
|
|
119
|
+
:param ctx: Local rendering context (optional)
|
|
120
|
+
:param kwargs: Additional keyword arguments for DataFrame.to_html
|
|
121
|
+
:return: HTML of rendered DataFrame
|
|
122
|
+
"""
|
|
123
|
+
# Defaults are empty dictionaries for these
|
|
124
|
+
formatters = formatters or {}
|
|
125
|
+
col_space = col_space or {}
|
|
126
|
+
|
|
127
|
+
# Render columns with MoleculeDtype
|
|
128
|
+
molecule_columns = set()
|
|
129
|
+
|
|
130
|
+
# Capture metadata from ORIGINAL DataFrame BEFORE copying
|
|
131
|
+
# (df.copy() may not preserve array metadata)
|
|
132
|
+
original_metadata_by_col = {}
|
|
133
|
+
|
|
134
|
+
for col in df.columns:
|
|
135
|
+
if isinstance(df.dtypes[col], oepd.MoleculeDtype):
|
|
136
|
+
molecule_columns.add(col)
|
|
137
|
+
# Get metadata from the original array before any copying
|
|
138
|
+
arr = df[col].array
|
|
139
|
+
if hasattr(arr, 'metadata') and arr.metadata:
|
|
140
|
+
original_metadata_by_col[col] = arr.metadata.copy()
|
|
141
|
+
|
|
142
|
+
# We need to copy both the DataFrame and the molecules, because we modify them in-place to render them
|
|
143
|
+
df = df.copy()
|
|
144
|
+
|
|
145
|
+
for col in molecule_columns:
|
|
146
|
+
# Direct assignment to help IDE understand this is a MoleculeArray
|
|
147
|
+
arr = df[col].array
|
|
148
|
+
assert isinstance(arr, oepd.MoleculeArray)
|
|
149
|
+
# Use preserved metadata from original DataFrame (not the copy which may have lost it)
|
|
150
|
+
original_metadata = original_metadata_by_col.get(col, {})
|
|
151
|
+
new_arr = arr.deepcopy()
|
|
152
|
+
new_arr.metadata.update(original_metadata)
|
|
153
|
+
df[col] = pd.Series(new_arr, index=df[col].index, dtype=oepd.MoleculeDtype())
|
|
154
|
+
|
|
155
|
+
# ---------------------------------------------------
|
|
156
|
+
# Molecule columns
|
|
157
|
+
# ---------------------------------------------------
|
|
158
|
+
|
|
159
|
+
if len(molecule_columns) > 0:
|
|
160
|
+
log.debug(f'Detected molecule columns: {", ".join(molecule_columns)}')
|
|
161
|
+
|
|
162
|
+
# Create formatters for each column
|
|
163
|
+
for col in molecule_columns:
|
|
164
|
+
|
|
165
|
+
# Create the formatter for this column
|
|
166
|
+
if col in formatters:
|
|
167
|
+
log.warning(f'Overwriting existing formatter for {col} with a molecule formatter')
|
|
168
|
+
|
|
169
|
+
# Direct assignment to help IDE understand this is a MoleculeArray
|
|
170
|
+
arr = df[col].array
|
|
171
|
+
assert isinstance(arr, oepd.MoleculeArray)
|
|
172
|
+
|
|
173
|
+
# Get the cnotebook options for this column
|
|
174
|
+
series_ctx = ctx if ctx is not None else get_series_context(arr.metadata)
|
|
175
|
+
|
|
176
|
+
formatters[col] = create_mol_formatter(ctx=series_ctx)
|
|
177
|
+
|
|
178
|
+
# Record the column width
|
|
179
|
+
if col in col_space:
|
|
180
|
+
log.warning(f'Column spacing for {col} already defined by overwriting with molecule image width')
|
|
181
|
+
|
|
182
|
+
col_space[col] = float(series_ctx.width)
|
|
183
|
+
|
|
184
|
+
# ---------------------------------------------------
|
|
185
|
+
# Display columns
|
|
186
|
+
# ---------------------------------------------------
|
|
187
|
+
|
|
188
|
+
# Render columns with DisplayDtype
|
|
189
|
+
display_columns = set()
|
|
190
|
+
|
|
191
|
+
for col in df.columns:
|
|
192
|
+
if isinstance(df.dtypes[col], oepd.DisplayDtype):
|
|
193
|
+
display_columns.add(col)
|
|
194
|
+
|
|
195
|
+
if len(display_columns) > 0:
|
|
196
|
+
log.debug(f'Detected display columns: {", ".join(display_columns)}')
|
|
197
|
+
|
|
198
|
+
for col in display_columns:
|
|
199
|
+
|
|
200
|
+
# Get the underlying display array
|
|
201
|
+
# Direct assignment to help IDE understand this is a DisplayArray
|
|
202
|
+
arr = df[col].array
|
|
203
|
+
assert isinstance(arr, oepd.DisplayArray)
|
|
204
|
+
|
|
205
|
+
# Get column metadata
|
|
206
|
+
series_ctx = ctx if ctx is not None else get_series_context(arr.metadata)
|
|
207
|
+
|
|
208
|
+
formatters[col] = create_disp_formatter(ctx=series_ctx)
|
|
209
|
+
|
|
210
|
+
if len(arr) > 0:
|
|
211
|
+
col_space[col] = max(disp.GetWidth() for disp in arr if isinstance(disp, oedepict.OE2DMolDisplay))
|
|
212
|
+
col_space[col] = max(0, col_space[col])
|
|
213
|
+
else:
|
|
214
|
+
col_space[col] = 0
|
|
215
|
+
|
|
216
|
+
# ---------------------------------------------------
|
|
217
|
+
# All other columns
|
|
218
|
+
# ---------------------------------------------------
|
|
219
|
+
|
|
220
|
+
for col in df.columns:
|
|
221
|
+
if col not in display_columns and col not in molecule_columns:
|
|
222
|
+
formatters[col] = escape_formatter
|
|
223
|
+
|
|
224
|
+
return df.to_html(escape=False, formatters=formatters, col_space=col_space, **kwargs)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
########################################################################################################################
|
|
228
|
+
# Register Pandas formatters
|
|
229
|
+
########################################################################################################################
|
|
230
|
+
|
|
231
|
+
if ipython_present:
|
|
232
|
+
|
|
233
|
+
def register_pandas_formatters():
|
|
234
|
+
"""
|
|
235
|
+
Modify how the notebook is told how to display Pandas Dataframes - this actually is more flexible because it
|
|
236
|
+
will still work with other custom changes to to_html().
|
|
237
|
+
|
|
238
|
+
Note: Calls to this function are idempotent.
|
|
239
|
+
"""
|
|
240
|
+
ipython_instance = get_ipython()
|
|
241
|
+
|
|
242
|
+
if ipython_instance is not None:
|
|
243
|
+
html_formatter = ipython_instance.display_formatter.formatters['text/html']
|
|
244
|
+
try:
|
|
245
|
+
formatter = html_formatter.lookup(pd.DataFrame)
|
|
246
|
+
if formatter is not render_dataframe:
|
|
247
|
+
html_formatter.for_type(pd.DataFrame, render_dataframe)
|
|
248
|
+
except KeyError:
|
|
249
|
+
html_formatter.for_type(pd.DataFrame, render_dataframe)
|
|
250
|
+
else:
|
|
251
|
+
log.debug("[cnotebook] iPython installed but not in use - cannot register pandas extension")
|
|
252
|
+
|
|
253
|
+
else:
|
|
254
|
+
|
|
255
|
+
# iPython is not present, so we do not register a Pandas formatter
|
|
256
|
+
def register_pandas_formatters():
|
|
257
|
+
pass
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
########################################################################################################################
|
|
261
|
+
# CNotebook Series accessor extensions for OEPandas .chem accessor
|
|
262
|
+
########################################################################################################################
|
|
263
|
+
|
|
264
|
+
def _series_highlight(
|
|
265
|
+
self,
|
|
266
|
+
pattern: Iterable[str] | str | oechem.OESubSearch | Iterable[oechem.OESubSearch],
|
|
267
|
+
*,
|
|
268
|
+
color: oechem.OEColor | oechem.OEColorIter | None = None,
|
|
269
|
+
style: int | Literal["overlay_default", "overlay_ball_and_stick"] = "overlay_default",
|
|
270
|
+
ref: oechem.OESubSearch | oechem.OEMCSSearch | oechem.OEQMol | Literal["first"] | oechem.OEMolBase | None = None,
|
|
271
|
+
method: Literal["ss", "substructure", "mcss", "fp", "fingerprint"] | None = None
|
|
272
|
+
) -> None:
|
|
273
|
+
"""
|
|
274
|
+
Highlight chemical features in a structure.
|
|
275
|
+
|
|
276
|
+
The pattern argument can be:
|
|
277
|
+
- SMARTS pattern
|
|
278
|
+
- oechem.OESubSearch or oechem.OEMCSSearch object
|
|
279
|
+
- Iterable of SMARTS patterns, oechem.OESubSearch, and/or oechem.OEMCSSearch objects
|
|
280
|
+
|
|
281
|
+
:param pattern: Pattern(s) to highlight in the molecule.
|
|
282
|
+
:param color: Highlight color(s). Can be a single oechem.OEColor or an oechem.OEColorIter
|
|
283
|
+
(e.g., oechem.OEGetLightColors()). Defaults to oechem.OEGetLightColors().
|
|
284
|
+
:param style: Highlight style. Can be an int (OEHighlightStyle constant) or a string
|
|
285
|
+
("overlay_default", "overlay_ball_and_stick"). Defaults to "overlay_default".
|
|
286
|
+
:param ref: Optional reference for alignment.
|
|
287
|
+
:param method: Optional alignment method.
|
|
288
|
+
"""
|
|
289
|
+
if not isinstance(self._obj.dtype, oepd.MoleculeDtype):
|
|
290
|
+
raise TypeError(
|
|
291
|
+
"highlight only works on molecule columns (oepandas.MoleculeDtype). If this column has "
|
|
292
|
+
"molecules, use series.chem.as_molecule() to convert to a molecule column first."
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
# Get the molecule array
|
|
296
|
+
arr = self._obj.array
|
|
297
|
+
assert isinstance(arr, oepd.MoleculeArray)
|
|
298
|
+
|
|
299
|
+
# Get / create a series context and save it (because we are modifying it locally)
|
|
300
|
+
ctx = get_series_context(arr.metadata, save=True)
|
|
301
|
+
|
|
302
|
+
# ********************************************************************************
|
|
303
|
+
# Highlighting
|
|
304
|
+
# ********************************************************************************
|
|
305
|
+
|
|
306
|
+
# Case: Pattern is a single SMARTS string or oechem.OESubSearch object
|
|
307
|
+
if isinstance(pattern, (str, oechem.OESubSearch, oechem.OEMCSSearch, oechem.OEQMol)):
|
|
308
|
+
ctx.add_callback(
|
|
309
|
+
create_structure_highlighter(
|
|
310
|
+
query=pattern,
|
|
311
|
+
color=color,
|
|
312
|
+
style=style
|
|
313
|
+
)
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
# Case: Pattern is an iterable
|
|
317
|
+
elif isinstance(pattern, Iterable):
|
|
318
|
+
for element in pattern:
|
|
319
|
+
|
|
320
|
+
# Element is a SMARTS string or oechem.OESubSearch object
|
|
321
|
+
if isinstance(element, (str, oechem.OESubSearch, oechem.OEMCSSearch, oechem.OEQMol)):
|
|
322
|
+
ctx.add_callback(
|
|
323
|
+
create_structure_highlighter(
|
|
324
|
+
query=element,
|
|
325
|
+
color=color,
|
|
326
|
+
style=style
|
|
327
|
+
)
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# Unknown element
|
|
331
|
+
else:
|
|
332
|
+
raise TypeError(f'Do not know how to add molecule highlight for type {type(element).__name__}')
|
|
333
|
+
|
|
334
|
+
# Case: Pattern is an unknown type
|
|
335
|
+
else:
|
|
336
|
+
raise TypeError(f'Do not know how to add molecule highlight for type {type(pattern).__name__}')
|
|
337
|
+
|
|
338
|
+
# ********************************************************************************
|
|
339
|
+
# Alignment
|
|
340
|
+
# ********************************************************************************
|
|
341
|
+
|
|
342
|
+
if ref is not None:
|
|
343
|
+
self._obj.chem.align_depictions(ref=ref, method=method)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def _series_recalculate_depiction_coordinates(
|
|
347
|
+
self,
|
|
348
|
+
*,
|
|
349
|
+
clear_coords: bool = True,
|
|
350
|
+
add_depiction_hydrogens: bool = True,
|
|
351
|
+
perceive_bond_stereo: bool = True,
|
|
352
|
+
suppress_explicit_hydrogens: bool = True,
|
|
353
|
+
orientation: int = oedepict.OEDepictOrientation_Default
|
|
354
|
+
) -> None:
|
|
355
|
+
"""
|
|
356
|
+
Recalculate the depictions for a molecule series.
|
|
357
|
+
|
|
358
|
+
See the following link for more information:
|
|
359
|
+
https://docs.eyesopen.com/toolkits/python/depicttk/OEDepictClasses/OEPrepareDepictionOptions.html
|
|
360
|
+
|
|
361
|
+
:param clear_coords: Clear existing 2D coordinates
|
|
362
|
+
:param add_depiction_hydrogens: Add explicit depiction hydrogens for faithful stereo depiction, etc.
|
|
363
|
+
:param perceive_bond_stereo: Perceive wedge/hash bond stereo
|
|
364
|
+
:param suppress_explicit_hydrogens: Suppress explicit hydrogens
|
|
365
|
+
:param orientation: Preferred 2D orientation
|
|
366
|
+
"""
|
|
367
|
+
if not isinstance(self._obj.dtype, oepd.MoleculeDtype):
|
|
368
|
+
raise TypeError(
|
|
369
|
+
"recalculate_depiction_coordinates only works on molecule columns (oepandas.MoleculeDtype). If this "
|
|
370
|
+
"column has molecules, use series.chem.as_molecule() to convert to a molecule column first."
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
# Create the depiction options
|
|
374
|
+
opts = oedepict.OEPrepareDepictionOptions()
|
|
375
|
+
opts.SetClearCoords(clear_coords)
|
|
376
|
+
opts.SetAddDepictionHydrogens(add_depiction_hydrogens)
|
|
377
|
+
opts.SetPerceiveBondStereo(perceive_bond_stereo)
|
|
378
|
+
opts.SetSuppressHydrogens(suppress_explicit_hydrogens)
|
|
379
|
+
opts.SetDepictOrientation(orientation)
|
|
380
|
+
|
|
381
|
+
for mol in self._obj.array:
|
|
382
|
+
if isinstance(mol, oechem.OEMolBase):
|
|
383
|
+
oedepict.OEPrepareDepiction(mol, opts)
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def _series_reset_depictions(self) -> None:
|
|
387
|
+
"""
|
|
388
|
+
Reset depiction callbacks for a molecule series
|
|
389
|
+
"""
|
|
390
|
+
# Check if array has metadata attribute (should be true for oepandas arrays)
|
|
391
|
+
if hasattr(self._obj.array, "metadata"):
|
|
392
|
+
arr = self._obj.array
|
|
393
|
+
assert isinstance(arr, oepd.MoleculeArray)
|
|
394
|
+
_ = arr.metadata.pop("cnotebook", None)
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def _series_clear_formatting_rules(self) -> None:
|
|
398
|
+
"""
|
|
399
|
+
Clear all formatting rule callbacks from a molecule series.
|
|
400
|
+
|
|
401
|
+
This removes any callbacks applied to the molecule prior to rendering,
|
|
402
|
+
such as highlighting. Unlike reset_depictions which removes the entire
|
|
403
|
+
rendering context, this method only clears the callbacks while preserving
|
|
404
|
+
other context settings like image dimensions and styling.
|
|
405
|
+
"""
|
|
406
|
+
if hasattr(self._obj.array, "metadata"):
|
|
407
|
+
arr = self._obj.array
|
|
408
|
+
assert isinstance(arr, oepd.MoleculeArray)
|
|
409
|
+
ctx = arr.metadata.get("cnotebook", None)
|
|
410
|
+
if ctx is not None and isinstance(ctx, CNotebookContext):
|
|
411
|
+
ctx.reset_callbacks()
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def _series_align_depictions(
|
|
415
|
+
self,
|
|
416
|
+
ref: oechem.OESubSearch | oechem.OEMCSSearch | oechem.OEMolBase | oechem.OEQMol | Literal["first"],
|
|
417
|
+
method: Literal["substructure", "ss", "mcss", "fp", "fingerprint"] | None = None,
|
|
418
|
+
**kwargs
|
|
419
|
+
) -> None:
|
|
420
|
+
"""
|
|
421
|
+
Align the 2D coordinates of molecules
|
|
422
|
+
:param ref: Alignment reference
|
|
423
|
+
:param method: Alignment method
|
|
424
|
+
:param kwargs: Keyword arguments for aligner
|
|
425
|
+
:return: Aligned molecule depictions
|
|
426
|
+
"""
|
|
427
|
+
if not isinstance(self._obj.dtype, oepd.MoleculeDtype):
|
|
428
|
+
raise TypeError(
|
|
429
|
+
"align_depictions only works on molecule columns (oepandas.MoleculeDtype). If this "
|
|
430
|
+
"column has molecules, use series.chem.as_molecule() to convert to a molecule column first."
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
# Get the rendering context for creating the displays
|
|
434
|
+
arr = self._obj.array
|
|
435
|
+
assert isinstance(arr, oepd.MoleculeArray)
|
|
436
|
+
|
|
437
|
+
if isinstance(ref, str) and ref == "first":
|
|
438
|
+
for mol in arr:
|
|
439
|
+
if mol is not None and mol.IsValid():
|
|
440
|
+
ref = mol.CreateCopy()
|
|
441
|
+
break
|
|
442
|
+
else:
|
|
443
|
+
log.warning("No valid molecule found in series for depiction alignment")
|
|
444
|
+
return
|
|
445
|
+
|
|
446
|
+
# Suppress alignment warnings (there are lots of needless warnings)
|
|
447
|
+
level = oechem.OEThrow.GetLevel()
|
|
448
|
+
oechem.OEThrow.SetLevel(oechem.OEErrorLevel_Error)
|
|
449
|
+
|
|
450
|
+
# noinspection PyBroadException
|
|
451
|
+
try:
|
|
452
|
+
# Create the aligner
|
|
453
|
+
aligner = create_aligner(ref=ref, method=method)
|
|
454
|
+
|
|
455
|
+
for mol in arr:
|
|
456
|
+
_ = aligner(mol)
|
|
457
|
+
|
|
458
|
+
except Exception as ex:
|
|
459
|
+
log.debug("Error aligning molecules: %s", ex)
|
|
460
|
+
|
|
461
|
+
# Restore OEThrow
|
|
462
|
+
finally:
|
|
463
|
+
oechem.OEThrow.SetLevel(level)
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
########################################################################################################################
|
|
467
|
+
# CNotebook DataFrame accessor extensions for OEPandas .chem accessor
|
|
468
|
+
########################################################################################################################
|
|
469
|
+
|
|
470
|
+
def _dataframe_recalculate_depiction_coordinates(
|
|
471
|
+
self,
|
|
472
|
+
*,
|
|
473
|
+
molecule_columns: str | Iterable[str] | None = None,
|
|
474
|
+
clear_coords: bool = True,
|
|
475
|
+
add_depction_hydrogens: bool = True,
|
|
476
|
+
perceive_bond_stereo: bool = True,
|
|
477
|
+
suppress_explicit_hydrogens: bool = True,
|
|
478
|
+
orientation: int = oedepict.OEDepictOrientation_Default
|
|
479
|
+
) -> None:
|
|
480
|
+
"""
|
|
481
|
+
Recalculate the depictions for a one or more molecule series in a DataFrame. If molecule_columns is None,
|
|
482
|
+
which is the default, then all molecule columns will have their depictions recalculated
|
|
483
|
+
|
|
484
|
+
See the following link for more information:
|
|
485
|
+
https://docs.eyesopen.com/toolkits/python/depicttk/OEDepictClasses/OEPrepareDepictionOptions.html
|
|
486
|
+
|
|
487
|
+
:param molecule_columns: Optional molecule column(s) to have depictions recalculated
|
|
488
|
+
:param clear_coords: Clear existing 2D coordinates
|
|
489
|
+
:param add_depction_hydrogens: Add explicit depiction hydrogens for faithful stereo depiction, etc.
|
|
490
|
+
:param perceive_bond_stereo: Perceive wedge/hash bond stereo
|
|
491
|
+
:param suppress_explicit_hydrogens: Suppress explicit hydrogens
|
|
492
|
+
:param orientation: Preferred 2D orientation
|
|
493
|
+
"""
|
|
494
|
+
if molecule_columns is None:
|
|
495
|
+
molecule_columns = set()
|
|
496
|
+
|
|
497
|
+
for col in self._obj.columns:
|
|
498
|
+
if isinstance(self._obj.dtypes[col], oepd.MoleculeDtype):
|
|
499
|
+
molecule_columns.add(col)
|
|
500
|
+
|
|
501
|
+
elif isinstance(molecule_columns, str):
|
|
502
|
+
molecule_columns = {molecule_columns}
|
|
503
|
+
|
|
504
|
+
else:
|
|
505
|
+
molecule_columns = set(molecule_columns)
|
|
506
|
+
|
|
507
|
+
# Recalculate the column depictions
|
|
508
|
+
for col in molecule_columns:
|
|
509
|
+
|
|
510
|
+
if col in self._obj.columns:
|
|
511
|
+
if isinstance(self._obj.dtypes[col], oepd.MoleculeDtype):
|
|
512
|
+
self._obj[col].chem.recalculate_depiction_coordinates(
|
|
513
|
+
clear_coords=clear_coords,
|
|
514
|
+
add_depction_hydrogens=add_depction_hydrogens,
|
|
515
|
+
perceive_bond_stereo=perceive_bond_stereo,
|
|
516
|
+
suppress_explicit_hydrogens=suppress_explicit_hydrogens,
|
|
517
|
+
orientation=orientation
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
else:
|
|
521
|
+
log.warning(f'Column {col} does not have a MoleculeDtype')
|
|
522
|
+
|
|
523
|
+
else:
|
|
524
|
+
log.warning(f'{col} not found in DataFrame columns: ({", ".join(self._obj.columns)})')
|
|
525
|
+
molecule_columns.remove(col)
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
def _dataframe_reset_depictions(self, *, molecule_columns: str | Iterable[str] | None = None) -> None:
|
|
529
|
+
"""
|
|
530
|
+
Reset depiction callbacks for one or more columns
|
|
531
|
+
"""
|
|
532
|
+
columns = set()
|
|
533
|
+
if molecule_columns is None:
|
|
534
|
+
columns.update(self._obj.columns)
|
|
535
|
+
|
|
536
|
+
elif isinstance(molecule_columns, str):
|
|
537
|
+
columns.add(molecule_columns)
|
|
538
|
+
|
|
539
|
+
else:
|
|
540
|
+
columns.update(molecule_columns)
|
|
541
|
+
|
|
542
|
+
# Filter invalid and non-molecule columns
|
|
543
|
+
for col in filter(
|
|
544
|
+
lambda c: c in self._obj.columns and isinstance(self._obj[c].dtype, oepd.MoleculeDtype),
|
|
545
|
+
columns
|
|
546
|
+
):
|
|
547
|
+
self._obj[col].chem.reset_depictions()
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
def _dataframe_clear_formatting_rules(self, molecule_columns: str | Iterable[str] | None = None) -> None:
|
|
551
|
+
"""
|
|
552
|
+
Clear all formatting rule callbacks from one or more molecule columns.
|
|
553
|
+
|
|
554
|
+
This removes any callbacks applied to molecules prior to rendering,
|
|
555
|
+
such as highlighting. Unlike reset_depictions which removes the entire
|
|
556
|
+
rendering context, this method only clears the callbacks while preserving
|
|
557
|
+
other context settings like image dimensions and styling.
|
|
558
|
+
|
|
559
|
+
:param molecule_columns: Optional molecule column(s) to clear formatting rules from.
|
|
560
|
+
If None, clears formatting rules from all molecule columns.
|
|
561
|
+
|
|
562
|
+
Example::
|
|
563
|
+
|
|
564
|
+
# Clear formatting rules from all molecule columns
|
|
565
|
+
df.chem.clear_formatting_rules()
|
|
566
|
+
|
|
567
|
+
# Clear formatting rules from a specific column
|
|
568
|
+
df.chem.clear_formatting_rules("smiles")
|
|
569
|
+
|
|
570
|
+
# Clear formatting rules from multiple columns
|
|
571
|
+
df.chem.clear_formatting_rules(["mol1", "mol2"])
|
|
572
|
+
"""
|
|
573
|
+
columns = set()
|
|
574
|
+
if molecule_columns is None:
|
|
575
|
+
columns.update(self._obj.columns)
|
|
576
|
+
|
|
577
|
+
elif isinstance(molecule_columns, str):
|
|
578
|
+
columns.add(molecule_columns)
|
|
579
|
+
|
|
580
|
+
else:
|
|
581
|
+
columns.update(molecule_columns)
|
|
582
|
+
|
|
583
|
+
# Filter invalid and non-molecule columns
|
|
584
|
+
for col in filter(
|
|
585
|
+
lambda c: c in self._obj.columns and isinstance(self._obj[c].dtype, oepd.MoleculeDtype),
|
|
586
|
+
columns
|
|
587
|
+
):
|
|
588
|
+
self._obj[col].chem.clear_formatting_rules()
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
def _dataframe_highlight(
|
|
592
|
+
self,
|
|
593
|
+
molecule_column: str,
|
|
594
|
+
pattern: Iterable[str] | str | oechem.OESubSearch | Iterable[oechem.OESubSearch],
|
|
595
|
+
*,
|
|
596
|
+
color: oechem.OEColor | oechem.OEColorIter | None = None,
|
|
597
|
+
style: int | Literal["overlay_default", "overlay_ball_and_stick"] = "overlay_default",
|
|
598
|
+
) -> None:
|
|
599
|
+
"""
|
|
600
|
+
Highlight chemical features in molecules within a specified column.
|
|
601
|
+
|
|
602
|
+
The pattern argument can be:
|
|
603
|
+
- SMARTS pattern
|
|
604
|
+
- oechem.OESubSearch or oechem.OEMCSSearch object
|
|
605
|
+
- Iterable of SMARTS patterns, oechem.OESubSearch, and/or oechem.OEMCSSearch objects
|
|
606
|
+
|
|
607
|
+
:param molecule_column: Name of the molecule column to highlight.
|
|
608
|
+
:param pattern: Pattern(s) to highlight in the molecules.
|
|
609
|
+
:param color: Highlight color(s). Can be a single oechem.OEColor or an oechem.OEColorIter
|
|
610
|
+
(e.g., oechem.OEGetLightColors()). Defaults to oechem.OEGetLightColors().
|
|
611
|
+
:param style: Highlight style. Can be an int (OEHighlightStyle constant) or a string
|
|
612
|
+
("overlay_default", "overlay_ball_and_stick"). Defaults to "overlay_default".
|
|
613
|
+
|
|
614
|
+
Example::
|
|
615
|
+
|
|
616
|
+
# Highlight benzene rings in the 'smiles' column
|
|
617
|
+
df.chem.highlight("smiles", "c1ccccc1")
|
|
618
|
+
|
|
619
|
+
# Highlight multiple patterns
|
|
620
|
+
df.chem.highlight("smiles", ["c1ccccc1", "[OH]"])
|
|
621
|
+
"""
|
|
622
|
+
if molecule_column not in self._obj.columns:
|
|
623
|
+
raise ValueError(f'Column {molecule_column} not found in DataFrame columns: ({", ".join(self._obj.columns)})')
|
|
624
|
+
|
|
625
|
+
if not isinstance(self._obj[molecule_column].dtype, oepd.MoleculeDtype):
|
|
626
|
+
raise TypeError(
|
|
627
|
+
f"highlight only works on molecule columns (oepandas.MoleculeDtype). Column '{molecule_column}' "
|
|
628
|
+
f"has type {self._obj[molecule_column].dtype}."
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
# Delegate to the series-level highlight (which works in Pandas)
|
|
632
|
+
self._obj[molecule_column].chem.highlight(pattern, color=color, style=style)
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
def _dataframe_copy_molecules(
|
|
636
|
+
self,
|
|
637
|
+
source_column: str,
|
|
638
|
+
dest_column: str,
|
|
639
|
+
) -> pd.DataFrame:
|
|
640
|
+
"""
|
|
641
|
+
Create a deep copy of molecules from one column to a new column.
|
|
642
|
+
|
|
643
|
+
This creates independent copies of all molecules, allowing modifications
|
|
644
|
+
(such as highlighting or alignment) to the new column without affecting
|
|
645
|
+
the original.
|
|
646
|
+
|
|
647
|
+
:param source_column: Name of the source molecule column.
|
|
648
|
+
:param dest_column: Name of the new column to create with copied molecules.
|
|
649
|
+
:returns: DataFrame with the new molecule column added.
|
|
650
|
+
|
|
651
|
+
Example::
|
|
652
|
+
|
|
653
|
+
# Create a copy of molecules for alignment
|
|
654
|
+
df = df.chem.copy_molecules("Original", "Aligned")
|
|
655
|
+
df.chem.highlight("Aligned", "c1ccccc1")
|
|
656
|
+
"""
|
|
657
|
+
if source_column not in self._obj.columns:
|
|
658
|
+
raise ValueError(f'Column {source_column} not found in DataFrame columns: ({", ".join(self._obj.columns)})')
|
|
659
|
+
|
|
660
|
+
if not isinstance(self._obj[source_column].dtype, oepd.MoleculeDtype):
|
|
661
|
+
raise TypeError(
|
|
662
|
+
f"copy_molecules only works on molecule columns (oepandas.MoleculeDtype). Column '{source_column}' "
|
|
663
|
+
f"has type {self._obj[source_column].dtype}."
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
# Use the series-level copy_molecules and assign to the new column
|
|
667
|
+
self._obj[dest_column] = self._obj[source_column].chem.copy_molecules()
|
|
668
|
+
return self._obj
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
def _dataframe_highlight_using_column(
|
|
672
|
+
self,
|
|
673
|
+
molecule_column: str,
|
|
674
|
+
pattern_column: str,
|
|
675
|
+
*,
|
|
676
|
+
highlighted_column: str = "highlighted_substructures",
|
|
677
|
+
color: oechem.OEColor | oechem.OEColorIter | None = None,
|
|
678
|
+
style: int | Literal["overlay_default", "overlay_ball_and_stick"] = "overlay_default",
|
|
679
|
+
inplace: bool = False
|
|
680
|
+
) -> pd.DataFrame:
|
|
681
|
+
"""
|
|
682
|
+
Highlight molecules based on the value of another column. The column produced is a DisplayArray column, so
|
|
683
|
+
the results are not suitable for other molecular calculations.
|
|
684
|
+
|
|
685
|
+
The other column can contain:
|
|
686
|
+
- Comma or whitespace delimited string of SMARTS patterns
|
|
687
|
+
- oechem.OESubSearch or oechem.OEMCSSearch object
|
|
688
|
+
- Iterable of SMARTS patterns, oechem.OESubSearch, and/or oechem.OEMCSSearch objects
|
|
689
|
+
|
|
690
|
+
:param molecule_column: Name of the molecule column.
|
|
691
|
+
:param pattern_column: Name of the pattern column.
|
|
692
|
+
:param highlighted_column: Optional name of the column with highlighted structures.
|
|
693
|
+
:param color: Highlight color(s). Can be a single oechem.OEColor or an oechem.OEColorIter
|
|
694
|
+
(e.g., oechem.OEGetLightColors()). Defaults to oechem.OEGetLightColors().
|
|
695
|
+
:param style: Highlight style. Can be an int (OEHighlightStyle constant) or a string
|
|
696
|
+
("overlay_default", "overlay_ball_and_stick"). Defaults to "overlay_default".
|
|
697
|
+
:param inplace: Modify the DataFrame in place.
|
|
698
|
+
:returns: Modified DataFrame.
|
|
699
|
+
"""
|
|
700
|
+
# Object we are operating on
|
|
701
|
+
df = self._obj if inplace else self._obj.copy()
|
|
702
|
+
|
|
703
|
+
if molecule_column not in df.columns:
|
|
704
|
+
raise KeyError(f'{molecule_column} not found in DataFrame columns: ({", ".join(df.columns)}')
|
|
705
|
+
|
|
706
|
+
if not isinstance(df[molecule_column].dtype, oepd.MoleculeDtype):
|
|
707
|
+
raise TypeError(
|
|
708
|
+
f"highlight_using_column only works on molecule columns (oepandas.MoleculeDtype). If {molecule_column}"
|
|
709
|
+
" has molecules, use df.chem.as_molecule() to convert to a molecule column first."
|
|
710
|
+
)
|
|
711
|
+
|
|
712
|
+
if pattern_column not in df.columns:
|
|
713
|
+
raise KeyError(f'{pattern_column} not found in DataFrame columns: ({", ".join(df.columns)}')
|
|
714
|
+
|
|
715
|
+
# Default color
|
|
716
|
+
if color is None:
|
|
717
|
+
color = oechem.OEGetLightColors()
|
|
718
|
+
|
|
719
|
+
# Determine highlighting approach based on style
|
|
720
|
+
use_overlay = isinstance(style, str) and style in ("overlay_default", "overlay_ball_and_stick")
|
|
721
|
+
|
|
722
|
+
# Check if color is compatible with overlay
|
|
723
|
+
if use_overlay and isinstance(color, oechem.OEColor):
|
|
724
|
+
log.warning(
|
|
725
|
+
"Overlay coloring is not compatible with a single oechem.OEColor. Falling back to standard highlighting")
|
|
726
|
+
use_overlay = False
|
|
727
|
+
style = oedepict.OEHighlightStyle_BallAndStick
|
|
728
|
+
|
|
729
|
+
# Create the display objects
|
|
730
|
+
indexes = []
|
|
731
|
+
displays = []
|
|
732
|
+
|
|
733
|
+
# Get the rendering context for creating the displays
|
|
734
|
+
arr = df[molecule_column].array
|
|
735
|
+
assert isinstance(arr, oepd.MoleculeArray)
|
|
736
|
+
ctx = get_series_context(arr.metadata)
|
|
737
|
+
|
|
738
|
+
for idx, row in df.iterrows():
|
|
739
|
+
indexes.append(idx)
|
|
740
|
+
|
|
741
|
+
mol = row[molecule_column]
|
|
742
|
+
if isinstance(mol, oechem.OEMolBase):
|
|
743
|
+
|
|
744
|
+
# Create the display
|
|
745
|
+
disp = oemol_to_disp(mol, ctx=ctx)
|
|
746
|
+
|
|
747
|
+
# Highlight
|
|
748
|
+
substructures = []
|
|
749
|
+
patterns = row[pattern_column]
|
|
750
|
+
|
|
751
|
+
# Parse different patterns
|
|
752
|
+
if isinstance(patterns, str):
|
|
753
|
+
for pattern in re.split(SMARTS_DELIMITER_RE, patterns):
|
|
754
|
+
ss = oechem.OESubSearch(pattern)
|
|
755
|
+
if ss.IsValid():
|
|
756
|
+
substructures.append(ss)
|
|
757
|
+
|
|
758
|
+
elif isinstance(patterns, oechem.OESubSearch):
|
|
759
|
+
if patterns.IsValid():
|
|
760
|
+
substructures.append(patterns)
|
|
761
|
+
|
|
762
|
+
elif isinstance(patterns, Iterable):
|
|
763
|
+
|
|
764
|
+
for p in patterns:
|
|
765
|
+
|
|
766
|
+
if isinstance(p, str):
|
|
767
|
+
for pattern in re.split(SMARTS_DELIMITER_RE, p):
|
|
768
|
+
ss = oechem.OESubSearch(pattern)
|
|
769
|
+
if ss.IsValid():
|
|
770
|
+
substructures.append(ss)
|
|
771
|
+
|
|
772
|
+
elif isinstance(p, oechem.OESubSearch):
|
|
773
|
+
if p.IsValid():
|
|
774
|
+
substructures.append(p)
|
|
775
|
+
|
|
776
|
+
else:
|
|
777
|
+
log.warning(f'Do not know how to highlight using: {type(p).__name__}')
|
|
778
|
+
|
|
779
|
+
else:
|
|
780
|
+
log.warning(f'Do not know how to highlight using: {type(patterns).__name__}')
|
|
781
|
+
|
|
782
|
+
# Overlay highlighting
|
|
783
|
+
if use_overlay:
|
|
784
|
+
highlight = oedepict.OEHighlightOverlayByBallAndStick(color)
|
|
785
|
+
for ss in substructures:
|
|
786
|
+
oedepict.OEAddHighlightOverlay(disp, highlight, ss.Match(mol, True))
|
|
787
|
+
|
|
788
|
+
else:
|
|
789
|
+
# Traditional highlighting
|
|
790
|
+
if isinstance(color, oechem.OEColor):
|
|
791
|
+
highlight_color = color
|
|
792
|
+
else:
|
|
793
|
+
highlight_color = oechem.OELightBlue
|
|
794
|
+
for c in color:
|
|
795
|
+
highlight_color = c
|
|
796
|
+
break
|
|
797
|
+
for ss in substructures:
|
|
798
|
+
for match in ss.Match(mol, True):
|
|
799
|
+
oedepict.OEAddHighlighting(disp, highlight_color, style, match)
|
|
800
|
+
|
|
801
|
+
displays.append(disp)
|
|
802
|
+
|
|
803
|
+
else:
|
|
804
|
+
displays.append(None)
|
|
805
|
+
|
|
806
|
+
df[highlighted_column] = pd.Series(displays, index=indexes, dtype=oepd.DisplayDtype())
|
|
807
|
+
return df
|
|
808
|
+
|
|
809
|
+
|
|
810
|
+
class ColorBondByOverlapScore(oegrapheme.OEBondGlyphBase):
|
|
811
|
+
"""Bond glyph that colors bonds by fingerprint overlap score.
|
|
812
|
+
|
|
813
|
+
Used internally by fingerprint similarity visualization to highlight
|
|
814
|
+
bonds based on their contribution to molecular similarity.
|
|
815
|
+
|
|
816
|
+
See: https://docs.eyesopen.com/toolkits/cookbook/python/depiction/simcalc.html
|
|
817
|
+
"""
|
|
818
|
+
|
|
819
|
+
def __init__(self, cg: oechem.OELinearColorGradient, tag: int):
|
|
820
|
+
"""Create a bond coloring glyph.
|
|
821
|
+
|
|
822
|
+
:param cg: Color gradient to map overlap scores to colors.
|
|
823
|
+
:param tag: OEChem data tag containing overlap scores on bonds.
|
|
824
|
+
"""
|
|
825
|
+
oegrapheme.OEBondGlyphBase.__init__(self)
|
|
826
|
+
self.colorg = cg
|
|
827
|
+
self.tag = tag
|
|
828
|
+
|
|
829
|
+
# noinspection PyPep8Naming
|
|
830
|
+
def RenderGlyph(self, disp, bond):
|
|
831
|
+
|
|
832
|
+
bdisp = disp.GetBondDisplay(bond)
|
|
833
|
+
if bdisp is None or not bdisp.IsVisible():
|
|
834
|
+
return False
|
|
835
|
+
|
|
836
|
+
if not bond.HasData(self.tag):
|
|
837
|
+
return False
|
|
838
|
+
|
|
839
|
+
linewidth = disp.GetScale() / 3.0
|
|
840
|
+
color = self.colorg.GetColorAt(bond.GetData(self.tag))
|
|
841
|
+
pen = oedepict.OEPen(color, color, oedepict.OEFill_Off, linewidth)
|
|
842
|
+
|
|
843
|
+
adispB = disp.GetAtomDisplay(bond.GetBgn())
|
|
844
|
+
adispE = disp.GetAtomDisplay(bond.GetEnd())
|
|
845
|
+
|
|
846
|
+
layer = disp.GetLayer(oedepict.OELayerPosition_Below)
|
|
847
|
+
layer.DrawLine(adispB.GetCoords(), adispE.GetCoords(), pen)
|
|
848
|
+
|
|
849
|
+
return True
|
|
850
|
+
|
|
851
|
+
# noinspection PyPep8Naming
|
|
852
|
+
def ColorBondByOverlapScore(self):
|
|
853
|
+
return ColorBondByOverlapScore(self.colorg, self.tag).__disown__()
|
|
854
|
+
|
|
855
|
+
|
|
856
|
+
# Store the fingerprint tag for fingerprint_similarity
|
|
857
|
+
_fingerprint_overlap_tag = oechem.OEGetTag("fingerprint_overlap")
|
|
858
|
+
|
|
859
|
+
|
|
860
|
+
def _dataframe_fingerprint_similarity(
|
|
861
|
+
self,
|
|
862
|
+
molecule_column: str,
|
|
863
|
+
ref: oechem.OEMolBase | None = None,
|
|
864
|
+
*,
|
|
865
|
+
tanimoto_column="fingerprint_tanimoto",
|
|
866
|
+
reference_similarity_column="reference_similarity",
|
|
867
|
+
target_similarity_column="target_similarity",
|
|
868
|
+
fptype: str = "tree",
|
|
869
|
+
num_bits: int = 4096,
|
|
870
|
+
min_distance: int = 0,
|
|
871
|
+
max_distance: int = 4,
|
|
872
|
+
atom_type: str | int = oegraphsim.OEFPAtomType_DefaultTreeAtom,
|
|
873
|
+
bond_type: str | int = oegraphsim.OEFPBondType_DefaultTreeBond,
|
|
874
|
+
inplace: bool = False
|
|
875
|
+
) -> pd.DataFrame:
|
|
876
|
+
"""
|
|
877
|
+
Color molecules by fingerprint similarity
|
|
878
|
+
:param molecule_column: Name of the molecule column
|
|
879
|
+
:param ref: Reference molecule
|
|
880
|
+
:param tanimoto_column: Name of the tanimoto column
|
|
881
|
+
:param reference_similarity_column: Name of the reference similarity column
|
|
882
|
+
:param target_similarity_column: Name of the target similarity column
|
|
883
|
+
:param fptype: Fingerprint type
|
|
884
|
+
:param num_bits: Number of bits in the fingerprint
|
|
885
|
+
:param min_distance: Minimum distance/radius for path/circular/tree
|
|
886
|
+
:param max_distance: Maximum distance/radius for path/circular/tree
|
|
887
|
+
:param atom_type: Atom type string delimited by "|" OR int bitmask from the oegraphsim.OEFPAtomType_ namespace
|
|
888
|
+
:param bond_type: Bond type string delimited by "|" OR int bitmask from the oegraphsim.OEFPBondType_ namespace
|
|
889
|
+
:param inplace: Modify the DataFrame in place
|
|
890
|
+
:return: DataFrame with similarity columns
|
|
891
|
+
"""
|
|
892
|
+
tag = _fingerprint_overlap_tag
|
|
893
|
+
|
|
894
|
+
# Preprocess
|
|
895
|
+
df = self._obj if inplace else self._obj.copy()
|
|
896
|
+
|
|
897
|
+
if molecule_column not in df.columns:
|
|
898
|
+
raise KeyError(f'Molecule column not found in DataFrame: {molecule_column}')
|
|
899
|
+
|
|
900
|
+
if not isinstance(df[molecule_column].dtype, oepd.MoleculeDtype):
|
|
901
|
+
raise TypeError("Column {} does not have dtype oepd.MoleculeDtype ({})".format(
|
|
902
|
+
molecule_column, str(df[molecule_column].dtype)))
|
|
903
|
+
|
|
904
|
+
# Get the context
|
|
905
|
+
arr = self._obj[molecule_column].array
|
|
906
|
+
assert isinstance(arr, oepd.MoleculeArray)
|
|
907
|
+
ctx = get_series_context(arr.metadata)
|
|
908
|
+
|
|
909
|
+
# If we're using the first molecule as our reference
|
|
910
|
+
if ref is None:
|
|
911
|
+
for mol in arr: # type: oechem.OEMol
|
|
912
|
+
if mol.IsValid():
|
|
913
|
+
ref = mol
|
|
914
|
+
break
|
|
915
|
+
else:
|
|
916
|
+
log.warning(f'No valid reference molecules to use for alignment in column {molecule_column}')
|
|
917
|
+
return df
|
|
918
|
+
|
|
919
|
+
# Check reference molecule
|
|
920
|
+
if not ref.IsValid():
|
|
921
|
+
log.warning("Reference molecule is not valid")
|
|
922
|
+
return df
|
|
923
|
+
|
|
924
|
+
# Fingerprint maker
|
|
925
|
+
make_fp = fingerprint_maker(
|
|
926
|
+
fptype=fptype,
|
|
927
|
+
num_bits=num_bits,
|
|
928
|
+
min_distance=min_distance,
|
|
929
|
+
max_distance=max_distance,
|
|
930
|
+
atom_type=atom_type,
|
|
931
|
+
bond_type=bond_type
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
# Make the reference fingerprint
|
|
935
|
+
ref_fp = make_fp(ref)
|
|
936
|
+
|
|
937
|
+
if not ref_fp.IsValid():
|
|
938
|
+
log.warning("Fingerprint from reference molecule is invalid")
|
|
939
|
+
return df
|
|
940
|
+
|
|
941
|
+
# Create the display objects
|
|
942
|
+
ref_displays = []
|
|
943
|
+
targ_displays = []
|
|
944
|
+
|
|
945
|
+
# FIXME: See now below regarding the fact we have to cache the reference and target molecule copies
|
|
946
|
+
ref_molecules = []
|
|
947
|
+
targ_molecules = []
|
|
948
|
+
|
|
949
|
+
tanimotos = []
|
|
950
|
+
index = []
|
|
951
|
+
|
|
952
|
+
for idx, mol in df[molecule_column].items(): # type: Hashable, oechem.OEMol
|
|
953
|
+
index.append(idx)
|
|
954
|
+
if mol is not None and mol.IsValid():
|
|
955
|
+
|
|
956
|
+
# Copy the molecules, because we're modifying them
|
|
957
|
+
targ_mol = oechem.OEMol(mol)
|
|
958
|
+
ref_mol = oechem.OEMol(ref)
|
|
959
|
+
|
|
960
|
+
# FIXME: See now below regarding the fact we have to cache the reference and target molecule copies
|
|
961
|
+
targ_molecules.append(targ_mol)
|
|
962
|
+
ref_molecules.append(ref_mol)
|
|
963
|
+
|
|
964
|
+
# Create the fingerprint
|
|
965
|
+
targ_fp = make_fp(targ_mol)
|
|
966
|
+
if targ_fp.IsValid():
|
|
967
|
+
|
|
968
|
+
# Add the tanimoto
|
|
969
|
+
tanimotos.append(oegraphsim.OETanimoto(ref_fp, targ_fp))
|
|
970
|
+
|
|
971
|
+
# Calculate the similarity
|
|
972
|
+
targ_bonds = oechem.OEUIntArray(targ_mol.GetMaxBondIdx())
|
|
973
|
+
ref_bonds = oechem.OEUIntArray(ref_mol.GetMaxBondIdx())
|
|
974
|
+
|
|
975
|
+
# Overlaps
|
|
976
|
+
overlaps = oegraphsim.OEGetFPOverlap(ref_mol, targ_mol, ref_fp.GetFPTypeBase())
|
|
977
|
+
|
|
978
|
+
for match in overlaps:
|
|
979
|
+
for bond in match.GetPatternBonds():
|
|
980
|
+
ref_bonds[bond.GetIdx()] += 1
|
|
981
|
+
for bond in match.GetTargetBonds():
|
|
982
|
+
targ_bonds[bond.GetIdx()] += 1
|
|
983
|
+
|
|
984
|
+
for bond in targ_mol.GetBonds():
|
|
985
|
+
bond.SetData(tag, targ_bonds[bond.GetIdx()])
|
|
986
|
+
|
|
987
|
+
for bond in ref_mol.GetBonds():
|
|
988
|
+
bond.SetData(tag, ref_bonds[bond.GetIdx()])
|
|
989
|
+
|
|
990
|
+
# noinspection PyTypeChecker
|
|
991
|
+
maxvalue = max((0, max(targ_bonds), max(ref_bonds)))
|
|
992
|
+
|
|
993
|
+
# Create the color gradient
|
|
994
|
+
colorg = oechem.OELinearColorGradient()
|
|
995
|
+
colorg.AddStop(oechem.OEColorStop(0.0, oechem.OEPinkTint))
|
|
996
|
+
colorg.AddStop(oechem.OEColorStop(1.0, oechem.OEYellow))
|
|
997
|
+
colorg.AddStop(oechem.OEColorStop(maxvalue, oechem.OEDarkGreen))
|
|
998
|
+
|
|
999
|
+
# Function that will color the bonds
|
|
1000
|
+
bondglyph = ColorBondByOverlapScore(colorg, tag)
|
|
1001
|
+
|
|
1002
|
+
# Align the molecules
|
|
1003
|
+
oedepict.OEPrepareDepiction(ref_mol, False)
|
|
1004
|
+
oedepict.OEPrepareDepiction(targ_mol, False)
|
|
1005
|
+
|
|
1006
|
+
overlaps = oegraphsim.OEGetFPOverlap(ref_mol, targ_mol, ref_fp.GetFPTypeBase())
|
|
1007
|
+
oedepict.OEPrepareMultiAlignedDepiction(targ_mol, ref_mol, overlaps)
|
|
1008
|
+
|
|
1009
|
+
# Create the displays
|
|
1010
|
+
ref_disp = oemol_to_disp(ref_mol, ctx=ctx)
|
|
1011
|
+
targ_disp = oemol_to_disp(targ_mol, ctx=ctx)
|
|
1012
|
+
|
|
1013
|
+
# Color the displays
|
|
1014
|
+
oegrapheme.OEAddGlyph(ref_disp, bondglyph, oechem.IsTrueBond())
|
|
1015
|
+
oegrapheme.OEAddGlyph(targ_disp, bondglyph, oechem.IsTrueBond())
|
|
1016
|
+
|
|
1017
|
+
ref_displays.append(ref_disp)
|
|
1018
|
+
targ_displays.append(targ_disp)
|
|
1019
|
+
|
|
1020
|
+
# Fingerprint was invalid
|
|
1021
|
+
else:
|
|
1022
|
+
ref_displays.append(None)
|
|
1023
|
+
targ_displays.append(None)
|
|
1024
|
+
|
|
1025
|
+
# Molecule was invalid
|
|
1026
|
+
else:
|
|
1027
|
+
ref_displays.append(None)
|
|
1028
|
+
targ_displays.append(None)
|
|
1029
|
+
|
|
1030
|
+
# Add the columns
|
|
1031
|
+
df[tanimoto_column] = pd.Series(
|
|
1032
|
+
tanimotos,
|
|
1033
|
+
index=index,
|
|
1034
|
+
dtype=float
|
|
1035
|
+
)
|
|
1036
|
+
|
|
1037
|
+
# FIXME: Submitted to OpenEye as Case #00037423
|
|
1038
|
+
# We need to keep the copies of the molecules that we made above, or they will be garbage collected
|
|
1039
|
+
# and the OE2DMolDisplay objects will segfault. We'll keep those in the metadata now for the arrays.
|
|
1040
|
+
ref_arr = oepd.DisplayArray(ref_displays, metadata={"molecules": ref_molecules})
|
|
1041
|
+
targ_arr = oepd.DisplayArray(targ_displays, metadata={"molecules": targ_molecules})
|
|
1042
|
+
|
|
1043
|
+
df[reference_similarity_column] = pd.Series(
|
|
1044
|
+
ref_arr,
|
|
1045
|
+
index=shallow_copy(index),
|
|
1046
|
+
dtype=oepd.DisplayDtype()
|
|
1047
|
+
)
|
|
1048
|
+
|
|
1049
|
+
df[target_similarity_column] = pd.Series(
|
|
1050
|
+
targ_arr,
|
|
1051
|
+
index=shallow_copy(index),
|
|
1052
|
+
dtype=oepd.DisplayDtype()
|
|
1053
|
+
)
|
|
1054
|
+
|
|
1055
|
+
return df
|
|
1056
|
+
|
|
1057
|
+
|
|
1058
|
+
########################################################################################################################
|
|
1059
|
+
# Monkey-patch CNotebook methods onto OEPandas accessors
|
|
1060
|
+
########################################################################################################################
|
|
1061
|
+
|
|
1062
|
+
# Import the OEPandas accessor classes
|
|
1063
|
+
from oepandas.pandas_extensions import OESeriesAccessor, OEDataFrameAccessor
|
|
1064
|
+
|
|
1065
|
+
# Add cnotebook methods to Series accessor
|
|
1066
|
+
OESeriesAccessor.highlight = _series_highlight
|
|
1067
|
+
OESeriesAccessor.recalculate_depiction_coordinates = _series_recalculate_depiction_coordinates
|
|
1068
|
+
OESeriesAccessor.reset_depictions = _series_reset_depictions
|
|
1069
|
+
OESeriesAccessor.clear_formatting_rules = _series_clear_formatting_rules
|
|
1070
|
+
OESeriesAccessor.align_depictions = _series_align_depictions
|
|
1071
|
+
|
|
1072
|
+
# Add cnotebook methods to DataFrame accessor
|
|
1073
|
+
OEDataFrameAccessor.recalculate_depiction_coordinates = _dataframe_recalculate_depiction_coordinates
|
|
1074
|
+
OEDataFrameAccessor.reset_depictions = _dataframe_reset_depictions
|
|
1075
|
+
OEDataFrameAccessor.clear_formatting_rules = _dataframe_clear_formatting_rules
|
|
1076
|
+
OEDataFrameAccessor.copy_molecules = _dataframe_copy_molecules
|
|
1077
|
+
OEDataFrameAccessor.highlight = _dataframe_highlight
|
|
1078
|
+
OEDataFrameAccessor.highlight_using_column = _dataframe_highlight_using_column
|
|
1079
|
+
OEDataFrameAccessor.fingerprint_similarity = _dataframe_fingerprint_similarity
|
|
1080
|
+
|
|
1081
|
+
|
|
1082
|
+
########################################################################################################################
|
|
1083
|
+
# MolGrid accessor methods for Series and DataFrame
|
|
1084
|
+
########################################################################################################################
|
|
1085
|
+
|
|
1086
|
+
def _series_molgrid(
|
|
1087
|
+
self,
|
|
1088
|
+
title_field: str = "Title",
|
|
1089
|
+
tooltip_fields: list = None,
|
|
1090
|
+
**kwargs
|
|
1091
|
+
):
|
|
1092
|
+
"""Display molecules in an interactive grid.
|
|
1093
|
+
|
|
1094
|
+
:param title_field: Field for title (molecule property or DataFrame column).
|
|
1095
|
+
:param tooltip_fields: Fields for tooltip.
|
|
1096
|
+
:param kwargs: Additional arguments passed to MolGrid.
|
|
1097
|
+
:returns: MolGrid instance.
|
|
1098
|
+
"""
|
|
1099
|
+
from cnotebook import MolGrid
|
|
1100
|
+
|
|
1101
|
+
series = self._obj
|
|
1102
|
+
mols = list(series)
|
|
1103
|
+
|
|
1104
|
+
# Check if series is part of a DataFrame
|
|
1105
|
+
df = None
|
|
1106
|
+
# noinspection PyProtectedMember
|
|
1107
|
+
if hasattr(series, '_cacher') and series._cacher is not None:
|
|
1108
|
+
try:
|
|
1109
|
+
# noinspection PyProtectedMember
|
|
1110
|
+
df = series._cacher[1]()
|
|
1111
|
+
except (TypeError, KeyError):
|
|
1112
|
+
pass
|
|
1113
|
+
|
|
1114
|
+
return MolGrid(
|
|
1115
|
+
mols,
|
|
1116
|
+
dataframe=df,
|
|
1117
|
+
mol_col=series.name,
|
|
1118
|
+
title_field=title_field,
|
|
1119
|
+
tooltip_fields=tooltip_fields,
|
|
1120
|
+
**kwargs
|
|
1121
|
+
)
|
|
1122
|
+
|
|
1123
|
+
|
|
1124
|
+
def _dataframe_molgrid(
|
|
1125
|
+
self,
|
|
1126
|
+
mol_col: str,
|
|
1127
|
+
title_field: str = "Title",
|
|
1128
|
+
tooltip_fields: list = None,
|
|
1129
|
+
**kwargs
|
|
1130
|
+
):
|
|
1131
|
+
"""Display molecules from a column in an interactive grid.
|
|
1132
|
+
|
|
1133
|
+
:param mol_col: Column containing molecules.
|
|
1134
|
+
:param title_field: Column for title display.
|
|
1135
|
+
:param tooltip_fields: Columns for tooltip.
|
|
1136
|
+
:param kwargs: Additional arguments passed to MolGrid.
|
|
1137
|
+
:returns: MolGrid instance.
|
|
1138
|
+
"""
|
|
1139
|
+
from cnotebook import MolGrid
|
|
1140
|
+
|
|
1141
|
+
df = self._obj
|
|
1142
|
+
mols = list(df[mol_col])
|
|
1143
|
+
|
|
1144
|
+
return MolGrid(
|
|
1145
|
+
mols,
|
|
1146
|
+
dataframe=df,
|
|
1147
|
+
mol_col=mol_col,
|
|
1148
|
+
title_field=title_field,
|
|
1149
|
+
tooltip_fields=tooltip_fields,
|
|
1150
|
+
**kwargs
|
|
1151
|
+
)
|
|
1152
|
+
|
|
1153
|
+
|
|
1154
|
+
# Add molgrid methods to accessors
|
|
1155
|
+
OESeriesAccessor.molgrid = _series_molgrid
|
|
1156
|
+
OEDataFrameAccessor.molgrid = _dataframe_molgrid
|