cnotebook 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,900 @@
1
+ import re
2
+ import logging
3
+ import typing
4
+ import pandas as pd
5
+ import oepandas as oepd
6
+ from pandas.api.extensions import register_dataframe_accessor, register_series_accessor
7
+ from typing import Iterable, Any, Literal, Hashable
8
+ from openeye import oechem, oedepict, oegraphsim, oegrapheme
9
+ from copy import copy as shallow_copy
10
+ from .context import pass_cnotebook_context, get_series_context
11
+ from .helpers import escape_brackets, create_structure_highlighter
12
+ from .align import create_aligner, fingerprint_maker
13
+ from .render import (
14
+ CNotebookContext,
15
+ oemol_to_disp,
16
+ oedisp_to_html,
17
+ render_invalid_molecule,
18
+ render_empty_molecule
19
+ )
20
+
21
+ # Only register iPython formatters if that is present
22
+ try:
23
+ # noinspection PyProtectedMember,PyPackageRequirements
24
+ from IPython import get_ipython
25
+ ipython_present = True
26
+ except ModuleNotFoundError:
27
+ ipython_present = False
28
+
29
+ if typing.TYPE_CHECKING:
30
+ from .context import CNotebookContext
31
+
32
+
33
+ SMARTS_DELIMITER_RE = re.compile(r'\s*[|\r\n\t]+\s*')
34
+
35
+ log = logging.getLogger("cnotebook")
36
+
37
+
38
+ def create_mol_formatter(*, ctx: CNotebookContext) -> typing.Callable[[oechem.OEMolBase], str]:
39
+ """
40
+ Closure that creates a function that renders an OEMol to HTML
41
+ :param ctx: CNotebook rendering context
42
+ :return: Function that renders molecules to HTML
43
+ """
44
+ def _oemol_to_html(mol: oechem.OEMolBase):
45
+ if isinstance(mol, oechem.OEMolBase):
46
+
47
+ # Render valid molecules
48
+ if mol.IsValid():
49
+ # Create the display object
50
+ disp = oemol_to_disp(mol, ctx=ctx)
51
+
52
+ # Apply display callbacks
53
+ if ctx.callbacks is not None:
54
+ for callback in ctx.callbacks:
55
+ callback(disp)
56
+
57
+ # Render into the string stream
58
+ return oedisp_to_html(disp)
59
+
60
+ # Empty molecule
61
+ elif mol.NumAtoms() == 0:
62
+ return render_empty_molecule(ctx=ctx)
63
+
64
+ # Invalid molecule
65
+ else:
66
+ return render_invalid_molecule(ctx=ctx)
67
+
68
+ return str(mol)
69
+
70
+ return _oemol_to_html
71
+
72
+
73
+ @pass_cnotebook_context
74
+ def create_disp_formatter(
75
+ *,
76
+ callbacks: list[typing.Callable[[oedepict.OE2DMolDisplay], None]] | None = None,
77
+ ctx: CNotebookContext
78
+ ) -> typing.Callable[[oedepict.OE2DMolDisplay], str]:
79
+ """
80
+ Closure that creates a function that renders an OEMol to HTML
81
+ :param ctx: Render context
82
+ :param callbacks: List of callbacks to modify the rendering of the molecule
83
+ :return: Function that renders molecules to HTML
84
+ """
85
+
86
+ def _oedisp_to_html(disp: oedepict.OE2DMolDisplay) -> str:
87
+
88
+ if isinstance(disp, oedepict.OE2DMolDisplay) and disp.IsValid():
89
+ # Copy the display, as not to modify the original with callbacks
90
+ # TODO: Update with ctx
91
+ disp_to_render = oedepict.OE2DMolDisplay(disp)
92
+
93
+ # Apply display callbacks
94
+ if callbacks is not None:
95
+ for callback in callbacks:
96
+ callback(disp_to_render)
97
+
98
+ return oedisp_to_html(disp_to_render, ctx=ctx)
99
+ return str(disp)
100
+
101
+ return _oedisp_to_html
102
+
103
+
104
+ def escape_formatter(obj: Any) -> str:
105
+ return escape_brackets(str(obj))
106
+
107
+
108
+ def render_dataframe(
109
+ df: pd.DataFrame,
110
+ formatters: dict | None = None,
111
+ col_space: dict[str, float | int] | None = None,
112
+ **kwargs
113
+ ) -> str:
114
+ """
115
+ Render a DataFrame with molecules
116
+ :param df: DataFrame to render
117
+ :param formatters: Custom formatters for displaying columns
118
+ :param col_space: Custom column spacing
119
+ :param kwargs: Additional keyword arguments for DataFrame.to_html
120
+ :return: HTML of rendered DataFrame
121
+ """
122
+ # Defaults are empty dictionaries for these
123
+ formatters = formatters or {}
124
+ col_space = col_space or {}
125
+
126
+ # Render columns with MoleculeDtype
127
+ molecule_columns = set()
128
+
129
+ for col in df.columns:
130
+ if isinstance(df.dtypes[col], oepd.MoleculeDtype):
131
+ molecule_columns.add(col)
132
+
133
+ # We need to copy both the DataFrame and the molecules, because we modify them in-place to render them
134
+ df = df.copy()
135
+
136
+ for col in molecule_columns:
137
+ # Direct assignment to help IDE understand this is a MoleculeArray
138
+ arr = df[col].array
139
+ assert isinstance(arr, oepd.MoleculeArray)
140
+ df[col] = pd.Series(arr.deepcopy(), index=df[col].index, dtype=oepd.MoleculeDtype())
141
+
142
+ # ---------------------------------------------------
143
+ # Molecule columns
144
+ # ---------------------------------------------------
145
+
146
+ if len(molecule_columns) > 0:
147
+ log.debug(f'Detected molecule columns: {", ".join(molecule_columns)}')
148
+
149
+ # Create formatters for each column
150
+ for col in molecule_columns:
151
+
152
+ # Create the formatter for this column
153
+ if col in formatters:
154
+ log.warning(f'Overwriting existing formatter for {col} with a molecule formatter')
155
+
156
+ # Direct assignment to help IDE understand this is a MoleculeArray
157
+ arr = df[col].array
158
+ assert isinstance(arr, oepd.MoleculeArray)
159
+
160
+ # Get the cnotebook options for this column
161
+ ctx = get_series_context(arr.metadata)
162
+
163
+ formatters[col] = create_mol_formatter(ctx=ctx)
164
+
165
+ # Record the column width
166
+ if col in col_space:
167
+ log.warning(f'Column spacing for {col} already defined by overwriting with molecule image width')
168
+
169
+ col_space[col] = float(ctx.width)
170
+
171
+ # ---------------------------------------------------
172
+ # Display columns
173
+ # ---------------------------------------------------
174
+
175
+ # Render columns with DisplayDtype
176
+ display_columns = set()
177
+
178
+ for col in df.columns:
179
+ if isinstance(df.dtypes[col], oepd.DisplayDtype):
180
+ display_columns.add(col)
181
+
182
+ if len(display_columns) > 0:
183
+ log.debug(f'Detected display columns: {", ".join(display_columns)}')
184
+
185
+ for col in display_columns:
186
+
187
+ # Get the underlying display array
188
+ # Direct assignment to help IDE understand this is a DisplayArray
189
+ arr = df[col].array
190
+ assert isinstance(arr, oepd.DisplayArray)
191
+
192
+ # Get column metadata
193
+ ctx = get_series_context(arr.metadata)
194
+
195
+ formatters[col] = create_disp_formatter(ctx=ctx)
196
+
197
+ if len(arr) > 0:
198
+ col_space[col] = max(disp.GetWidth() for disp in arr if isinstance(disp, oedepict.OE2DMolDisplay))
199
+ col_space[col] = max(0, col_space[col])
200
+ else:
201
+ col_space[col] = 0
202
+
203
+ # ---------------------------------------------------
204
+ # All other columns
205
+ # ---------------------------------------------------
206
+
207
+ for col in df.columns:
208
+ if col not in display_columns and col not in molecule_columns:
209
+ formatters[col] = escape_formatter
210
+
211
+ return df.to_html(escape=False, formatters=formatters, col_space=col_space, **kwargs)
212
+
213
+
214
+ ########################################################################################################################
215
+ # Register Pandas formatters
216
+ ########################################################################################################################
217
+
218
+ if ipython_present:
219
+
220
+ def register_pandas_formatters():
221
+ """
222
+ Modify how the notebook is told how to display Pandas Dataframes - this actually is more flexible because it
223
+ will still work with other custom changes to to_html().
224
+
225
+ Note: Calls to this function are idempotent.
226
+ """
227
+ ipython_instance = get_ipython()
228
+
229
+ if ipython_instance is not None:
230
+ html_formatter = ipython_instance.display_formatter.formatters['text/html']
231
+ try:
232
+ formatter = html_formatter.lookup(pd.DataFrame)
233
+ if formatter is not render_dataframe:
234
+ html_formatter.for_type(pd.DataFrame, render_dataframe)
235
+ except KeyError:
236
+ html_formatter.for_type(pd.DataFrame, render_dataframe)
237
+ else:
238
+ log.debug("[cnotebook] iPython installed but not in use - cannot register pandas extension")
239
+
240
+ else:
241
+
242
+ # iPython is not present, so we do not register a Pandas formatter
243
+ def register_pandas_formatters():
244
+ pass
245
+
246
+
247
+ ########################################################################################################################
248
+ # Series accessors
249
+ ########################################################################################################################
250
+
251
+ @register_series_accessor("highlight")
252
+ class SeriesHighlightAccessor:
253
+ def __init__(self, pandas_obj: pd.Series):
254
+ if not isinstance(pandas_obj.dtype, oepd.MoleculeDtype):
255
+ raise TypeError(
256
+ "subsearch only works on molecule columns (oepandas.MoleculeDtype). If this column has "
257
+ "molecules, use pd.Series.as_molecule to convert to a molecule column first."
258
+ )
259
+
260
+ self._obj = pandas_obj
261
+
262
+ def __call__(
263
+ self,
264
+ pattern: Iterable[str] | str | oechem.OESubSearch | Iterable[oechem.OESubSearch],
265
+ *,
266
+ color: oechem.OEColor = oechem.OEColor(oechem.OELightBlue),
267
+ style: int = oedepict.OEHighlightStyle_Stick,
268
+ ref: oechem.OESubSearch | oechem.OEMCSSearch | oechem.OEQMol | Literal["first"] | oechem.OEMolBase | None = None, # noqa
269
+ method: Literal["ss", "substructure", "mcss", "fp", "fingerprint"] | None = None
270
+ ) -> None:
271
+ """
272
+ Highlight chemical features in a structure
273
+
274
+ The pattern argument can be:
275
+ - SMARTS pattern
276
+ - oechem.OESubSearch or oechem.OEMCSSearch object
277
+ - Iterable of SMARTS patterns, oechem.OESubSearch, and/or oechem.OEMCSSearch objects
278
+
279
+ :param pattern: Pattern(s) to highlight in the molecule
280
+ :param color: Highlight color
281
+ :param style: Highlight style
282
+ :return: Callback to highlight the pattern(s) in the molecule
283
+ """
284
+ # Get the molecule array
285
+ # Direct assignment to help IDE understand this is a MoleculeArray
286
+ arr = self._obj.array
287
+ assert isinstance(arr, oepd.MoleculeArray)
288
+
289
+ # Get / create a series context and save it (because we are modifying it locally)
290
+ ctx = get_series_context(arr.metadata, save=True)
291
+
292
+ # ********************************************************************************
293
+ # Highlighting
294
+ # ********************************************************************************
295
+
296
+ # Case: Pattern is a single SMARTS string or oechem.OESubSearch object
297
+ if isinstance(pattern, (str, oechem.OESubSearch, oechem.OEMCSSearch, oechem.OEQMol)):
298
+ ctx.add_callback(
299
+ create_structure_highlighter(
300
+ query=pattern,
301
+ color=color,
302
+ style=style
303
+ )
304
+ )
305
+
306
+ # Case: Pattern is an iterable
307
+ elif isinstance(pattern, Iterable):
308
+ for element in pattern:
309
+
310
+ # Element is a SMARTS string or oechem.OESubSearch object
311
+ if isinstance(element, (str, oechem.OESubSearch, oechem.OEMCSSearch, oechem.OEQMol)):
312
+ ctx.add_callback(
313
+ create_structure_highlighter(
314
+ query=element,
315
+ color=color,
316
+ style=style
317
+ )
318
+ )
319
+
320
+ # Unknown element
321
+ else:
322
+ raise TypeError(f'Do not know how to add molecule highlight for type {type(element).__name__}')
323
+
324
+ # Case: Pattern is an unknown type
325
+ else:
326
+ raise TypeError(f'Do not know how to add molecule highlight for type {type(pattern).__name__}')
327
+
328
+ # ********************************************************************************
329
+ # Alignment
330
+ # ********************************************************************************
331
+
332
+ if ref is not None:
333
+ self._obj.align_depictions(ref=ref, method=method)
334
+
335
+
336
+ @register_series_accessor("recalculate_depiction_coordinates")
337
+ class SeriesRecalculateDepictionCoordinatesAccessor:
338
+ def __init__(self, pandas_obj: pd.Series):
339
+ if not isinstance(pandas_obj.dtype, oepd.MoleculeDtype):
340
+ raise TypeError(
341
+ "recalculate_depiction_coordinates only works on molecule columns (oepandas.MoleculeDtype). If this "
342
+ "column has molecules, use pd.Series.as_molecule to convert to a molecule column first."
343
+ )
344
+
345
+ self._obj = pandas_obj
346
+
347
+ def __call__(
348
+ self,
349
+ *,
350
+ clear_coords: bool = True,
351
+ add_depction_hydrogens: bool = True,
352
+ perceive_bond_stereo: bool = True,
353
+ suppress_explicit_hydrogens: bool = True,
354
+ orientation: int = oedepict.OEDepictOrientation_Default
355
+ ) -> None:
356
+ """
357
+ Recalculate the depictions for a molecule series.
358
+
359
+ See the following link for more information:
360
+ https://docs.eyesopen.com/toolkits/python/depicttk/OEDepictClasses/OEPrepareDepictionOptions.html
361
+
362
+ :param clear_coords: Clear existing 2D coordinates
363
+ :param add_depction_hydrogens: Add explicit depiction hydrogens for faithful stereo depiction, etc.
364
+ :param perceive_bond_stereo: Perceive wedge/hash bond stereo
365
+ :param suppress_explicit_hydrogens: Suppress explicit hydrogens
366
+ :param orientation: Preferred 2D orientation
367
+ """
368
+ # Create the depiction options
369
+ opts = oedepict.OEPrepareDepictionOptions()
370
+ opts.SetClearCoords(clear_coords)
371
+ opts.SetAddDepictionHydrogens(add_depction_hydrogens)
372
+
373
+ for mol in self._obj.array:
374
+ if isinstance(mol, oechem.OEMolBase):
375
+ oedepict.OEPrepareDepiction(mol, opts)
376
+
377
+
378
+ @register_series_accessor("reset_depictions")
379
+ class SeriesResetDepictionsAccessor:
380
+ def __init__(self, pandas_obj: pd.Series):
381
+ self._obj = pandas_obj
382
+
383
+ def __call__(self) -> None:
384
+ """
385
+ Reset depiction callbacks for a molecule series
386
+ """
387
+ # Check if array has metadata attribute (should be true for oepandas arrays)
388
+ if hasattr(self._obj.array, "metadata"):
389
+ # Direct assignment to help IDE understand this has metadata
390
+ arr = self._obj.array
391
+ assert isinstance(arr, oepd.MoleculeArray)
392
+ _ = arr.metadata.pop("cnotebook", None)
393
+
394
+
395
+ @register_series_accessor("align_depictions")
396
+ class SeriesAlignDepictionsAccessor:
397
+ def __init__(self, pandas_obj: pd.Series):
398
+ if not isinstance(pandas_obj.dtype, oepd.MoleculeDtype):
399
+ raise TypeError(
400
+ "align_depictions only works on molecule columns (oepandas.MoleculeDtype). If this "
401
+ "column has molecules, use pd.Series.as_molecule to convert to a molecule column first."
402
+ )
403
+
404
+ self._obj = pandas_obj
405
+
406
+ def __call__(
407
+ self,
408
+ ref: oechem.OESubSearch | oechem.OEMCSSearch | oechem.OEMolBase | oechem.OEQMol | Literal["first"],
409
+ method: Literal["substructure", "ss", "mcss", "fp", "fingerprint"] | None = None,
410
+ **kwargs
411
+ ) -> None:
412
+ """
413
+ Align the 2D coordinates of molecules
414
+ :param align: Alignment reference
415
+ :param kwargs: Keyword arguments for aligner
416
+ :return: Aligned molecule depictions
417
+ """
418
+ # Get the rendering context for creating the displays
419
+
420
+ # TODO: Maybe do this smarter so that you know if the context is column-level, which means you could copy that
421
+ # context into the new DisplayArray that you'll create below? Or even link the contexts?
422
+
423
+ # Direct assignment to help IDE understand this is a MoleculeArray
424
+ arr = self._obj.array
425
+ assert isinstance(arr, oepd.MoleculeArray)
426
+
427
+ if isinstance(ref, str) and ref == "first":
428
+ for mol in arr:
429
+ if mol is not None and mol.IsValid():
430
+ ref = mol.CreateCopy()
431
+ break
432
+ else:
433
+ log.warning("No valid molecule found in series for depiction alignment")
434
+ return
435
+
436
+ # Suppress alignment warnings (there are lots of needless warnings)
437
+ level = oechem.OEThrow.GetLevel()
438
+ oechem.OEThrow.SetLevel(oechem.OEErrorLevel_Error)
439
+
440
+ # noinspection PyBroadException
441
+ try:
442
+ # Create the aligner
443
+ aligner = create_aligner(ref=ref, method=method)
444
+
445
+ for mol in arr:
446
+ _ = aligner(mol)
447
+
448
+ except Exception:
449
+ # We don't care if the aligners fail - it just results in unaligned structures (NBD)
450
+ pass
451
+
452
+ # Restore OEThrow
453
+ finally:
454
+ oechem.OEThrow.SetLevel(level)
455
+
456
+
457
+ ########################################################################################################################
458
+ # DataFrame accessors
459
+ ########################################################################################################################
460
+
461
+ @register_dataframe_accessor("recalculate_depiction_coordinates")
462
+ class SeriesRecalculateDepictionCoordinatesAccessor:
463
+ def __init__(self, pandas_obj: pd.DataFrame):
464
+ self._obj = pandas_obj
465
+
466
+ def __call__(
467
+ self,
468
+ *,
469
+ molecule_columns: str | Iterable[str] | None = None,
470
+ clear_coords: bool = True,
471
+ add_depction_hydrogens: bool = True,
472
+ perceive_bond_stereo: bool = True,
473
+ suppress_explicit_hydrogens: bool = True,
474
+ orientation: int = oedepict.OEDepictOrientation_Default
475
+ ) -> None:
476
+ """
477
+ Recalculate the depictions for a one or more molecule series in a DataFrame. If molecule_columns is None,
478
+ which is the default, then all molecule columns will have their depictions recalculated
479
+
480
+ See the following link for more information:
481
+ https://docs.eyesopen.com/toolkits/python/depicttk/OEDepictClasses/OEPrepareDepictionOptions.html
482
+
483
+ :param molecule_columns: Optional molecule column(s) to have depictions recalculated
484
+ :param clear_coords: Clear existing 2D coordinates
485
+ :param add_depction_hydrogens: Add explicit depiction hydrogens for faithful stereo depiction, etc.
486
+ :param perceive_bond_stereo: Perceive wedge/hash bond stereo
487
+ :param suppress_explicit_hydrogens: Suppress explicit hydrogens
488
+ :param orientation: Preferred 2D orientation
489
+ """
490
+ if molecule_columns is None:
491
+ molecule_columns = set()
492
+
493
+ for col in self._obj.columns:
494
+ if isinstance(self._obj.dtypes[col], oepd.MoleculeDtype):
495
+ molecule_columns.add(col)
496
+
497
+ elif isinstance(molecule_columns, str):
498
+ molecule_columns = {molecule_columns}
499
+
500
+ else:
501
+ molecule_columns = set(molecule_columns)
502
+
503
+ # Recalculate the column depictions
504
+ for col in molecule_columns:
505
+
506
+ if col in self._obj.columns:
507
+ if isinstance(self._obj.dtypes[col], oepd.MoleculeDtype):
508
+ self._obj[col].recalculate_depiction_coordinates(
509
+ clear_coords=clear_coords,
510
+ add_depction_hydrogens=add_depction_hydrogens,
511
+ perceive_bond_stereo=perceive_bond_stereo,
512
+ suppress_explicit_hydrogens=suppress_explicit_hydrogens,
513
+ orientation=orientation
514
+ )
515
+
516
+ else:
517
+ log.warning(f'Column {col} does not have a MoleculeDtype')
518
+
519
+ else:
520
+ log.warning(f'{col} not found in DataFrame columns: ({", ".join(self._obj.columns)})')
521
+ molecule_columns.remove(col)
522
+
523
+
524
+ @register_dataframe_accessor("reset_depictions")
525
+ class SeriesResetDepictionsAccessor:
526
+ def __init__(self, pandas_obj: pd.DataFrame):
527
+ self._obj = pandas_obj
528
+
529
+ def __call__(self, *, molecule_columns: str | Iterable[str] | None = None) -> None:
530
+ """
531
+ Reset depiction callbacks for one or more columns
532
+ """
533
+ columns = set()
534
+ if molecule_columns is None:
535
+ columns.update(self._obj.columns)
536
+
537
+ elif isinstance(molecule_columns, str):
538
+ columns.add(molecule_columns)
539
+
540
+ else:
541
+ columns.update(molecule_columns)
542
+
543
+ # Filter invalid and non-molecule columns
544
+ for col in filter(
545
+ lambda c: c in self._obj.columns and isinstance(self._obj[c].dtype, oepd.MoleculeDtype),
546
+ columns
547
+ ):
548
+ self._obj[col].reset_depictions()
549
+
550
+
551
+ @register_dataframe_accessor("highlight_using_column")
552
+ class HighlightUsingColumnAccessor:
553
+ def __init__(self, pandas_obj: pd.DataFrame):
554
+ self._obj = pandas_obj
555
+
556
+ def __call__(
557
+ self,
558
+ molecule_column: str,
559
+ pattern_column: str,
560
+ *,
561
+ highlighted_column: str = "highlighted_substructures",
562
+ ref: oechem.OESubSearch | oechem.OEMCSSearch | oechem.OEMolBase | None = None,
563
+ alignment_opts: oedepict.OEAlignmentOptions | None = None,
564
+ prepare_opts: oedepict.OEPrepareDepictionOptions | None = None,
565
+ inplace: bool = False
566
+ ) -> pd.DataFrame:
567
+ """
568
+ Highlight molecules based on the value of another column. The column produced is a DisplayArray column, so
569
+ the results are not suitable for other molecular calculations.
570
+
571
+ The other column can contain:
572
+ - Comma or whitespace delimited string of SMARTS patterns
573
+ - oechem.OESubSearch or oechem.OEMCSSearch object
574
+ - Iterable of SMARTS patterns, oechem.OESubSearch, and/or oechem.OEMCSSearch objects
575
+
576
+ :param molecule_column: Name of the molecule column
577
+ :param pattern_column: Name of the pattern column
578
+ :param highlighted_column: Optional name of the column with highlighted structures
579
+ :param ref: Optional reference for aligning depictions
580
+ :param alignment_opts: Optional depiction alignment options (oedepict.OEAlignmentOptions)
581
+ :param prepare_opts: Optional depiction preparation options (oedepict.OEPrepareDepictionOptions)
582
+ :param inplace: Modify the DataFrame in place
583
+ :return: Modified DataFrame
584
+ """
585
+ # Object we are operating on
586
+ df = self._obj if inplace else self._obj.copy()
587
+
588
+ if molecule_column not in df.columns:
589
+ raise KeyError(f'{molecule_column} not found in DataFrame columns: ({", ".join(df.columns)}')
590
+
591
+ if not isinstance(df[molecule_column].dtype, oepd.MoleculeDtype):
592
+ raise TypeError(
593
+ f"highlight_using_column only works on molecule columns (oepandas.MoleculeDtype). If {molecule_column}"
594
+ " has molecules, use pd.Series.as_molecule to convert to a molecule column first."
595
+ )
596
+
597
+ if pattern_column not in df.columns:
598
+ raise KeyError(f'{pattern_column} not found in DataFrame columns: ({", ".join(df.columns)}')
599
+
600
+ # Create the display objects
601
+ indexes = []
602
+ displays = []
603
+
604
+ # Get the rendering context for creating the displays
605
+ # TODO: Maybe do this smarter so that you know if the context is column-level, which means you could copy that
606
+ # context into the new DisplayArray that you'll create below? Or even link the contexts?
607
+ # Direct assignment to help IDE understand this is a MoleculeArray
608
+ arr = df[molecule_column].array
609
+ assert isinstance(arr, oepd.MoleculeArray)
610
+ ctx = get_series_context(arr.metadata)
611
+
612
+ for idx, row in df.iterrows():
613
+ indexes.append(idx)
614
+
615
+ mol = row[molecule_column]
616
+ if isinstance(mol, oechem.OEMolBase):
617
+
618
+ # Create the display
619
+ disp = oemol_to_disp(mol, ctx=ctx)
620
+
621
+ # Highlight
622
+ substructures = []
623
+ patterns = row[pattern_column]
624
+
625
+ # Parse different patterns
626
+ if isinstance(patterns, str):
627
+ for pattern in re.split(SMARTS_DELIMITER_RE, patterns):
628
+ ss = oechem.OESubSearch(pattern)
629
+ if ss.IsValid():
630
+ substructures.append(ss)
631
+
632
+ elif isinstance(patterns, oechem.OESubSearch):
633
+ if patterns.IsValid():
634
+ substructures.append(patterns)
635
+
636
+ elif isinstance(patterns, Iterable):
637
+
638
+ for p in patterns:
639
+
640
+ if isinstance(p, str):
641
+ for pattern in re.split(SMARTS_DELIMITER_RE, p):
642
+ ss = oechem.OESubSearch(pattern)
643
+ if ss.IsValid():
644
+ substructures.append(ss)
645
+
646
+ elif isinstance(p, oechem.OESubSearch):
647
+ if p.IsValid():
648
+ substructures.append(p)
649
+
650
+ else:
651
+ log.warning(f'Do not know how to highlight using: {type(p).__name__}')
652
+
653
+ else:
654
+ log.warning(f'Do not know how to highlight using: {type(patterns).__name__}')
655
+
656
+ # Apply substructure highlights
657
+ highlight = oedepict.OEHighlightOverlayByBallAndStick(oechem.OEGetLightColors())
658
+
659
+ for ss in substructures:
660
+ oedepict.OEAddHighlightOverlay(disp, highlight, ss.Match(mol, True))
661
+
662
+ displays.append(disp)
663
+
664
+ else:
665
+ displays.append(None)
666
+
667
+ df[highlighted_column] = pd.Series(displays, index=indexes, dtype=oepd.DisplayDtype())
668
+ return df
669
+
670
+
671
+ class ColorBondByOverlapScore(oegrapheme.OEBondGlyphBase):
672
+ """
673
+ Color molecule by bond overlap score:
674
+ https://docs.eyesopen.com/toolkits/cookbook/python/depiction/simcalc.html
675
+ """
676
+ def __init__(self, cg, tag):
677
+ oegrapheme.OEBondGlyphBase.__init__(self)
678
+ self.colorg = cg
679
+ self.tag = tag
680
+
681
+ # noinspection PyPep8Naming
682
+ def RenderGlyph(self, disp, bond):
683
+
684
+ bdisp = disp.GetBondDisplay(bond)
685
+ if bdisp is None or not bdisp.IsVisible():
686
+ return False
687
+
688
+ if not bond.HasData(self.tag):
689
+ return False
690
+
691
+ linewidth = disp.GetScale() / 3.0
692
+ color = self.colorg.GetColorAt(bond.GetData(self.tag))
693
+ pen = oedepict.OEPen(color, color, oedepict.OEFill_Off, linewidth)
694
+
695
+ adispB = disp.GetAtomDisplay(bond.GetBgn())
696
+ adispE = disp.GetAtomDisplay(bond.GetEnd())
697
+
698
+ layer = disp.GetLayer(oedepict.OELayerPosition_Below)
699
+ layer.DrawLine(adispB.GetCoords(), adispE.GetCoords(), pen)
700
+
701
+ return True
702
+
703
+ # noinspection PyPep8Naming
704
+ def ColorBondByOverlapScore(self):
705
+ return ColorBondByOverlapScore(self.colorg, self.tag).__disown__()
706
+
707
+
708
+ @register_dataframe_accessor("fingerprint_similarity")
709
+ class FingerprintSimilaritySeriesAccessor:
710
+ def __init__(self, pandas_obj: pd.DataFrame):
711
+ self._obj = pandas_obj
712
+ self._tag = oechem.OEGetTag("fingerprint_overlap")
713
+
714
+ def __call__(
715
+ self,
716
+ molecule_column: str,
717
+ ref: oechem.OEMolBase | None = None,
718
+ *,
719
+ tanimoto_column="fingerprint_tanimoto",
720
+ reference_similarity_column="reference_similarity",
721
+ target_similarity_column="target_similarity",
722
+ fptype: str = "tree",
723
+ num_bits: int = 4096,
724
+ min_distance: int = 0,
725
+ max_distance: int = 4,
726
+ atom_type: str | int = oegraphsim.OEFPAtomType_DefaultTreeAtom,
727
+ bond_type: str | int = oegraphsim.OEFPBondType_DefaultTreeBond,
728
+ inplace: bool = False
729
+ ) -> pd.DataFrame:
730
+ """
731
+ Color molecules by fingerprint similarity
732
+ :param ref: Reference molecule
733
+ :param fptype: Fingerprint type
734
+ :param num_bits: Number of bits in the fingerprint
735
+ :param min_distance: Minimum distance/radius for path/circular/tree
736
+ :param max_distance: Maximum distance/radius for path/circular/tree
737
+ :param atom_type: Atom type string delimited by "|" OR int bitmask from the oegraphsim.OEFPAtomType_ namespace
738
+ :param bond_type: Bond type string delimited by "|" OR int bitmask from the oegraphsim.OEFPBondType_ namespace
739
+ :return:
740
+ """
741
+ # Preprocess
742
+ df = self._obj if inplace else self._obj.copy()
743
+
744
+ if molecule_column not in df.columns:
745
+ raise KeyError(f'Molecule column not found in DataFrame: {molecule_column}')
746
+
747
+ if not isinstance(df[molecule_column].dtype, oepd.MoleculeDtype):
748
+ raise TypeError("Column {} does not have dtype oepd.MoleculeDtype ({})".format(
749
+ molecule_column, str(df[molecule_column].dtype)))
750
+
751
+ # Get the context
752
+ # Direct assignment to help IDE understand this is a MoleculeArray
753
+ arr = self._obj[molecule_column].array
754
+ assert isinstance(arr, oepd.MoleculeArray)
755
+ ctx = get_series_context(arr.metadata)
756
+
757
+ # If we're using the first molecule as our reference
758
+ if ref is None:
759
+ for mol in arr: # type: oechem.OEMol
760
+ if mol.IsValid():
761
+ ref = mol
762
+ break
763
+ else:
764
+ log.warning(f'No valid reference molecules to use for alignment in column {molecule_column}')
765
+ return df
766
+
767
+ # Check reference molecule
768
+ if not ref.IsValid():
769
+ log.warning("Reference molecule is not valid")
770
+ return df
771
+
772
+ # Fingerprint maker
773
+ make_fp = fingerprint_maker(
774
+ fptype=fptype,
775
+ num_bits=num_bits,
776
+ min_distance=min_distance,
777
+ max_distance=max_distance,
778
+ atom_type=atom_type,
779
+ bond_type=bond_type
780
+ )
781
+
782
+ # Make the reference fingerprint
783
+ ref_fp = make_fp(ref)
784
+
785
+ if not ref_fp.IsValid():
786
+ log.warning("Fingerprint from reference molecule is invalid")
787
+ return df
788
+
789
+ # Create the display objects
790
+ ref_displays = []
791
+ targ_displays = []
792
+
793
+ # FIXME: See now below regarding the fact we have to cache the reference and target molecule copies
794
+ ref_molecules = []
795
+ targ_molecules = []
796
+
797
+ tanimotos = []
798
+ index = []
799
+
800
+ for idx, mol in df[molecule_column].items(): # type: Hashable, oechem.OEMol
801
+ index.append(idx)
802
+ if mol is not None and mol.IsValid():
803
+
804
+ # Copy the molecules, because we're modifying them
805
+ targ_mol = oechem.OEMol(mol)
806
+ ref_mol = oechem.OEMol(ref)
807
+
808
+ # FIXME: See now below regarding the fact we have to cache the reference and target molecule copies
809
+ targ_molecules.append(targ_mol)
810
+ ref_molecules.append(ref_mol)
811
+
812
+ # Create the fingerprint
813
+ targ_fp = make_fp(targ_mol)
814
+ if targ_fp.IsValid():
815
+
816
+ # Add the tanimoto
817
+ tanimotos.append(oegraphsim.OETanimoto(ref_fp, targ_fp))
818
+
819
+ # Calculate the similarity
820
+ targ_bonds = oechem.OEUIntArray(targ_mol.GetMaxBondIdx())
821
+ ref_bonds = oechem.OEUIntArray(ref_mol.GetMaxBondIdx())
822
+
823
+ # Overlaps
824
+ overlaps = oegraphsim.OEGetFPOverlap(ref_mol, targ_mol, ref_fp.GetFPTypeBase())
825
+
826
+ for match in overlaps:
827
+ for bond in match.GetPatternBonds():
828
+ ref_bonds[bond.GetIdx()] += 1
829
+ for bond in match.GetTargetBonds():
830
+ targ_bonds[bond.GetIdx()] += 1
831
+
832
+ for bond in targ_mol.GetBonds():
833
+ bond.SetData(self._tag, targ_bonds[bond.GetIdx()])
834
+
835
+ for bond in ref_mol.GetBonds():
836
+ bond.SetData(self._tag, ref_bonds[bond.GetIdx()])
837
+
838
+ # noinspection PyTypeChecker
839
+ maxvalue = max((0, max(targ_bonds), max(ref_bonds)))
840
+
841
+ # Create the color gradient
842
+ colorg = oechem.OELinearColorGradient()
843
+ colorg.AddStop(oechem.OEColorStop(0.0, oechem.OEPinkTint))
844
+ colorg.AddStop(oechem.OEColorStop(1.0, oechem.OEYellow))
845
+ colorg.AddStop(oechem.OEColorStop(maxvalue, oechem.OEDarkGreen))
846
+
847
+ # Function that will color the bonds
848
+ bondglyph = ColorBondByOverlapScore(colorg, self._tag)
849
+
850
+ # Align the molecules
851
+ overlaps = oegraphsim.OEGetFPOverlap(ref_mol, targ_mol, ref_fp.GetFPTypeBase())
852
+ oedepict.OEPrepareMultiAlignedDepiction(targ_mol, ref_mol, overlaps)
853
+
854
+ # Create the displays
855
+ ref_disp = oemol_to_disp(ref_mol, ctx=ctx)
856
+ targ_disp = oemol_to_disp(targ_mol, ctx=ctx)
857
+
858
+ # Color the displays
859
+ oegrapheme.OEAddGlyph(ref_disp, bondglyph, oechem.IsTrueBond())
860
+ oegrapheme.OEAddGlyph(targ_disp, bondglyph, oechem.IsTrueBond())
861
+
862
+ ref_displays.append(ref_disp)
863
+ targ_displays.append(targ_disp)
864
+
865
+ # Fingerprint was invalid
866
+ else:
867
+ ref_displays.append(None)
868
+ targ_displays.append(None)
869
+
870
+ # Molecule was invalid
871
+ else:
872
+ ref_displays.append(None)
873
+ targ_displays.append(None)
874
+
875
+ # Add the columns
876
+ df[tanimoto_column] = pd.Series(
877
+ tanimotos,
878
+ index=index,
879
+ dtype=float
880
+ )
881
+
882
+ # FIXME: Submitted to OpenEye as Case #00037423
883
+ # We need to keep the copies of the molecules that we made above, or they will be garbage collected
884
+ # and the OE2DMolDisplay objects will segfault. We'll keep those in the metadata now for the arrays.
885
+ ref_arr = oepd.DisplayArray(ref_displays, metadata={"molecules": ref_molecules})
886
+ targ_arr = oepd.DisplayArray(targ_displays, metadata={"molecules": targ_molecules})
887
+
888
+ df[reference_similarity_column] = pd.Series(
889
+ ref_arr,
890
+ index=shallow_copy(index),
891
+ dtype=oepd.DisplayDtype()
892
+ )
893
+
894
+ df[target_similarity_column] = pd.Series(
895
+ targ_arr,
896
+ index=shallow_copy(index),
897
+ dtype=oepd.DisplayDtype()
898
+ )
899
+
900
+ return df