cnotebook 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1156 @@
1
+ import re
2
+ import logging
3
+ import typing
4
+ import pandas as pd
5
+ import oepandas as oepd
6
+ from typing import Iterable, Any, Literal, Hashable
7
+ from openeye import oechem, oedepict, oegraphsim, oegrapheme
8
+ from copy import copy as shallow_copy
9
+ from .context import pass_cnotebook_context, get_series_context
10
+ from .helpers import escape_brackets, create_structure_highlighter
11
+ from .align import create_aligner, fingerprint_maker
12
+ from .render import (
13
+ CNotebookContext, # noqa
14
+ oemol_to_disp,
15
+ oedisp_to_html,
16
+ render_invalid_molecule,
17
+ render_empty_molecule
18
+ )
19
+
20
+ # Only register iPython formatters if that is present
21
+ try:
22
+ # noinspection PyProtectedMember,PyPackageRequirements
23
+ from IPython import get_ipython
24
+ ipython_present = True
25
+ except ModuleNotFoundError:
26
+ ipython_present = False
27
+
28
+ if typing.TYPE_CHECKING:
29
+ from .context import CNotebookContext
30
+
31
+
32
+ SMARTS_DELIMITER_RE = re.compile(r'\s*[|\r\n\t]+\s*')
33
+
34
+ log = logging.getLogger("cnotebook")
35
+
36
+
37
+ def create_mol_formatter(*, ctx: CNotebookContext) -> typing.Callable[[oechem.OEMolBase], str]:
38
+ """
39
+ Closure that creates a function that renders an OEMol to HTML
40
+ :param ctx: CNotebook rendering context
41
+ :return: Function that renders molecules to HTML
42
+ """
43
+ def _oemol_to_html(mol: oechem.OEMolBase):
44
+ if isinstance(mol, oechem.OEMolBase):
45
+
46
+ # Render valid molecules
47
+ if mol.IsValid():
48
+ # Create the display object
49
+ disp = oemol_to_disp(mol, ctx=ctx)
50
+
51
+ # Apply display callbacks
52
+ if ctx.callbacks is not None:
53
+ for callback in ctx.callbacks:
54
+ callback(disp)
55
+
56
+ # Render into the string stream
57
+ return oedisp_to_html(disp)
58
+
59
+ # Empty molecule
60
+ elif mol.NumAtoms() == 0:
61
+ return render_empty_molecule(ctx=ctx)
62
+
63
+ # Invalid molecule
64
+ else:
65
+ return render_invalid_molecule(ctx=ctx)
66
+
67
+ return str(mol)
68
+
69
+ return _oemol_to_html
70
+
71
+
72
+ @pass_cnotebook_context
73
+ def create_disp_formatter(
74
+ *,
75
+ callbacks: list[typing.Callable[[oedepict.OE2DMolDisplay], None]] | None = None,
76
+ ctx: CNotebookContext
77
+ ) -> typing.Callable[[oedepict.OE2DMolDisplay], str]:
78
+ """
79
+ Closure that creates a function that renders an OEMol to HTML
80
+ :param ctx: Render context
81
+ :param callbacks: List of callbacks to modify the rendering of the molecule
82
+ :return: Function that renders molecules to HTML
83
+ """
84
+
85
+ def _oedisp_to_html(disp: oedepict.OE2DMolDisplay) -> str:
86
+
87
+ if isinstance(disp, oedepict.OE2DMolDisplay) and disp.IsValid():
88
+ # Copy the display, as not to modify the original with callbacks
89
+ # TODO: Update with ctx
90
+ disp_to_render = oedepict.OE2DMolDisplay(disp)
91
+
92
+ # Apply display callbacks
93
+ if callbacks is not None:
94
+ for callback in callbacks:
95
+ callback(disp_to_render)
96
+
97
+ return oedisp_to_html(disp_to_render, ctx=ctx)
98
+ return str(disp)
99
+
100
+ return _oedisp_to_html
101
+
102
+
103
+ def escape_formatter(obj: Any) -> str:
104
+ return escape_brackets(str(obj))
105
+
106
+
107
+ def render_dataframe(
108
+ df: pd.DataFrame,
109
+ formatters: dict | None = None,
110
+ col_space: dict[str, float | int] | None = None,
111
+ ctx: CNotebookContext | None = None,
112
+ **kwargs
113
+ ) -> str:
114
+ """
115
+ Render a DataFrame with molecules
116
+ :param df: DataFrame to render
117
+ :param formatters: Custom formatters for displaying columns
118
+ :param col_space: Custom column spacing
119
+ :param ctx: Local rendering context (optional)
120
+ :param kwargs: Additional keyword arguments for DataFrame.to_html
121
+ :return: HTML of rendered DataFrame
122
+ """
123
+ # Defaults are empty dictionaries for these
124
+ formatters = formatters or {}
125
+ col_space = col_space or {}
126
+
127
+ # Render columns with MoleculeDtype
128
+ molecule_columns = set()
129
+
130
+ # Capture metadata from ORIGINAL DataFrame BEFORE copying
131
+ # (df.copy() may not preserve array metadata)
132
+ original_metadata_by_col = {}
133
+
134
+ for col in df.columns:
135
+ if isinstance(df.dtypes[col], oepd.MoleculeDtype):
136
+ molecule_columns.add(col)
137
+ # Get metadata from the original array before any copying
138
+ arr = df[col].array
139
+ if hasattr(arr, 'metadata') and arr.metadata:
140
+ original_metadata_by_col[col] = arr.metadata.copy()
141
+
142
+ # We need to copy both the DataFrame and the molecules, because we modify them in-place to render them
143
+ df = df.copy()
144
+
145
+ for col in molecule_columns:
146
+ # Direct assignment to help IDE understand this is a MoleculeArray
147
+ arr = df[col].array
148
+ assert isinstance(arr, oepd.MoleculeArray)
149
+ # Use preserved metadata from original DataFrame (not the copy which may have lost it)
150
+ original_metadata = original_metadata_by_col.get(col, {})
151
+ new_arr = arr.deepcopy()
152
+ new_arr.metadata.update(original_metadata)
153
+ df[col] = pd.Series(new_arr, index=df[col].index, dtype=oepd.MoleculeDtype())
154
+
155
+ # ---------------------------------------------------
156
+ # Molecule columns
157
+ # ---------------------------------------------------
158
+
159
+ if len(molecule_columns) > 0:
160
+ log.debug(f'Detected molecule columns: {", ".join(molecule_columns)}')
161
+
162
+ # Create formatters for each column
163
+ for col in molecule_columns:
164
+
165
+ # Create the formatter for this column
166
+ if col in formatters:
167
+ log.warning(f'Overwriting existing formatter for {col} with a molecule formatter')
168
+
169
+ # Direct assignment to help IDE understand this is a MoleculeArray
170
+ arr = df[col].array
171
+ assert isinstance(arr, oepd.MoleculeArray)
172
+
173
+ # Get the cnotebook options for this column
174
+ series_ctx = ctx if ctx is not None else get_series_context(arr.metadata)
175
+
176
+ formatters[col] = create_mol_formatter(ctx=series_ctx)
177
+
178
+ # Record the column width
179
+ if col in col_space:
180
+ log.warning(f'Column spacing for {col} already defined by overwriting with molecule image width')
181
+
182
+ col_space[col] = float(series_ctx.width)
183
+
184
+ # ---------------------------------------------------
185
+ # Display columns
186
+ # ---------------------------------------------------
187
+
188
+ # Render columns with DisplayDtype
189
+ display_columns = set()
190
+
191
+ for col in df.columns:
192
+ if isinstance(df.dtypes[col], oepd.DisplayDtype):
193
+ display_columns.add(col)
194
+
195
+ if len(display_columns) > 0:
196
+ log.debug(f'Detected display columns: {", ".join(display_columns)}')
197
+
198
+ for col in display_columns:
199
+
200
+ # Get the underlying display array
201
+ # Direct assignment to help IDE understand this is a DisplayArray
202
+ arr = df[col].array
203
+ assert isinstance(arr, oepd.DisplayArray)
204
+
205
+ # Get column metadata
206
+ series_ctx = ctx if ctx is not None else get_series_context(arr.metadata)
207
+
208
+ formatters[col] = create_disp_formatter(ctx=series_ctx)
209
+
210
+ if len(arr) > 0:
211
+ col_space[col] = max(disp.GetWidth() for disp in arr if isinstance(disp, oedepict.OE2DMolDisplay))
212
+ col_space[col] = max(0, col_space[col])
213
+ else:
214
+ col_space[col] = 0
215
+
216
+ # ---------------------------------------------------
217
+ # All other columns
218
+ # ---------------------------------------------------
219
+
220
+ for col in df.columns:
221
+ if col not in display_columns and col not in molecule_columns:
222
+ formatters[col] = escape_formatter
223
+
224
+ return df.to_html(escape=False, formatters=formatters, col_space=col_space, **kwargs)
225
+
226
+
227
+ ########################################################################################################################
228
+ # Register Pandas formatters
229
+ ########################################################################################################################
230
+
231
+ if ipython_present:
232
+
233
+ def register_pandas_formatters():
234
+ """
235
+ Modify how the notebook is told how to display Pandas Dataframes - this actually is more flexible because it
236
+ will still work with other custom changes to to_html().
237
+
238
+ Note: Calls to this function are idempotent.
239
+ """
240
+ ipython_instance = get_ipython()
241
+
242
+ if ipython_instance is not None:
243
+ html_formatter = ipython_instance.display_formatter.formatters['text/html']
244
+ try:
245
+ formatter = html_formatter.lookup(pd.DataFrame)
246
+ if formatter is not render_dataframe:
247
+ html_formatter.for_type(pd.DataFrame, render_dataframe)
248
+ except KeyError:
249
+ html_formatter.for_type(pd.DataFrame, render_dataframe)
250
+ else:
251
+ log.debug("[cnotebook] iPython installed but not in use - cannot register pandas extension")
252
+
253
+ else:
254
+
255
+ # iPython is not present, so we do not register a Pandas formatter
256
+ def register_pandas_formatters():
257
+ pass
258
+
259
+
260
+ ########################################################################################################################
261
+ # CNotebook Series accessor extensions for OEPandas .chem accessor
262
+ ########################################################################################################################
263
+
264
+ def _series_highlight(
265
+ self,
266
+ pattern: Iterable[str] | str | oechem.OESubSearch | Iterable[oechem.OESubSearch],
267
+ *,
268
+ color: oechem.OEColor | oechem.OEColorIter | None = None,
269
+ style: int | Literal["overlay_default", "overlay_ball_and_stick"] = "overlay_default",
270
+ ref: oechem.OESubSearch | oechem.OEMCSSearch | oechem.OEQMol | Literal["first"] | oechem.OEMolBase | None = None,
271
+ method: Literal["ss", "substructure", "mcss", "fp", "fingerprint"] | None = None
272
+ ) -> None:
273
+ """
274
+ Highlight chemical features in a structure.
275
+
276
+ The pattern argument can be:
277
+ - SMARTS pattern
278
+ - oechem.OESubSearch or oechem.OEMCSSearch object
279
+ - Iterable of SMARTS patterns, oechem.OESubSearch, and/or oechem.OEMCSSearch objects
280
+
281
+ :param pattern: Pattern(s) to highlight in the molecule.
282
+ :param color: Highlight color(s). Can be a single oechem.OEColor or an oechem.OEColorIter
283
+ (e.g., oechem.OEGetLightColors()). Defaults to oechem.OEGetLightColors().
284
+ :param style: Highlight style. Can be an int (OEHighlightStyle constant) or a string
285
+ ("overlay_default", "overlay_ball_and_stick"). Defaults to "overlay_default".
286
+ :param ref: Optional reference for alignment.
287
+ :param method: Optional alignment method.
288
+ """
289
+ if not isinstance(self._obj.dtype, oepd.MoleculeDtype):
290
+ raise TypeError(
291
+ "highlight only works on molecule columns (oepandas.MoleculeDtype). If this column has "
292
+ "molecules, use series.chem.as_molecule() to convert to a molecule column first."
293
+ )
294
+
295
+ # Get the molecule array
296
+ arr = self._obj.array
297
+ assert isinstance(arr, oepd.MoleculeArray)
298
+
299
+ # Get / create a series context and save it (because we are modifying it locally)
300
+ ctx = get_series_context(arr.metadata, save=True)
301
+
302
+ # ********************************************************************************
303
+ # Highlighting
304
+ # ********************************************************************************
305
+
306
+ # Case: Pattern is a single SMARTS string or oechem.OESubSearch object
307
+ if isinstance(pattern, (str, oechem.OESubSearch, oechem.OEMCSSearch, oechem.OEQMol)):
308
+ ctx.add_callback(
309
+ create_structure_highlighter(
310
+ query=pattern,
311
+ color=color,
312
+ style=style
313
+ )
314
+ )
315
+
316
+ # Case: Pattern is an iterable
317
+ elif isinstance(pattern, Iterable):
318
+ for element in pattern:
319
+
320
+ # Element is a SMARTS string or oechem.OESubSearch object
321
+ if isinstance(element, (str, oechem.OESubSearch, oechem.OEMCSSearch, oechem.OEQMol)):
322
+ ctx.add_callback(
323
+ create_structure_highlighter(
324
+ query=element,
325
+ color=color,
326
+ style=style
327
+ )
328
+ )
329
+
330
+ # Unknown element
331
+ else:
332
+ raise TypeError(f'Do not know how to add molecule highlight for type {type(element).__name__}')
333
+
334
+ # Case: Pattern is an unknown type
335
+ else:
336
+ raise TypeError(f'Do not know how to add molecule highlight for type {type(pattern).__name__}')
337
+
338
+ # ********************************************************************************
339
+ # Alignment
340
+ # ********************************************************************************
341
+
342
+ if ref is not None:
343
+ self._obj.chem.align_depictions(ref=ref, method=method)
344
+
345
+
346
+ def _series_recalculate_depiction_coordinates(
347
+ self,
348
+ *,
349
+ clear_coords: bool = True,
350
+ add_depiction_hydrogens: bool = True,
351
+ perceive_bond_stereo: bool = True,
352
+ suppress_explicit_hydrogens: bool = True,
353
+ orientation: int = oedepict.OEDepictOrientation_Default
354
+ ) -> None:
355
+ """
356
+ Recalculate the depictions for a molecule series.
357
+
358
+ See the following link for more information:
359
+ https://docs.eyesopen.com/toolkits/python/depicttk/OEDepictClasses/OEPrepareDepictionOptions.html
360
+
361
+ :param clear_coords: Clear existing 2D coordinates
362
+ :param add_depiction_hydrogens: Add explicit depiction hydrogens for faithful stereo depiction, etc.
363
+ :param perceive_bond_stereo: Perceive wedge/hash bond stereo
364
+ :param suppress_explicit_hydrogens: Suppress explicit hydrogens
365
+ :param orientation: Preferred 2D orientation
366
+ """
367
+ if not isinstance(self._obj.dtype, oepd.MoleculeDtype):
368
+ raise TypeError(
369
+ "recalculate_depiction_coordinates only works on molecule columns (oepandas.MoleculeDtype). If this "
370
+ "column has molecules, use series.chem.as_molecule() to convert to a molecule column first."
371
+ )
372
+
373
+ # Create the depiction options
374
+ opts = oedepict.OEPrepareDepictionOptions()
375
+ opts.SetClearCoords(clear_coords)
376
+ opts.SetAddDepictionHydrogens(add_depiction_hydrogens)
377
+ opts.SetPerceiveBondStereo(perceive_bond_stereo)
378
+ opts.SetSuppressHydrogens(suppress_explicit_hydrogens)
379
+ opts.SetDepictOrientation(orientation)
380
+
381
+ for mol in self._obj.array:
382
+ if isinstance(mol, oechem.OEMolBase):
383
+ oedepict.OEPrepareDepiction(mol, opts)
384
+
385
+
386
+ def _series_reset_depictions(self) -> None:
387
+ """
388
+ Reset depiction callbacks for a molecule series
389
+ """
390
+ # Check if array has metadata attribute (should be true for oepandas arrays)
391
+ if hasattr(self._obj.array, "metadata"):
392
+ arr = self._obj.array
393
+ assert isinstance(arr, oepd.MoleculeArray)
394
+ _ = arr.metadata.pop("cnotebook", None)
395
+
396
+
397
+ def _series_clear_formatting_rules(self) -> None:
398
+ """
399
+ Clear all formatting rule callbacks from a molecule series.
400
+
401
+ This removes any callbacks applied to the molecule prior to rendering,
402
+ such as highlighting. Unlike reset_depictions which removes the entire
403
+ rendering context, this method only clears the callbacks while preserving
404
+ other context settings like image dimensions and styling.
405
+ """
406
+ if hasattr(self._obj.array, "metadata"):
407
+ arr = self._obj.array
408
+ assert isinstance(arr, oepd.MoleculeArray)
409
+ ctx = arr.metadata.get("cnotebook", None)
410
+ if ctx is not None and isinstance(ctx, CNotebookContext):
411
+ ctx.reset_callbacks()
412
+
413
+
414
+ def _series_align_depictions(
415
+ self,
416
+ ref: oechem.OESubSearch | oechem.OEMCSSearch | oechem.OEMolBase | oechem.OEQMol | Literal["first"],
417
+ method: Literal["substructure", "ss", "mcss", "fp", "fingerprint"] | None = None,
418
+ **kwargs
419
+ ) -> None:
420
+ """
421
+ Align the 2D coordinates of molecules
422
+ :param ref: Alignment reference
423
+ :param method: Alignment method
424
+ :param kwargs: Keyword arguments for aligner
425
+ :return: Aligned molecule depictions
426
+ """
427
+ if not isinstance(self._obj.dtype, oepd.MoleculeDtype):
428
+ raise TypeError(
429
+ "align_depictions only works on molecule columns (oepandas.MoleculeDtype). If this "
430
+ "column has molecules, use series.chem.as_molecule() to convert to a molecule column first."
431
+ )
432
+
433
+ # Get the rendering context for creating the displays
434
+ arr = self._obj.array
435
+ assert isinstance(arr, oepd.MoleculeArray)
436
+
437
+ if isinstance(ref, str) and ref == "first":
438
+ for mol in arr:
439
+ if mol is not None and mol.IsValid():
440
+ ref = mol.CreateCopy()
441
+ break
442
+ else:
443
+ log.warning("No valid molecule found in series for depiction alignment")
444
+ return
445
+
446
+ # Suppress alignment warnings (there are lots of needless warnings)
447
+ level = oechem.OEThrow.GetLevel()
448
+ oechem.OEThrow.SetLevel(oechem.OEErrorLevel_Error)
449
+
450
+ # noinspection PyBroadException
451
+ try:
452
+ # Create the aligner
453
+ aligner = create_aligner(ref=ref, method=method)
454
+
455
+ for mol in arr:
456
+ _ = aligner(mol)
457
+
458
+ except Exception as ex:
459
+ log.debug("Error aligning molecules: %s", ex)
460
+
461
+ # Restore OEThrow
462
+ finally:
463
+ oechem.OEThrow.SetLevel(level)
464
+
465
+
466
+ ########################################################################################################################
467
+ # CNotebook DataFrame accessor extensions for OEPandas .chem accessor
468
+ ########################################################################################################################
469
+
470
+ def _dataframe_recalculate_depiction_coordinates(
471
+ self,
472
+ *,
473
+ molecule_columns: str | Iterable[str] | None = None,
474
+ clear_coords: bool = True,
475
+ add_depction_hydrogens: bool = True,
476
+ perceive_bond_stereo: bool = True,
477
+ suppress_explicit_hydrogens: bool = True,
478
+ orientation: int = oedepict.OEDepictOrientation_Default
479
+ ) -> None:
480
+ """
481
+ Recalculate the depictions for a one or more molecule series in a DataFrame. If molecule_columns is None,
482
+ which is the default, then all molecule columns will have their depictions recalculated
483
+
484
+ See the following link for more information:
485
+ https://docs.eyesopen.com/toolkits/python/depicttk/OEDepictClasses/OEPrepareDepictionOptions.html
486
+
487
+ :param molecule_columns: Optional molecule column(s) to have depictions recalculated
488
+ :param clear_coords: Clear existing 2D coordinates
489
+ :param add_depction_hydrogens: Add explicit depiction hydrogens for faithful stereo depiction, etc.
490
+ :param perceive_bond_stereo: Perceive wedge/hash bond stereo
491
+ :param suppress_explicit_hydrogens: Suppress explicit hydrogens
492
+ :param orientation: Preferred 2D orientation
493
+ """
494
+ if molecule_columns is None:
495
+ molecule_columns = set()
496
+
497
+ for col in self._obj.columns:
498
+ if isinstance(self._obj.dtypes[col], oepd.MoleculeDtype):
499
+ molecule_columns.add(col)
500
+
501
+ elif isinstance(molecule_columns, str):
502
+ molecule_columns = {molecule_columns}
503
+
504
+ else:
505
+ molecule_columns = set(molecule_columns)
506
+
507
+ # Recalculate the column depictions
508
+ for col in molecule_columns:
509
+
510
+ if col in self._obj.columns:
511
+ if isinstance(self._obj.dtypes[col], oepd.MoleculeDtype):
512
+ self._obj[col].chem.recalculate_depiction_coordinates(
513
+ clear_coords=clear_coords,
514
+ add_depction_hydrogens=add_depction_hydrogens,
515
+ perceive_bond_stereo=perceive_bond_stereo,
516
+ suppress_explicit_hydrogens=suppress_explicit_hydrogens,
517
+ orientation=orientation
518
+ )
519
+
520
+ else:
521
+ log.warning(f'Column {col} does not have a MoleculeDtype')
522
+
523
+ else:
524
+ log.warning(f'{col} not found in DataFrame columns: ({", ".join(self._obj.columns)})')
525
+ molecule_columns.remove(col)
526
+
527
+
528
+ def _dataframe_reset_depictions(self, *, molecule_columns: str | Iterable[str] | None = None) -> None:
529
+ """
530
+ Reset depiction callbacks for one or more columns
531
+ """
532
+ columns = set()
533
+ if molecule_columns is None:
534
+ columns.update(self._obj.columns)
535
+
536
+ elif isinstance(molecule_columns, str):
537
+ columns.add(molecule_columns)
538
+
539
+ else:
540
+ columns.update(molecule_columns)
541
+
542
+ # Filter invalid and non-molecule columns
543
+ for col in filter(
544
+ lambda c: c in self._obj.columns and isinstance(self._obj[c].dtype, oepd.MoleculeDtype),
545
+ columns
546
+ ):
547
+ self._obj[col].chem.reset_depictions()
548
+
549
+
550
+ def _dataframe_clear_formatting_rules(self, molecule_columns: str | Iterable[str] | None = None) -> None:
551
+ """
552
+ Clear all formatting rule callbacks from one or more molecule columns.
553
+
554
+ This removes any callbacks applied to molecules prior to rendering,
555
+ such as highlighting. Unlike reset_depictions which removes the entire
556
+ rendering context, this method only clears the callbacks while preserving
557
+ other context settings like image dimensions and styling.
558
+
559
+ :param molecule_columns: Optional molecule column(s) to clear formatting rules from.
560
+ If None, clears formatting rules from all molecule columns.
561
+
562
+ Example::
563
+
564
+ # Clear formatting rules from all molecule columns
565
+ df.chem.clear_formatting_rules()
566
+
567
+ # Clear formatting rules from a specific column
568
+ df.chem.clear_formatting_rules("smiles")
569
+
570
+ # Clear formatting rules from multiple columns
571
+ df.chem.clear_formatting_rules(["mol1", "mol2"])
572
+ """
573
+ columns = set()
574
+ if molecule_columns is None:
575
+ columns.update(self._obj.columns)
576
+
577
+ elif isinstance(molecule_columns, str):
578
+ columns.add(molecule_columns)
579
+
580
+ else:
581
+ columns.update(molecule_columns)
582
+
583
+ # Filter invalid and non-molecule columns
584
+ for col in filter(
585
+ lambda c: c in self._obj.columns and isinstance(self._obj[c].dtype, oepd.MoleculeDtype),
586
+ columns
587
+ ):
588
+ self._obj[col].chem.clear_formatting_rules()
589
+
590
+
591
+ def _dataframe_highlight(
592
+ self,
593
+ molecule_column: str,
594
+ pattern: Iterable[str] | str | oechem.OESubSearch | Iterable[oechem.OESubSearch],
595
+ *,
596
+ color: oechem.OEColor | oechem.OEColorIter | None = None,
597
+ style: int | Literal["overlay_default", "overlay_ball_and_stick"] = "overlay_default",
598
+ ) -> None:
599
+ """
600
+ Highlight chemical features in molecules within a specified column.
601
+
602
+ The pattern argument can be:
603
+ - SMARTS pattern
604
+ - oechem.OESubSearch or oechem.OEMCSSearch object
605
+ - Iterable of SMARTS patterns, oechem.OESubSearch, and/or oechem.OEMCSSearch objects
606
+
607
+ :param molecule_column: Name of the molecule column to highlight.
608
+ :param pattern: Pattern(s) to highlight in the molecules.
609
+ :param color: Highlight color(s). Can be a single oechem.OEColor or an oechem.OEColorIter
610
+ (e.g., oechem.OEGetLightColors()). Defaults to oechem.OEGetLightColors().
611
+ :param style: Highlight style. Can be an int (OEHighlightStyle constant) or a string
612
+ ("overlay_default", "overlay_ball_and_stick"). Defaults to "overlay_default".
613
+
614
+ Example::
615
+
616
+ # Highlight benzene rings in the 'smiles' column
617
+ df.chem.highlight("smiles", "c1ccccc1")
618
+
619
+ # Highlight multiple patterns
620
+ df.chem.highlight("smiles", ["c1ccccc1", "[OH]"])
621
+ """
622
+ if molecule_column not in self._obj.columns:
623
+ raise ValueError(f'Column {molecule_column} not found in DataFrame columns: ({", ".join(self._obj.columns)})')
624
+
625
+ if not isinstance(self._obj[molecule_column].dtype, oepd.MoleculeDtype):
626
+ raise TypeError(
627
+ f"highlight only works on molecule columns (oepandas.MoleculeDtype). Column '{molecule_column}' "
628
+ f"has type {self._obj[molecule_column].dtype}."
629
+ )
630
+
631
+ # Delegate to the series-level highlight (which works in Pandas)
632
+ self._obj[molecule_column].chem.highlight(pattern, color=color, style=style)
633
+
634
+
635
+ def _dataframe_copy_molecules(
636
+ self,
637
+ source_column: str,
638
+ dest_column: str,
639
+ ) -> pd.DataFrame:
640
+ """
641
+ Create a deep copy of molecules from one column to a new column.
642
+
643
+ This creates independent copies of all molecules, allowing modifications
644
+ (such as highlighting or alignment) to the new column without affecting
645
+ the original.
646
+
647
+ :param source_column: Name of the source molecule column.
648
+ :param dest_column: Name of the new column to create with copied molecules.
649
+ :returns: DataFrame with the new molecule column added.
650
+
651
+ Example::
652
+
653
+ # Create a copy of molecules for alignment
654
+ df = df.chem.copy_molecules("Original", "Aligned")
655
+ df.chem.highlight("Aligned", "c1ccccc1")
656
+ """
657
+ if source_column not in self._obj.columns:
658
+ raise ValueError(f'Column {source_column} not found in DataFrame columns: ({", ".join(self._obj.columns)})')
659
+
660
+ if not isinstance(self._obj[source_column].dtype, oepd.MoleculeDtype):
661
+ raise TypeError(
662
+ f"copy_molecules only works on molecule columns (oepandas.MoleculeDtype). Column '{source_column}' "
663
+ f"has type {self._obj[source_column].dtype}."
664
+ )
665
+
666
+ # Use the series-level copy_molecules and assign to the new column
667
+ self._obj[dest_column] = self._obj[source_column].chem.copy_molecules()
668
+ return self._obj
669
+
670
+
671
+ def _dataframe_highlight_using_column(
672
+ self,
673
+ molecule_column: str,
674
+ pattern_column: str,
675
+ *,
676
+ highlighted_column: str = "highlighted_substructures",
677
+ color: oechem.OEColor | oechem.OEColorIter | None = None,
678
+ style: int | Literal["overlay_default", "overlay_ball_and_stick"] = "overlay_default",
679
+ inplace: bool = False
680
+ ) -> pd.DataFrame:
681
+ """
682
+ Highlight molecules based on the value of another column. The column produced is a DisplayArray column, so
683
+ the results are not suitable for other molecular calculations.
684
+
685
+ The other column can contain:
686
+ - Comma or whitespace delimited string of SMARTS patterns
687
+ - oechem.OESubSearch or oechem.OEMCSSearch object
688
+ - Iterable of SMARTS patterns, oechem.OESubSearch, and/or oechem.OEMCSSearch objects
689
+
690
+ :param molecule_column: Name of the molecule column.
691
+ :param pattern_column: Name of the pattern column.
692
+ :param highlighted_column: Optional name of the column with highlighted structures.
693
+ :param color: Highlight color(s). Can be a single oechem.OEColor or an oechem.OEColorIter
694
+ (e.g., oechem.OEGetLightColors()). Defaults to oechem.OEGetLightColors().
695
+ :param style: Highlight style. Can be an int (OEHighlightStyle constant) or a string
696
+ ("overlay_default", "overlay_ball_and_stick"). Defaults to "overlay_default".
697
+ :param inplace: Modify the DataFrame in place.
698
+ :returns: Modified DataFrame.
699
+ """
700
+ # Object we are operating on
701
+ df = self._obj if inplace else self._obj.copy()
702
+
703
+ if molecule_column not in df.columns:
704
+ raise KeyError(f'{molecule_column} not found in DataFrame columns: ({", ".join(df.columns)}')
705
+
706
+ if not isinstance(df[molecule_column].dtype, oepd.MoleculeDtype):
707
+ raise TypeError(
708
+ f"highlight_using_column only works on molecule columns (oepandas.MoleculeDtype). If {molecule_column}"
709
+ " has molecules, use df.chem.as_molecule() to convert to a molecule column first."
710
+ )
711
+
712
+ if pattern_column not in df.columns:
713
+ raise KeyError(f'{pattern_column} not found in DataFrame columns: ({", ".join(df.columns)}')
714
+
715
+ # Default color
716
+ if color is None:
717
+ color = oechem.OEGetLightColors()
718
+
719
+ # Determine highlighting approach based on style
720
+ use_overlay = isinstance(style, str) and style in ("overlay_default", "overlay_ball_and_stick")
721
+
722
+ # Check if color is compatible with overlay
723
+ if use_overlay and isinstance(color, oechem.OEColor):
724
+ log.warning(
725
+ "Overlay coloring is not compatible with a single oechem.OEColor. Falling back to standard highlighting")
726
+ use_overlay = False
727
+ style = oedepict.OEHighlightStyle_BallAndStick
728
+
729
+ # Create the display objects
730
+ indexes = []
731
+ displays = []
732
+
733
+ # Get the rendering context for creating the displays
734
+ arr = df[molecule_column].array
735
+ assert isinstance(arr, oepd.MoleculeArray)
736
+ ctx = get_series_context(arr.metadata)
737
+
738
+ for idx, row in df.iterrows():
739
+ indexes.append(idx)
740
+
741
+ mol = row[molecule_column]
742
+ if isinstance(mol, oechem.OEMolBase):
743
+
744
+ # Create the display
745
+ disp = oemol_to_disp(mol, ctx=ctx)
746
+
747
+ # Highlight
748
+ substructures = []
749
+ patterns = row[pattern_column]
750
+
751
+ # Parse different patterns
752
+ if isinstance(patterns, str):
753
+ for pattern in re.split(SMARTS_DELIMITER_RE, patterns):
754
+ ss = oechem.OESubSearch(pattern)
755
+ if ss.IsValid():
756
+ substructures.append(ss)
757
+
758
+ elif isinstance(patterns, oechem.OESubSearch):
759
+ if patterns.IsValid():
760
+ substructures.append(patterns)
761
+
762
+ elif isinstance(patterns, Iterable):
763
+
764
+ for p in patterns:
765
+
766
+ if isinstance(p, str):
767
+ for pattern in re.split(SMARTS_DELIMITER_RE, p):
768
+ ss = oechem.OESubSearch(pattern)
769
+ if ss.IsValid():
770
+ substructures.append(ss)
771
+
772
+ elif isinstance(p, oechem.OESubSearch):
773
+ if p.IsValid():
774
+ substructures.append(p)
775
+
776
+ else:
777
+ log.warning(f'Do not know how to highlight using: {type(p).__name__}')
778
+
779
+ else:
780
+ log.warning(f'Do not know how to highlight using: {type(patterns).__name__}')
781
+
782
+ # Overlay highlighting
783
+ if use_overlay:
784
+ highlight = oedepict.OEHighlightOverlayByBallAndStick(color)
785
+ for ss in substructures:
786
+ oedepict.OEAddHighlightOverlay(disp, highlight, ss.Match(mol, True))
787
+
788
+ else:
789
+ # Traditional highlighting
790
+ if isinstance(color, oechem.OEColor):
791
+ highlight_color = color
792
+ else:
793
+ highlight_color = oechem.OELightBlue
794
+ for c in color:
795
+ highlight_color = c
796
+ break
797
+ for ss in substructures:
798
+ for match in ss.Match(mol, True):
799
+ oedepict.OEAddHighlighting(disp, highlight_color, style, match)
800
+
801
+ displays.append(disp)
802
+
803
+ else:
804
+ displays.append(None)
805
+
806
+ df[highlighted_column] = pd.Series(displays, index=indexes, dtype=oepd.DisplayDtype())
807
+ return df
808
+
809
+
810
+ class ColorBondByOverlapScore(oegrapheme.OEBondGlyphBase):
811
+ """Bond glyph that colors bonds by fingerprint overlap score.
812
+
813
+ Used internally by fingerprint similarity visualization to highlight
814
+ bonds based on their contribution to molecular similarity.
815
+
816
+ See: https://docs.eyesopen.com/toolkits/cookbook/python/depiction/simcalc.html
817
+ """
818
+
819
+ def __init__(self, cg: oechem.OELinearColorGradient, tag: int):
820
+ """Create a bond coloring glyph.
821
+
822
+ :param cg: Color gradient to map overlap scores to colors.
823
+ :param tag: OEChem data tag containing overlap scores on bonds.
824
+ """
825
+ oegrapheme.OEBondGlyphBase.__init__(self)
826
+ self.colorg = cg
827
+ self.tag = tag
828
+
829
+ # noinspection PyPep8Naming
830
+ def RenderGlyph(self, disp, bond):
831
+
832
+ bdisp = disp.GetBondDisplay(bond)
833
+ if bdisp is None or not bdisp.IsVisible():
834
+ return False
835
+
836
+ if not bond.HasData(self.tag):
837
+ return False
838
+
839
+ linewidth = disp.GetScale() / 3.0
840
+ color = self.colorg.GetColorAt(bond.GetData(self.tag))
841
+ pen = oedepict.OEPen(color, color, oedepict.OEFill_Off, linewidth)
842
+
843
+ adispB = disp.GetAtomDisplay(bond.GetBgn())
844
+ adispE = disp.GetAtomDisplay(bond.GetEnd())
845
+
846
+ layer = disp.GetLayer(oedepict.OELayerPosition_Below)
847
+ layer.DrawLine(adispB.GetCoords(), adispE.GetCoords(), pen)
848
+
849
+ return True
850
+
851
+ # noinspection PyPep8Naming
852
+ def ColorBondByOverlapScore(self):
853
+ return ColorBondByOverlapScore(self.colorg, self.tag).__disown__()
854
+
855
+
856
+ # Store the fingerprint tag for fingerprint_similarity
857
+ _fingerprint_overlap_tag = oechem.OEGetTag("fingerprint_overlap")
858
+
859
+
860
+ def _dataframe_fingerprint_similarity(
861
+ self,
862
+ molecule_column: str,
863
+ ref: oechem.OEMolBase | None = None,
864
+ *,
865
+ tanimoto_column="fingerprint_tanimoto",
866
+ reference_similarity_column="reference_similarity",
867
+ target_similarity_column="target_similarity",
868
+ fptype: str = "tree",
869
+ num_bits: int = 4096,
870
+ min_distance: int = 0,
871
+ max_distance: int = 4,
872
+ atom_type: str | int = oegraphsim.OEFPAtomType_DefaultTreeAtom,
873
+ bond_type: str | int = oegraphsim.OEFPBondType_DefaultTreeBond,
874
+ inplace: bool = False
875
+ ) -> pd.DataFrame:
876
+ """
877
+ Color molecules by fingerprint similarity
878
+ :param molecule_column: Name of the molecule column
879
+ :param ref: Reference molecule
880
+ :param tanimoto_column: Name of the tanimoto column
881
+ :param reference_similarity_column: Name of the reference similarity column
882
+ :param target_similarity_column: Name of the target similarity column
883
+ :param fptype: Fingerprint type
884
+ :param num_bits: Number of bits in the fingerprint
885
+ :param min_distance: Minimum distance/radius for path/circular/tree
886
+ :param max_distance: Maximum distance/radius for path/circular/tree
887
+ :param atom_type: Atom type string delimited by "|" OR int bitmask from the oegraphsim.OEFPAtomType_ namespace
888
+ :param bond_type: Bond type string delimited by "|" OR int bitmask from the oegraphsim.OEFPBondType_ namespace
889
+ :param inplace: Modify the DataFrame in place
890
+ :return: DataFrame with similarity columns
891
+ """
892
+ tag = _fingerprint_overlap_tag
893
+
894
+ # Preprocess
895
+ df = self._obj if inplace else self._obj.copy()
896
+
897
+ if molecule_column not in df.columns:
898
+ raise KeyError(f'Molecule column not found in DataFrame: {molecule_column}')
899
+
900
+ if not isinstance(df[molecule_column].dtype, oepd.MoleculeDtype):
901
+ raise TypeError("Column {} does not have dtype oepd.MoleculeDtype ({})".format(
902
+ molecule_column, str(df[molecule_column].dtype)))
903
+
904
+ # Get the context
905
+ arr = self._obj[molecule_column].array
906
+ assert isinstance(arr, oepd.MoleculeArray)
907
+ ctx = get_series_context(arr.metadata)
908
+
909
+ # If we're using the first molecule as our reference
910
+ if ref is None:
911
+ for mol in arr: # type: oechem.OEMol
912
+ if mol.IsValid():
913
+ ref = mol
914
+ break
915
+ else:
916
+ log.warning(f'No valid reference molecules to use for alignment in column {molecule_column}')
917
+ return df
918
+
919
+ # Check reference molecule
920
+ if not ref.IsValid():
921
+ log.warning("Reference molecule is not valid")
922
+ return df
923
+
924
+ # Fingerprint maker
925
+ make_fp = fingerprint_maker(
926
+ fptype=fptype,
927
+ num_bits=num_bits,
928
+ min_distance=min_distance,
929
+ max_distance=max_distance,
930
+ atom_type=atom_type,
931
+ bond_type=bond_type
932
+ )
933
+
934
+ # Make the reference fingerprint
935
+ ref_fp = make_fp(ref)
936
+
937
+ if not ref_fp.IsValid():
938
+ log.warning("Fingerprint from reference molecule is invalid")
939
+ return df
940
+
941
+ # Create the display objects
942
+ ref_displays = []
943
+ targ_displays = []
944
+
945
+ # FIXME: See now below regarding the fact we have to cache the reference and target molecule copies
946
+ ref_molecules = []
947
+ targ_molecules = []
948
+
949
+ tanimotos = []
950
+ index = []
951
+
952
+ for idx, mol in df[molecule_column].items(): # type: Hashable, oechem.OEMol
953
+ index.append(idx)
954
+ if mol is not None and mol.IsValid():
955
+
956
+ # Copy the molecules, because we're modifying them
957
+ targ_mol = oechem.OEMol(mol)
958
+ ref_mol = oechem.OEMol(ref)
959
+
960
+ # FIXME: See now below regarding the fact we have to cache the reference and target molecule copies
961
+ targ_molecules.append(targ_mol)
962
+ ref_molecules.append(ref_mol)
963
+
964
+ # Create the fingerprint
965
+ targ_fp = make_fp(targ_mol)
966
+ if targ_fp.IsValid():
967
+
968
+ # Add the tanimoto
969
+ tanimotos.append(oegraphsim.OETanimoto(ref_fp, targ_fp))
970
+
971
+ # Calculate the similarity
972
+ targ_bonds = oechem.OEUIntArray(targ_mol.GetMaxBondIdx())
973
+ ref_bonds = oechem.OEUIntArray(ref_mol.GetMaxBondIdx())
974
+
975
+ # Overlaps
976
+ overlaps = oegraphsim.OEGetFPOverlap(ref_mol, targ_mol, ref_fp.GetFPTypeBase())
977
+
978
+ for match in overlaps:
979
+ for bond in match.GetPatternBonds():
980
+ ref_bonds[bond.GetIdx()] += 1
981
+ for bond in match.GetTargetBonds():
982
+ targ_bonds[bond.GetIdx()] += 1
983
+
984
+ for bond in targ_mol.GetBonds():
985
+ bond.SetData(tag, targ_bonds[bond.GetIdx()])
986
+
987
+ for bond in ref_mol.GetBonds():
988
+ bond.SetData(tag, ref_bonds[bond.GetIdx()])
989
+
990
+ # noinspection PyTypeChecker
991
+ maxvalue = max((0, max(targ_bonds), max(ref_bonds)))
992
+
993
+ # Create the color gradient
994
+ colorg = oechem.OELinearColorGradient()
995
+ colorg.AddStop(oechem.OEColorStop(0.0, oechem.OEPinkTint))
996
+ colorg.AddStop(oechem.OEColorStop(1.0, oechem.OEYellow))
997
+ colorg.AddStop(oechem.OEColorStop(maxvalue, oechem.OEDarkGreen))
998
+
999
+ # Function that will color the bonds
1000
+ bondglyph = ColorBondByOverlapScore(colorg, tag)
1001
+
1002
+ # Align the molecules
1003
+ oedepict.OEPrepareDepiction(ref_mol, False)
1004
+ oedepict.OEPrepareDepiction(targ_mol, False)
1005
+
1006
+ overlaps = oegraphsim.OEGetFPOverlap(ref_mol, targ_mol, ref_fp.GetFPTypeBase())
1007
+ oedepict.OEPrepareMultiAlignedDepiction(targ_mol, ref_mol, overlaps)
1008
+
1009
+ # Create the displays
1010
+ ref_disp = oemol_to_disp(ref_mol, ctx=ctx)
1011
+ targ_disp = oemol_to_disp(targ_mol, ctx=ctx)
1012
+
1013
+ # Color the displays
1014
+ oegrapheme.OEAddGlyph(ref_disp, bondglyph, oechem.IsTrueBond())
1015
+ oegrapheme.OEAddGlyph(targ_disp, bondglyph, oechem.IsTrueBond())
1016
+
1017
+ ref_displays.append(ref_disp)
1018
+ targ_displays.append(targ_disp)
1019
+
1020
+ # Fingerprint was invalid
1021
+ else:
1022
+ ref_displays.append(None)
1023
+ targ_displays.append(None)
1024
+
1025
+ # Molecule was invalid
1026
+ else:
1027
+ ref_displays.append(None)
1028
+ targ_displays.append(None)
1029
+
1030
+ # Add the columns
1031
+ df[tanimoto_column] = pd.Series(
1032
+ tanimotos,
1033
+ index=index,
1034
+ dtype=float
1035
+ )
1036
+
1037
+ # FIXME: Submitted to OpenEye as Case #00037423
1038
+ # We need to keep the copies of the molecules that we made above, or they will be garbage collected
1039
+ # and the OE2DMolDisplay objects will segfault. We'll keep those in the metadata now for the arrays.
1040
+ ref_arr = oepd.DisplayArray(ref_displays, metadata={"molecules": ref_molecules})
1041
+ targ_arr = oepd.DisplayArray(targ_displays, metadata={"molecules": targ_molecules})
1042
+
1043
+ df[reference_similarity_column] = pd.Series(
1044
+ ref_arr,
1045
+ index=shallow_copy(index),
1046
+ dtype=oepd.DisplayDtype()
1047
+ )
1048
+
1049
+ df[target_similarity_column] = pd.Series(
1050
+ targ_arr,
1051
+ index=shallow_copy(index),
1052
+ dtype=oepd.DisplayDtype()
1053
+ )
1054
+
1055
+ return df
1056
+
1057
+
1058
+ ########################################################################################################################
1059
+ # Monkey-patch CNotebook methods onto OEPandas accessors
1060
+ ########################################################################################################################
1061
+
1062
+ # Import the OEPandas accessor classes
1063
+ from oepandas.pandas_extensions import OESeriesAccessor, OEDataFrameAccessor
1064
+
1065
+ # Add cnotebook methods to Series accessor
1066
+ OESeriesAccessor.highlight = _series_highlight
1067
+ OESeriesAccessor.recalculate_depiction_coordinates = _series_recalculate_depiction_coordinates
1068
+ OESeriesAccessor.reset_depictions = _series_reset_depictions
1069
+ OESeriesAccessor.clear_formatting_rules = _series_clear_formatting_rules
1070
+ OESeriesAccessor.align_depictions = _series_align_depictions
1071
+
1072
+ # Add cnotebook methods to DataFrame accessor
1073
+ OEDataFrameAccessor.recalculate_depiction_coordinates = _dataframe_recalculate_depiction_coordinates
1074
+ OEDataFrameAccessor.reset_depictions = _dataframe_reset_depictions
1075
+ OEDataFrameAccessor.clear_formatting_rules = _dataframe_clear_formatting_rules
1076
+ OEDataFrameAccessor.copy_molecules = _dataframe_copy_molecules
1077
+ OEDataFrameAccessor.highlight = _dataframe_highlight
1078
+ OEDataFrameAccessor.highlight_using_column = _dataframe_highlight_using_column
1079
+ OEDataFrameAccessor.fingerprint_similarity = _dataframe_fingerprint_similarity
1080
+
1081
+
1082
+ ########################################################################################################################
1083
+ # MolGrid accessor methods for Series and DataFrame
1084
+ ########################################################################################################################
1085
+
1086
+ def _series_molgrid(
1087
+ self,
1088
+ title_field: str = "Title",
1089
+ tooltip_fields: list = None,
1090
+ **kwargs
1091
+ ):
1092
+ """Display molecules in an interactive grid.
1093
+
1094
+ :param title_field: Field for title (molecule property or DataFrame column).
1095
+ :param tooltip_fields: Fields for tooltip.
1096
+ :param kwargs: Additional arguments passed to MolGrid.
1097
+ :returns: MolGrid instance.
1098
+ """
1099
+ from cnotebook import MolGrid
1100
+
1101
+ series = self._obj
1102
+ mols = list(series)
1103
+
1104
+ # Check if series is part of a DataFrame
1105
+ df = None
1106
+ # noinspection PyProtectedMember
1107
+ if hasattr(series, '_cacher') and series._cacher is not None:
1108
+ try:
1109
+ # noinspection PyProtectedMember
1110
+ df = series._cacher[1]()
1111
+ except (TypeError, KeyError):
1112
+ pass
1113
+
1114
+ return MolGrid(
1115
+ mols,
1116
+ dataframe=df,
1117
+ mol_col=series.name,
1118
+ title_field=title_field,
1119
+ tooltip_fields=tooltip_fields,
1120
+ **kwargs
1121
+ )
1122
+
1123
+
1124
+ def _dataframe_molgrid(
1125
+ self,
1126
+ mol_col: str,
1127
+ title_field: str = "Title",
1128
+ tooltip_fields: list = None,
1129
+ **kwargs
1130
+ ):
1131
+ """Display molecules from a column in an interactive grid.
1132
+
1133
+ :param mol_col: Column containing molecules.
1134
+ :param title_field: Column for title display.
1135
+ :param tooltip_fields: Columns for tooltip.
1136
+ :param kwargs: Additional arguments passed to MolGrid.
1137
+ :returns: MolGrid instance.
1138
+ """
1139
+ from cnotebook import MolGrid
1140
+
1141
+ df = self._obj
1142
+ mols = list(df[mol_col])
1143
+
1144
+ return MolGrid(
1145
+ mols,
1146
+ dataframe=df,
1147
+ mol_col=mol_col,
1148
+ title_field=title_field,
1149
+ tooltip_fields=tooltip_fields,
1150
+ **kwargs
1151
+ )
1152
+
1153
+
1154
+ # Add molgrid methods to accessors
1155
+ OESeriesAccessor.molgrid = _series_molgrid
1156
+ OEDataFrameAccessor.molgrid = _dataframe_molgrid