cnotebook 1.2.0__py3-none-any.whl → 2.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cnotebook/__init__.py +365 -67
- cnotebook/align.py +231 -167
- cnotebook/context.py +50 -18
- cnotebook/grid/__init__.py +56 -0
- cnotebook/grid/grid.py +1655 -0
- cnotebook/helpers.py +147 -15
- cnotebook/ipython_ext.py +0 -3
- cnotebook/marimo_ext.py +67 -0
- cnotebook/pandas_ext.py +760 -514
- cnotebook/polars_ext.py +1237 -0
- cnotebook/render.py +0 -195
- cnotebook-2.1.1.dist-info/METADATA +338 -0
- cnotebook-2.1.1.dist-info/RECORD +16 -0
- {cnotebook-1.2.0.dist-info → cnotebook-2.1.1.dist-info}/WHEEL +1 -1
- cnotebook-1.2.0.dist-info/METADATA +0 -280
- cnotebook-1.2.0.dist-info/RECORD +0 -13
- {cnotebook-1.2.0.dist-info → cnotebook-2.1.1.dist-info}/licenses/LICENSE +0 -0
- {cnotebook-1.2.0.dist-info → cnotebook-2.1.1.dist-info}/top_level.txt +0 -0
cnotebook/polars_ext.py
ADDED
|
@@ -0,0 +1,1237 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import typing
|
|
3
|
+
import weakref
|
|
4
|
+
import polars as pl
|
|
5
|
+
import oepolars as oeplr
|
|
6
|
+
from openeye import oechem, oedepict, oegraphsim, oegrapheme
|
|
7
|
+
from .context import pass_cnotebook_context, get_series_context, create_local_context
|
|
8
|
+
from typing import Iterable, Literal
|
|
9
|
+
from .helpers import escape_brackets, create_structure_highlighter
|
|
10
|
+
from .align import fingerprint_maker
|
|
11
|
+
from .render import (
|
|
12
|
+
CNotebookContext, # noqa
|
|
13
|
+
oemol_to_disp,
|
|
14
|
+
oedisp_to_html,
|
|
15
|
+
render_invalid_molecule,
|
|
16
|
+
render_empty_molecule
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
# Only register iPython formatters if that is present
|
|
20
|
+
try:
|
|
21
|
+
# noinspection PyProtectedMember,PyPackageRequirements
|
|
22
|
+
from IPython import get_ipython
|
|
23
|
+
ipython_present = True
|
|
24
|
+
except ModuleNotFoundError:
|
|
25
|
+
ipython_present = False
|
|
26
|
+
|
|
27
|
+
if typing.TYPE_CHECKING:
|
|
28
|
+
from .context import CNotebookContext
|
|
29
|
+
|
|
30
|
+
log = logging.getLogger("cnotebook")
|
|
31
|
+
|
|
32
|
+
# Global storage for DataFrame column contexts
|
|
33
|
+
# Structure: {id(DataFrame): (weakref(DataFrame), {column_name: CNotebookContext})}
|
|
34
|
+
# We store a weak reference to the DataFrame to allow cleanup
|
|
35
|
+
_dataframe_column_contexts: dict[int, tuple[weakref.ref, dict[str, CNotebookContext]]] = {}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _cleanup_dead_contexts() -> None:
|
|
39
|
+
"""Remove entries for DataFrames that have been garbage collected."""
|
|
40
|
+
dead_ids = [df_id for df_id, (ref, _) in _dataframe_column_contexts.items() if ref() is None]
|
|
41
|
+
for df_id in dead_ids:
|
|
42
|
+
del _dataframe_column_contexts[df_id]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def get_dataframe_column_context(df: pl.DataFrame, column: str) -> CNotebookContext | None:
|
|
46
|
+
"""
|
|
47
|
+
Get the CNotebookContext for a specific DataFrame column.
|
|
48
|
+
|
|
49
|
+
:param df: The DataFrame.
|
|
50
|
+
:param column: The column name.
|
|
51
|
+
:returns: The CNotebookContext if one exists, otherwise None.
|
|
52
|
+
"""
|
|
53
|
+
_cleanup_dead_contexts()
|
|
54
|
+
df_id = id(df)
|
|
55
|
+
if df_id in _dataframe_column_contexts:
|
|
56
|
+
ref, col_contexts = _dataframe_column_contexts[df_id]
|
|
57
|
+
# Verify the DataFrame is still the same object
|
|
58
|
+
if ref() is df:
|
|
59
|
+
return col_contexts.get(column)
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def set_dataframe_column_context(df: pl.DataFrame, column: str, ctx: CNotebookContext) -> None:
|
|
64
|
+
"""
|
|
65
|
+
Set the CNotebookContext for a specific DataFrame column.
|
|
66
|
+
|
|
67
|
+
:param df: The DataFrame.
|
|
68
|
+
:param column: The column name.
|
|
69
|
+
:param ctx: The CNotebookContext to store.
|
|
70
|
+
"""
|
|
71
|
+
_cleanup_dead_contexts()
|
|
72
|
+
df_id = id(df)
|
|
73
|
+
if df_id not in _dataframe_column_contexts:
|
|
74
|
+
_dataframe_column_contexts[df_id] = (weakref.ref(df), {})
|
|
75
|
+
_dataframe_column_contexts[df_id][1][column] = ctx
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def create_mol_formatter(*, ctx: CNotebookContext) -> typing.Callable[[oechem.OEMolBase], str]:
|
|
79
|
+
"""
|
|
80
|
+
Closure that creates a function that renders an OEMol to HTML
|
|
81
|
+
:param ctx: CNotebook rendering context
|
|
82
|
+
:return: Function that renders molecules to HTML
|
|
83
|
+
"""
|
|
84
|
+
def _oemol_to_html(mol: oechem.OEMolBase):
|
|
85
|
+
if isinstance(mol, oechem.OEMolBase):
|
|
86
|
+
|
|
87
|
+
# Render valid molecules
|
|
88
|
+
if mol.IsValid():
|
|
89
|
+
# Create the display object
|
|
90
|
+
disp = oemol_to_disp(mol, ctx=ctx)
|
|
91
|
+
|
|
92
|
+
# Apply display callbacks
|
|
93
|
+
if ctx.callbacks is not None:
|
|
94
|
+
for callback in ctx.callbacks:
|
|
95
|
+
callback(disp)
|
|
96
|
+
|
|
97
|
+
# Render into the string stream
|
|
98
|
+
return oedisp_to_html(disp)
|
|
99
|
+
|
|
100
|
+
# Empty molecule
|
|
101
|
+
elif mol.NumAtoms() == 0:
|
|
102
|
+
return render_empty_molecule(ctx=ctx)
|
|
103
|
+
|
|
104
|
+
# Invalid molecule
|
|
105
|
+
else:
|
|
106
|
+
return render_invalid_molecule(ctx=ctx)
|
|
107
|
+
|
|
108
|
+
return str(mol)
|
|
109
|
+
|
|
110
|
+
return _oemol_to_html
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@pass_cnotebook_context
|
|
114
|
+
def create_disp_formatter(
|
|
115
|
+
*,
|
|
116
|
+
callbacks: list[typing.Callable[[oedepict.OE2DMolDisplay], None]] | None = None,
|
|
117
|
+
ctx: CNotebookContext
|
|
118
|
+
) -> typing.Callable[[oedepict.OE2DMolDisplay], str]:
|
|
119
|
+
"""
|
|
120
|
+
Closure that creates a function that renders an OE2DMolDisplay to HTML
|
|
121
|
+
:param ctx: Render context
|
|
122
|
+
:param callbacks: List of callbacks to modify the rendering of the molecule
|
|
123
|
+
:return: Function that renders display objects to HTML
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
def _oedisp_to_html(disp: oedepict.OE2DMolDisplay) -> str:
|
|
127
|
+
|
|
128
|
+
if isinstance(disp, oedepict.OE2DMolDisplay) and disp.IsValid():
|
|
129
|
+
# Copy the display, as not to modify the original with callbacks
|
|
130
|
+
disp_to_render = oedepict.OE2DMolDisplay(disp)
|
|
131
|
+
|
|
132
|
+
# Apply display callbacks
|
|
133
|
+
if callbacks is not None:
|
|
134
|
+
for callback in callbacks:
|
|
135
|
+
callback(disp_to_render)
|
|
136
|
+
|
|
137
|
+
return oedisp_to_html(disp_to_render, ctx=ctx)
|
|
138
|
+
return str(disp)
|
|
139
|
+
|
|
140
|
+
return _oedisp_to_html
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def escape_formatter(obj: typing.Any) -> str:
|
|
144
|
+
return escape_brackets(str(obj))
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def render_polars_dataframe(
|
|
148
|
+
df: pl.DataFrame,
|
|
149
|
+
formatters: dict | None = None,
|
|
150
|
+
col_space: dict[str, float | int] | None = None,
|
|
151
|
+
ctx: CNotebookContext | None = None,
|
|
152
|
+
**kwargs
|
|
153
|
+
) -> str:
|
|
154
|
+
"""
|
|
155
|
+
Render a Polars DataFrame with molecules to HTML.
|
|
156
|
+
|
|
157
|
+
This is a native Polars implementation that renders molecule and display
|
|
158
|
+
columns directly without converting to pandas.
|
|
159
|
+
|
|
160
|
+
:param df: Polars DataFrame to render
|
|
161
|
+
:param formatters: Custom formatters for displaying columns
|
|
162
|
+
:param col_space: Custom column spacing
|
|
163
|
+
:param ctx: Local renering context (optional)
|
|
164
|
+
:param kwargs: Additional keyword arguments (currently unused, kept for API compatibility)
|
|
165
|
+
:return: HTML of rendered DataFrame
|
|
166
|
+
"""
|
|
167
|
+
# Defaults are empty dictionaries for these
|
|
168
|
+
formatters = formatters or {}
|
|
169
|
+
col_space = col_space or {}
|
|
170
|
+
|
|
171
|
+
# Identify molecule and display columns
|
|
172
|
+
molecule_columns: set[str] = set()
|
|
173
|
+
display_columns: set[str] = set()
|
|
174
|
+
|
|
175
|
+
# Capture metadata from ORIGINAL DataFrame and create formatters
|
|
176
|
+
for col in df.columns:
|
|
177
|
+
dtype = df.schema[col]
|
|
178
|
+
if isinstance(dtype, oeplr.MoleculeType):
|
|
179
|
+
molecule_columns.add(col)
|
|
180
|
+
|
|
181
|
+
# First check for DataFrame-level column context (persists across column accesses)
|
|
182
|
+
df_col_ctx = get_dataframe_column_context(df, col)
|
|
183
|
+
|
|
184
|
+
if df_col_ctx is not None:
|
|
185
|
+
# Use DataFrame-level context
|
|
186
|
+
series_ctx = ctx if ctx is not None else df_col_ctx
|
|
187
|
+
else:
|
|
188
|
+
# Fall back to series metadata (might be empty due to Polars Series ephemeral nature)
|
|
189
|
+
series = df.get_column(col)
|
|
190
|
+
metadata = series.chem.metadata if hasattr(series, 'chem') else {}
|
|
191
|
+
series_ctx = ctx if ctx is not None else get_series_context(metadata)
|
|
192
|
+
|
|
193
|
+
if col in formatters:
|
|
194
|
+
log.warning(f'Overwriting existing formatter for {col} with a molecule formatter')
|
|
195
|
+
|
|
196
|
+
formatters[col] = create_mol_formatter(ctx=series_ctx)
|
|
197
|
+
|
|
198
|
+
# Record the column width
|
|
199
|
+
if col in col_space:
|
|
200
|
+
log.warning(f'Column spacing for {col} already defined, overwriting with molecule image width')
|
|
201
|
+
|
|
202
|
+
col_space[col] = float(series_ctx.width)
|
|
203
|
+
|
|
204
|
+
elif isinstance(dtype, oeplr.DisplayType):
|
|
205
|
+
display_columns.add(col)
|
|
206
|
+
|
|
207
|
+
# Get metadata from the original series via .chem.metadata
|
|
208
|
+
series = df.get_column(col)
|
|
209
|
+
metadata = series.chem.metadata if hasattr(series, 'chem') else {}
|
|
210
|
+
|
|
211
|
+
# Get the cnotebook options for this column (use passed ctx if provided)
|
|
212
|
+
series_ctx = ctx if ctx is not None else get_series_context(metadata)
|
|
213
|
+
|
|
214
|
+
if col in formatters:
|
|
215
|
+
log.warning(f'Overwriting existing formatter for {col} with a display formatter')
|
|
216
|
+
|
|
217
|
+
formatters[col] = create_disp_formatter(ctx=series_ctx)
|
|
218
|
+
|
|
219
|
+
# Calculate column width from display objects
|
|
220
|
+
if len(series) > 0:
|
|
221
|
+
max_width = 0
|
|
222
|
+
for disp in series:
|
|
223
|
+
if isinstance(disp, oedepict.OE2DMolDisplay):
|
|
224
|
+
max_width = max(max_width, disp.GetWidth())
|
|
225
|
+
col_space[col] = max(0, max_width)
|
|
226
|
+
else:
|
|
227
|
+
col_space[col] = 0
|
|
228
|
+
|
|
229
|
+
if len(molecule_columns) > 0:
|
|
230
|
+
log.debug(f'Detected molecule columns: {", ".join(molecule_columns)}')
|
|
231
|
+
|
|
232
|
+
if len(display_columns) > 0:
|
|
233
|
+
log.debug(f'Detected display columns: {", ".join(display_columns)}')
|
|
234
|
+
|
|
235
|
+
# All other columns get escape formatter
|
|
236
|
+
for col in df.columns:
|
|
237
|
+
if col not in display_columns and col not in molecule_columns:
|
|
238
|
+
if col not in formatters:
|
|
239
|
+
formatters[col] = escape_formatter
|
|
240
|
+
|
|
241
|
+
# Deep copy molecule columns to avoid modifying originals during rendering
|
|
242
|
+
# Create a dictionary mapping column name to deep-copied series
|
|
243
|
+
copied_molecule_series: dict[str, pl.Series] = {}
|
|
244
|
+
for col in molecule_columns:
|
|
245
|
+
series = df.get_column(col)
|
|
246
|
+
if hasattr(series, 'chem') and hasattr(series.chem, 'deepcopy'):
|
|
247
|
+
# Use oepolars deepcopy to create copies of molecules
|
|
248
|
+
copied_series = series.chem.deepcopy()
|
|
249
|
+
# Preserve metadata from original
|
|
250
|
+
if hasattr(series, 'chem') and hasattr(series.chem, 'metadata'):
|
|
251
|
+
original_metadata = series.chem.metadata
|
|
252
|
+
if original_metadata and hasattr(copied_series, 'chem'):
|
|
253
|
+
copied_series.chem.metadata.update(original_metadata)
|
|
254
|
+
copied_molecule_series[col] = copied_series
|
|
255
|
+
|
|
256
|
+
# Build HTML table natively
|
|
257
|
+
html_parts = ['<table border="1" class="dataframe">', '<thead><tr style="text-align: right;">']
|
|
258
|
+
|
|
259
|
+
# Header
|
|
260
|
+
for col in df.columns:
|
|
261
|
+
width_style = ""
|
|
262
|
+
if col in col_space:
|
|
263
|
+
width_style = f' style="min-width: {col_space[col]}px;"'
|
|
264
|
+
html_parts.append(f'<th{width_style}>{escape_brackets(str(col))}</th>')
|
|
265
|
+
html_parts.append('</tr></thead>')
|
|
266
|
+
|
|
267
|
+
# Body
|
|
268
|
+
html_parts.append('<tbody>')
|
|
269
|
+
for row_idx in range(len(df)):
|
|
270
|
+
html_parts.append('<tr>')
|
|
271
|
+
for col in df.columns:
|
|
272
|
+
# Use copied series for molecule columns, original for others
|
|
273
|
+
if col in copied_molecule_series:
|
|
274
|
+
value = copied_molecule_series[col][row_idx]
|
|
275
|
+
else:
|
|
276
|
+
value = df[col][row_idx]
|
|
277
|
+
|
|
278
|
+
# Apply formatter if available
|
|
279
|
+
if col in formatters:
|
|
280
|
+
cell_html = formatters[col](value)
|
|
281
|
+
else:
|
|
282
|
+
cell_html = escape_brackets(str(value))
|
|
283
|
+
|
|
284
|
+
html_parts.append(f'<td>{cell_html}</td>')
|
|
285
|
+
html_parts.append('</tr>')
|
|
286
|
+
html_parts.append('</tbody>')
|
|
287
|
+
|
|
288
|
+
html_parts.append('</table>')
|
|
289
|
+
|
|
290
|
+
return ''.join(html_parts)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
########################################################################################################################
|
|
294
|
+
# Series accessor methods (monkey-patched onto oepolars)
|
|
295
|
+
########################################################################################################################
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _series_highlight(
|
|
299
|
+
self,
|
|
300
|
+
pattern: Iterable[str] | str | oechem.OESubSearch | Iterable[oechem.OESubSearch],
|
|
301
|
+
*,
|
|
302
|
+
color: oechem.OEColor | oechem.OEColorIter | None = None,
|
|
303
|
+
style: int | Literal["overlay_default", "overlay_ball_and_stick"] = "overlay_default",
|
|
304
|
+
ref: oechem.OESubSearch | oechem.OEMCSSearch | oechem.OEQMol | Literal["first"] | oechem.OEMolBase | None = None,
|
|
305
|
+
method: Literal["ss", "substructure", "mcss", "fp", "fingerprint"] | None = None
|
|
306
|
+
) -> None:
|
|
307
|
+
"""
|
|
308
|
+
Highlight chemical features in a structure.
|
|
309
|
+
|
|
310
|
+
The pattern argument can be:
|
|
311
|
+
- SMARTS pattern
|
|
312
|
+
- oechem.OESubSearch or oechem.OEMCSSearch object
|
|
313
|
+
- Iterable of SMARTS patterns, oechem.OESubSearch, and/or oechem.OEMCSSearch objects
|
|
314
|
+
|
|
315
|
+
:param pattern: Pattern(s) to highlight in the molecule.
|
|
316
|
+
:param color: Highlight color(s). Can be a single oechem.OEColor or an oechem.OEColorIter
|
|
317
|
+
(e.g., oechem.OEGetLightColors()). Defaults to oechem.OEGetLightColors().
|
|
318
|
+
:param style: Highlight style. Can be an int (OEHighlightStyle constant) or a string
|
|
319
|
+
("overlay_default", "overlay_ball_and_stick"). Defaults to "overlay_default".
|
|
320
|
+
:param ref: Optional reference for alignment.
|
|
321
|
+
:param method: Optional alignment method.
|
|
322
|
+
"""
|
|
323
|
+
# Check dtype
|
|
324
|
+
if not isinstance(self._series.dtype, oeplr.MoleculeType):
|
|
325
|
+
raise TypeError(
|
|
326
|
+
"highlight only works on molecule columns (oepolars.MoleculeType). If this column has "
|
|
327
|
+
"molecules, use series.chem.as_molecule() to convert to a molecule column first."
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# Get / create a series context and save it (because we are modifying it locally)
|
|
331
|
+
ctx = get_series_context(self.metadata, save=True)
|
|
332
|
+
|
|
333
|
+
# ********************************************************************************
|
|
334
|
+
# Highlighting
|
|
335
|
+
# ********************************************************************************
|
|
336
|
+
|
|
337
|
+
# Case: Pattern is a single SMARTS string or oechem.OESubSearch object
|
|
338
|
+
if isinstance(pattern, (str, oechem.OESubSearch, oechem.OEMCSSearch, oechem.OEQMol)):
|
|
339
|
+
ctx.add_callback(
|
|
340
|
+
create_structure_highlighter(
|
|
341
|
+
query=pattern,
|
|
342
|
+
color=color,
|
|
343
|
+
style=style
|
|
344
|
+
)
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
# Case: Pattern is an iterable
|
|
348
|
+
elif isinstance(pattern, Iterable):
|
|
349
|
+
for element in pattern:
|
|
350
|
+
|
|
351
|
+
# Element is a SMARTS string or oechem.OESubSearch object
|
|
352
|
+
if isinstance(element, (str, oechem.OESubSearch, oechem.OEMCSSearch, oechem.OEQMol)):
|
|
353
|
+
ctx.add_callback(
|
|
354
|
+
create_structure_highlighter(
|
|
355
|
+
query=element,
|
|
356
|
+
color=color,
|
|
357
|
+
style=style
|
|
358
|
+
)
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
# Unknown element
|
|
362
|
+
else:
|
|
363
|
+
raise TypeError(f'Do not know how to add molecule highlight for type {type(element).__name__}')
|
|
364
|
+
|
|
365
|
+
# Case: Pattern is an unknown type
|
|
366
|
+
else:
|
|
367
|
+
raise TypeError(f'Do not know how to add molecule highlight for type {type(pattern).__name__}')
|
|
368
|
+
|
|
369
|
+
# ********************************************************************************
|
|
370
|
+
# Alignment
|
|
371
|
+
# ********************************************************************************
|
|
372
|
+
|
|
373
|
+
if ref is not None:
|
|
374
|
+
# Only apply alignment if align_depictions method is available
|
|
375
|
+
if hasattr(self, 'align_depictions'):
|
|
376
|
+
self.align_depictions(ref=ref, method=method)
|
|
377
|
+
else:
|
|
378
|
+
log.warning("align_depictions not available; ref parameter ignored")
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def _series_reset_depictions(self) -> None:
|
|
382
|
+
"""
|
|
383
|
+
Reset depiction callbacks for a molecule series.
|
|
384
|
+
|
|
385
|
+
This clears any highlight callbacks that have been added to the series metadata.
|
|
386
|
+
"""
|
|
387
|
+
# Clear the cnotebook context from metadata
|
|
388
|
+
_ = self.metadata.pop("cnotebook", None)
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def _series_clear_formatting_rules(self) -> None:
|
|
392
|
+
"""
|
|
393
|
+
Clear all formatting rule callbacks from a molecule series.
|
|
394
|
+
|
|
395
|
+
This removes any callbacks applied to the molecule prior to rendering,
|
|
396
|
+
such as highlighting. Unlike reset_depictions which removes the entire
|
|
397
|
+
rendering context, this method only clears the callbacks while preserving
|
|
398
|
+
other context settings like image dimensions and styling.
|
|
399
|
+
"""
|
|
400
|
+
ctx = self.metadata.get("cnotebook", None)
|
|
401
|
+
if ctx is not None and isinstance(ctx, CNotebookContext):
|
|
402
|
+
ctx.reset_callbacks()
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def _series_recalculate_depiction_coordinates(
|
|
406
|
+
self,
|
|
407
|
+
*,
|
|
408
|
+
clear_coords: bool = True,
|
|
409
|
+
add_depiction_hydrogens: bool = True,
|
|
410
|
+
perceive_bond_stereo: bool = True,
|
|
411
|
+
suppress_explicit_hydrogens: bool = True,
|
|
412
|
+
orientation: int = oedepict.OEDepictOrientation_Default
|
|
413
|
+
) -> None:
|
|
414
|
+
"""
|
|
415
|
+
Recalculate the depictions for a molecule series.
|
|
416
|
+
|
|
417
|
+
See the following link for more information:
|
|
418
|
+
https://docs.eyesopen.com/toolkits/python/depicttk/OEDepictClasses/OEPrepareDepictionOptions.html
|
|
419
|
+
|
|
420
|
+
:param clear_coords: Clear existing 2D coordinates
|
|
421
|
+
:param add_depiction_hydrogens: Add explicit depiction hydrogens for faithful stereo depiction, etc.
|
|
422
|
+
:param perceive_bond_stereo: Perceive wedge/hash bond stereo
|
|
423
|
+
:param suppress_explicit_hydrogens: Suppress explicit hydrogens
|
|
424
|
+
:param orientation: Preferred 2D orientation
|
|
425
|
+
"""
|
|
426
|
+
if not isinstance(self._series.dtype, oeplr.MoleculeType):
|
|
427
|
+
raise TypeError(
|
|
428
|
+
"recalculate_depiction_coordinates only works on molecule columns (oepolars.MoleculeType). If this "
|
|
429
|
+
"column has molecules, use series.chem.as_molecule() to convert to a molecule column first."
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
# Create the depiction options
|
|
433
|
+
opts = oedepict.OEPrepareDepictionOptions()
|
|
434
|
+
opts.SetClearCoords(clear_coords)
|
|
435
|
+
opts.SetAddDepictionHydrogens(add_depiction_hydrogens)
|
|
436
|
+
opts.SetPerceiveBondStereo(perceive_bond_stereo)
|
|
437
|
+
opts.SetSuppressHydrogens(suppress_explicit_hydrogens)
|
|
438
|
+
opts.SetDepictOrientation(orientation)
|
|
439
|
+
|
|
440
|
+
for mol in self._series.to_list():
|
|
441
|
+
if isinstance(mol, oechem.OEMolBase):
|
|
442
|
+
oedepict.OEPrepareDepiction(mol, opts)
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def _series_align_depictions(
|
|
446
|
+
self,
|
|
447
|
+
ref: oechem.OESubSearch | oechem.OEMCSSearch | oechem.OEMolBase | oechem.OEQMol | Literal["first"],
|
|
448
|
+
method: Literal["substructure", "ss", "mcss", "fp", "fingerprint"] | None = None,
|
|
449
|
+
**kwargs
|
|
450
|
+
) -> None:
|
|
451
|
+
"""
|
|
452
|
+
Align the 2D coordinates of molecules in a series.
|
|
453
|
+
|
|
454
|
+
:param ref: Alignment reference (molecule, "first", or search object)
|
|
455
|
+
:param method: Alignment method
|
|
456
|
+
:param kwargs: Keyword arguments for aligner
|
|
457
|
+
"""
|
|
458
|
+
if not isinstance(self._series.dtype, oeplr.MoleculeType):
|
|
459
|
+
raise TypeError(
|
|
460
|
+
"align_depictions only works on molecule columns (oepolars.MoleculeType). If this "
|
|
461
|
+
"column has molecules, use series.chem.as_molecule() to convert to a molecule column first."
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
# Get molecule list from series
|
|
465
|
+
mols = self._series.to_list()
|
|
466
|
+
|
|
467
|
+
# Handle "first" reference
|
|
468
|
+
if isinstance(ref, str) and ref == "first":
|
|
469
|
+
for mol in mols:
|
|
470
|
+
if mol is not None and mol.IsValid():
|
|
471
|
+
ref = mol.CreateCopy()
|
|
472
|
+
break
|
|
473
|
+
else:
|
|
474
|
+
log.warning("No valid molecule found in series for depiction alignment")
|
|
475
|
+
return
|
|
476
|
+
|
|
477
|
+
# Make sure the reference has 2D coordinates
|
|
478
|
+
oedepict.OEPrepareDepiction(ref, False)
|
|
479
|
+
|
|
480
|
+
# Suppress alignment warnings (there are lots of needless warnings)
|
|
481
|
+
level = oechem.OEThrow.GetLevel()
|
|
482
|
+
oechem.OEThrow.SetLevel(oechem.OEErrorLevel_Error)
|
|
483
|
+
|
|
484
|
+
# noinspection PyBroadException
|
|
485
|
+
try:
|
|
486
|
+
# Create the aligner
|
|
487
|
+
from .align import create_aligner
|
|
488
|
+
aligner = create_aligner(ref=ref, method=method, **kwargs)
|
|
489
|
+
|
|
490
|
+
for mol in mols:
|
|
491
|
+
if mol is not None:
|
|
492
|
+
_ = aligner(mol)
|
|
493
|
+
|
|
494
|
+
except Exception:
|
|
495
|
+
# We don't care if the aligners fail - it just results in unaligned structures (NBD)
|
|
496
|
+
pass
|
|
497
|
+
|
|
498
|
+
# Restore OEThrow level
|
|
499
|
+
oechem.OEThrow.SetLevel(level)
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
# Monkey-patch onto oepolars SeriesChemNamespace
|
|
503
|
+
# Note: Series-level highlight is not registered because Polars Series are ephemeral and
|
|
504
|
+
# metadata doesn't persist across column accesses. Use df.chem.highlight() instead.
|
|
505
|
+
from oepolars.namespaces.series import SeriesChemNamespace
|
|
506
|
+
SeriesChemNamespace.reset_depictions = _series_reset_depictions
|
|
507
|
+
SeriesChemNamespace.clear_formatting_rules = _series_clear_formatting_rules
|
|
508
|
+
SeriesChemNamespace.recalculate_depiction_coordinates = _series_recalculate_depiction_coordinates
|
|
509
|
+
SeriesChemNamespace.align_depictions = _series_align_depictions
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
########################################################################################################################
|
|
513
|
+
# DataFrame accessor methods (monkey-patched onto oepolars)
|
|
514
|
+
########################################################################################################################
|
|
515
|
+
|
|
516
|
+
# Regular expression for splitting SMARTS patterns
|
|
517
|
+
import re
|
|
518
|
+
SMARTS_DELIMITER_RE = re.compile(r'\s*[|\r\n\t]+\s*')
|
|
519
|
+
|
|
520
|
+
# Store the fingerprint tag for fingerprint_similarity
|
|
521
|
+
_fingerprint_overlap_tag = oechem.OEGetTag("fingerprint_overlap")
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
class ColorBondByOverlapScore(oegrapheme.OEBondGlyphBase):
|
|
525
|
+
"""Bond glyph that colors bonds by fingerprint overlap score.
|
|
526
|
+
|
|
527
|
+
Used internally by fingerprint similarity visualization to highlight
|
|
528
|
+
bonds based on their contribution to molecular similarity.
|
|
529
|
+
|
|
530
|
+
See: https://docs.eyesopen.com/toolkits/cookbook/python/depiction/simcalc.html
|
|
531
|
+
"""
|
|
532
|
+
|
|
533
|
+
def __init__(self, cg: oechem.OELinearColorGradient, tag: int):
|
|
534
|
+
"""Create a bond coloring glyph.
|
|
535
|
+
|
|
536
|
+
:param cg: Color gradient to map overlap scores to colors.
|
|
537
|
+
:param tag: OEChem data tag containing overlap scores on bonds.
|
|
538
|
+
"""
|
|
539
|
+
oegrapheme.OEBondGlyphBase.__init__(self)
|
|
540
|
+
self.colorg = cg
|
|
541
|
+
self.tag = tag
|
|
542
|
+
|
|
543
|
+
# noinspection PyPep8Naming
|
|
544
|
+
def RenderGlyph(self, disp, bond):
|
|
545
|
+
|
|
546
|
+
bdisp = disp.GetBondDisplay(bond)
|
|
547
|
+
if bdisp is None or not bdisp.IsVisible():
|
|
548
|
+
return False
|
|
549
|
+
|
|
550
|
+
if not bond.HasData(self.tag):
|
|
551
|
+
return False
|
|
552
|
+
|
|
553
|
+
linewidth = disp.GetScale() / 3.0
|
|
554
|
+
color = self.colorg.GetColorAt(bond.GetData(self.tag))
|
|
555
|
+
pen = oedepict.OEPen(color, color, oedepict.OEFill_Off, linewidth)
|
|
556
|
+
|
|
557
|
+
adispB = disp.GetAtomDisplay(bond.GetBgn())
|
|
558
|
+
adispE = disp.GetAtomDisplay(bond.GetEnd())
|
|
559
|
+
|
|
560
|
+
layer = disp.GetLayer(oedepict.OELayerPosition_Below)
|
|
561
|
+
layer.DrawLine(adispB.GetCoords(), adispE.GetCoords(), pen)
|
|
562
|
+
|
|
563
|
+
return True
|
|
564
|
+
|
|
565
|
+
# noinspection PyPep8Naming
|
|
566
|
+
def ColorBondByOverlapScore(self):
|
|
567
|
+
return ColorBondByOverlapScore(self.colorg, self.tag).__disown__()
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
def _dataframe_reset_depictions(self, *, molecule_columns: str | Iterable[str] | None = None) -> None:
|
|
571
|
+
"""
|
|
572
|
+
Reset depiction callbacks for one or more molecule columns in the DataFrame.
|
|
573
|
+
|
|
574
|
+
:param molecule_columns: Optional molecule column(s) to reset. If None, resets all molecule columns.
|
|
575
|
+
"""
|
|
576
|
+
columns = set()
|
|
577
|
+
if molecule_columns is None:
|
|
578
|
+
columns.update(self._df.columns)
|
|
579
|
+
|
|
580
|
+
elif isinstance(molecule_columns, str):
|
|
581
|
+
columns.add(molecule_columns)
|
|
582
|
+
|
|
583
|
+
else:
|
|
584
|
+
columns.update(molecule_columns)
|
|
585
|
+
|
|
586
|
+
# Filter invalid and non-molecule columns
|
|
587
|
+
for col in filter(
|
|
588
|
+
lambda c: c in self._df.columns and isinstance(self._df.schema[c], oeplr.MoleculeType),
|
|
589
|
+
columns
|
|
590
|
+
):
|
|
591
|
+
self._df.get_column(col).chem.reset_depictions()
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
def _dataframe_clear_formatting_rules(self, molecule_columns: str | Iterable[str] | None = None) -> None:
|
|
595
|
+
"""
|
|
596
|
+
Clear all formatting rule callbacks from one or more molecule columns.
|
|
597
|
+
|
|
598
|
+
This removes any callbacks applied to molecules prior to rendering,
|
|
599
|
+
such as highlighting. Unlike reset_depictions which removes the entire
|
|
600
|
+
rendering context, this method only clears the callbacks while preserving
|
|
601
|
+
other context settings like image dimensions and styling.
|
|
602
|
+
|
|
603
|
+
:param molecule_columns: Optional molecule column(s) to clear formatting rules from.
|
|
604
|
+
If None, clears formatting rules from all molecule columns.
|
|
605
|
+
|
|
606
|
+
Example::
|
|
607
|
+
|
|
608
|
+
# Clear formatting rules from all molecule columns
|
|
609
|
+
df.chem.clear_formatting_rules()
|
|
610
|
+
|
|
611
|
+
# Clear formatting rules from a specific column
|
|
612
|
+
df.chem.clear_formatting_rules("smiles")
|
|
613
|
+
|
|
614
|
+
# Clear formatting rules from multiple columns
|
|
615
|
+
df.chem.clear_formatting_rules(["mol1", "mol2"])
|
|
616
|
+
"""
|
|
617
|
+
columns = set()
|
|
618
|
+
if molecule_columns is None:
|
|
619
|
+
columns.update(self._df.columns)
|
|
620
|
+
|
|
621
|
+
elif isinstance(molecule_columns, str):
|
|
622
|
+
columns.add(molecule_columns)
|
|
623
|
+
|
|
624
|
+
else:
|
|
625
|
+
columns.update(molecule_columns)
|
|
626
|
+
|
|
627
|
+
# Filter invalid and non-molecule columns and clear their formatting rules
|
|
628
|
+
for col in filter(
|
|
629
|
+
lambda c: c in self._df.columns and isinstance(self._df.schema[c], oeplr.MoleculeType),
|
|
630
|
+
columns
|
|
631
|
+
):
|
|
632
|
+
# Clear DataFrame-level column context callbacks
|
|
633
|
+
ctx = get_dataframe_column_context(self._df, col)
|
|
634
|
+
if ctx is not None:
|
|
635
|
+
ctx.reset_callbacks()
|
|
636
|
+
|
|
637
|
+
|
|
638
|
+
def _dataframe_recalculate_depiction_coordinates(
|
|
639
|
+
self,
|
|
640
|
+
*,
|
|
641
|
+
molecule_columns: str | Iterable[str] | None = None,
|
|
642
|
+
clear_coords: bool = True,
|
|
643
|
+
add_depiction_hydrogens: bool = True,
|
|
644
|
+
perceive_bond_stereo: bool = True,
|
|
645
|
+
suppress_explicit_hydrogens: bool = True,
|
|
646
|
+
orientation: int = oedepict.OEDepictOrientation_Default
|
|
647
|
+
) -> None:
|
|
648
|
+
"""
|
|
649
|
+
Recalculate the depictions for one or more molecule series in a DataFrame. If molecule_columns is None,
|
|
650
|
+
which is the default, then all molecule columns will have their depictions recalculated.
|
|
651
|
+
|
|
652
|
+
See the following link for more information:
|
|
653
|
+
https://docs.eyesopen.com/toolkits/python/depicttk/OEDepictClasses/OEPrepareDepictionOptions.html
|
|
654
|
+
|
|
655
|
+
:param molecule_columns: Optional molecule column(s) to have depictions recalculated
|
|
656
|
+
:param clear_coords: Clear existing 2D coordinates
|
|
657
|
+
:param add_depiction_hydrogens: Add explicit depiction hydrogens for faithful stereo depiction, etc.
|
|
658
|
+
:param perceive_bond_stereo: Perceive wedge/hash bond stereo
|
|
659
|
+
:param suppress_explicit_hydrogens: Suppress explicit hydrogens
|
|
660
|
+
:param orientation: Preferred 2D orientation
|
|
661
|
+
"""
|
|
662
|
+
if molecule_columns is None:
|
|
663
|
+
molecule_columns = set()
|
|
664
|
+
|
|
665
|
+
for col in self._df.columns:
|
|
666
|
+
if isinstance(self._df.schema[col], oeplr.MoleculeType):
|
|
667
|
+
molecule_columns.add(col)
|
|
668
|
+
|
|
669
|
+
elif isinstance(molecule_columns, str):
|
|
670
|
+
molecule_columns = {molecule_columns}
|
|
671
|
+
|
|
672
|
+
else:
|
|
673
|
+
molecule_columns = set(molecule_columns)
|
|
674
|
+
|
|
675
|
+
# Recalculate the column depictions
|
|
676
|
+
for col in molecule_columns:
|
|
677
|
+
|
|
678
|
+
if col in self._df.columns:
|
|
679
|
+
if isinstance(self._df.schema[col], oeplr.MoleculeType):
|
|
680
|
+
self._df.get_column(col).chem.recalculate_depiction_coordinates(
|
|
681
|
+
clear_coords=clear_coords,
|
|
682
|
+
add_depiction_hydrogens=add_depiction_hydrogens,
|
|
683
|
+
perceive_bond_stereo=perceive_bond_stereo,
|
|
684
|
+
suppress_explicit_hydrogens=suppress_explicit_hydrogens,
|
|
685
|
+
orientation=orientation
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
else:
|
|
689
|
+
log.warning(f'Column {col} does not have a MoleculeType')
|
|
690
|
+
|
|
691
|
+
else:
|
|
692
|
+
log.warning(f'{col} not found in DataFrame columns: ({", ".join(self._df.columns)})')
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
def _dataframe_highlight(
|
|
696
|
+
self,
|
|
697
|
+
molecule_column: str,
|
|
698
|
+
pattern: Iterable[str] | str | oechem.OESubSearch | Iterable[oechem.OESubSearch],
|
|
699
|
+
*,
|
|
700
|
+
color: oechem.OEColor | oechem.OEColorIter | None = None,
|
|
701
|
+
style: int | Literal["overlay_default", "overlay_ball_and_stick"] = "overlay_default",
|
|
702
|
+
) -> None:
|
|
703
|
+
"""
|
|
704
|
+
Highlight chemical features in molecules within a specified column.
|
|
705
|
+
|
|
706
|
+
This method stores the highlighting callbacks at the DataFrame level, ensuring they persist
|
|
707
|
+
across column accesses. This is necessary because Polars Series objects are ephemeral.
|
|
708
|
+
|
|
709
|
+
The pattern argument can be:
|
|
710
|
+
- SMARTS pattern
|
|
711
|
+
- oechem.OESubSearch or oechem.OEMCSSearch object
|
|
712
|
+
- Iterable of SMARTS patterns, oechem.OESubSearch, and/or oechem.OEMCSSearch objects
|
|
713
|
+
|
|
714
|
+
:param molecule_column: Name of the molecule column to highlight.
|
|
715
|
+
:param pattern: Pattern(s) to highlight in the molecules.
|
|
716
|
+
:param color: Highlight color(s). Can be a single oechem.OEColor or an oechem.OEColorIter
|
|
717
|
+
(e.g., oechem.OEGetLightColors()). Defaults to oechem.OEGetLightColors().
|
|
718
|
+
:param style: Highlight style. Can be an int (OEHighlightStyle constant) or a string
|
|
719
|
+
("overlay_default", "overlay_ball_and_stick"). Defaults to "overlay_default".
|
|
720
|
+
"""
|
|
721
|
+
# Check the column exists and is a molecule type
|
|
722
|
+
if molecule_column not in self._df.columns:
|
|
723
|
+
raise ValueError(f'Column {molecule_column} not found in DataFrame columns: ({", ".join(self._df.columns)})')
|
|
724
|
+
|
|
725
|
+
if not isinstance(self._df.schema[molecule_column], oeplr.MoleculeType):
|
|
726
|
+
raise TypeError(
|
|
727
|
+
f"highlight only works on molecule columns (oepolars.MoleculeType). Column '{molecule_column}' "
|
|
728
|
+
f"has type {self._df.schema[molecule_column]}."
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
# Get or create the context for this DataFrame column
|
|
732
|
+
ctx = get_dataframe_column_context(self._df, molecule_column)
|
|
733
|
+
if ctx is None:
|
|
734
|
+
ctx = create_local_context()
|
|
735
|
+
set_dataframe_column_context(self._df, molecule_column, ctx)
|
|
736
|
+
|
|
737
|
+
# Case: Pattern is a single SMARTS string or oechem.OESubSearch object
|
|
738
|
+
if isinstance(pattern, (str, oechem.OESubSearch, oechem.OEMCSSearch, oechem.OEQMol)):
|
|
739
|
+
ctx.add_callback(create_structure_highlighter(pattern, color=color, style=style))
|
|
740
|
+
|
|
741
|
+
# Case: Pattern is an iterable of SMARTS strings and/or oechem.OESubSearch objects
|
|
742
|
+
elif isinstance(pattern, Iterable):
|
|
743
|
+
for p in pattern:
|
|
744
|
+
ctx.add_callback(create_structure_highlighter(p, color=color, style=style))
|
|
745
|
+
|
|
746
|
+
else:
|
|
747
|
+
raise TypeError(f'Unsupported type for pattern: {type(pattern).__name__}')
|
|
748
|
+
|
|
749
|
+
|
|
750
|
+
def _dataframe_copy_molecules(
|
|
751
|
+
self,
|
|
752
|
+
source_column: str,
|
|
753
|
+
dest_column: str,
|
|
754
|
+
) -> pl.DataFrame:
|
|
755
|
+
"""
|
|
756
|
+
Create a deep copy of molecules from one column to a new column.
|
|
757
|
+
|
|
758
|
+
This creates independent copies of all molecules, allowing modifications
|
|
759
|
+
(such as highlighting or alignment) to the new column without affecting
|
|
760
|
+
the original.
|
|
761
|
+
|
|
762
|
+
:param source_column: Name of the source molecule column.
|
|
763
|
+
:param dest_column: Name of the new column to create with copied molecules.
|
|
764
|
+
:returns: New DataFrame with the molecule column added.
|
|
765
|
+
|
|
766
|
+
Example::
|
|
767
|
+
|
|
768
|
+
# Create a copy of molecules for alignment
|
|
769
|
+
df = df.chem.copy_molecules("Original", "Aligned")
|
|
770
|
+
df.chem.highlight("Aligned", "c1ccccc1")
|
|
771
|
+
"""
|
|
772
|
+
if source_column not in self._df.columns:
|
|
773
|
+
raise ValueError(f'Column {source_column} not found in DataFrame columns: ({", ".join(self._df.columns)})')
|
|
774
|
+
|
|
775
|
+
if not isinstance(self._df.schema[source_column], oeplr.MoleculeType):
|
|
776
|
+
raise TypeError(
|
|
777
|
+
f"copy_molecules only works on molecule columns (oepolars.MoleculeType). Column '{source_column}' "
|
|
778
|
+
f"has type {self._df.schema[source_column]}."
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
# Use the series-level copy_molecules (or deepcopy) and add as a new column
|
|
782
|
+
copied_series = self._df.get_column(source_column).chem.copy_molecules()
|
|
783
|
+
return self._df.with_columns(copied_series.alias(dest_column))
|
|
784
|
+
|
|
785
|
+
|
|
786
|
+
def _dataframe_highlight_using_column(
|
|
787
|
+
self,
|
|
788
|
+
molecule_column: str,
|
|
789
|
+
pattern_column: str,
|
|
790
|
+
*,
|
|
791
|
+
highlighted_column: str = "highlighted_substructures",
|
|
792
|
+
color: oechem.OEColor | oechem.OEColorIter | None = None,
|
|
793
|
+
style: int | Literal["overlay_default", "overlay_ball_and_stick"] = "overlay_default",
|
|
794
|
+
ref: oechem.OESubSearch | oechem.OEMCSSearch | oechem.OEMolBase | None = None,
|
|
795
|
+
alignment_opts: oedepict.OEAlignmentOptions | None = None,
|
|
796
|
+
prepare_opts: oedepict.OEPrepareDepictionOptions | None = None,
|
|
797
|
+
inplace: bool = False
|
|
798
|
+
) -> pl.DataFrame:
|
|
799
|
+
"""
|
|
800
|
+
Highlight molecules based on the value of another column. The column produced is a DisplayType column, so
|
|
801
|
+
the results are not suitable for other molecular calculations.
|
|
802
|
+
|
|
803
|
+
The other column can contain:
|
|
804
|
+
- Comma or whitespace delimited string of SMARTS patterns
|
|
805
|
+
- oechem.OESubSearch or oechem.OEMCSSearch object
|
|
806
|
+
- Iterable of SMARTS patterns, oechem.OESubSearch, and/or oechem.OEMCSSearch objects
|
|
807
|
+
|
|
808
|
+
:param molecule_column: Name of the molecule column.
|
|
809
|
+
:param pattern_column: Name of the pattern column.
|
|
810
|
+
:param highlighted_column: Optional name of the column with highlighted structures.
|
|
811
|
+
:param color: Highlight color(s). Can be a single oechem.OEColor or an oechem.OEColorIter
|
|
812
|
+
(e.g., oechem.OEGetLightColors()). Defaults to oechem.OEGetLightColors().
|
|
813
|
+
:param style: Highlight style. Can be an int (OEHighlightStyle constant) or a string
|
|
814
|
+
("overlay_default", "overlay_ball_and_stick"). Defaults to "overlay_default".
|
|
815
|
+
:param ref: Optional reference for aligning depictions.
|
|
816
|
+
:param alignment_opts: Optional depiction alignment options (oedepict.OEAlignmentOptions).
|
|
817
|
+
:param prepare_opts: Optional depiction preparation options (oedepict.OEPrepareDepictionOptions).
|
|
818
|
+
:param inplace: If True, returns the modified DataFrame (note: Polars DataFrames are immutable).
|
|
819
|
+
:returns: Modified DataFrame with highlighted column.
|
|
820
|
+
"""
|
|
821
|
+
df = self._df
|
|
822
|
+
|
|
823
|
+
if molecule_column not in df.columns:
|
|
824
|
+
raise KeyError(f'{molecule_column} not found in DataFrame columns: ({", ".join(df.columns)})')
|
|
825
|
+
|
|
826
|
+
if not isinstance(df.schema[molecule_column], oeplr.MoleculeType):
|
|
827
|
+
raise TypeError(
|
|
828
|
+
f"highlight_using_column only works on molecule columns (oepolars.MoleculeType). If {molecule_column}"
|
|
829
|
+
" has molecules, use df.chem.as_molecule() to convert to a molecule column first."
|
|
830
|
+
)
|
|
831
|
+
|
|
832
|
+
if pattern_column not in df.columns:
|
|
833
|
+
raise KeyError(f'{pattern_column} not found in DataFrame columns: ({", ".join(df.columns)})')
|
|
834
|
+
|
|
835
|
+
# Default color
|
|
836
|
+
if color is None:
|
|
837
|
+
color = oechem.OEGetLightColors()
|
|
838
|
+
|
|
839
|
+
# Determine highlighting approach based on style
|
|
840
|
+
use_overlay = isinstance(style, str) and style in ("overlay_default", "overlay_ball_and_stick")
|
|
841
|
+
|
|
842
|
+
# Check if color is compatible with overlay
|
|
843
|
+
if use_overlay and isinstance(color, oechem.OEColor):
|
|
844
|
+
log.warning(
|
|
845
|
+
"Overlay coloring is not compatible with a single oechem.OEColor. Falling back to standard highlighting")
|
|
846
|
+
use_overlay = False
|
|
847
|
+
style = oedepict.OEHighlightStyle_BallAndStick
|
|
848
|
+
|
|
849
|
+
# Create the display objects
|
|
850
|
+
displays = []
|
|
851
|
+
|
|
852
|
+
# Get the rendering context for creating the displays
|
|
853
|
+
series = df.get_column(molecule_column)
|
|
854
|
+
metadata = series.chem.metadata if hasattr(series, 'chem') else {}
|
|
855
|
+
ctx = get_series_context(metadata)
|
|
856
|
+
|
|
857
|
+
for row_idx in range(len(df)):
|
|
858
|
+
mol = df[molecule_column][row_idx]
|
|
859
|
+
patterns = df[pattern_column][row_idx]
|
|
860
|
+
|
|
861
|
+
if isinstance(mol, oechem.OEMolBase) and mol.IsValid():
|
|
862
|
+
|
|
863
|
+
# Create the display
|
|
864
|
+
disp = oemol_to_disp(mol, ctx=ctx)
|
|
865
|
+
|
|
866
|
+
# Highlight
|
|
867
|
+
substructures = []
|
|
868
|
+
|
|
869
|
+
# Parse different patterns
|
|
870
|
+
if isinstance(patterns, str):
|
|
871
|
+
for pattern in re.split(SMARTS_DELIMITER_RE, patterns):
|
|
872
|
+
ss = oechem.OESubSearch(pattern)
|
|
873
|
+
if ss.IsValid():
|
|
874
|
+
substructures.append(ss)
|
|
875
|
+
|
|
876
|
+
elif isinstance(patterns, oechem.OESubSearch):
|
|
877
|
+
if patterns.IsValid():
|
|
878
|
+
substructures.append(patterns)
|
|
879
|
+
|
|
880
|
+
elif isinstance(patterns, Iterable):
|
|
881
|
+
|
|
882
|
+
for p in patterns:
|
|
883
|
+
|
|
884
|
+
if isinstance(p, str):
|
|
885
|
+
for pattern in re.split(SMARTS_DELIMITER_RE, p):
|
|
886
|
+
ss = oechem.OESubSearch(pattern)
|
|
887
|
+
if ss.IsValid():
|
|
888
|
+
substructures.append(ss)
|
|
889
|
+
|
|
890
|
+
elif isinstance(p, oechem.OESubSearch):
|
|
891
|
+
if p.IsValid():
|
|
892
|
+
substructures.append(p)
|
|
893
|
+
|
|
894
|
+
else:
|
|
895
|
+
log.warning(f'Do not know how to highlight using: {type(p).__name__}')
|
|
896
|
+
|
|
897
|
+
elif patterns is not None:
|
|
898
|
+
log.warning(f'Do not know how to highlight using: {type(patterns).__name__}')
|
|
899
|
+
|
|
900
|
+
# Overlay highlighting
|
|
901
|
+
if use_overlay:
|
|
902
|
+
highlight = oedepict.OEHighlightOverlayByBallAndStick(color)
|
|
903
|
+
for ss in substructures:
|
|
904
|
+
oedepict.OEAddHighlightOverlay(disp, highlight, ss.Match(mol, True))
|
|
905
|
+
|
|
906
|
+
else:
|
|
907
|
+
# Traditional highlighting
|
|
908
|
+
if isinstance(color, oechem.OEColor):
|
|
909
|
+
highlight_color = color
|
|
910
|
+
else:
|
|
911
|
+
highlight_color = oechem.OELightBlue
|
|
912
|
+
for c in color:
|
|
913
|
+
highlight_color = c
|
|
914
|
+
break
|
|
915
|
+
for ss in substructures:
|
|
916
|
+
for match in ss.Match(mol, True):
|
|
917
|
+
oedepict.OEAddHighlighting(disp, highlight_color, style, match)
|
|
918
|
+
|
|
919
|
+
displays.append(disp)
|
|
920
|
+
|
|
921
|
+
else:
|
|
922
|
+
displays.append(None)
|
|
923
|
+
|
|
924
|
+
# Create the new column with DisplayType (must instantiate the type)
|
|
925
|
+
display_series = pl.Series(highlighted_column, displays, dtype=oeplr.DisplayType())
|
|
926
|
+
|
|
927
|
+
# Add the column to the DataFrame
|
|
928
|
+
result = df.with_columns(display_series)
|
|
929
|
+
|
|
930
|
+
return result
|
|
931
|
+
|
|
932
|
+
|
|
933
|
+
def _dataframe_fingerprint_similarity(
|
|
934
|
+
self,
|
|
935
|
+
molecule_column: str,
|
|
936
|
+
ref: oechem.OEMolBase | None = None,
|
|
937
|
+
*,
|
|
938
|
+
tanimoto_column: str = "fingerprint_tanimoto",
|
|
939
|
+
reference_similarity_column: str = "reference_similarity",
|
|
940
|
+
target_similarity_column: str = "target_similarity",
|
|
941
|
+
fptype: str = "tree",
|
|
942
|
+
num_bits: int = 4096,
|
|
943
|
+
min_distance: int = 0,
|
|
944
|
+
max_distance: int = 4,
|
|
945
|
+
atom_type: str | int = oegraphsim.OEFPAtomType_DefaultTreeAtom,
|
|
946
|
+
bond_type: str | int = oegraphsim.OEFPBondType_DefaultTreeBond,
|
|
947
|
+
inplace: bool = False
|
|
948
|
+
) -> pl.DataFrame:
|
|
949
|
+
"""
|
|
950
|
+
Color molecules by fingerprint similarity.
|
|
951
|
+
|
|
952
|
+
:param molecule_column: Name of the molecule column
|
|
953
|
+
:param ref: Reference molecule (if None, uses first valid molecule)
|
|
954
|
+
:param tanimoto_column: Name of the tanimoto score column
|
|
955
|
+
:param reference_similarity_column: Name of the reference display column
|
|
956
|
+
:param target_similarity_column: Name of the target display column
|
|
957
|
+
:param fptype: Fingerprint type
|
|
958
|
+
:param num_bits: Number of bits in the fingerprint
|
|
959
|
+
:param min_distance: Minimum distance/radius for path/circular/tree
|
|
960
|
+
:param max_distance: Maximum distance/radius for path/circular/tree
|
|
961
|
+
:param atom_type: Atom type bitmask
|
|
962
|
+
:param bond_type: Bond type bitmask
|
|
963
|
+
:param inplace: Not used (Polars DataFrames are immutable), kept for API compatibility
|
|
964
|
+
:return: DataFrame with similarity columns
|
|
965
|
+
"""
|
|
966
|
+
tag = _fingerprint_overlap_tag
|
|
967
|
+
df = self._df
|
|
968
|
+
|
|
969
|
+
if molecule_column not in df.columns:
|
|
970
|
+
raise KeyError(f'Molecule column not found in DataFrame: {molecule_column}')
|
|
971
|
+
|
|
972
|
+
if not isinstance(df.schema[molecule_column], oeplr.MoleculeType):
|
|
973
|
+
raise TypeError(
|
|
974
|
+
f"Column {molecule_column} does not have MoleculeType ({df.schema[molecule_column]})"
|
|
975
|
+
)
|
|
976
|
+
|
|
977
|
+
# Get the context for rendering
|
|
978
|
+
series = df.get_column(molecule_column)
|
|
979
|
+
metadata = series.chem.metadata if hasattr(series, 'chem') else {}
|
|
980
|
+
ctx = get_series_context(metadata)
|
|
981
|
+
|
|
982
|
+
# Get molecule list
|
|
983
|
+
mols = series.to_list()
|
|
984
|
+
|
|
985
|
+
# If we're using the first molecule as our reference
|
|
986
|
+
if ref is None:
|
|
987
|
+
for mol in mols:
|
|
988
|
+
if mol is not None and mol.IsValid():
|
|
989
|
+
ref = mol
|
|
990
|
+
break
|
|
991
|
+
else:
|
|
992
|
+
log.warning(f'No valid reference molecules to use for alignment in column {molecule_column}')
|
|
993
|
+
return df
|
|
994
|
+
|
|
995
|
+
# Check reference molecule
|
|
996
|
+
if not ref.IsValid():
|
|
997
|
+
log.warning("Reference molecule is not valid")
|
|
998
|
+
return df
|
|
999
|
+
|
|
1000
|
+
# Fingerprint maker
|
|
1001
|
+
make_fp = fingerprint_maker(
|
|
1002
|
+
fptype=fptype,
|
|
1003
|
+
num_bits=num_bits,
|
|
1004
|
+
min_distance=min_distance,
|
|
1005
|
+
max_distance=max_distance,
|
|
1006
|
+
atom_type=atom_type,
|
|
1007
|
+
bond_type=bond_type
|
|
1008
|
+
)
|
|
1009
|
+
|
|
1010
|
+
# Make the reference fingerprint
|
|
1011
|
+
ref_fp = make_fp(ref)
|
|
1012
|
+
|
|
1013
|
+
if not ref_fp.IsValid():
|
|
1014
|
+
log.warning("Fingerprint from reference molecule is invalid")
|
|
1015
|
+
return df
|
|
1016
|
+
|
|
1017
|
+
# Create the display objects and scores
|
|
1018
|
+
ref_displays = []
|
|
1019
|
+
targ_displays = []
|
|
1020
|
+
ref_molecules = [] # Cache to prevent GC
|
|
1021
|
+
targ_molecules = [] # Cache to prevent GC
|
|
1022
|
+
tanimotos = []
|
|
1023
|
+
|
|
1024
|
+
for mol in mols:
|
|
1025
|
+
if mol is not None and mol.IsValid():
|
|
1026
|
+
|
|
1027
|
+
# Copy the molecules, because we're modifying them
|
|
1028
|
+
targ_mol = oechem.OEMol(mol)
|
|
1029
|
+
ref_mol = oechem.OEMol(ref)
|
|
1030
|
+
|
|
1031
|
+
# Cache molecules to prevent GC
|
|
1032
|
+
targ_molecules.append(targ_mol)
|
|
1033
|
+
ref_molecules.append(ref_mol)
|
|
1034
|
+
|
|
1035
|
+
# Create the fingerprint
|
|
1036
|
+
targ_fp = make_fp(targ_mol)
|
|
1037
|
+
if targ_fp.IsValid():
|
|
1038
|
+
|
|
1039
|
+
# Add the tanimoto
|
|
1040
|
+
tanimotos.append(oegraphsim.OETanimoto(ref_fp, targ_fp))
|
|
1041
|
+
|
|
1042
|
+
# Calculate the similarity
|
|
1043
|
+
targ_bonds = oechem.OEUIntArray(targ_mol.GetMaxBondIdx())
|
|
1044
|
+
ref_bonds = oechem.OEUIntArray(ref_mol.GetMaxBondIdx())
|
|
1045
|
+
|
|
1046
|
+
# Overlaps
|
|
1047
|
+
overlaps = oegraphsim.OEGetFPOverlap(ref_mol, targ_mol, ref_fp.GetFPTypeBase())
|
|
1048
|
+
|
|
1049
|
+
for match in overlaps:
|
|
1050
|
+
for bond in match.GetPatternBonds():
|
|
1051
|
+
ref_bonds[bond.GetIdx()] += 1
|
|
1052
|
+
for bond in match.GetTargetBonds():
|
|
1053
|
+
targ_bonds[bond.GetIdx()] += 1
|
|
1054
|
+
|
|
1055
|
+
for bond in targ_mol.GetBonds():
|
|
1056
|
+
bond.SetData(tag, targ_bonds[bond.GetIdx()])
|
|
1057
|
+
|
|
1058
|
+
for bond in ref_mol.GetBonds():
|
|
1059
|
+
bond.SetData(tag, ref_bonds[bond.GetIdx()])
|
|
1060
|
+
|
|
1061
|
+
# noinspection PyTypeChecker
|
|
1062
|
+
maxvalue = max((0, max(targ_bonds), max(ref_bonds)))
|
|
1063
|
+
|
|
1064
|
+
# Create the color gradient
|
|
1065
|
+
colorg = oechem.OELinearColorGradient()
|
|
1066
|
+
colorg.AddStop(oechem.OEColorStop(0.0, oechem.OEPinkTint))
|
|
1067
|
+
colorg.AddStop(oechem.OEColorStop(1.0, oechem.OEYellow))
|
|
1068
|
+
colorg.AddStop(oechem.OEColorStop(maxvalue, oechem.OEDarkGreen))
|
|
1069
|
+
|
|
1070
|
+
# Function that will color the bonds
|
|
1071
|
+
bondglyph = ColorBondByOverlapScore(colorg, tag)
|
|
1072
|
+
|
|
1073
|
+
# Align the molecules
|
|
1074
|
+
overlaps = oegraphsim.OEGetFPOverlap(ref_mol, targ_mol, ref_fp.GetFPTypeBase())
|
|
1075
|
+
oedepict.OEPrepareMultiAlignedDepiction(targ_mol, ref_mol, overlaps)
|
|
1076
|
+
|
|
1077
|
+
# Create the displays
|
|
1078
|
+
ref_disp = oemol_to_disp(ref_mol, ctx=ctx)
|
|
1079
|
+
targ_disp = oemol_to_disp(targ_mol, ctx=ctx)
|
|
1080
|
+
|
|
1081
|
+
# Color the displays
|
|
1082
|
+
oegrapheme.OEAddGlyph(ref_disp, bondglyph, oechem.IsTrueBond())
|
|
1083
|
+
oegrapheme.OEAddGlyph(targ_disp, bondglyph, oechem.IsTrueBond())
|
|
1084
|
+
|
|
1085
|
+
ref_displays.append(ref_disp)
|
|
1086
|
+
targ_displays.append(targ_disp)
|
|
1087
|
+
|
|
1088
|
+
# Fingerprint was invalid
|
|
1089
|
+
else:
|
|
1090
|
+
tanimotos.append(None)
|
|
1091
|
+
ref_displays.append(None)
|
|
1092
|
+
targ_displays.append(None)
|
|
1093
|
+
|
|
1094
|
+
# Molecule was invalid
|
|
1095
|
+
else:
|
|
1096
|
+
tanimotos.append(None)
|
|
1097
|
+
ref_displays.append(None)
|
|
1098
|
+
targ_displays.append(None)
|
|
1099
|
+
|
|
1100
|
+
# Create the columns
|
|
1101
|
+
tanimoto_series = pl.Series(tanimoto_column, tanimotos, dtype=pl.Float64)
|
|
1102
|
+
ref_series = pl.Series(reference_similarity_column, ref_displays, dtype=oeplr.DisplayType())
|
|
1103
|
+
targ_series = pl.Series(target_similarity_column, targ_displays, dtype=oeplr.DisplayType())
|
|
1104
|
+
|
|
1105
|
+
# Store molecule references in metadata to prevent GC (same as pandas version)
|
|
1106
|
+
ref_series.chem.metadata["molecules"] = ref_molecules # noqa
|
|
1107
|
+
targ_series.chem.metadata["molecules"] = targ_molecules # noqa
|
|
1108
|
+
|
|
1109
|
+
# Add the columns to the DataFrame
|
|
1110
|
+
result = df.with_columns([tanimoto_series, ref_series, targ_series])
|
|
1111
|
+
|
|
1112
|
+
return result
|
|
1113
|
+
|
|
1114
|
+
|
|
1115
|
+
# Monkey-patch onto oepolars DataFrameChemNamespace
|
|
1116
|
+
from oepolars.namespaces.dataframe import DataFrameChemNamespace
|
|
1117
|
+
DataFrameChemNamespace.reset_depictions = _dataframe_reset_depictions
|
|
1118
|
+
DataFrameChemNamespace.clear_formatting_rules = _dataframe_clear_formatting_rules
|
|
1119
|
+
DataFrameChemNamespace.recalculate_depiction_coordinates = _dataframe_recalculate_depiction_coordinates
|
|
1120
|
+
DataFrameChemNamespace.highlight = _dataframe_highlight
|
|
1121
|
+
DataFrameChemNamespace.highlight_using_column = _dataframe_highlight_using_column
|
|
1122
|
+
DataFrameChemNamespace.fingerprint_similarity = _dataframe_fingerprint_similarity
|
|
1123
|
+
DataFrameChemNamespace.copy_molecules = _dataframe_copy_molecules
|
|
1124
|
+
|
|
1125
|
+
|
|
1126
|
+
########################################################################################################################
|
|
1127
|
+
# MolGrid accessor methods for Series and DataFrame
|
|
1128
|
+
########################################################################################################################
|
|
1129
|
+
|
|
1130
|
+
|
|
1131
|
+
def _polars_series_molgrid(
|
|
1132
|
+
self,
|
|
1133
|
+
title: bool | str | None = True,
|
|
1134
|
+
tooltip_fields: list[str] | None = None,
|
|
1135
|
+
**kwargs
|
|
1136
|
+
) -> "MolGrid":
|
|
1137
|
+
"""Display molecules in an interactive grid.
|
|
1138
|
+
|
|
1139
|
+
:param title: Title display mode. True uses molecule's title, a string
|
|
1140
|
+
specifies a field name, None/False hides titles.
|
|
1141
|
+
:param tooltip_fields: Fields for tooltip.
|
|
1142
|
+
:param kwargs: Additional arguments passed to MolGrid.
|
|
1143
|
+
:returns: MolGrid instance.
|
|
1144
|
+
"""
|
|
1145
|
+
from cnotebook import MolGrid
|
|
1146
|
+
|
|
1147
|
+
series = self._series
|
|
1148
|
+
mols = list(series.to_list())
|
|
1149
|
+
|
|
1150
|
+
return MolGrid(
|
|
1151
|
+
mols,
|
|
1152
|
+
title=title,
|
|
1153
|
+
tooltip_fields=tooltip_fields,
|
|
1154
|
+
**kwargs
|
|
1155
|
+
)
|
|
1156
|
+
|
|
1157
|
+
|
|
1158
|
+
def _polars_dataframe_molgrid(
|
|
1159
|
+
self,
|
|
1160
|
+
mol_col: str,
|
|
1161
|
+
title: bool | str | None = True,
|
|
1162
|
+
tooltip_fields: list[str] | None = None,
|
|
1163
|
+
**kwargs
|
|
1164
|
+
) -> "MolGrid":
|
|
1165
|
+
"""Display molecules from a column in an interactive grid.
|
|
1166
|
+
|
|
1167
|
+
:param mol_col: Column containing molecules.
|
|
1168
|
+
:param title: Title display mode. True uses molecule's title, a string
|
|
1169
|
+
specifies a field name, None/False hides titles.
|
|
1170
|
+
:param tooltip_fields: Columns for tooltip.
|
|
1171
|
+
:param kwargs: Additional arguments passed to MolGrid.
|
|
1172
|
+
:returns: MolGrid instance.
|
|
1173
|
+
"""
|
|
1174
|
+
from cnotebook import MolGrid
|
|
1175
|
+
import pandas as pd
|
|
1176
|
+
|
|
1177
|
+
df = self._df
|
|
1178
|
+
mols = list(df[mol_col].to_list())
|
|
1179
|
+
|
|
1180
|
+
# Build pandas DataFrame from non-molecule columns for MolGrid data access
|
|
1181
|
+
# We extract only primitive columns to avoid pyarrow dependency issues
|
|
1182
|
+
pdf_data = {}
|
|
1183
|
+
for col in df.columns:
|
|
1184
|
+
if col != mol_col:
|
|
1185
|
+
# Extract column values as Python objects
|
|
1186
|
+
pdf_data[col] = df[col].to_list()
|
|
1187
|
+
|
|
1188
|
+
pdf = pd.DataFrame(pdf_data)
|
|
1189
|
+
|
|
1190
|
+
return MolGrid(
|
|
1191
|
+
mols,
|
|
1192
|
+
dataframe=pdf,
|
|
1193
|
+
mol_col=mol_col,
|
|
1194
|
+
title=title,
|
|
1195
|
+
tooltip_fields=tooltip_fields,
|
|
1196
|
+
**kwargs
|
|
1197
|
+
)
|
|
1198
|
+
|
|
1199
|
+
|
|
1200
|
+
# Attach molgrid methods to accessors
|
|
1201
|
+
SeriesChemNamespace.molgrid = _polars_series_molgrid
|
|
1202
|
+
DataFrameChemNamespace.molgrid = _polars_dataframe_molgrid
|
|
1203
|
+
|
|
1204
|
+
|
|
1205
|
+
########################################################################################################################
|
|
1206
|
+
# Register Polars formatters
|
|
1207
|
+
########################################################################################################################
|
|
1208
|
+
|
|
1209
|
+
if ipython_present:
|
|
1210
|
+
|
|
1211
|
+
def register_polars_formatters():
|
|
1212
|
+
"""
|
|
1213
|
+
Register Polars DataFrame formatters for iPython/Jupyter display.
|
|
1214
|
+
|
|
1215
|
+
This registers render_polars_dataframe as the HTML formatter for
|
|
1216
|
+
Polars DataFrames in iPython environments.
|
|
1217
|
+
|
|
1218
|
+
Note: Calls to this function are idempotent.
|
|
1219
|
+
"""
|
|
1220
|
+
ipython_instance = get_ipython()
|
|
1221
|
+
|
|
1222
|
+
if ipython_instance is not None:
|
|
1223
|
+
html_formatter = ipython_instance.display_formatter.formatters['text/html']
|
|
1224
|
+
try:
|
|
1225
|
+
formatter = html_formatter.lookup(pl.DataFrame)
|
|
1226
|
+
if formatter is not render_polars_dataframe:
|
|
1227
|
+
html_formatter.for_type(pl.DataFrame, render_polars_dataframe)
|
|
1228
|
+
except KeyError:
|
|
1229
|
+
html_formatter.for_type(pl.DataFrame, render_polars_dataframe)
|
|
1230
|
+
else:
|
|
1231
|
+
log.debug("[cnotebook] iPython installed but not in use - cannot register polars extension")
|
|
1232
|
+
|
|
1233
|
+
else:
|
|
1234
|
+
|
|
1235
|
+
# iPython is not present, so we do not register a Polars formatter
|
|
1236
|
+
def register_polars_formatters():
|
|
1237
|
+
pass
|