fucciphase 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fucciphase/phase.py ADDED
@@ -0,0 +1,501 @@
1
+ from enum import Enum
2
+ from typing import List
3
+
4
+ import dtaidistance.preprocessing
5
+ import numpy as np
6
+ import pandas as pd
7
+ from dtaidistance.dtw import warping_amount
8
+ from dtaidistance.subsequence.dtw import subsequence_alignment
9
+ from scipy import interpolate, stats
10
+
11
+ from .sensor import FUCCISensor
12
+ from .utils import (
13
+ check_channels,
14
+ check_thresholds,
15
+ get_norm_channel_name,
16
+ get_time_distortion_coefficient,
17
+ )
18
+
19
+
20
+ class NewColumns(str, Enum):
21
+ """Columns generated by the analysis.
22
+
23
+
24
+ Attributes
25
+ ----------
26
+ CELL_CYCLE_PERC : str
27
+ Unique cell cycle percentage value
28
+ PHASE : str
29
+ Phase of the cell cycle
30
+ """
31
+
32
+ CELL_CYCLE_PERC_DTW = "CELL_CYCLE_PERC_DTW"
33
+ CELL_CYCLE_PERC = "CELL_CYCLE_PERC"
34
+ PHASE = "PHASE"
35
+ DISCRETE_PHASE_MAX = "DISCRETE_PHASE_MAX"
36
+ DISCRETE_PHASE_BG = "DISCRETE_PHASE_BG"
37
+ DISCRETE_PHASE_DIFF = "DISCRETE_PHASE_DIFF"
38
+ DTW_DISTORTION = "DTW_DISTORTION"
39
+ DTW_DISTORTION_REL = "DTW_DISTORTION_REL"
40
+ DTW_DISTANCE = "DTW_DISTANCE"
41
+ DTW_WARPING = "DTW_WARP"
42
+ REL_DTW_WARPING = "DTW_WARP_REL"
43
+
44
+ @staticmethod
45
+ def cell_cycle() -> str:
46
+ """Return the name of the unique intensity column."""
47
+ return NewColumns.CELL_CYCLE_PERC.value
48
+
49
+ @staticmethod
50
+ def phase() -> str:
51
+ """Return the name of the phase column."""
52
+ return NewColumns.PHASE.value
53
+
54
+ @staticmethod
55
+ def cell_cycle_dtw() -> str:
56
+ """Return the name of the cell cycle percentage column."""
57
+ return NewColumns.CELL_CYCLE_PERC_DTW.value
58
+
59
+ @staticmethod
60
+ def discrete_phase_max() -> str:
61
+ """Return the name of the discrete phase column."""
62
+ return NewColumns.DISCRETE_PHASE_MAX.value
63
+
64
+ @staticmethod
65
+ def discrete_phase_bg() -> str:
66
+ """Return the name of the discrete phase column."""
67
+ return NewColumns.DISCRETE_PHASE_BG.value
68
+
69
+ @staticmethod
70
+ def discrete_phase_diff() -> str:
71
+ """Return the name of the discrete phase column."""
72
+ return NewColumns.DISCRETE_PHASE_DIFF.value
73
+
74
+ @staticmethod
75
+ def dtw_distortion() -> str:
76
+ """Return the name of the DTW distortion."""
77
+ return NewColumns.DTW_DISTORTION.value
78
+
79
+ @staticmethod
80
+ def dtw_distortion_norm() -> str:
81
+ """Return the name of the DTW distortion."""
82
+ return NewColumns.DTW_DISTORTION_REL.value
83
+
84
+ @staticmethod
85
+ def dtw_distance() -> str:
86
+ """Return the name of the DTW distance."""
87
+ return NewColumns.DTW_DISTANCE.value
88
+
89
+ @staticmethod
90
+ def dtw_warping_amount() -> str:
91
+ """Return the name of the DTW warping amount."""
92
+ return NewColumns.DTW_WARPING.value
93
+
94
+ @staticmethod
95
+ def rel_dtw_warping_amount() -> str:
96
+ """Return the name of the relative DTW warping amount."""
97
+ return NewColumns.REL_DTW_WARPING.value
98
+
99
+
100
+ def generate_cycle_phases(
101
+ df: pd.DataFrame,
102
+ channels: List[str],
103
+ sensor: FUCCISensor,
104
+ thresholds: List[float],
105
+ estimate_percentage: bool = False,
106
+ ) -> None:
107
+ """Add a column in place to the dataframe with the phase of the cell cycle.
108
+
109
+ The phase is determined using a threshold on the channel intensities
110
+ assuming a FUCCI sensor.
111
+
112
+ The thresholds per channel must be between 0 and 1.
113
+
114
+ Example:
115
+ channels = ["CH1", "CH2"]
116
+ thresholds = [0.1, 0.1]
117
+
118
+ The sensor needs to be calibrated for each cell line.
119
+ For that, record the FUCCI intensities of multiple cell cycles
120
+ by live-cell fluorescence microscopy.
121
+ See the examples for more details.
122
+
123
+ The thresholds need to be chosen based on the expected noise of the background and
124
+ uncertainty in intensity computation.
125
+ They give the ratio to the maximum intensity.
126
+ E.g., a threshold of 0.1 means that all intensities below 0.1 times the maximum
127
+ intensity are considered background signal.
128
+
129
+ Parameters
130
+ ----------
131
+ df : pd.DataFrame
132
+ Dataframe with columns holding normalized intensities
133
+ sensor: FUCCISensor
134
+ FUCCI sensor with phase specifics
135
+ channels: List[str]
136
+ Names of channels
137
+ thresholds: List[float]
138
+ Thresholds to separate phases
139
+ estimate_percentage: bool
140
+ Estimate cell cycle percentages
141
+
142
+
143
+ Raises
144
+ ------
145
+ ValueError
146
+ If the number of thresholds is not 2
147
+ ValueError
148
+ If the phases are not unique
149
+ ValueError
150
+ If the thresholds are not between 0 and 1, one excluded
151
+ """
152
+ # sanity check: check that the normalized channels are present
153
+ norm_channel_names = []
154
+ for channel in channels:
155
+ norm_channel_name = get_norm_channel_name(channel)
156
+ if norm_channel_name not in df.columns:
157
+ raise ValueError(
158
+ f"Column {get_norm_channel_name(channel)} not found, call "
159
+ f"normalize_channel({channel}) on the dataframe."
160
+ )
161
+ norm_channel_names.append(norm_channel_name)
162
+
163
+ # check that all channels are present
164
+ check_channels(sensor.fluorophores, channels)
165
+
166
+ # compute phases
167
+ estimate_cell_phase_from_max_intensity(
168
+ df,
169
+ norm_channel_names,
170
+ sensor,
171
+ background=[0] * sensor.fluorophores,
172
+ thresholds=thresholds,
173
+ )
174
+
175
+ # name of phase_column
176
+ phase_column = NewColumns.discrete_phase_max()
177
+ # compute percentages
178
+ if estimate_percentage:
179
+ estimate_cell_cycle_percentage(df, norm_channel_names, sensor, phase_column)
180
+
181
+
182
+ def estimate_cell_cycle_percentage(
183
+ df: pd.DataFrame, channels: List[str], sensor: FUCCISensor, phase_column: str
184
+ ) -> None:
185
+ """Estimate cell cycle percentage from intensity pairs.
186
+
187
+ Parameters
188
+ ----------
189
+ df : pd.DataFrame
190
+ Dataframe with columns holding normalized intensities
191
+ sensor: FUCCISensor
192
+ FUCCI sensor with phase specifics
193
+ channels: List[str]
194
+ Names of channels
195
+ phase_column: str
196
+ Name of phase column
197
+ """
198
+ percentages = []
199
+ # iterate through data frame
200
+ for _, row in df.iterrows():
201
+ intensities = [row[channel] for channel in channels]
202
+ phase = row[phase_column]
203
+ percentage = sensor.get_estimated_cycle_percentage(phase, intensities)
204
+ percentages.append(percentage)
205
+
206
+ # TODO add inplace to dataframe
207
+ # df[NewColumns.cell_cycle()] = pd.Series(percentages, dtype=float)
208
+ df[NewColumns.cell_cycle()] = percentages
209
+
210
+
211
+ def estimate_cell_phase_from_max_intensity(
212
+ df: pd.DataFrame,
213
+ channels: List[str],
214
+ sensor: FUCCISensor,
215
+ background: List[float],
216
+ thresholds: List[float],
217
+ ) -> None:
218
+ """Add a column in place to the dataframe with the estimated phase of the cell
219
+ cycle, where the phase is determined by thresholding the channel intensities.
220
+
221
+ The provided thresholds are used to decide if a channel is switched on (ON).
222
+ For that, the background is subtracted from the mean intensity.
223
+ The obtained values are normalized w.r.t. the maximum mean intensity in the
224
+ respective channel available in the DataFrame.
225
+ Hence, the threshold values should be between 0 and 1.
226
+ This method will not work reliably if not enough cells from different phases
227
+ are contained in the DataFrame.
228
+
229
+ Parameters
230
+ ----------
231
+ df: pd.DataFrame
232
+ Dataframe with a CELL_CYCLE_PERC column
233
+ channels: List[str]
234
+ Names of channels
235
+ sensor: FUCCISensor
236
+ FUCCI sensor with specific phase analysis information
237
+ background: List[float]
238
+ Single value per channel representing background
239
+ thresholds: List[float]
240
+ Thresholds to separate phases
241
+
242
+ Raises
243
+ ------
244
+ ValueError
245
+ If the dataframe does not contain the normalized channels.
246
+ """
247
+ # sanity check: check that channels are present
248
+ for channel in channels:
249
+ if channel not in df.columns:
250
+ raise ValueError(
251
+ f"Column {channel} not found, provide correct input parameters."
252
+ )
253
+
254
+ if len(channels) != len(background):
255
+ raise ValueError("Provide one background value per channel.")
256
+
257
+ check_channels(sensor.fluorophores, channels)
258
+ check_thresholds(sensor.fluorophores, thresholds)
259
+
260
+ phase_markers_list: List[pd.Series[bool]] = []
261
+ for channel, bg_value, threshold in zip(channels, background, thresholds):
262
+ # get intensities and subtract background
263
+ intensity = df[channel] - bg_value
264
+ # threshold channels to decide if ON / OFF (data is in list per spot)
265
+ phase_markers_list.append(intensity > threshold * intensity.max())
266
+ phase_markers_list_tilted = np.array(phase_markers_list).T
267
+
268
+ # store phases
269
+ phase_names = []
270
+ for phase_markers in phase_markers_list_tilted:
271
+ phase_names.append(sensor.get_phase(phase_markers))
272
+ # TODO check pd.Series issue
273
+ df[NewColumns.discrete_phase_max()] = phase_names
274
+
275
+
276
+ def estimate_cell_phase_from_background(
277
+ df: pd.DataFrame,
278
+ channels: List[str],
279
+ sensor: FUCCISensor,
280
+ background: List[float],
281
+ thresholds: List[float],
282
+ ) -> None:
283
+ """Add a column in place to the dataframe with the estimated phase of the cell
284
+ cycle, where the phase is determined by comparing the channel intensities to
285
+ the respective background intensities.
286
+
287
+ The provided factors are used to decide if a channel is switched on (ON).
288
+ If the intensity exceeds the background level times the factor, the channel
289
+ is ON. Hence, the factors should be greater than 0.
290
+
291
+
292
+ Parameters
293
+ ----------
294
+ df: pd.DataFrame
295
+ Dataframe with a CELL_CYCLE_PERC column
296
+ channels: List[str]
297
+ Names of channels
298
+ sensor: FUCCISensor
299
+ FUCCI sensor with specific phase analysis information
300
+ background: List[float]
301
+ Single value per channel representing background
302
+ thresholds: List[float]
303
+ Thresholds to separate phases
304
+
305
+ Raises
306
+ ------
307
+ ValueError
308
+ If the dataframe does not contain the normalized channels.
309
+ """
310
+ # sanity check: check that channels are present
311
+ for channel in channels:
312
+ if channel not in df.columns:
313
+ raise ValueError(
314
+ f"Column {channel} not found, provide correct input parameters."
315
+ )
316
+
317
+ if len(channels) != len(background):
318
+ raise ValueError("Provide one background value per channel.")
319
+
320
+ check_channels(sensor.fluorophores, channels)
321
+
322
+ phase_markers_list: List[pd.Series[bool]] = []
323
+ for channel, bg_value, threshold in zip(channels, background, thresholds):
324
+ intensity = df[channel]
325
+ # threshold channels to decide if ON / OFF (data is in list per spot)
326
+ phase_markers_list.append(intensity > threshold * bg_value)
327
+ phase_markers_list_tilted = np.array(phase_markers_list).T
328
+
329
+ # store phases
330
+ phase_names = []
331
+ for phase_markers in phase_markers_list_tilted:
332
+ phase_names.append(sensor.get_phase(phase_markers))
333
+ df[NewColumns.discrete_phase_bg()] = pd.Series(phase_names, dtype=str) # add as str
334
+
335
+
336
+ # flake8: noqa: C901
337
+ def estimate_percentage_by_subsequence_alignment(
338
+ df: pd.DataFrame,
339
+ dt: float,
340
+ channels: List[str],
341
+ reference_data: pd.DataFrame,
342
+ smooth: float = 0.1,
343
+ penalty: float = 0.05,
344
+ track_id_name: str = "TRACK_ID",
345
+ minimum_track_length: int = 10,
346
+ use_zscore_norm: bool = True,
347
+ use_derivative: bool = True,
348
+ ) -> None:
349
+ """Use subsequence alignment to estimate percentage.
350
+
351
+ Parameters
352
+ ----------
353
+ df: pd.DataFrame
354
+ DataFrame with tracks
355
+ dt: float
356
+ Timestep between frames in hours
357
+ channels: List[str]
358
+ List of channels to be matched with reference data
359
+ reference_data: pd.DataFrame
360
+ Containing reference intensities over time
361
+ smooth: float
362
+ Smoothing factor, see dtaidistance documentation
363
+ penalty: float
364
+ Penalty for DTW algorithm, enforces diagonal warping path
365
+ track_id_name: str
366
+ Name of column with track IDs
367
+ minimum_track_length: int
368
+ Only estimate phase for tracks longer than this
369
+ use_zscore_norm: bool
370
+ Use z-score normalization before differencing curves
371
+ Probably not needed if intensities of reference and measured
372
+ curve are similar
373
+ use_derivative: bool
374
+ Take derivative to perform alignment independent of intensity
375
+ baseline (in default mode also after normalization)
376
+ """
377
+ if "time" not in reference_data:
378
+ raise ValueError("Need to provide time column in reference_data.")
379
+ if "percentage" not in reference_data:
380
+ raise ValueError("Need to provide percentage column in reference_data.")
381
+
382
+ if not set(channels).issubset(reference_data.columns):
383
+ raise ValueError("Provide channel names in reference_data.")
384
+
385
+ # interpolate reference curve
386
+ time_scale = reference_data["time"].to_numpy()
387
+ interpolation_functions = {}
388
+ for channel in channels:
389
+ interpolation_functions[channel] = interpolate.interp1d(
390
+ time_scale, reference_data[channel].to_numpy()
391
+ )
392
+ f_percentage = interpolate.interp1d(
393
+ time_scale, reference_data["percentage"].to_numpy()
394
+ )
395
+
396
+ num_time = int(time_scale[-1] / dt)
397
+ new_time_scale = np.linspace(0, dt * num_time, num=num_time + 1)
398
+ assert np.isclose(dt, new_time_scale[1] - new_time_scale[0])
399
+
400
+ # reference curve in time scale of provided track
401
+ percentage_ref = f_percentage(new_time_scale)
402
+
403
+ series_diff = []
404
+ for channel in channels:
405
+ series = interpolation_functions[channel](new_time_scale)
406
+ if use_zscore_norm:
407
+ series = stats.zscore(series)
408
+ # if all values are the same, we zero to not numerical issues
409
+ if np.all(np.isnan(series)):
410
+ series = 0.0
411
+
412
+ if use_derivative:
413
+ try:
414
+ diff_ch = dtaidistance.preprocessing.differencing(series, smooth=smooth)
415
+ except ValueError:
416
+ print(
417
+ "WARNING: The smoothing failed, continue without smoothing"
418
+ f" for channel {channel}"
419
+ )
420
+ diff_ch = dtaidistance.preprocessing.differencing(series)
421
+ else:
422
+ diff_ch = series
423
+ series_diff.append(diff_ch)
424
+ series = np.array(series_diff)
425
+ series = np.swapaxes(series, 0, 1)
426
+
427
+ df.loc[:, NewColumns.cell_cycle_dtw()] = np.nan
428
+
429
+ track_ids = df[track_id_name].unique()
430
+ for track_id in track_ids:
431
+ track_df = df.loc[df[track_id_name] == track_id]
432
+ # the algorithm does not work for short tracks
433
+ if len(track_df) < minimum_track_length:
434
+ # insert NaN
435
+ new_percentage = np.full(len(track_df), np.nan)
436
+ df.loc[df[track_id_name] == track_id, NewColumns.cell_cycle_dtw()] = (
437
+ new_percentage[:]
438
+ )
439
+ continue
440
+
441
+ # find percentages if track is long enough
442
+ queries = track_df[channels].to_numpy()
443
+
444
+ queries_diff = []
445
+ for idx in range(len(channels)):
446
+ if use_zscore_norm:
447
+ queries[:, idx] = stats.zscore(queries[:, idx])
448
+ # if all values are the same, we zero to not numerical issues
449
+ if np.all(np.isnan(queries[:, idx])):
450
+ queries[:, idx] = 0.0
451
+ if use_derivative:
452
+ diff_ch = dtaidistance.preprocessing.differencing(
453
+ queries[:, idx], smooth=smooth
454
+ )
455
+ else:
456
+ diff_ch = queries[:, idx]
457
+ queries_diff.append(diff_ch)
458
+
459
+ query = np.array(queries_diff)
460
+ query = np.swapaxes(query, 0, 1)
461
+
462
+ sa = subsequence_alignment(query, series, penalty=penalty)
463
+ best_match = sa.best_match()
464
+ if use_derivative:
465
+ new_percentage = np.zeros(query.shape[0] + 1)
466
+ else:
467
+ new_percentage = np.zeros(query.shape[0])
468
+ for p in best_match.path:
469
+ new_percentage[p[0]] = percentage_ref[p[1]]
470
+ if p[1] + 1 < len(percentage_ref):
471
+ last_percentage = p[1] + 1
472
+ else:
473
+ last_percentage = p[1]
474
+ new_percentage[-1] = percentage_ref[last_percentage]
475
+ # save estimated cell cycle percentages
476
+ df.loc[df[track_id_name] == track_id, NewColumns.cell_cycle_dtw()] = (
477
+ new_percentage[:]
478
+ )
479
+ # save DTW distance
480
+ df.loc[df[track_id_name] == track_id, NewColumns.dtw_distance()] = (
481
+ best_match.value
482
+ )
483
+
484
+ _, distortion_score, _, _ = get_time_distortion_coefficient(best_match.path)
485
+ # save DTW distortion
486
+ df.loc[df[track_id_name] == track_id, NewColumns.dtw_distortion()] = (
487
+ distortion_score
488
+ )
489
+ df.loc[df[track_id_name] == track_id, NewColumns.dtw_distortion_norm()] = (
490
+ distortion_score / len(track_df)
491
+ )
492
+
493
+ # save DTW warping amount
494
+ df.loc[df[track_id_name] == track_id, NewColumns.dtw_warping_amount()] = (
495
+ warping_amount(best_match.path)
496
+ )
497
+
498
+ # save DTW warping amount
499
+ df.loc[df[track_id_name] == track_id, NewColumns.rel_dtw_warping_amount()] = (
500
+ warping_amount(best_match.path) / len(track_df)
501
+ )