dclab 0.62.16__cp39-cp39-macosx_11_0_arm64.whl → 0.63.0__cp39-cp39-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dclab might be problematic. Click here for more details.

dclab/kde_methods.py CHANGED
@@ -1,303 +1,11 @@
1
- """Kernel Density Estimation methods"""
1
+ import warnings
2
2
 
3
- import numpy as np
4
- from scipy.interpolate import RectBivariateSpline
5
- from scipy.stats import gaussian_kde, skew
3
+ from .kde.methods import ( # noqa: F401
4
+ bin_num_doane, bin_width_doane, bin_width_percentile, get_bad_vals,
5
+ ignore_nan_inf, kde_gauss, kde_histogram, kde_multivariate, kde_none,
6
+ methods
7
+ )
6
8
 
7
- from .cached import Cache
8
- from .external.statsmodels.nonparametric.kernel_density import KDEMultivariate
9
-
10
-
11
- def bin_num_doane(a):
12
- """Compute number of bins based on Doane's formula
13
-
14
- Notes
15
- -----
16
- If the bin width cannot be determined, then a bin
17
- number of 5 is returned.
18
-
19
- See Also
20
- --------
21
- bin_width_doane: method used to compute the bin width
22
- """
23
- bad = np.isnan(a) | np.isinf(a)
24
- data = a[~bad]
25
- acc = bin_width_doane(a)
26
- if acc == 0 or np.isnan(acc):
27
- num = 5
28
- else:
29
- num = int(np.round((data.max() - data.min()) / acc))
30
- return num
31
-
32
-
33
- def bin_width_doane(a):
34
- """Compute contour spacing based on Doane's formula
35
-
36
- References
37
- ----------
38
- - `<https://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width>`_
39
- - `<https://stats.stackexchange.com/questions/55134/
40
- doanes-formula-for-histogram-binning>`_
41
-
42
- Notes
43
- -----
44
- Doane's formula is actually designed for histograms. This
45
- function is kept here for backwards-compatibility reasons.
46
- It is highly recommended to use :func:`bin_width_percentile`
47
- instead.
48
- """
49
- bad = np.isnan(a) | np.isinf(a)
50
- data = a[~bad]
51
- n = data.size
52
- g1 = skew(data)
53
- sigma_g1 = np.sqrt(6 * (n - 2) / ((n + 1) * (n + 3)))
54
- k = 1 + np.log2(n) + np.log2(1 + np.abs(g1) / sigma_g1)
55
- acc = (data.max() - data.min()) / k
56
- return acc
57
-
58
-
59
- def bin_width_percentile(a):
60
- """Compute contour spacing based on data percentiles
61
-
62
- The 10th and the 90th percentile of the input data are taken.
63
- The spacing then computes to the difference between those
64
- two percentiles divided by 23.
65
-
66
- Notes
67
- -----
68
- The Freedman–Diaconis rule uses the interquartile range and
69
- normalizes to the third root of len(a). Such things do not
70
- work very well for RT-DC data, because len(a) is huge. Here
71
- we use just the top and bottom 10th percentiles with a fixed
72
- normalization.
73
- """
74
- bad = np.isnan(a) | np.isinf(a)
75
- data = a[~bad]
76
- start = np.percentile(data, 10)
77
- end = np.percentile(data, 90)
78
- acc = (end - start) / 23
79
- return acc
80
-
81
-
82
- def get_bad_vals(x, y):
83
- return np.isnan(x) | np.isinf(x) | np.isnan(y) | np.isinf(y)
84
-
85
-
86
- def ignore_nan_inf(kde_method):
87
- """Ignores nans and infs from the input data
88
-
89
- Invalid positions in the resulting density are set to nan.
90
- """
91
- def new_kde_method(events_x, events_y, xout=None, yout=None,
92
- *args, **kwargs):
93
- bad_in = get_bad_vals(events_x, events_y)
94
- if xout is None:
95
- density = np.zeros_like(events_x, dtype=np.float64)
96
- bad_out = bad_in
97
- xo = yo = None
98
- else:
99
- density = np.zeros_like(xout, dtype=np.float64)
100
- bad_out = get_bad_vals(xout, yout)
101
- xo = xout[~bad_out]
102
- yo = yout[~bad_out]
103
- # Filter events
104
- ev_x = events_x[~bad_in]
105
- ev_y = events_y[~bad_in]
106
- density[~bad_out] = kde_method(ev_x, ev_y,
107
- xo, yo,
108
- *args, **kwargs)
109
- density[bad_out] = np.nan
110
- return density
111
-
112
- doc_add = "\n Notes\n" +\
113
- " -----\n" +\
114
- " This is a wrapped version that ignores nan and inf values."
115
- new_kde_method.__doc__ = kde_method.__doc__ + doc_add
116
-
117
- return new_kde_method
118
-
119
-
120
- @ignore_nan_inf
121
- @Cache
122
- def kde_gauss(events_x, events_y, xout=None, yout=None):
123
- """ Gaussian Kernel Density Estimation
124
-
125
- Parameters
126
- ----------
127
- events_x, events_y: 1D ndarray
128
- The input points for kernel density estimation. Input
129
- is flattened automatically.
130
- xout, yout: ndarray
131
- The coordinates at which the KDE should be computed.
132
- If set to none, input coordinates are used.
133
-
134
- Returns
135
- -------
136
- density: ndarray, same shape as `xout`
137
- The KDE for the points in (xout, yout)
138
-
139
- See Also
140
- --------
141
- `scipy.stats.gaussian_kde`
142
- """
143
- valid_combi = ((xout is None and yout is None) or
144
- (xout is not None and yout is not None)
145
- )
146
- if not valid_combi:
147
- raise ValueError("Both `xout` and `yout` must be (un)set.")
148
-
149
- if xout is None and yout is None:
150
- xout = events_x
151
- yout = events_y
152
-
153
- try:
154
- estimator = gaussian_kde([events_x.flatten(), events_y.flatten()])
155
- density = estimator.evaluate([xout.flatten(), yout.flatten()])
156
- except np.linalg.LinAlgError:
157
- # LinAlgError occurs when matrix to solve is singular (issue #117)
158
- density = np.zeros(xout.shape)*np.nan
159
- return density.reshape(xout.shape)
160
-
161
-
162
- @ignore_nan_inf
163
- @Cache
164
- def kde_histogram(events_x, events_y, xout=None, yout=None, bins=None):
165
- """ Histogram-based Kernel Density Estimation
166
-
167
- Parameters
168
- ----------
169
- events_x, events_y: 1D ndarray
170
- The input points for kernel density estimation. Input
171
- is flattened automatically.
172
- xout, yout: ndarray
173
- The coordinates at which the KDE should be computed.
174
- If set to none, input coordinates are used.
175
- bins: tuple (binsx, binsy)
176
- The number of bins to use for the histogram.
177
-
178
- Returns
179
- -------
180
- density: ndarray, same shape as `xout`
181
- The KDE for the points in (xout, yout)
182
-
183
- See Also
184
- --------
185
- `numpy.histogram2d`
186
- `scipy.interpolate.RectBivariateSpline`
187
- """
188
- valid_combi = ((xout is None and yout is None) or
189
- (xout is not None and yout is not None)
190
- )
191
- if not valid_combi:
192
- raise ValueError("Both `xout` and `yout` must be (un)set.")
193
-
194
- if xout is None and yout is None:
195
- xout = events_x
196
- yout = events_y
197
-
198
- if bins is None:
199
- bins = (max(5, bin_num_doane(events_x)),
200
- max(5, bin_num_doane(events_y)))
201
-
202
- # Compute the histogram
203
- hist2d, xedges, yedges = np.histogram2d(x=events_x,
204
- y=events_y,
205
- bins=bins,
206
- density=True)
207
- xip = xedges[1:]-(xedges[1]-xedges[0])/2
208
- yip = yedges[1:]-(yedges[1]-yedges[0])/2
209
-
210
- estimator = RectBivariateSpline(x=xip, y=yip, z=hist2d)
211
- density = estimator.ev(xout, yout)
212
- density[density < 0] = 0
213
-
214
- return density.reshape(xout.shape)
215
-
216
-
217
- def kde_none(events_x, events_y, xout=None, yout=None):
218
- """No Kernel Density Estimation
219
-
220
- Parameters
221
- ----------
222
- events_x, events_y: 1D ndarray
223
- The input points for kernel density estimation. Input
224
- is flattened automatically.
225
- xout, yout: ndarray
226
- The coordinates at which the KDE should be computed.
227
- If set to none, input coordinates are used.
228
-
229
- Returns
230
- -------
231
- density: ndarray, same shape as `xout`
232
- The KDE for the points in (xout, yout)
233
-
234
- Notes
235
- -----
236
- This method is a convenience method that always returns ones in the shape
237
- that the other methods in this module produce.
238
- """
239
- valid_combi = ((xout is None and yout is None) or
240
- (xout is not None and yout is not None)
241
- )
242
- if not valid_combi:
243
- raise ValueError("Both `xout` and `yout` must be (un)set.")
244
-
245
- if xout is None and yout is None:
246
- xout = events_x
247
- _ = events_y
248
-
249
- return np.ones(xout.shape)
250
-
251
-
252
- @ignore_nan_inf
253
- @Cache
254
- def kde_multivariate(events_x, events_y, xout=None, yout=None, bw=None):
255
- """ Multivariate Kernel Density Estimation
256
-
257
- Parameters
258
- ----------
259
- events_x, events_y: 1D ndarray
260
- The input points for kernel density estimation. Input
261
- is flattened automatically.
262
- bw: tuple (bwx, bwy) or None
263
- The bandwith for kernel density estimation.
264
- xout, yout: ndarray
265
- The coordinates at which the KDE should be computed.
266
- If set to none, input coordinates are used.
267
-
268
- Returns
269
- -------
270
- density: ndarray, same shape as `xout`
271
- The KDE for the points in (xout, yout)
272
-
273
- See Also
274
- --------
275
- `statsmodels.nonparametric.kernel_density.KDEMultivariate`
276
- """
277
- valid_combi = ((xout is None and yout is None) or
278
- (xout is not None and yout is not None)
279
- )
280
- if not valid_combi:
281
- raise ValueError("Both `xout` and `yout` must be (un)set.")
282
-
283
- if xout is None and yout is None:
284
- xout = events_x
285
- yout = events_y
286
- if bw is None:
287
- # divide by 2 to make it comparable to histogram KDE
288
- bw = (bin_width_doane(events_x) / 2,
289
- bin_width_doane(events_y) / 2)
290
-
291
- positions = np.vstack([xout.flatten(), yout.flatten()])
292
- estimator_ly = KDEMultivariate(data=[events_x.flatten(),
293
- events_y.flatten()],
294
- var_type='cc', bw=bw)
295
-
296
- density = estimator_ly.pdf(positions)
297
- return density.reshape(xout.shape)
298
-
299
-
300
- methods = {"gauss": kde_gauss,
301
- "histogram": kde_histogram,
302
- "none": kde_none,
303
- "multivariate": kde_multivariate}
9
+ warnings.warn("`dclab.kde_methods` is deprecated; please use "
10
+ "the `dclab.kde.methods` instead",
11
+ DeprecationWarning)
@@ -4,23 +4,23 @@ import hashlib
4
4
  import json
5
5
  import os.path
6
6
  import pathlib
7
+ import random
7
8
  import traceback
8
9
  from typing import Literal
9
10
  import uuid
10
- import random
11
11
  import warnings
12
12
 
13
13
  import numpy as np
14
14
 
15
15
  from .. import definitions as dfn
16
16
  from .. import downsampling
17
+ from ..kde import KernelDensityEstimator
18
+ from ..kde import methods as kde_methods
17
19
  from ..polygon_filter import PolygonFilter
18
- from .. import kde_methods
19
20
  from ..util import hashobj
20
-
21
- from .feat_anc_core import AncillaryFeature, FEATURES_RAPID
22
21
  from . import feat_basin
23
22
  from .export import Export
23
+ from .feat_anc_core import FEATURES_RAPID, AncillaryFeature
24
24
  from .filter import Filter
25
25
 
26
26
 
@@ -28,6 +28,10 @@ class FeatureShouldExistButNotFoundWarning(UserWarning):
28
28
  pass
29
29
 
30
30
 
31
+ class LocalBasinForbiddenWarning(UserWarning):
32
+ pass
33
+
34
+
31
35
  class LogTransformWarning(UserWarning):
32
36
  pass
33
37
 
@@ -322,47 +326,6 @@ class RTDCBase(abc.ABC):
322
326
  pass
323
327
  return data
324
328
 
325
- @staticmethod
326
- def _apply_scale(a, scale, feat):
327
- """Helper function for transforming an aray to log-scale
328
-
329
- Parameters
330
- ----------
331
- a: np.ndarray
332
- Input array
333
- scale: str
334
- If set to "log", take the logarithm of `a`; if set to
335
- "linear" return `a` unchanged.
336
- feat: str
337
- Feature name (required for debugging)
338
-
339
- Returns
340
- -------
341
- b: np.ndarray
342
- The scaled array
343
-
344
- Notes
345
- -----
346
- If the scale is not "linear", then a new array is returned.
347
- All warnings are suppressed when computing `np.log(a)`, as
348
- `a` may have negative or nan values.
349
- """
350
- if scale == "linear":
351
- b = a
352
- elif scale == "log":
353
- with warnings.catch_warnings(record=True) as w:
354
- warnings.simplefilter("always")
355
- b = np.log(a)
356
- if len(w):
357
- # Tell the user that the log-transformation issued
358
- # a warning.
359
- warnings.warn("Invalid values encounterd in np.log "
360
- "while scaling feature '{}'!".format(feat))
361
- else:
362
- raise ValueError("`scale` must be either 'linear' or 'log', "
363
- + "got '{}'!".format(scale))
364
- return b
365
-
366
329
  @staticmethod
367
330
  def get_kde_spacing(a, scale="linear", method=kde_methods.bin_width_doane,
368
331
  method_kw=None, feat="undefined", ret_scaled=False):
@@ -383,16 +346,14 @@ class RTDCBase(abc.ABC):
383
346
  ret_scaled: bool
384
347
  whether to return the scaled array of `a`
385
348
  """
386
- if method_kw is None:
387
- method_kw = {}
388
- # Apply scale (no change for linear scale)
389
- asc = RTDCBase._apply_scale(a, scale, feat)
390
- # Apply multiplicator
391
- acc = method(asc, **method_kw)
392
- if ret_scaled:
393
- return acc, asc
394
- else:
395
- return acc
349
+ return KernelDensityEstimator.get_spacing(
350
+ a=a,
351
+ scale=scale,
352
+ method=method,
353
+ method_kw=method_kw,
354
+ feat=feat,
355
+ ret_scaled=ret_scaled,
356
+ )
396
357
 
397
358
  @property
398
359
  def _feature_candidates(self):
@@ -625,8 +586,8 @@ class RTDCBase(abc.ABC):
625
586
  y = self[yax][self.filter.all]
626
587
 
627
588
  # Apply scale (no change for linear scale)
628
- xs = RTDCBase._apply_scale(x, xscale, xax)
629
- ys = RTDCBase._apply_scale(y, yscale, yax)
589
+ xs = KernelDensityEstimator.apply_scale(x, xscale, xax)
590
+ ys = KernelDensityEstimator.apply_scale(y, yscale, yax)
630
591
 
631
592
  _, _, idx = downsampling.downsample_grid(xs, ys,
632
593
  samples=downsample,
@@ -673,64 +634,11 @@ class RTDCBase(abc.ABC):
673
634
  X, Y, Z : coordinates
674
635
  The kernel density Z evaluated on a rectangular grid (X,Y).
675
636
  """
676
- if kde_kwargs is None:
677
- kde_kwargs = {}
678
- xax = xax.lower()
679
- yax = yax.lower()
680
- kde_type = kde_type.lower()
681
- if kde_type not in kde_methods.methods:
682
- raise ValueError("Not a valid kde type: {}!".format(kde_type))
683
-
684
- # Get data
685
- x = self[xax][self.filter.all]
686
- y = self[yax][self.filter.all]
687
-
688
- xacc_sc, xs = RTDCBase.get_kde_spacing(
689
- a=x,
690
- feat=xax,
691
- scale=xscale,
692
- method=kde_methods.bin_width_doane,
693
- ret_scaled=True)
694
-
695
- yacc_sc, ys = RTDCBase.get_kde_spacing(
696
- a=y,
697
- feat=yax,
698
- scale=yscale,
699
- method=kde_methods.bin_width_doane,
700
- ret_scaled=True)
701
-
702
- if xacc is None or xacc == 0:
703
- xacc = xacc_sc / 5
704
-
705
- if yacc is None or yacc == 0:
706
- yacc = yacc_sc / 5
707
-
708
- # Ignore infs and nans
709
- bad = kde_methods.get_bad_vals(xs, ys)
710
- xc = xs[~bad]
711
- yc = ys[~bad]
712
-
713
- xnum = int(np.ceil((xc.max() - xc.min()) / xacc))
714
- ynum = int(np.ceil((yc.max() - yc.min()) / yacc))
715
-
716
- xlin = np.linspace(xc.min(), xc.max(), xnum, endpoint=True)
717
- ylin = np.linspace(yc.min(), yc.max(), ynum, endpoint=True)
718
-
719
- xmesh, ymesh = np.meshgrid(xlin, ylin, indexing="ij")
720
-
721
- kde_fct = kde_methods.methods[kde_type]
722
- if len(x):
723
- density = kde_fct(events_x=xs, events_y=ys,
724
- xout=xmesh, yout=ymesh,
725
- **kde_kwargs)
726
- else:
727
- density = np.array([])
728
-
729
- # Convert mesh back to linear scale if applicable
730
- if xscale == "log":
731
- xmesh = np.exp(xmesh)
732
- if yscale == "log":
733
- ymesh = np.exp(ymesh)
637
+ kde_instance = KernelDensityEstimator(rtdc_ds=self)
638
+ xmesh, ymesh, density = kde_instance.get_contour(
639
+ xax=xax, yax=yax, xacc=xacc, yacc=yacc, kde_type=kde_type,
640
+ kde_kwargs=kde_kwargs, xscale=xscale, yscale=yscale
641
+ )
734
642
 
735
643
  return xmesh, ymesh, density
736
644
 
@@ -765,36 +673,11 @@ class RTDCBase(abc.ABC):
765
673
  density : 1d ndarray
766
674
  The kernel density evaluated for the filtered data points.
767
675
  """
768
- if kde_kwargs is None:
769
- kde_kwargs = {}
770
- xax = xax.lower()
771
- yax = yax.lower()
772
- kde_type = kde_type.lower()
773
- if kde_type not in kde_methods.methods:
774
- raise ValueError("Not a valid kde type: {}!".format(kde_type))
775
-
776
- # Get data
777
- x = self[xax][self.filter.all]
778
- y = self[yax][self.filter.all]
779
-
780
- # Apply scale (no change for linear scale)
781
- xs = RTDCBase._apply_scale(x, xscale, xax)
782
- ys = RTDCBase._apply_scale(y, yscale, yax)
783
-
784
- if positions is None:
785
- posx = None
786
- posy = None
787
- else:
788
- posx = RTDCBase._apply_scale(positions[0], xscale, xax)
789
- posy = RTDCBase._apply_scale(positions[1], yscale, yax)
790
-
791
- kde_fct = kde_methods.methods[kde_type]
792
- if len(x):
793
- density = kde_fct(events_x=xs, events_y=ys,
794
- xout=posx, yout=posy,
795
- **kde_kwargs)
796
- else:
797
- density = np.array([])
676
+ kde_instance = KernelDensityEstimator(rtdc_ds=self)
677
+ density = kde_instance.get_scatter(
678
+ xax=xax, yax=yax, positions=positions, kde_type=kde_type,
679
+ kde_kwargs=kde_kwargs, xscale=xscale, yscale=yscale
680
+ )
798
681
 
799
682
  return density
800
683
 
@@ -879,7 +762,8 @@ class RTDCBase(abc.ABC):
879
762
  elif bdict["type"] == "file":
880
763
  if not self._local_basins_allowed:
881
764
  warnings.warn(f"Basin type 'file' not allowed for format "
882
- f"'{self.format}'")
765
+ f"'{self.format}'",
766
+ LocalBasinForbiddenWarning)
883
767
  # stop processing this basin
884
768
  continue
885
769
  p_paths = list(bdict["paths"])