copulas 0.12.4.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,661 @@
1
+ """Base Univariate class."""
2
+
3
+ import pickle
4
+ from abc import ABC
5
+ from enum import Enum
6
+
7
+ import numpy as np
8
+
9
+ from copulas.errors import NotFittedError
10
+ from copulas.univariate.selection import select_univariate
11
+ from copulas.utils import (
12
+ get_instance,
13
+ get_qualified_name,
14
+ random_state,
15
+ store_args,
16
+ validate_random_state,
17
+ )
18
+
19
+
20
+ class ParametricType(Enum):
21
+ """Parametric Enum."""
22
+
23
+ NON_PARAMETRIC = 0
24
+ PARAMETRIC = 1
25
+
26
+
27
+ class BoundedType(Enum):
28
+ """Bounded Enum."""
29
+
30
+ UNBOUNDED = 0
31
+ SEMI_BOUNDED = 1
32
+ BOUNDED = 2
33
+
34
+
35
+ class Univariate(object):
36
+ """Univariate Distribution.
37
+
38
+ Args:
39
+ candidates (list[str or type or Univariate]):
40
+ List of candidates to select the best univariate from.
41
+ It can be a list of strings representing Univariate FQNs,
42
+ or a list of Univariate subclasses or a list of instances.
43
+ parametric (ParametricType):
44
+ If not ``None``, only select subclasses of this type.
45
+ Ignored if ``candidates`` is passed.
46
+ bounded (BoundedType):
47
+ If not ``None``, only select subclasses of this type.
48
+ Ignored if ``candidates`` is passed.
49
+ random_state (int or np.random.RandomState):
50
+ Random seed or RandomState to use.
51
+ selection_sample_size (int):
52
+ Size of the subsample to use for candidate selection.
53
+ If ``None``, all the data is used.
54
+ """
55
+
56
+ PARAMETRIC = ParametricType.NON_PARAMETRIC
57
+ BOUNDED = BoundedType.UNBOUNDED
58
+
59
+ fitted = False
60
+ _constant_value = None
61
+ _instance = None
62
+
63
+ @classmethod
64
+ def _select_candidates(cls, parametric=None, bounded=None):
65
+ """Select which subclasses fulfill the specified constriants.
66
+
67
+ Args:
68
+ parametric (ParametricType):
69
+ If not ``None``, only select subclasses of this type.
70
+ bounded (BoundedType):
71
+ If not ``None``, only select subclasses of this type.
72
+
73
+ Returns:
74
+ list:
75
+ Selected subclasses.
76
+ """
77
+ candidates = []
78
+ for subclass in cls.__subclasses__():
79
+ candidates.extend(subclass._select_candidates(parametric, bounded))
80
+ if ABC in subclass.__bases__:
81
+ continue
82
+ if parametric is not None and subclass.PARAMETRIC != parametric:
83
+ continue
84
+ if bounded is not None and subclass.BOUNDED != bounded:
85
+ continue
86
+
87
+ candidates.append(subclass)
88
+
89
+ return candidates
90
+
91
+ @store_args
92
+ def __init__(
93
+ self,
94
+ candidates=None,
95
+ parametric=None,
96
+ bounded=None,
97
+ random_state=None,
98
+ selection_sample_size=None,
99
+ ):
100
+ self.candidates = candidates or self._select_candidates(parametric, bounded)
101
+ self.random_state = validate_random_state(random_state)
102
+ self.selection_sample_size = selection_sample_size
103
+
104
+ @classmethod
105
+ def __repr__(cls):
106
+ """Return class name."""
107
+ return cls.__name__
108
+
109
+ def check_fit(self):
110
+ """Check whether this model has already been fit to a random variable.
111
+
112
+ Raise a ``NotFittedError`` if it has not.
113
+
114
+ Raises:
115
+ NotFittedError:
116
+ if the model is not fitted.
117
+ """
118
+ if not self.fitted:
119
+ raise NotFittedError('This model is not fitted.')
120
+
121
+ def _constant_sample(self, num_samples):
122
+ """Sample values for a constant distribution.
123
+
124
+ Args:
125
+ num_samples (int):
126
+ Number of rows to sample
127
+
128
+ Returns:
129
+ numpy.ndarray:
130
+ Sampled values. Array of shape (num_samples,).
131
+ """
132
+ return np.full(num_samples, self._constant_value)
133
+
134
+ def _constant_cumulative_distribution(self, X):
135
+ """Cumulative distribution for the degenerate case of constant distribution.
136
+
137
+ Note that the output of this method will be an array whose unique values are 0 and 1.
138
+ More information can be found here: https://en.wikipedia.org/wiki/Degenerate_distribution
139
+
140
+ Arguments:
141
+ X (numpy.ndarray):
142
+ Values for which the cumulative distribution will be computed.
143
+ It must have shape (n, 1).
144
+
145
+ Returns:
146
+ numpy.ndarray:
147
+ Cumulative distribution values for points in X.
148
+ """
149
+ result = np.ones(X.shape)
150
+ result[np.nonzero(X < self._constant_value)] = 0
151
+
152
+ return result
153
+
154
+ def _constant_probability_density(self, X):
155
+ """Probability density for the degenerate case of constant distribution.
156
+
157
+ Note that the output of this method will be an array whose unique values are 0 and 1.
158
+ More information can be found here: https://en.wikipedia.org/wiki/Degenerate_distribution
159
+
160
+ Arguments:
161
+ X (numpy.ndarray):
162
+ Values for which the probability density will be computed.
163
+ It must have shape (n, 1).
164
+
165
+ Returns:
166
+ numpy.ndarray:
167
+ Probability density values for points in X.
168
+ """
169
+ result = np.zeros(X.shape)
170
+ result[np.nonzero(X == self._constant_value)] = 1
171
+
172
+ return result
173
+
174
+ def _constant_percent_point(self, X):
175
+ """Percent point for the degenerate case of constant distribution.
176
+
177
+ Note that the output of this method will be an array whose unique values are `np.nan`
178
+ and self._constant_value.
179
+ More information can be found here: https://en.wikipedia.org/wiki/Degenerate_distribution
180
+
181
+ Arguments:
182
+ U (numpy.ndarray):
183
+ Values for which the cumulative distribution will be computed.
184
+ It must have shape (n, 1) and values must be in [0,1].
185
+
186
+ Returns:
187
+ numpy.ndarray:
188
+ Inverse cumulative distribution values for points in U.
189
+ """
190
+ return np.full(X.shape, self._constant_value)
191
+
192
+ def _replace_constant_methods(self):
193
+ """Replace conventional distribution methods by its constant counterparts."""
194
+ self.cumulative_distribution = self._constant_cumulative_distribution
195
+ self.percent_point = self._constant_percent_point
196
+ self.probability_density = self._constant_probability_density
197
+ self.sample = self._constant_sample
198
+
199
+ def _set_constant_value(self, constant_value):
200
+ """Set the distribution up to behave as a degenerate distribution.
201
+
202
+ The constant value is stored as ``self._constant_value`` and all
203
+ the methods are replaced by their degenerate counterparts.
204
+
205
+ Args:
206
+ constant_value (float):
207
+ Value to set as the constant one.
208
+ """
209
+ self._constant_value = constant_value
210
+ self._replace_constant_methods()
211
+
212
+ def _check_constant_value(self, X):
213
+ """Check if a Series or array contains only one unique value.
214
+
215
+ If it contains only one value, set the instance up to behave accordingly.
216
+
217
+ Args:
218
+ X (numpy.ndarray):
219
+ Data to analyze.
220
+
221
+ Returns:
222
+ float:
223
+ Whether the input data had only one value or not.
224
+ """
225
+ uniques = np.unique(X)
226
+ if len(uniques) == 1:
227
+ self._set_constant_value(uniques[0])
228
+
229
+ return True
230
+
231
+ return False
232
+
233
+ def fit(self, X):
234
+ """Fit the model to a random variable.
235
+
236
+ Arguments:
237
+ X (numpy.ndarray):
238
+ Values of the random variable. It must have shape (n, 1).
239
+ """
240
+ if self.selection_sample_size and self.selection_sample_size < len(X):
241
+ selection_sample = np.random.choice(X, size=self.selection_sample_size)
242
+ else:
243
+ selection_sample = X
244
+
245
+ self._instance = select_univariate(selection_sample, self.candidates)
246
+ self._instance.fit(X)
247
+
248
+ self.fitted = True
249
+
250
+ def probability_density(self, X):
251
+ """Compute the probability density for each point in X.
252
+
253
+ Arguments:
254
+ X (numpy.ndarray):
255
+ Values for which the probability density will be computed.
256
+ It must have shape (n, 1).
257
+
258
+ Returns:
259
+ numpy.ndarray:
260
+ Probability density values for points in X.
261
+
262
+ Raises:
263
+ NotFittedError:
264
+ if the model is not fitted.
265
+ """
266
+ self.check_fit()
267
+ return self._instance.probability_density(X)
268
+
269
+ def log_probability_density(self, X):
270
+ """Compute the log of the probability density for each point in X.
271
+
272
+ It should be overridden with numerically stable variants whenever possible.
273
+
274
+ Arguments:
275
+ X (numpy.ndarray):
276
+ Values for which the log probability density will be computed.
277
+ It must have shape (n, 1).
278
+
279
+ Returns:
280
+ numpy.ndarray:
281
+ Log probability density values for points in X.
282
+
283
+ Raises:
284
+ NotFittedError:
285
+ if the model is not fitted.
286
+ """
287
+ self.check_fit()
288
+ if self._instance:
289
+ return self._instance.log_probability_density(X)
290
+
291
+ return np.log(self.probability_density(X))
292
+
293
+ def pdf(self, X):
294
+ """Compute the probability density for each point in X.
295
+
296
+ Arguments:
297
+ X (numpy.ndarray):
298
+ Values for which the probability density will be computed.
299
+ It must have shape (n, 1).
300
+
301
+ Returns:
302
+ numpy.ndarray:
303
+ Probability density values for points in X.
304
+ """
305
+ return self.probability_density(X)
306
+
307
+ def cumulative_distribution(self, X):
308
+ """Compute the cumulative distribution value for each point in X.
309
+
310
+ Arguments:
311
+ X (numpy.ndarray):
312
+ Values for which the cumulative distribution will be computed.
313
+ It must have shape (n, 1).
314
+
315
+ Returns:
316
+ numpy.ndarray:
317
+ Cumulative distribution values for points in X.
318
+
319
+ Raises:
320
+ NotFittedError:
321
+ if the model is not fitted.
322
+ """
323
+ self.check_fit()
324
+ return self._instance.cumulative_distribution(X)
325
+
326
+ def cdf(self, X):
327
+ """Compute the cumulative distribution value for each point in X.
328
+
329
+ Arguments:
330
+ X (numpy.ndarray):
331
+ Values for which the cumulative distribution will be computed.
332
+ It must have shape (n, 1).
333
+
334
+ Returns:
335
+ numpy.ndarray:
336
+ Cumulative distribution values for points in X.
337
+ """
338
+ return self.cumulative_distribution(X)
339
+
340
+ def percent_point(self, U):
341
+ """Compute the inverse cumulative distribution value for each point in U.
342
+
343
+ Arguments:
344
+ U (numpy.ndarray):
345
+ Values for which the cumulative distribution will be computed.
346
+ It must have shape (n, 1) and values must be in [0,1].
347
+
348
+ Returns:
349
+ numpy.ndarray:
350
+ Inverse cumulative distribution values for points in U.
351
+
352
+ Raises:
353
+ NotFittedError:
354
+ if the model is not fitted.
355
+ """
356
+ self.check_fit()
357
+ return self._instance.percent_point(U)
358
+
359
+ def ppf(self, U):
360
+ """Compute the inverse cumulative distribution value for each point in U.
361
+
362
+ Arguments:
363
+ U (numpy.ndarray):
364
+ Values for which the cumulative distribution will be computed.
365
+ It must have shape (n, 1) and values must be in [0,1].
366
+
367
+ Returns:
368
+ numpy.ndarray:
369
+ Inverse cumulative distribution values for points in U.
370
+ """
371
+ return self.percent_point(U)
372
+
373
+ def set_random_state(self, random_state):
374
+ """Set the random state.
375
+
376
+ Args:
377
+ random_state (int, np.random.RandomState, or None):
378
+ Seed or RandomState for the random generator.
379
+ """
380
+ self.random_state = validate_random_state(random_state)
381
+
382
+ def sample(self, n_samples=1):
383
+ """Sample values from this model.
384
+
385
+ Argument:
386
+ n_samples (int):
387
+ Number of values to sample
388
+
389
+ Returns:
390
+ numpy.ndarray:
391
+ Array of shape (n_samples, 1) with values randomly
392
+ sampled from this model distribution.
393
+
394
+ Raises:
395
+ NotFittedError:
396
+ if the model is not fitted.
397
+ """
398
+ self.check_fit()
399
+ return self._instance.sample(n_samples)
400
+
401
+ def _get_params(self):
402
+ """Return attributes from self.model to serialize.
403
+
404
+ Returns:
405
+ dict:
406
+ Parameters of the underlying distribution.
407
+ """
408
+ return self._instance._get_params()
409
+
410
+ def _set_params(self, params):
411
+ """Set the parameters of this univariate.
412
+
413
+ Must be implemented in all the subclasses.
414
+
415
+ Args:
416
+ dict:
417
+ Parameters to recreate this instance.
418
+ """
419
+ raise NotImplementedError()
420
+
421
+ def to_dict(self):
422
+ """Return the parameters of this model in a dict.
423
+
424
+ Returns:
425
+ dict:
426
+ Dictionary containing the distribution type and all
427
+ the parameters that define the distribution.
428
+
429
+ Raises:
430
+ NotFittedError:
431
+ if the model is not fitted.
432
+ """
433
+ self.check_fit()
434
+
435
+ params = self._get_params()
436
+ if self.__class__ is Univariate:
437
+ params['type'] = get_qualified_name(self._instance)
438
+ else:
439
+ params['type'] = get_qualified_name(self)
440
+
441
+ return params
442
+
443
+ @classmethod
444
+ def from_dict(cls, params):
445
+ """Build a distribution from its params dict.
446
+
447
+ Args:
448
+ params (dict):
449
+ Dictionary containing the FQN of the distribution and the
450
+ necessary parameters to rebuild it.
451
+ The input format is exactly the same that is outputted by
452
+ the distribution class ``to_dict`` method.
453
+
454
+ Returns:
455
+ Univariate:
456
+ Distribution instance.
457
+ """
458
+ params = params.copy()
459
+ distribution = get_instance(params.pop('type'))
460
+ distribution._set_params(params)
461
+ distribution.fitted = True
462
+
463
+ return distribution
464
+
465
+ def save(self, path):
466
+ """Serialize this univariate instance using pickle.
467
+
468
+ Args:
469
+ path (str):
470
+ Path to where this distribution will be serialized.
471
+ """
472
+ with open(path, 'wb') as pickle_file:
473
+ pickle.dump(self, pickle_file)
474
+
475
+ @classmethod
476
+ def load(cls, path):
477
+ """Load a Univariate instance from a pickle file.
478
+
479
+ Args:
480
+ path (str):
481
+ Path to the pickle file where the distribution has been serialized.
482
+
483
+ Returns:
484
+ Univariate:
485
+ Loaded instance.
486
+ """
487
+ with open(path, 'rb') as pickle_file:
488
+ return pickle.load(pickle_file)
489
+
490
+
491
+ class ScipyModel(Univariate, ABC):
492
+ """Wrapper for scipy models.
493
+
494
+ This class makes the probability_density, cumulative_distribution,
495
+ percent_point and sample point at the underlying pdf, cdf, ppd and rvs
496
+ methods respectively.
497
+
498
+ fit, _get_params and _set_params must be implemented by the subclasses.
499
+ """
500
+
501
+ MODEL_CLASS = None
502
+
503
+ _params = None
504
+
505
+ def __init__(self, random_state=None):
506
+ """Initialize Scipy model.
507
+
508
+ Overwrite Univariate __init__ to skip candidate initialization.
509
+
510
+ Args:
511
+ random_state (int, np.random.RandomState, or None): seed
512
+ or RandomState for random generator.
513
+ """
514
+ self.random_state = validate_random_state(random_state)
515
+
516
+ def probability_density(self, X):
517
+ """Compute the probability density for each point in X.
518
+
519
+ Arguments:
520
+ X (numpy.ndarray):
521
+ Values for which the probability density will be computed.
522
+ It must have shape (n, 1).
523
+
524
+ Returns:
525
+ numpy.ndarray:
526
+ Probability density values for points in X.
527
+
528
+ Raises:
529
+ NotFittedError:
530
+ if the model is not fitted.
531
+ """
532
+ self.check_fit()
533
+ return self.MODEL_CLASS.pdf(X, **self._params)
534
+
535
+ def log_probability_density(self, X):
536
+ """Compute the log of the probability density for each point in X.
537
+
538
+ Arguments:
539
+ X (numpy.ndarray):
540
+ Values for which the log probability density will be computed.
541
+ It must have shape (n, 1).
542
+
543
+ Returns:
544
+ numpy.ndarray:
545
+ Log probability density values for points in X.
546
+
547
+ Raises:
548
+ NotFittedError:
549
+ if the model is not fitted.
550
+ """
551
+ self.check_fit()
552
+ if hasattr(self.MODEL_CLASS, 'logpdf'):
553
+ return self.MODEL_CLASS.logpdf(X, **self._params)
554
+
555
+ return np.log(self.probability_density(X))
556
+
557
+ def cumulative_distribution(self, X):
558
+ """Compute the cumulative distribution value for each point in X.
559
+
560
+ Arguments:
561
+ X (numpy.ndarray):
562
+ Values for which the cumulative distribution will be computed.
563
+ It must have shape (n, 1).
564
+
565
+ Returns:
566
+ numpy.ndarray:
567
+ Cumulative distribution values for points in X.
568
+
569
+ Raises:
570
+ NotFittedError:
571
+ if the model is not fitted.
572
+ """
573
+ self.check_fit()
574
+ return self.MODEL_CLASS.cdf(X, **self._params)
575
+
576
+ def percent_point(self, U):
577
+ """Compute the inverse cumulative distribution value for each point in U.
578
+
579
+ Arguments:
580
+ U (numpy.ndarray):
581
+ Values for which the cumulative distribution will be computed.
582
+ It must have shape (n, 1) and values must be in [0,1].
583
+
584
+ Returns:
585
+ numpy.ndarray:
586
+ Inverse cumulative distribution values for points in U.
587
+
588
+ Raises:
589
+ NotFittedError:
590
+ if the model is not fitted.
591
+ """
592
+ self.check_fit()
593
+ return self.MODEL_CLASS.ppf(U, **self._params)
594
+
595
+ @random_state
596
+ def sample(self, n_samples=1):
597
+ """Sample values from this model.
598
+
599
+ Argument:
600
+ n_samples (int):
601
+ Number of values to sample
602
+
603
+ Returns:
604
+ numpy.ndarray:
605
+ Array of shape (n_samples, 1) with values randomly
606
+ sampled from this model distribution.
607
+
608
+ Raises:
609
+ NotFittedError:
610
+ if the model is not fitted.
611
+ """
612
+ self.check_fit()
613
+ return self.MODEL_CLASS.rvs(size=n_samples, **self._params)
614
+
615
+ def _fit(self, X):
616
+ """Fit the model to a non-constant random variable.
617
+
618
+ Must be implemented in all the subclasses.
619
+
620
+ Arguments:
621
+ X (numpy.ndarray):
622
+ Values of the random variable. It must have shape (n, 1).
623
+ """
624
+ raise NotImplementedError()
625
+
626
+ def fit(self, X):
627
+ """Fit the model to a random variable.
628
+
629
+ Arguments:
630
+ X (numpy.ndarray):
631
+ Values of the random variable. It must have shape (n, 1).
632
+ """
633
+ if self._check_constant_value(X):
634
+ self._fit_constant(X)
635
+ else:
636
+ self._fit(X)
637
+
638
+ self.fitted = True
639
+
640
+ def _get_params(self):
641
+ """Return attributes from self._model to serialize.
642
+
643
+ Must be implemented in all the subclasses.
644
+
645
+ Returns:
646
+ dict:
647
+ Parameters to recreate self._model in its current fit status.
648
+ """
649
+ return self._params.copy()
650
+
651
+ def _set_params(self, params):
652
+ """Set the parameters of this univariate.
653
+
654
+ Args:
655
+ params (dict):
656
+ Parameters to recreate this instance.
657
+ """
658
+ self._params = params.copy()
659
+ if self._is_constant():
660
+ constant = self._extract_constant()
661
+ self._set_constant_value(constant)
@@ -0,0 +1,48 @@
1
+ """BetaUnivariate module."""
2
+
3
+ import numpy as np
4
+ from scipy.stats import beta
5
+
6
+ from copulas.univariate.base import BoundedType, ParametricType, ScipyModel
7
+ from copulas.utils import EPSILON
8
+
9
+
10
+ class BetaUnivariate(ScipyModel):
11
+ """Wrapper around scipy.stats.beta.
12
+
13
+ Documentation: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.beta.html
14
+ """
15
+
16
+ PARAMETRIC = ParametricType.PARAMETRIC
17
+ BOUNDED = BoundedType.BOUNDED
18
+ MODEL_CLASS = beta
19
+
20
+ def _fit_constant(self, X):
21
+ self._params = {
22
+ 'a': 1.0,
23
+ 'b': 1.0,
24
+ 'loc': np.unique(X)[0],
25
+ 'scale': 0.0,
26
+ }
27
+
28
+ def _fit(self, X):
29
+ min_x = np.min(X)
30
+ max_x = np.max(X)
31
+ a, b, loc, scale = beta.fit(X, loc=min_x, scale=max_x - min_x)
32
+
33
+ if loc > max_x or scale + loc < min_x:
34
+ raise ValueError(
35
+ 'Converged parameters for beta distribution are '
36
+ 'outside the min/max range of the data.'
37
+ )
38
+
39
+ if scale < EPSILON:
40
+ raise ValueError('Converged parameters for beta distribution have a near-zero range.')
41
+
42
+ self._params = {'loc': loc, 'scale': scale, 'a': a, 'b': b}
43
+
44
+ def _is_constant(self):
45
+ return self._params['scale'] == 0
46
+
47
+ def _extract_constant(self):
48
+ return self._params['loc']