copulas 0.12.4.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- copulas/__init__.py +91 -0
- copulas/bivariate/__init__.py +175 -0
- copulas/bivariate/base.py +448 -0
- copulas/bivariate/clayton.py +163 -0
- copulas/bivariate/frank.py +170 -0
- copulas/bivariate/gumbel.py +144 -0
- copulas/bivariate/independence.py +81 -0
- copulas/bivariate/utils.py +19 -0
- copulas/datasets.py +214 -0
- copulas/errors.py +5 -0
- copulas/multivariate/__init__.py +8 -0
- copulas/multivariate/base.py +200 -0
- copulas/multivariate/gaussian.py +345 -0
- copulas/multivariate/tree.py +691 -0
- copulas/multivariate/vine.py +359 -0
- copulas/optimize/__init__.py +154 -0
- copulas/univariate/__init__.py +25 -0
- copulas/univariate/base.py +661 -0
- copulas/univariate/beta.py +48 -0
- copulas/univariate/gamma.py +38 -0
- copulas/univariate/gaussian.py +27 -0
- copulas/univariate/gaussian_kde.py +192 -0
- copulas/univariate/log_laplace.py +38 -0
- copulas/univariate/selection.py +36 -0
- copulas/univariate/student_t.py +31 -0
- copulas/univariate/truncated_gaussian.py +66 -0
- copulas/univariate/uniform.py +27 -0
- copulas/utils.py +248 -0
- copulas/visualization.py +345 -0
- copulas-0.12.4.dev3.dist-info/METADATA +215 -0
- copulas-0.12.4.dev3.dist-info/RECORD +34 -0
- copulas-0.12.4.dev3.dist-info/WHEEL +5 -0
- copulas-0.12.4.dev3.dist-info/licenses/LICENSE +106 -0
- copulas-0.12.4.dev3.dist-info/top_level.txt +1 -0
copulas/datasets.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""Sample datasets for the Copulas library."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from scipy import stats
|
|
6
|
+
|
|
7
|
+
from copulas.utils import set_random_state, validate_random_state
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _dummy_fn(state):
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def sample_bivariate_age_income(size=1000, seed=42):
|
|
15
|
+
"""Sample from a bivariate toy dataset.
|
|
16
|
+
|
|
17
|
+
This dataset contains two columns which correspond to the simulated age and
|
|
18
|
+
income which are positively correlated with outliers.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
size (int):
|
|
22
|
+
Amount of samples to generate. Defaults to 1000.
|
|
23
|
+
seed (int):
|
|
24
|
+
Random seed to use. Defaults to 42.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
pandas.DataFrame:
|
|
28
|
+
DataFrame with two columns, ``age`` and ``income``.
|
|
29
|
+
"""
|
|
30
|
+
with set_random_state(validate_random_state(seed), _dummy_fn):
|
|
31
|
+
age = stats.beta.rvs(a=2.0, b=6.0, loc=18, scale=100, size=size)
|
|
32
|
+
income = np.log(age) * 100
|
|
33
|
+
income += np.random.normal(loc=np.log(age) / 100, scale=10, size=size)
|
|
34
|
+
income[np.random.randint(0, 10, size=size) == 0] /= 1000
|
|
35
|
+
|
|
36
|
+
return pd.DataFrame({'age': age, 'income': income})
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def sample_trivariate_xyz(size=1000, seed=42):
|
|
40
|
+
"""Sample from three dimensional toy dataset.
|
|
41
|
+
|
|
42
|
+
The output is a DataFrame containing three columns:
|
|
43
|
+
|
|
44
|
+
* ``x``: Beta distribution with a=0.1 and b=0.1
|
|
45
|
+
* ``y``: Beta distribution with a=0.1 and b=0.5
|
|
46
|
+
* ``z``: Normal distribution + 10 times ``y``
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
size (int):
|
|
50
|
+
Amount of samples to generate. Defaults to 1000.
|
|
51
|
+
seed (int):
|
|
52
|
+
Random seed to use. Defaults to 42.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
pandas.DataFrame:
|
|
56
|
+
DataFrame with three columns, ``x``, ``y`` and ``z``.
|
|
57
|
+
"""
|
|
58
|
+
with set_random_state(validate_random_state(seed), _dummy_fn):
|
|
59
|
+
x = stats.beta.rvs(a=0.1, b=0.1, size=size)
|
|
60
|
+
y = stats.beta.rvs(a=0.1, b=0.5, size=size)
|
|
61
|
+
return pd.DataFrame({'x': x, 'y': y, 'z': np.random.normal(size=size) + y * 10})
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def sample_univariate_bernoulli(size=1000, seed=42):
|
|
65
|
+
"""Sample from a Bernoulli distribution with p=0.3.
|
|
66
|
+
|
|
67
|
+
The distribution is built by sampling a uniform random and then setting
|
|
68
|
+
0 or 1 depending on whether the value is above or below 0.3.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
size (int):
|
|
72
|
+
Amount of samples to generate. Defaults to 1000.
|
|
73
|
+
seed (int):
|
|
74
|
+
Random seed to use. Defaults to 42.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
pandas.Series:
|
|
78
|
+
Series with the sampled values.
|
|
79
|
+
"""
|
|
80
|
+
with set_random_state(validate_random_state(seed), _dummy_fn):
|
|
81
|
+
return pd.Series(np.random.random(size=size) < 0.3).astype(float)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def sample_univariate_bimodal(size=1000, seed=42):
|
|
85
|
+
"""Sample from a bimodal distribution which mixes two Gaussians at 0.0 and 10.0 with stdev=1.
|
|
86
|
+
|
|
87
|
+
The distribution is built by sampling a standard normal and a normal with mean ``10``
|
|
88
|
+
and then selecting one or the other based on a bernoulli distribution.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
size (int):
|
|
92
|
+
Amount of samples to generate. Defaults to 1000.
|
|
93
|
+
seed (int):
|
|
94
|
+
Random seed to use. Defaults to 42.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
pandas.Series:
|
|
98
|
+
Series with the sampled values.
|
|
99
|
+
"""
|
|
100
|
+
with set_random_state(validate_random_state(seed), _dummy_fn):
|
|
101
|
+
bernoulli = sample_univariate_bernoulli(size, seed)
|
|
102
|
+
mode1 = np.random.normal(size=size) * bernoulli
|
|
103
|
+
mode2 = np.random.normal(size=size, loc=10) * (1.0 - bernoulli)
|
|
104
|
+
|
|
105
|
+
return pd.Series(mode1 + mode2)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def sample_univariate_uniform(size=1000, seed=42):
|
|
109
|
+
"""Sample from a uniform distribution in [-1.0, 3.0].
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
size (int):
|
|
113
|
+
Amount of samples to generate. Defaults to 1000.
|
|
114
|
+
seed (int):
|
|
115
|
+
Random seed to use. Defaults to 42.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
pandas.Series:
|
|
119
|
+
Series with the sampled values.
|
|
120
|
+
"""
|
|
121
|
+
with set_random_state(validate_random_state(seed), _dummy_fn):
|
|
122
|
+
return pd.Series(4.0 * np.random.random(size=size) - 1.0)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def sample_univariate_normal(size=1000, seed=42):
|
|
126
|
+
"""Sample from a normal distribution with mean 1 and stdev 1.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
size (int):
|
|
130
|
+
Amount of samples to generate. Defaults to 1000.
|
|
131
|
+
seed (int):
|
|
132
|
+
Random seed to use. Defaults to 42.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
pandas.Series:
|
|
136
|
+
Series with the sampled values.
|
|
137
|
+
"""
|
|
138
|
+
with set_random_state(validate_random_state(seed), _dummy_fn):
|
|
139
|
+
return pd.Series(np.random.normal(size=size, loc=1.0))
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def sample_univariate_degenerate(size=1000, seed=42):
|
|
143
|
+
"""Sample from a degenerate distribution that only takes one random value.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
size (int):
|
|
147
|
+
Amount of samples to generate. Defaults to 1000.
|
|
148
|
+
seed (int):
|
|
149
|
+
Random seed to use. Defaults to 42.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
pandas.Series:
|
|
153
|
+
Series with the sampled values.
|
|
154
|
+
"""
|
|
155
|
+
with set_random_state(validate_random_state(seed), _dummy_fn):
|
|
156
|
+
return pd.Series(np.full(size, np.random.random()))
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def sample_univariate_exponential(size=1000, seed=42):
|
|
160
|
+
"""Sample from an exponential distribution at 3.0 with rate 1.0.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
size (int):
|
|
164
|
+
Amount of samples to generate. Defaults to 1000.
|
|
165
|
+
seed (int):
|
|
166
|
+
Random seed to use. Defaults to 42.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
pandas.Series:
|
|
170
|
+
Series with the sampled values.
|
|
171
|
+
"""
|
|
172
|
+
with set_random_state(validate_random_state(seed), _dummy_fn):
|
|
173
|
+
return pd.Series(np.random.exponential(size=size) + 3.0)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def sample_univariate_beta(size=1000, seed=42):
|
|
177
|
+
"""Sample from a beta distribution with a=3 and b=1 and loc=4.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
size (int):
|
|
181
|
+
Amount of samples to generate. Defaults to 1000.
|
|
182
|
+
seed (int):
|
|
183
|
+
Random seed to use. Defaults to 42.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
pandas.Series:
|
|
187
|
+
Series with the sampled values.
|
|
188
|
+
"""
|
|
189
|
+
with set_random_state(validate_random_state(seed), _dummy_fn):
|
|
190
|
+
return pd.Series(stats.beta.rvs(a=3, b=1, loc=4, size=size))
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def sample_univariates(size=1000, seed=42):
|
|
194
|
+
"""Sample from a list of univariate distributions.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
size (int):
|
|
198
|
+
Amount of samples to generate. Defaults to 1000.
|
|
199
|
+
seed (int):
|
|
200
|
+
Random seed to use. Defaults to 42.
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
pandas.DataFrame:
|
|
204
|
+
DataFrame with the sampled distributions.
|
|
205
|
+
"""
|
|
206
|
+
return pd.DataFrame({
|
|
207
|
+
'bernoulli': sample_univariate_bernoulli(size, seed),
|
|
208
|
+
'bimodal': sample_univariate_bimodal(size, seed),
|
|
209
|
+
'uniform': sample_univariate_uniform(size, seed),
|
|
210
|
+
'normal': sample_univariate_normal(size, seed),
|
|
211
|
+
'degenerate': sample_univariate_degenerate(size, seed),
|
|
212
|
+
'exponential': sample_univariate_exponential(size, seed),
|
|
213
|
+
'beta': sample_univariate_beta(size, seed),
|
|
214
|
+
})
|
copulas/errors.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Multivariate copulas module."""
|
|
2
|
+
|
|
3
|
+
from copulas.multivariate.base import Multivariate
|
|
4
|
+
from copulas.multivariate.gaussian import GaussianMultivariate
|
|
5
|
+
from copulas.multivariate.tree import Tree, TreeTypes
|
|
6
|
+
from copulas.multivariate.vine import VineCopula
|
|
7
|
+
|
|
8
|
+
__all__ = ('Multivariate', 'GaussianMultivariate', 'VineCopula', 'Tree', 'TreeTypes')
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""Base Multivariate class."""
|
|
2
|
+
|
|
3
|
+
import pickle
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from copulas.errors import NotFittedError
|
|
8
|
+
from copulas.utils import get_instance, validate_random_state
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Multivariate(object):
|
|
12
|
+
"""Abstract class for a multi-variate copula object."""
|
|
13
|
+
|
|
14
|
+
fitted = False
|
|
15
|
+
|
|
16
|
+
def __init__(self, random_state=None):
|
|
17
|
+
self.random_state = validate_random_state(random_state)
|
|
18
|
+
|
|
19
|
+
def fit(self, X):
|
|
20
|
+
"""Fit the model to table with values from multiple random variables.
|
|
21
|
+
|
|
22
|
+
Arguments:
|
|
23
|
+
X (pandas.DataFrame):
|
|
24
|
+
Values of the random variables.
|
|
25
|
+
"""
|
|
26
|
+
raise NotImplementedError
|
|
27
|
+
|
|
28
|
+
def probability_density(self, X):
|
|
29
|
+
"""Compute the probability density for each point in X.
|
|
30
|
+
|
|
31
|
+
Arguments:
|
|
32
|
+
X (pandas.DataFrame):
|
|
33
|
+
Values for which the probability density will be computed.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
numpy.ndarray:
|
|
37
|
+
Probability density values for points in X.
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
NotFittedError:
|
|
41
|
+
if the model is not fitted.
|
|
42
|
+
"""
|
|
43
|
+
raise NotImplementedError
|
|
44
|
+
|
|
45
|
+
def log_probability_density(self, X):
|
|
46
|
+
"""Compute the log of the probability density for each point in X.
|
|
47
|
+
|
|
48
|
+
Arguments:
|
|
49
|
+
X (pandas.DataFrame):
|
|
50
|
+
Values for which the log probability density will be computed.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
numpy.ndarray:
|
|
54
|
+
Log probability density values for points in X.
|
|
55
|
+
|
|
56
|
+
Raises:
|
|
57
|
+
NotFittedError:
|
|
58
|
+
if the model is not fitted.
|
|
59
|
+
"""
|
|
60
|
+
return np.log(self.probability_density(X))
|
|
61
|
+
|
|
62
|
+
def pdf(self, X):
|
|
63
|
+
"""Compute the probability density for each point in X.
|
|
64
|
+
|
|
65
|
+
Arguments:
|
|
66
|
+
X (pandas.DataFrame):
|
|
67
|
+
Values for which the probability density will be computed.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
numpy.ndarray:
|
|
71
|
+
Probability density values for points in X.
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
NotFittedError:
|
|
75
|
+
if the model is not fitted.
|
|
76
|
+
"""
|
|
77
|
+
return self.probability_density(X)
|
|
78
|
+
|
|
79
|
+
def cumulative_distribution(self, X):
|
|
80
|
+
"""Compute the cumulative distribution value for each point in X.
|
|
81
|
+
|
|
82
|
+
Arguments:
|
|
83
|
+
X (pandas.DataFrame):
|
|
84
|
+
Values for which the cumulative distribution will be computed.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
numpy.ndarray:
|
|
88
|
+
Cumulative distribution values for points in X.
|
|
89
|
+
|
|
90
|
+
Raises:
|
|
91
|
+
NotFittedError:
|
|
92
|
+
if the model is not fitted.
|
|
93
|
+
"""
|
|
94
|
+
raise NotImplementedError
|
|
95
|
+
|
|
96
|
+
def cdf(self, X):
|
|
97
|
+
"""Compute the cumulative distribution value for each point in X.
|
|
98
|
+
|
|
99
|
+
Arguments:
|
|
100
|
+
X (pandas.DataFrame):
|
|
101
|
+
Values for which the cumulative distribution will be computed.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
numpy.ndarray:
|
|
105
|
+
Cumulative distribution values for points in X.
|
|
106
|
+
|
|
107
|
+
Raises:
|
|
108
|
+
NotFittedError:
|
|
109
|
+
if the model is not fitted.
|
|
110
|
+
"""
|
|
111
|
+
return self.cumulative_distribution(X)
|
|
112
|
+
|
|
113
|
+
def set_random_state(self, random_state):
|
|
114
|
+
"""Set the random state.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
random_state (int, np.random.RandomState, or None):
|
|
118
|
+
Seed or RandomState for the random generator.
|
|
119
|
+
"""
|
|
120
|
+
self.random_state = validate_random_state(random_state)
|
|
121
|
+
|
|
122
|
+
def sample(self, num_rows=1):
|
|
123
|
+
"""Sample values from this model.
|
|
124
|
+
|
|
125
|
+
Argument:
|
|
126
|
+
num_rows (int):
|
|
127
|
+
Number of rows to sample.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
numpy.ndarray:
|
|
131
|
+
Array of shape (n_samples, *) with values randomly
|
|
132
|
+
sampled from this model distribution.
|
|
133
|
+
|
|
134
|
+
Raises:
|
|
135
|
+
NotFittedError:
|
|
136
|
+
if the model is not fitted.
|
|
137
|
+
"""
|
|
138
|
+
raise NotImplementedError
|
|
139
|
+
|
|
140
|
+
def to_dict(self):
|
|
141
|
+
"""Return a `dict` with the parameters to replicate this object.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
dict:
|
|
145
|
+
Parameters of this distribution.
|
|
146
|
+
"""
|
|
147
|
+
raise NotImplementedError
|
|
148
|
+
|
|
149
|
+
@classmethod
|
|
150
|
+
def from_dict(cls, params):
|
|
151
|
+
"""Create a new instance from a parameters dictionary.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
params (dict):
|
|
155
|
+
Parameters of the distribution, in the same format as the one
|
|
156
|
+
returned by the ``to_dict`` method.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Multivariate:
|
|
160
|
+
Instance of the distribution defined on the parameters.
|
|
161
|
+
"""
|
|
162
|
+
multivariate_class = get_instance(params['type'])
|
|
163
|
+
return multivariate_class.from_dict(params)
|
|
164
|
+
|
|
165
|
+
@classmethod
|
|
166
|
+
def load(cls, path):
|
|
167
|
+
"""Load a Multivariate instance from a pickle file.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
path (str):
|
|
171
|
+
Path to the pickle file where the distribution has been serialized.
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
Multivariate:
|
|
175
|
+
Loaded instance.
|
|
176
|
+
"""
|
|
177
|
+
with open(path, 'rb') as pickle_file:
|
|
178
|
+
return pickle.load(pickle_file)
|
|
179
|
+
|
|
180
|
+
def save(self, path):
|
|
181
|
+
"""Serialize this multivariate instance using pickle.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
path (str):
|
|
185
|
+
Path to where this distribution will be serialized.
|
|
186
|
+
"""
|
|
187
|
+
with open(path, 'wb') as pickle_file:
|
|
188
|
+
pickle.dump(self, pickle_file)
|
|
189
|
+
|
|
190
|
+
def check_fit(self):
|
|
191
|
+
"""Check whether this model has already been fit to a random variable.
|
|
192
|
+
|
|
193
|
+
Raise a ``NotFittedError`` if it has not.
|
|
194
|
+
|
|
195
|
+
Raises:
|
|
196
|
+
NotFittedError:
|
|
197
|
+
if the model is not fitted.
|
|
198
|
+
"""
|
|
199
|
+
if not self.fitted:
|
|
200
|
+
raise NotFittedError('This model is not fitted.')
|