copulas 0.12.4.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- copulas/__init__.py +91 -0
- copulas/bivariate/__init__.py +175 -0
- copulas/bivariate/base.py +448 -0
- copulas/bivariate/clayton.py +163 -0
- copulas/bivariate/frank.py +170 -0
- copulas/bivariate/gumbel.py +144 -0
- copulas/bivariate/independence.py +81 -0
- copulas/bivariate/utils.py +19 -0
- copulas/datasets.py +214 -0
- copulas/errors.py +5 -0
- copulas/multivariate/__init__.py +8 -0
- copulas/multivariate/base.py +200 -0
- copulas/multivariate/gaussian.py +345 -0
- copulas/multivariate/tree.py +691 -0
- copulas/multivariate/vine.py +359 -0
- copulas/optimize/__init__.py +154 -0
- copulas/univariate/__init__.py +25 -0
- copulas/univariate/base.py +661 -0
- copulas/univariate/beta.py +48 -0
- copulas/univariate/gamma.py +38 -0
- copulas/univariate/gaussian.py +27 -0
- copulas/univariate/gaussian_kde.py +192 -0
- copulas/univariate/log_laplace.py +38 -0
- copulas/univariate/selection.py +36 -0
- copulas/univariate/student_t.py +31 -0
- copulas/univariate/truncated_gaussian.py +66 -0
- copulas/univariate/uniform.py +27 -0
- copulas/utils.py +248 -0
- copulas/visualization.py +345 -0
- copulas-0.12.4.dev3.dist-info/METADATA +215 -0
- copulas-0.12.4.dev3.dist-info/RECORD +34 -0
- copulas-0.12.4.dev3.dist-info/WHEEL +5 -0
- copulas-0.12.4.dev3.dist-info/licenses/LICENSE +106 -0
- copulas-0.12.4.dev3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
"""GaussianMultivariate module."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from scipy import stats
|
|
9
|
+
|
|
10
|
+
from copulas.multivariate.base import Multivariate
|
|
11
|
+
from copulas.univariate import GaussianUnivariate, Univariate
|
|
12
|
+
from copulas.utils import (
|
|
13
|
+
EPSILON,
|
|
14
|
+
check_valid_values,
|
|
15
|
+
get_instance,
|
|
16
|
+
get_qualified_name,
|
|
17
|
+
random_state,
|
|
18
|
+
store_args,
|
|
19
|
+
validate_random_state,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
LOGGER = logging.getLogger(__name__)
|
|
23
|
+
DEFAULT_DISTRIBUTION = Univariate
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class GaussianMultivariate(Multivariate):
|
|
27
|
+
"""Class for a multivariate distribution that uses the Gaussian copula.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
distribution (str or dict):
|
|
31
|
+
Fully qualified name of the class to be used for modeling the marginal
|
|
32
|
+
distributions or a dictionary mapping column names to the fully qualified
|
|
33
|
+
distribution names.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
correlation = None
|
|
37
|
+
columns = None
|
|
38
|
+
univariates = None
|
|
39
|
+
|
|
40
|
+
@store_args
|
|
41
|
+
def __init__(self, distribution=DEFAULT_DISTRIBUTION, random_state=None):
|
|
42
|
+
self.random_state = validate_random_state(random_state)
|
|
43
|
+
self.distribution = distribution
|
|
44
|
+
|
|
45
|
+
def __repr__(self):
|
|
46
|
+
"""Produce printable representation of the object."""
|
|
47
|
+
if self.distribution == DEFAULT_DISTRIBUTION:
|
|
48
|
+
distribution = ''
|
|
49
|
+
elif isinstance(self.distribution, type):
|
|
50
|
+
distribution = f'distribution="{self.distribution.__name__}"'
|
|
51
|
+
else:
|
|
52
|
+
distribution = f'distribution="{self.distribution}"'
|
|
53
|
+
|
|
54
|
+
return f'GaussianMultivariate({distribution})'
|
|
55
|
+
|
|
56
|
+
def _transform_to_normal(self, X):
|
|
57
|
+
if isinstance(X, pd.Series):
|
|
58
|
+
X = X.to_frame().T
|
|
59
|
+
elif not isinstance(X, pd.DataFrame):
|
|
60
|
+
if len(X.shape) == 1:
|
|
61
|
+
X = [X]
|
|
62
|
+
|
|
63
|
+
X = pd.DataFrame(X, columns=self.columns)
|
|
64
|
+
|
|
65
|
+
U = []
|
|
66
|
+
for column_name, univariate in zip(self.columns, self.univariates):
|
|
67
|
+
if column_name in X:
|
|
68
|
+
column = X[column_name]
|
|
69
|
+
U.append(univariate.cdf(column.to_numpy()).clip(EPSILON, 1 - EPSILON))
|
|
70
|
+
|
|
71
|
+
return stats.norm.ppf(np.column_stack(U))
|
|
72
|
+
|
|
73
|
+
@check_valid_values
|
|
74
|
+
def fit(self, X):
|
|
75
|
+
"""Compute the distribution for each variable and then its correlation matrix.
|
|
76
|
+
|
|
77
|
+
Arguments:
|
|
78
|
+
X (pandas.DataFrame):
|
|
79
|
+
Values of the random variables.
|
|
80
|
+
"""
|
|
81
|
+
LOGGER.info('Fitting %s', self)
|
|
82
|
+
|
|
83
|
+
# Validate the input data
|
|
84
|
+
X = self._validate_input(X)
|
|
85
|
+
columns, univariates = self._fit_columns(X)
|
|
86
|
+
|
|
87
|
+
self.columns = columns
|
|
88
|
+
self.univariates = univariates
|
|
89
|
+
|
|
90
|
+
LOGGER.debug('Computing correlation.')
|
|
91
|
+
self.correlation = self._get_correlation(X)
|
|
92
|
+
self.fitted = True
|
|
93
|
+
LOGGER.debug('GaussianMultivariate fitted successfully')
|
|
94
|
+
|
|
95
|
+
def _validate_input(self, X):
|
|
96
|
+
"""Validate the input data."""
|
|
97
|
+
if not isinstance(X, pd.DataFrame):
|
|
98
|
+
X = pd.DataFrame(X)
|
|
99
|
+
|
|
100
|
+
return X
|
|
101
|
+
|
|
102
|
+
def _fit_columns(self, X):
|
|
103
|
+
"""Fit each column to its distribution."""
|
|
104
|
+
columns = []
|
|
105
|
+
univariates = []
|
|
106
|
+
for column_name, column in X.items():
|
|
107
|
+
distribution = self._get_distribution_for_column(column_name)
|
|
108
|
+
LOGGER.debug('Fitting column %s to %s', column_name, distribution)
|
|
109
|
+
|
|
110
|
+
univariate = self._fit_column(column, distribution, column_name)
|
|
111
|
+
columns.append(column_name)
|
|
112
|
+
univariates.append(univariate)
|
|
113
|
+
|
|
114
|
+
return columns, univariates
|
|
115
|
+
|
|
116
|
+
def _get_distribution_for_column(self, column_name):
|
|
117
|
+
"""Retrieve the distribution for a given column name."""
|
|
118
|
+
if isinstance(self.distribution, dict):
|
|
119
|
+
return self.distribution.get(column_name, DEFAULT_DISTRIBUTION)
|
|
120
|
+
|
|
121
|
+
return self.distribution
|
|
122
|
+
|
|
123
|
+
def _fit_column(self, column, distribution, column_name):
|
|
124
|
+
"""Fit a single column to its distribution with exception handling."""
|
|
125
|
+
univariate = get_instance(distribution)
|
|
126
|
+
try:
|
|
127
|
+
univariate.fit(column)
|
|
128
|
+
except Exception as error:
|
|
129
|
+
univariate = self._fit_with_fallback_distribution(
|
|
130
|
+
column, distribution, column_name, error
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
return univariate
|
|
134
|
+
|
|
135
|
+
def _fit_with_fallback_distribution(self, column, distribution, column_name, error):
|
|
136
|
+
"""Fall back to fitting a Gaussian distribution and log the error."""
|
|
137
|
+
log_message = (
|
|
138
|
+
f'Unable to fit to a {distribution} distribution for column {column_name}. '
|
|
139
|
+
'Using a Gaussian distribution instead.'
|
|
140
|
+
)
|
|
141
|
+
LOGGER.info(log_message)
|
|
142
|
+
univariate = GaussianUnivariate()
|
|
143
|
+
univariate.fit(column)
|
|
144
|
+
return univariate
|
|
145
|
+
|
|
146
|
+
def _get_correlation(self, X):
|
|
147
|
+
"""Compute correlation matrix with transformed data.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
X (numpy.ndarray):
|
|
151
|
+
Data for which the correlation needs to be computed.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
numpy.ndarray:
|
|
155
|
+
computed correlation matrix.
|
|
156
|
+
"""
|
|
157
|
+
result = self._transform_to_normal(X)
|
|
158
|
+
correlation = pd.DataFrame(data=result).corr().to_numpy()
|
|
159
|
+
correlation = np.nan_to_num(correlation, nan=0.0)
|
|
160
|
+
# If singular, add some noise to the diagonal
|
|
161
|
+
if np.linalg.cond(correlation) > 1.0 / sys.float_info.epsilon:
|
|
162
|
+
correlation = correlation + np.identity(correlation.shape[0]) * EPSILON
|
|
163
|
+
|
|
164
|
+
return pd.DataFrame(correlation, index=self.columns, columns=self.columns)
|
|
165
|
+
|
|
166
|
+
def probability_density(self, X):
|
|
167
|
+
"""Compute the probability density for each point in X.
|
|
168
|
+
|
|
169
|
+
Arguments:
|
|
170
|
+
X (pandas.DataFrame):
|
|
171
|
+
Values for which the probability density will be computed.
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
numpy.ndarray:
|
|
175
|
+
Probability density values for points in X.
|
|
176
|
+
|
|
177
|
+
Raises:
|
|
178
|
+
NotFittedError:
|
|
179
|
+
if the model is not fitted.
|
|
180
|
+
"""
|
|
181
|
+
self.check_fit()
|
|
182
|
+
transformed = self._transform_to_normal(X)
|
|
183
|
+
|
|
184
|
+
return stats.multivariate_normal.pdf(transformed, cov=self.correlation, allow_singular=True)
|
|
185
|
+
|
|
186
|
+
def cumulative_distribution(self, X):
|
|
187
|
+
"""Compute the cumulative distribution value for each point in X.
|
|
188
|
+
|
|
189
|
+
Arguments:
|
|
190
|
+
X (pandas.DataFrame):
|
|
191
|
+
Values for which the cumulative distribution will be computed.
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
numpy.ndarray:
|
|
195
|
+
Cumulative distribution values for points in X.
|
|
196
|
+
|
|
197
|
+
Raises:
|
|
198
|
+
NotFittedError:
|
|
199
|
+
if the model is not fitted.
|
|
200
|
+
"""
|
|
201
|
+
self.check_fit()
|
|
202
|
+
transformed = self._transform_to_normal(X)
|
|
203
|
+
return stats.multivariate_normal.cdf(transformed, cov=self.correlation)
|
|
204
|
+
|
|
205
|
+
def _get_conditional_distribution(self, conditions):
|
|
206
|
+
"""Compute the parameters of a conditional multivariate normal distribution.
|
|
207
|
+
|
|
208
|
+
The parameters of the conditioned distribution are computed as specified here:
|
|
209
|
+
https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Conditional_distributions
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
conditions (pandas.Series):
|
|
213
|
+
Mapping of the column names and column values to condition on.
|
|
214
|
+
The input values have already been transformed to their normal distribution.
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
tuple:
|
|
218
|
+
* means (numpy.array):
|
|
219
|
+
mean values to use for the conditioned multivariate normal.
|
|
220
|
+
* covariance (numpy.array):
|
|
221
|
+
covariance matrix to use for the conditioned
|
|
222
|
+
multivariate normal.
|
|
223
|
+
* columns (list):
|
|
224
|
+
names of the columns that will be sampled conditionally.
|
|
225
|
+
"""
|
|
226
|
+
columns2 = conditions.index
|
|
227
|
+
columns1 = self.correlation.columns.difference(columns2)
|
|
228
|
+
|
|
229
|
+
sigma11 = self.correlation.loc[columns1, columns1].to_numpy()
|
|
230
|
+
sigma12 = self.correlation.loc[columns1, columns2].to_numpy()
|
|
231
|
+
sigma21 = self.correlation.loc[columns2, columns1].to_numpy()
|
|
232
|
+
sigma22 = self.correlation.loc[columns2, columns2].to_numpy()
|
|
233
|
+
|
|
234
|
+
mu1 = np.zeros(len(columns1))
|
|
235
|
+
mu2 = np.zeros(len(columns2))
|
|
236
|
+
|
|
237
|
+
sigma12sigma22inv = sigma12 @ np.linalg.inv(sigma22)
|
|
238
|
+
|
|
239
|
+
mu_bar = mu1 + sigma12sigma22inv @ (conditions - mu2)
|
|
240
|
+
sigma_bar = sigma11 - sigma12sigma22inv @ sigma21
|
|
241
|
+
|
|
242
|
+
return mu_bar, sigma_bar, columns1
|
|
243
|
+
|
|
244
|
+
def _get_normal_samples(self, num_rows, conditions):
|
|
245
|
+
"""Get random rows in the standard normal space.
|
|
246
|
+
|
|
247
|
+
If no conditions are given, the values are sampled from a standard normal
|
|
248
|
+
multivariate.
|
|
249
|
+
|
|
250
|
+
If conditions are given, they are transformed to their equivalent standard
|
|
251
|
+
normal values using their marginals and then the values are sampled from
|
|
252
|
+
a standard normal multivariate conditioned on the given condition values.
|
|
253
|
+
"""
|
|
254
|
+
if conditions is None:
|
|
255
|
+
covariance = self.correlation
|
|
256
|
+
columns = self.columns
|
|
257
|
+
means = np.zeros(len(columns))
|
|
258
|
+
else:
|
|
259
|
+
conditions = pd.Series(conditions)
|
|
260
|
+
normal_conditions = self._transform_to_normal(conditions)[0]
|
|
261
|
+
normal_conditions = pd.Series(normal_conditions, index=conditions.index)
|
|
262
|
+
means, covariance, columns = self._get_conditional_distribution(normal_conditions)
|
|
263
|
+
|
|
264
|
+
samples = np.random.multivariate_normal(means, covariance, size=num_rows)
|
|
265
|
+
return pd.DataFrame(samples, columns=columns)
|
|
266
|
+
|
|
267
|
+
@random_state
|
|
268
|
+
def sample(self, num_rows=1, conditions=None):
|
|
269
|
+
"""Sample values from this model.
|
|
270
|
+
|
|
271
|
+
Argument:
|
|
272
|
+
num_rows (int):
|
|
273
|
+
Number of rows to sample.
|
|
274
|
+
conditions (dict or pd.Series):
|
|
275
|
+
Mapping of the column names and column values to condition on.
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
numpy.ndarray:
|
|
279
|
+
Array of shape (n_samples, *) with values randomly
|
|
280
|
+
sampled from this model distribution. If conditions have been
|
|
281
|
+
given, the output array also contains the corresponding columns
|
|
282
|
+
populated with the given values.
|
|
283
|
+
|
|
284
|
+
Raises:
|
|
285
|
+
NotFittedError:
|
|
286
|
+
if the model is not fitted.
|
|
287
|
+
"""
|
|
288
|
+
self.check_fit()
|
|
289
|
+
|
|
290
|
+
samples = self._get_normal_samples(num_rows, conditions)
|
|
291
|
+
|
|
292
|
+
output = {}
|
|
293
|
+
for column_name, univariate in zip(self.columns, self.univariates):
|
|
294
|
+
if conditions and column_name in conditions:
|
|
295
|
+
# Use the values that were given as conditions in the original space.
|
|
296
|
+
output[column_name] = np.full(num_rows, conditions[column_name])
|
|
297
|
+
else:
|
|
298
|
+
cdf = stats.norm.cdf(samples[column_name])
|
|
299
|
+
output[column_name] = univariate.percent_point(cdf)
|
|
300
|
+
|
|
301
|
+
return pd.DataFrame(data=output)
|
|
302
|
+
|
|
303
|
+
def to_dict(self):
|
|
304
|
+
"""Return a `dict` with the parameters to replicate this object.
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
dict:
|
|
308
|
+
Parameters of this distribution.
|
|
309
|
+
"""
|
|
310
|
+
self.check_fit()
|
|
311
|
+
univariates = [univariate.to_dict() for univariate in self.univariates]
|
|
312
|
+
|
|
313
|
+
return {
|
|
314
|
+
'correlation': self.correlation.to_numpy().tolist(),
|
|
315
|
+
'univariates': univariates,
|
|
316
|
+
'columns': self.columns,
|
|
317
|
+
'type': get_qualified_name(self),
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
@classmethod
|
|
321
|
+
def from_dict(cls, copula_dict):
|
|
322
|
+
"""Create a new instance from a parameters dictionary.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
params (dict):
|
|
326
|
+
Parameters of the distribution, in the same format as the one
|
|
327
|
+
returned by the ``to_dict`` method.
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
Multivariate:
|
|
331
|
+
Instance of the distribution defined on the parameters.
|
|
332
|
+
"""
|
|
333
|
+
instance = cls()
|
|
334
|
+
instance.univariates = []
|
|
335
|
+
columns = copula_dict['columns']
|
|
336
|
+
instance.columns = columns
|
|
337
|
+
|
|
338
|
+
for parameters in copula_dict['univariates']:
|
|
339
|
+
instance.univariates.append(Univariate.from_dict(parameters))
|
|
340
|
+
|
|
341
|
+
correlation = copula_dict['correlation']
|
|
342
|
+
instance.correlation = pd.DataFrame(correlation, index=columns, columns=columns)
|
|
343
|
+
instance.fitted = True
|
|
344
|
+
|
|
345
|
+
return instance
|