copulas 0.10.1.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of copulas might be problematic. Click here for more details.

@@ -0,0 +1,199 @@
1
+ """Base Multivariate class."""
2
+
3
+ import pickle
4
+
5
+ import numpy as np
6
+
7
+ from copulas import NotFittedError, get_instance, validate_random_state
8
+
9
+
10
+ class Multivariate(object):
11
+ """Abstract class for a multi-variate copula object."""
12
+
13
+ fitted = False
14
+
15
+ def __init__(self, random_state=None):
16
+ self.random_state = validate_random_state(random_state)
17
+
18
+ def fit(self, X):
19
+ """Fit the model to table with values from multiple random variables.
20
+
21
+ Arguments:
22
+ X (pandas.DataFrame):
23
+ Values of the random variables.
24
+ """
25
+ raise NotImplementedError
26
+
27
+ def probability_density(self, X):
28
+ """Compute the probability density for each point in X.
29
+
30
+ Arguments:
31
+ X (pandas.DataFrame):
32
+ Values for which the probability density will be computed.
33
+
34
+ Returns:
35
+ numpy.ndarray:
36
+ Probability density values for points in X.
37
+
38
+ Raises:
39
+ NotFittedError:
40
+ if the model is not fitted.
41
+ """
42
+ raise NotImplementedError
43
+
44
+ def log_probability_density(self, X):
45
+ """Compute the log of the probability density for each point in X.
46
+
47
+ Arguments:
48
+ X (pandas.DataFrame):
49
+ Values for which the log probability density will be computed.
50
+
51
+ Returns:
52
+ numpy.ndarray:
53
+ Log probability density values for points in X.
54
+
55
+ Raises:
56
+ NotFittedError:
57
+ if the model is not fitted.
58
+ """
59
+ return np.log(self.probability_density(X))
60
+
61
+ def pdf(self, X):
62
+ """Compute the probability density for each point in X.
63
+
64
+ Arguments:
65
+ X (pandas.DataFrame):
66
+ Values for which the probability density will be computed.
67
+
68
+ Returns:
69
+ numpy.ndarray:
70
+ Probability density values for points in X.
71
+
72
+ Raises:
73
+ NotFittedError:
74
+ if the model is not fitted.
75
+ """
76
+ return self.probability_density(X)
77
+
78
+ def cumulative_distribution(self, X):
79
+ """Compute the cumulative distribution value for each point in X.
80
+
81
+ Arguments:
82
+ X (pandas.DataFrame):
83
+ Values for which the cumulative distribution will be computed.
84
+
85
+ Returns:
86
+ numpy.ndarray:
87
+ Cumulative distribution values for points in X.
88
+
89
+ Raises:
90
+ NotFittedError:
91
+ if the model is not fitted.
92
+ """
93
+ raise NotImplementedError
94
+
95
+ def cdf(self, X):
96
+ """Compute the cumulative distribution value for each point in X.
97
+
98
+ Arguments:
99
+ X (pandas.DataFrame):
100
+ Values for which the cumulative distribution will be computed.
101
+
102
+ Returns:
103
+ numpy.ndarray:
104
+ Cumulative distribution values for points in X.
105
+
106
+ Raises:
107
+ NotFittedError:
108
+ if the model is not fitted.
109
+ """
110
+ return self.cumulative_distribution(X)
111
+
112
+ def set_random_state(self, random_state):
113
+ """Set the random state.
114
+
115
+ Args:
116
+ random_state (int, np.random.RandomState, or None):
117
+ Seed or RandomState for the random generator.
118
+ """
119
+ self.random_state = validate_random_state(random_state)
120
+
121
+ def sample(self, num_rows=1):
122
+ """Sample values from this model.
123
+
124
+ Argument:
125
+ num_rows (int):
126
+ Number of rows to sample.
127
+
128
+ Returns:
129
+ numpy.ndarray:
130
+ Array of shape (n_samples, *) with values randomly
131
+ sampled from this model distribution.
132
+
133
+ Raises:
134
+ NotFittedError:
135
+ if the model is not fitted.
136
+ """
137
+ raise NotImplementedError
138
+
139
+ def to_dict(self):
140
+ """Return a `dict` with the parameters to replicate this object.
141
+
142
+ Returns:
143
+ dict:
144
+ Parameters of this distribution.
145
+ """
146
+ raise NotImplementedError
147
+
148
+ @classmethod
149
+ def from_dict(cls, params):
150
+ """Create a new instance from a parameters dictionary.
151
+
152
+ Args:
153
+ params (dict):
154
+ Parameters of the distribution, in the same format as the one
155
+ returned by the ``to_dict`` method.
156
+
157
+ Returns:
158
+ Multivariate:
159
+ Instance of the distribution defined on the parameters.
160
+ """
161
+ multivariate_class = get_instance(params['type'])
162
+ return multivariate_class.from_dict(params)
163
+
164
+ @classmethod
165
+ def load(cls, path):
166
+ """Load a Multivariate instance from a pickle file.
167
+
168
+ Args:
169
+ path (str):
170
+ Path to the pickle file where the distribution has been serialized.
171
+
172
+ Returns:
173
+ Multivariate:
174
+ Loaded instance.
175
+ """
176
+ with open(path, 'rb') as pickle_file:
177
+ return pickle.load(pickle_file)
178
+
179
+ def save(self, path):
180
+ """Serialize this multivariate instance using pickle.
181
+
182
+ Args:
183
+ path (str):
184
+ Path to where this distribution will be serialized.
185
+ """
186
+ with open(path, 'wb') as pickle_file:
187
+ pickle.dump(self, pickle_file)
188
+
189
+ def check_fit(self):
190
+ """Check whether this model has already been fit to a random variable.
191
+
192
+ Raise a ``NotFittedError`` if it has not.
193
+
194
+ Raises:
195
+ NotFittedError:
196
+ if the model is not fitted.
197
+ """
198
+ if not self.fitted:
199
+ raise NotFittedError('This model is not fitted.')
@@ -0,0 +1,314 @@
1
+ """GaussianMultivariate module."""
2
+
3
+ import logging
4
+ import sys
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ from scipy import stats
9
+
10
+ from copulas import (
11
+ EPSILON, check_valid_values, get_instance, get_qualified_name, random_state, store_args,
12
+ validate_random_state)
13
+ from copulas.multivariate.base import Multivariate
14
+ from copulas.univariate import GaussianUnivariate, Univariate
15
+
16
+ LOGGER = logging.getLogger(__name__)
17
+ DEFAULT_DISTRIBUTION = Univariate
18
+
19
+
20
+ class GaussianMultivariate(Multivariate):
21
+ """Class for a multivariate distribution that uses the Gaussian copula.
22
+
23
+ Args:
24
+ distribution (str or dict):
25
+ Fully qualified name of the class to be used for modeling the marginal
26
+ distributions or a dictionary mapping column names to the fully qualified
27
+ distribution names.
28
+ """
29
+
30
+ correlation = None
31
+ columns = None
32
+ univariates = None
33
+
34
+ @store_args
35
+ def __init__(self, distribution=DEFAULT_DISTRIBUTION, random_state=None):
36
+ self.random_state = validate_random_state(random_state)
37
+ self.distribution = distribution
38
+
39
+ def __repr__(self):
40
+ """Produce printable representation of the object."""
41
+ if self.distribution == DEFAULT_DISTRIBUTION:
42
+ distribution = ''
43
+ elif isinstance(self.distribution, type):
44
+ distribution = f'distribution="{self.distribution.__name__}"'
45
+ else:
46
+ distribution = f'distribution="{self.distribution}"'
47
+
48
+ return f'GaussianMultivariate({distribution})'
49
+
50
+ def _transform_to_normal(self, X):
51
+ if isinstance(X, pd.Series):
52
+ X = X.to_frame().T
53
+ elif not isinstance(X, pd.DataFrame):
54
+ if len(X.shape) == 1:
55
+ X = [X]
56
+
57
+ X = pd.DataFrame(X, columns=self.columns)
58
+
59
+ U = []
60
+ for column_name, univariate in zip(self.columns, self.univariates):
61
+ if column_name in X:
62
+ column = X[column_name]
63
+ U.append(univariate.cdf(column.to_numpy()).clip(EPSILON, 1 - EPSILON))
64
+
65
+ return stats.norm.ppf(np.column_stack(U))
66
+
67
+ def _get_correlation(self, X):
68
+ """Compute correlation matrix with transformed data.
69
+
70
+ Args:
71
+ X (numpy.ndarray):
72
+ Data for which the correlation needs to be computed.
73
+
74
+ Returns:
75
+ numpy.ndarray:
76
+ computed correlation matrix.
77
+ """
78
+ result = self._transform_to_normal(X)
79
+ correlation = pd.DataFrame(data=result).corr().to_numpy()
80
+ correlation = np.nan_to_num(correlation, nan=0.0)
81
+ # If singular, add some noise to the diagonal
82
+ if np.linalg.cond(correlation) > 1.0 / sys.float_info.epsilon:
83
+ correlation = correlation + np.identity(correlation.shape[0]) * EPSILON
84
+
85
+ return pd.DataFrame(correlation, index=self.columns, columns=self.columns)
86
+
87
+ @check_valid_values
88
+ def fit(self, X):
89
+ """Compute the distribution for each variable and then its correlation matrix.
90
+
91
+ Arguments:
92
+ X (pandas.DataFrame):
93
+ Values of the random variables.
94
+ """
95
+ LOGGER.info('Fitting %s', self)
96
+
97
+ if not isinstance(X, pd.DataFrame):
98
+ X = pd.DataFrame(X)
99
+
100
+ columns = []
101
+ univariates = []
102
+ for column_name, column in X.items():
103
+ if isinstance(self.distribution, dict):
104
+ distribution = self.distribution.get(column_name, DEFAULT_DISTRIBUTION)
105
+ else:
106
+ distribution = self.distribution
107
+
108
+ LOGGER.debug('Fitting column %s to %s', column_name, distribution)
109
+
110
+ univariate = get_instance(distribution)
111
+ try:
112
+ univariate.fit(column)
113
+ except BaseException:
114
+ log_message = (
115
+ f'Unable to fit to a {distribution} distribution for column {column_name}. '
116
+ 'Using a Gaussian distribution instead.'
117
+ )
118
+ LOGGER.info(log_message)
119
+ univariate = GaussianUnivariate()
120
+ univariate.fit(column)
121
+
122
+ columns.append(column_name)
123
+ univariates.append(univariate)
124
+
125
+ self.columns = columns
126
+ self.univariates = univariates
127
+
128
+ LOGGER.debug('Computing correlation')
129
+ self.correlation = self._get_correlation(X)
130
+ self.fitted = True
131
+
132
+ LOGGER.debug('GaussianMultivariate fitted successfully')
133
+
134
+ def probability_density(self, X):
135
+ """Compute the probability density for each point in X.
136
+
137
+ Arguments:
138
+ X (pandas.DataFrame):
139
+ Values for which the probability density will be computed.
140
+
141
+ Returns:
142
+ numpy.ndarray:
143
+ Probability density values for points in X.
144
+
145
+ Raises:
146
+ NotFittedError:
147
+ if the model is not fitted.
148
+ """
149
+ self.check_fit()
150
+ transformed = self._transform_to_normal(X)
151
+
152
+ return stats.multivariate_normal.pdf(
153
+ transformed, cov=self.correlation, allow_singular=True)
154
+
155
+ def cumulative_distribution(self, X):
156
+ """Compute the cumulative distribution value for each point in X.
157
+
158
+ Arguments:
159
+ X (pandas.DataFrame):
160
+ Values for which the cumulative distribution will be computed.
161
+
162
+ Returns:
163
+ numpy.ndarray:
164
+ Cumulative distribution values for points in X.
165
+
166
+ Raises:
167
+ NotFittedError:
168
+ if the model is not fitted.
169
+ """
170
+ self.check_fit()
171
+ transformed = self._transform_to_normal(X)
172
+ return stats.multivariate_normal.cdf(transformed, cov=self.correlation)
173
+
174
+ def _get_conditional_distribution(self, conditions):
175
+ """Compute the parameters of a conditional multivariate normal distribution.
176
+
177
+ The parameters of the conditioned distribution are computed as specified here:
178
+ https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Conditional_distributions
179
+
180
+ Args:
181
+ conditions (pandas.Series):
182
+ Mapping of the column names and column values to condition on.
183
+ The input values have already been transformed to their normal distribution.
184
+
185
+ Returns:
186
+ tuple:
187
+ * means (numpy.array):
188
+ mean values to use for the conditioned multivariate normal.
189
+ * covariance (numpy.array):
190
+ covariance matrix to use for the conditioned
191
+ multivariate normal.
192
+ * columns (list):
193
+ names of the columns that will be sampled conditionally.
194
+ """
195
+ columns2 = conditions.index
196
+ columns1 = self.correlation.columns.difference(columns2)
197
+
198
+ sigma11 = self.correlation.loc[columns1, columns1].to_numpy()
199
+ sigma12 = self.correlation.loc[columns1, columns2].to_numpy()
200
+ sigma21 = self.correlation.loc[columns2, columns1].to_numpy()
201
+ sigma22 = self.correlation.loc[columns2, columns2].to_numpy()
202
+
203
+ mu1 = np.zeros(len(columns1))
204
+ mu2 = np.zeros(len(columns2))
205
+
206
+ sigma12sigma22inv = sigma12 @ np.linalg.inv(sigma22)
207
+
208
+ mu_bar = mu1 + sigma12sigma22inv @ (conditions - mu2)
209
+ sigma_bar = sigma11 - sigma12sigma22inv @ sigma21
210
+
211
+ return mu_bar, sigma_bar, columns1
212
+
213
+ def _get_normal_samples(self, num_rows, conditions):
214
+ """Get random rows in the standard normal space.
215
+
216
+ If no conditions are given, the values are sampled from a standard normal
217
+ multivariate.
218
+
219
+ If conditions are given, they are transformed to their equivalent standard
220
+ normal values using their marginals and then the values are sampled from
221
+ a standard normal multivariate conditioned on the given condition values.
222
+ """
223
+ if conditions is None:
224
+ covariance = self.correlation
225
+ columns = self.columns
226
+ means = np.zeros(len(columns))
227
+ else:
228
+ conditions = pd.Series(conditions)
229
+ normal_conditions = self._transform_to_normal(conditions)[0]
230
+ normal_conditions = pd.Series(normal_conditions, index=conditions.index)
231
+ means, covariance, columns = self._get_conditional_distribution(normal_conditions)
232
+
233
+ samples = np.random.multivariate_normal(means, covariance, size=num_rows)
234
+ return pd.DataFrame(samples, columns=columns)
235
+
236
+ @random_state
237
+ def sample(self, num_rows=1, conditions=None):
238
+ """Sample values from this model.
239
+
240
+ Argument:
241
+ num_rows (int):
242
+ Number of rows to sample.
243
+ conditions (dict or pd.Series):
244
+ Mapping of the column names and column values to condition on.
245
+
246
+ Returns:
247
+ numpy.ndarray:
248
+ Array of shape (n_samples, *) with values randomly
249
+ sampled from this model distribution. If conditions have been
250
+ given, the output array also contains the corresponding columns
251
+ populated with the given values.
252
+
253
+ Raises:
254
+ NotFittedError:
255
+ if the model is not fitted.
256
+ """
257
+ self.check_fit()
258
+
259
+ samples = self._get_normal_samples(num_rows, conditions)
260
+
261
+ output = {}
262
+ for column_name, univariate in zip(self.columns, self.univariates):
263
+ if conditions and column_name in conditions:
264
+ # Use the values that were given as conditions in the original space.
265
+ output[column_name] = np.full(num_rows, conditions[column_name])
266
+ else:
267
+ cdf = stats.norm.cdf(samples[column_name])
268
+ output[column_name] = univariate.percent_point(cdf)
269
+
270
+ return pd.DataFrame(data=output)
271
+
272
+ def to_dict(self):
273
+ """Return a `dict` with the parameters to replicate this object.
274
+
275
+ Returns:
276
+ dict:
277
+ Parameters of this distribution.
278
+ """
279
+ self.check_fit()
280
+ univariates = [univariate.to_dict() for univariate in self.univariates]
281
+
282
+ return {
283
+ 'correlation': self.correlation.to_numpy().tolist(),
284
+ 'univariates': univariates,
285
+ 'columns': self.columns,
286
+ 'type': get_qualified_name(self),
287
+ }
288
+
289
+ @classmethod
290
+ def from_dict(cls, copula_dict):
291
+ """Create a new instance from a parameters dictionary.
292
+
293
+ Args:
294
+ params (dict):
295
+ Parameters of the distribution, in the same format as the one
296
+ returned by the ``to_dict`` method.
297
+
298
+ Returns:
299
+ Multivariate:
300
+ Instance of the distribution defined on the parameters.
301
+ """
302
+ instance = cls()
303
+ instance.univariates = []
304
+ columns = copula_dict['columns']
305
+ instance.columns = columns
306
+
307
+ for parameters in copula_dict['univariates']:
308
+ instance.univariates.append(Univariate.from_dict(parameters))
309
+
310
+ correlation = copula_dict['correlation']
311
+ instance.correlation = pd.DataFrame(correlation, index=columns, columns=columns)
312
+ instance.fitted = True
313
+
314
+ return instance