copulas 0.10.1.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of copulas might be problematic. Click here for more details.

copulas/__init__.py ADDED
@@ -0,0 +1,332 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """Top-level package for Copulas."""
4
+
5
+ __author__ = 'DataCebo, Inc.'
6
+ __email__ = 'info@sdv.dev'
7
+ __version__ = '0.10.1.dev0'
8
+
9
+ import contextlib
10
+ import importlib
11
+ import sys
12
+ import warnings
13
+ from copy import deepcopy
14
+ from operator import attrgetter
15
+
16
+ import numpy as np
17
+ import pandas as pd
18
+ from pkg_resources import iter_entry_points
19
+
20
+ EPSILON = np.finfo(np.float32).eps
21
+
22
+
23
+ class NotFittedError(Exception):
24
+ """NotFittedError class."""
25
+
26
+
27
+ @contextlib.contextmanager
28
+ def set_random_state(random_state, set_model_random_state):
29
+ """Context manager for managing the random state.
30
+
31
+ Args:
32
+ random_state (int or np.random.RandomState):
33
+ The random seed or RandomState.
34
+ set_model_random_state (function):
35
+ Function to set the random state on the model.
36
+ """
37
+ original_state = np.random.get_state()
38
+
39
+ np.random.set_state(random_state.get_state())
40
+
41
+ try:
42
+ yield
43
+ finally:
44
+ current_random_state = np.random.RandomState()
45
+ current_random_state.set_state(np.random.get_state())
46
+ set_model_random_state(current_random_state)
47
+ np.random.set_state(original_state)
48
+
49
+
50
+ def random_state(function):
51
+ """Set the random state before calling the function.
52
+
53
+ Args:
54
+ function (Callable):
55
+ The function to wrap around.
56
+ """
57
+
58
+ def wrapper(self, *args, **kwargs):
59
+ if self.random_state is None:
60
+ return function(self, *args, **kwargs)
61
+
62
+ else:
63
+ with set_random_state(self.random_state, self.set_random_state):
64
+ return function(self, *args, **kwargs)
65
+
66
+ return wrapper
67
+
68
+
69
+ def validate_random_state(random_state):
70
+ """Validate random state argument.
71
+
72
+ Args:
73
+ random_state (int, numpy.random.RandomState, tuple, or None):
74
+ Seed or RandomState for the random generator.
75
+
76
+ Output:
77
+ numpy.random.RandomState
78
+ """
79
+ if random_state is None:
80
+ return None
81
+
82
+ if isinstance(random_state, int):
83
+ return np.random.RandomState(seed=random_state)
84
+ elif isinstance(random_state, np.random.RandomState):
85
+ return random_state
86
+ else:
87
+ raise TypeError(
88
+ f'`random_state` {random_state} expected to be an int '
89
+ 'or `np.random.RandomState` object.')
90
+
91
+
92
+ def get_instance(obj, **kwargs):
93
+ """Create new instance of the ``obj`` argument.
94
+
95
+ Args:
96
+ obj (str, type, instance):
97
+ """
98
+ instance = None
99
+ if isinstance(obj, str):
100
+ package, name = obj.rsplit('.', 1)
101
+ instance = getattr(importlib.import_module(package), name)(**kwargs)
102
+ elif isinstance(obj, type):
103
+ instance = obj(**kwargs)
104
+ else:
105
+ if kwargs:
106
+ instance = obj.__class__(**kwargs)
107
+ else:
108
+ args = getattr(obj, '__args__', ())
109
+ kwargs = getattr(obj, '__kwargs__', {})
110
+ instance = obj.__class__(*args, **kwargs)
111
+
112
+ return instance
113
+
114
+
115
+ def store_args(__init__):
116
+ """Save ``*args`` and ``**kwargs`` used in the ``__init__`` of a copula.
117
+
118
+ Args:
119
+ __init__(callable): ``__init__`` function to store their arguments.
120
+
121
+ Returns:
122
+ callable: Decorated ``__init__`` function.
123
+ """
124
+
125
+ def new__init__(self, *args, **kwargs):
126
+ args_copy = deepcopy(args)
127
+ kwargs_copy = deepcopy(kwargs)
128
+ __init__(self, *args, **kwargs)
129
+ self.__args__ = args_copy
130
+ self.__kwargs__ = kwargs_copy
131
+
132
+ return new__init__
133
+
134
+
135
+ def get_qualified_name(_object):
136
+ """Return the Fully Qualified Name from an instance or class."""
137
+ module = _object.__module__
138
+ if hasattr(_object, '__name__'):
139
+ _class = _object.__name__
140
+
141
+ else:
142
+ _class = _object.__class__.__name__
143
+
144
+ return module + '.' + _class
145
+
146
+
147
+ def vectorize(function):
148
+ """Allow a method that only accepts scalars to accept vectors too.
149
+
150
+ This decorator has two different behaviors depending on the dimensionality of the
151
+ array passed as an argument:
152
+
153
+ **1-d array**
154
+
155
+ It will work under the assumption that the `function` argument is a callable
156
+ with signature::
157
+
158
+ function(self, X, *args, **kwargs)
159
+
160
+ where X is an scalar magnitude.
161
+
162
+ In this case the arguments of the input array will be given one at a time, and
163
+ both the input and output of the decorated function will have shape (n,).
164
+
165
+ **2-d array**
166
+
167
+ It will work under the assumption that the `function` argument is a callable with signature::
168
+
169
+ function(self, X0, ..., Xj, *args, **kwargs)
170
+
171
+ where `Xi` are scalar magnitudes.
172
+
173
+ It will pass the contents of each row unpacked on each call. The input is espected to have
174
+ shape (n, j), the output a shape of (n,)
175
+
176
+ It will return a function that is guaranteed to return a `numpy.array`.
177
+
178
+ Args:
179
+ function(callable): Function that only accept and return scalars.
180
+
181
+ Returns:
182
+ callable: Decorated function that can accept and return :attr:`numpy.array`.
183
+
184
+ """
185
+
186
+ def decorated(self, X, *args, **kwargs):
187
+ if not isinstance(X, np.ndarray):
188
+ return function(self, X, *args, **kwargs)
189
+
190
+ if len(X.shape) == 1:
191
+ X = X.reshape([-1, 1])
192
+
193
+ if len(X.shape) == 2:
194
+ return np.fromiter(
195
+ (function(self, *x, *args, **kwargs) for x in X),
196
+ np.dtype('float64')
197
+ )
198
+
199
+ else:
200
+ raise ValueError('Arrays of dimensionality higher than 2 are not supported.')
201
+
202
+ decorated.__doc__ = function.__doc__
203
+ return decorated
204
+
205
+
206
+ def scalarize(function):
207
+ """Allow methods that only accepts 1-d vectors to work with scalars.
208
+
209
+ Args:
210
+ function(callable): Function that accepts and returns vectors.
211
+
212
+ Returns:
213
+ callable: Decorated function that accepts and returns scalars.
214
+ """
215
+
216
+ def decorated(self, X, *args, **kwargs):
217
+ scalar = not isinstance(X, np.ndarray)
218
+
219
+ if scalar:
220
+ X = np.array([X])
221
+
222
+ result = function(self, X, *args, **kwargs)
223
+ if scalar:
224
+ result = result[0]
225
+
226
+ return result
227
+
228
+ decorated.__doc__ = function.__doc__
229
+ return decorated
230
+
231
+
232
+ def check_valid_values(function):
233
+ """Raise an exception if the given values are not supported.
234
+
235
+ Args:
236
+ function(callable): Method whose unique argument is a numpy.array-like object.
237
+
238
+ Returns:
239
+ callable: Decorated function
240
+
241
+ Raises:
242
+ ValueError: If there are missing or invalid values or if the dataset is empty.
243
+ """
244
+
245
+ def decorated(self, X, *args, **kwargs):
246
+
247
+ if isinstance(X, pd.DataFrame):
248
+ W = X.to_numpy()
249
+
250
+ else:
251
+ W = X
252
+
253
+ if not len(W):
254
+ raise ValueError('Your dataset is empty.')
255
+
256
+ if not (np.issubdtype(W.dtype, np.floating) or np.issubdtype(W.dtype, np.integer)):
257
+ raise ValueError('There are non-numerical values in your data.')
258
+
259
+ if np.isnan(W).any().any():
260
+ raise ValueError('There are nan values in your data.')
261
+
262
+ return function(self, X, *args, **kwargs)
263
+
264
+ return decorated
265
+
266
+
267
+ def _get_addon_target(addon_path_name):
268
+ """Find the target object for the add-on.
269
+
270
+ Args:
271
+ addon_path_name (str):
272
+ The add-on's name. The add-on's name should be the full path of valid Python
273
+ identifiers (i.e. importable.module:object.attr).
274
+
275
+ Returns:
276
+ tuple:
277
+ * object:
278
+ The base module or object the add-on should be added to.
279
+ * str:
280
+ The name the add-on should be added to under the module or object.
281
+ """
282
+ module_path, _, object_path = addon_path_name.partition(':')
283
+ module_path = module_path.split('.')
284
+
285
+ if module_path[0] != __name__:
286
+ msg = f"expected base module to be '{__name__}', found '{module_path[0]}'"
287
+ raise AttributeError(msg)
288
+
289
+ target_base = sys.modules[__name__]
290
+ for submodule in module_path[1:-1]:
291
+ target_base = getattr(target_base, submodule)
292
+
293
+ addon_name = module_path[-1]
294
+ if object_path:
295
+ if len(module_path) > 1 and not hasattr(target_base, module_path[-1]):
296
+ msg = f"cannot add '{object_path}' to unknown submodule '{'.'.join(module_path)}'"
297
+ raise AttributeError(msg)
298
+
299
+ if len(module_path) > 1:
300
+ target_base = getattr(target_base, module_path[-1])
301
+
302
+ split_object = object_path.split('.')
303
+ addon_name = split_object[-1]
304
+
305
+ if len(split_object) > 1:
306
+ target_base = attrgetter('.'.join(split_object[:-1]))(target_base)
307
+
308
+ return target_base, addon_name
309
+
310
+
311
+ def _find_addons():
312
+ """Find and load all copulas add-ons."""
313
+ group = 'copulas_modules'
314
+ for entry_point in iter_entry_points(group=group):
315
+ try:
316
+ addon = entry_point.load()
317
+ except Exception: # pylint: disable=broad-exception-caught
318
+ msg = f'Failed to load "{entry_point.name}" from "{entry_point.module_name}".'
319
+ warnings.warn(msg)
320
+ continue
321
+
322
+ try:
323
+ addon_target, addon_name = _get_addon_target(entry_point.name)
324
+ except AttributeError as error:
325
+ msg = f"Failed to set '{entry_point.name}': {error}."
326
+ warnings.warn(msg)
327
+ continue
328
+
329
+ setattr(addon_target, addon_name, addon)
330
+
331
+
332
+ _find_addons()
@@ -0,0 +1,175 @@
1
+ """Bivariate copulas."""
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ from copulas import EPSILON
7
+ from copulas.bivariate.base import Bivariate, CopulaTypes
8
+ from copulas.bivariate.clayton import Clayton
9
+ from copulas.bivariate.frank import Frank
10
+ from copulas.bivariate.gumbel import Gumbel
11
+ from copulas.bivariate.utils import split_matrix
12
+
13
+ __all__ = (
14
+ 'Bivariate',
15
+ 'Clayton',
16
+ 'CopulaTypes',
17
+ 'Frank',
18
+ 'Gumbel',
19
+ )
20
+
21
+
22
+ COMPUTE_EMPIRICAL_STEPS = 50
23
+
24
+
25
+ def _compute_empirical(X):
26
+ """Compute empirical distribution.
27
+
28
+ Args:
29
+ X(numpy.array): Shape (n,2); Datapoints to compute the empirical(frequentist) copula.
30
+
31
+ Return:
32
+ tuple(list):
33
+
34
+ """
35
+ z_left = []
36
+ z_right = []
37
+ L = []
38
+ R = []
39
+
40
+ U, V = split_matrix(X)
41
+ N = len(U)
42
+ base = np.linspace(EPSILON, 1.0 - EPSILON, COMPUTE_EMPIRICAL_STEPS)
43
+ # See https://github.com/sdv-dev/Copulas/issues/45
44
+
45
+ for k in range(COMPUTE_EMPIRICAL_STEPS):
46
+ left = sum(np.logical_and(U <= base[k], V <= base[k])) / N
47
+ right = sum(np.logical_and(U >= base[k], V >= base[k])) / N
48
+
49
+ if left > 0:
50
+
51
+ z_left.append(base[k])
52
+ L.append(left / base[k] ** 2)
53
+
54
+ if right > 0:
55
+ z_right.append(base[k])
56
+ R.append(right / (1 - z_right[k]) ** 2)
57
+
58
+ return z_left, L, z_right, R
59
+
60
+
61
+ def _compute_tail(c, z):
62
+ r"""Compute upper concentration function for tail.
63
+
64
+ The upper tail concentration function is defined by:
65
+
66
+ .. math:: R(z) = \frac{[1 − 2z + C(z, z)]}{(1 − z)^{2}}
67
+
68
+ Args:
69
+ c(Iterable): Values of :math:`C(z,z)`.
70
+ z(Iterable): Values for the empirical copula.
71
+
72
+ Returns:
73
+ numpy.ndarray
74
+
75
+ """
76
+ return (1.0 - 2 * np.asarray(z) + c) / (np.power(1.0 - np.asarray(z), 2))
77
+
78
+
79
+ def _compute_candidates(copulas, left_tail, right_tail):
80
+ """Compute dependencies.
81
+
82
+ Args:
83
+ copulas(list[Bivariate]): Fitted instances of bivariate copulas.
84
+ z_left(list):
85
+ z_right(list):
86
+
87
+ Returns:
88
+ tuple[list]: Arrays of left and right dependencies for the empirical copula.
89
+
90
+
91
+ """
92
+ left = []
93
+ right = []
94
+
95
+ X_left = np.column_stack((left_tail, left_tail))
96
+ X_right = np.column_stack((right_tail, right_tail))
97
+
98
+ for copula in copulas:
99
+ left.append(copula.cumulative_distribution(X_left) / np.power(left_tail, 2))
100
+ right.append(_compute_tail(copula.cumulative_distribution(X_right), right_tail))
101
+
102
+ return left, right
103
+
104
+
105
+ def select_copula(X):
106
+ r"""Select best copula function based on likelihood.
107
+
108
+ Given out candidate copulas the procedure proposed for selecting the one
109
+ that best fit to a dataset of pairs :math:`\{(u_j, v_j )\}, j=1,2,...n` , is as follows:
110
+
111
+ 1. Estimate the most likely parameter :math:`\theta` of each copula candidate for the given
112
+ dataset.
113
+
114
+ 2. Construct :math:`R(z|\theta)`. Calculate the area under the tail for each of the copula
115
+ candidates.
116
+
117
+ 3. Compare the areas: :math:`a_u` achieved using empirical copula against the ones
118
+ achieved for the copula candidates. Score the outcome of the comparison from 3 (best)
119
+ down to 1 (worst).
120
+
121
+ 4. Proceed as in steps 2- 3 with the lower tail and function :math:`L`.
122
+
123
+ 5. Finally the sum of empirical upper and lower tail functions is compared against
124
+ :math:`R + L`. Scores of the three comparisons are summed and the candidate with the
125
+ highest value is selected.
126
+
127
+ Args:
128
+ X(np.ndarray): Matrix of shape (n,2).
129
+
130
+ Returns:
131
+ copula: Best copula that fits for it.
132
+
133
+ """
134
+ frank = Frank()
135
+ frank.fit(X)
136
+
137
+ if frank.tau <= 0:
138
+ return frank
139
+
140
+ copula_candidates = [frank]
141
+
142
+ # append copulas into the candidate list
143
+ for copula_class in [Clayton, Gumbel]:
144
+ try:
145
+ copula = copula_class()
146
+ copula.tau = frank.tau
147
+ copula._compute_theta()
148
+ copula_candidates.append(copula)
149
+ except ValueError:
150
+ pass
151
+
152
+ left_tail, empirical_left_aut, right_tail, empirical_right_aut = _compute_empirical(X)
153
+ candidate_left_auts, candidate_right_auts = _compute_candidates(
154
+ copula_candidates, left_tail, right_tail)
155
+
156
+ empirical_aut = np.concatenate((empirical_left_aut, empirical_right_aut))
157
+ candidate_auts = [
158
+ np.concatenate((left, right))
159
+ for left, right in zip(candidate_left_auts, candidate_right_auts)
160
+ ]
161
+
162
+ # compute L2 distance from empirical distribution
163
+ diff_left = [np.sum((empirical_left_aut - left) ** 2) for left in candidate_left_auts]
164
+ diff_right = [np.sum((empirical_right_aut - right) ** 2) for right in candidate_right_auts]
165
+ diff_both = [np.sum((empirical_aut - candidate) ** 2) for candidate in candidate_auts]
166
+
167
+ # calcule ranks
168
+ score_left = pd.Series(diff_left).rank(ascending=False)
169
+ score_right = pd.Series(diff_right).rank(ascending=False)
170
+ score_both = pd.Series(diff_both).rank(ascending=False)
171
+
172
+ score = score_left + score_right + score_both
173
+
174
+ selected_copula = np.argmax(score.to_numpy())
175
+ return copula_candidates[selected_copula]