copulas 0.10.1__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of copulas might be problematic. Click here for more details.

copulas/__init__.py CHANGED
@@ -1,267 +1,15 @@
1
- # -*- coding: utf-8 -*-
2
-
3
1
  """Top-level package for Copulas."""
4
2
 
5
3
  __author__ = 'DataCebo, Inc.'
6
4
  __email__ = 'info@sdv.dev'
7
- __version__ = '0.10.1'
5
+ __version__ = '0.12.1'
8
6
 
9
- import contextlib
10
- import importlib
11
7
  import sys
12
8
  import warnings
13
9
  from copy import deepcopy
10
+ from importlib.metadata import entry_points
14
11
  from operator import attrgetter
15
-
16
- import numpy as np
17
- import pandas as pd
18
- from pkg_resources import iter_entry_points
19
-
20
- EPSILON = np.finfo(np.float32).eps
21
-
22
-
23
- class NotFittedError(Exception):
24
- """NotFittedError class."""
25
-
26
-
27
- @contextlib.contextmanager
28
- def set_random_state(random_state, set_model_random_state):
29
- """Context manager for managing the random state.
30
-
31
- Args:
32
- random_state (int or np.random.RandomState):
33
- The random seed or RandomState.
34
- set_model_random_state (function):
35
- Function to set the random state on the model.
36
- """
37
- original_state = np.random.get_state()
38
-
39
- np.random.set_state(random_state.get_state())
40
-
41
- try:
42
- yield
43
- finally:
44
- current_random_state = np.random.RandomState()
45
- current_random_state.set_state(np.random.get_state())
46
- set_model_random_state(current_random_state)
47
- np.random.set_state(original_state)
48
-
49
-
50
- def random_state(function):
51
- """Set the random state before calling the function.
52
-
53
- Args:
54
- function (Callable):
55
- The function to wrap around.
56
- """
57
-
58
- def wrapper(self, *args, **kwargs):
59
- if self.random_state is None:
60
- return function(self, *args, **kwargs)
61
-
62
- else:
63
- with set_random_state(self.random_state, self.set_random_state):
64
- return function(self, *args, **kwargs)
65
-
66
- return wrapper
67
-
68
-
69
- def validate_random_state(random_state):
70
- """Validate random state argument.
71
-
72
- Args:
73
- random_state (int, numpy.random.RandomState, tuple, or None):
74
- Seed or RandomState for the random generator.
75
-
76
- Output:
77
- numpy.random.RandomState
78
- """
79
- if random_state is None:
80
- return None
81
-
82
- if isinstance(random_state, int):
83
- return np.random.RandomState(seed=random_state)
84
- elif isinstance(random_state, np.random.RandomState):
85
- return random_state
86
- else:
87
- raise TypeError(
88
- f'`random_state` {random_state} expected to be an int '
89
- 'or `np.random.RandomState` object.')
90
-
91
-
92
- def get_instance(obj, **kwargs):
93
- """Create new instance of the ``obj`` argument.
94
-
95
- Args:
96
- obj (str, type, instance):
97
- """
98
- instance = None
99
- if isinstance(obj, str):
100
- package, name = obj.rsplit('.', 1)
101
- instance = getattr(importlib.import_module(package), name)(**kwargs)
102
- elif isinstance(obj, type):
103
- instance = obj(**kwargs)
104
- else:
105
- if kwargs:
106
- instance = obj.__class__(**kwargs)
107
- else:
108
- args = getattr(obj, '__args__', ())
109
- kwargs = getattr(obj, '__kwargs__', {})
110
- instance = obj.__class__(*args, **kwargs)
111
-
112
- return instance
113
-
114
-
115
- def store_args(__init__):
116
- """Save ``*args`` and ``**kwargs`` used in the ``__init__`` of a copula.
117
-
118
- Args:
119
- __init__(callable): ``__init__`` function to store their arguments.
120
-
121
- Returns:
122
- callable: Decorated ``__init__`` function.
123
- """
124
-
125
- def new__init__(self, *args, **kwargs):
126
- args_copy = deepcopy(args)
127
- kwargs_copy = deepcopy(kwargs)
128
- __init__(self, *args, **kwargs)
129
- self.__args__ = args_copy
130
- self.__kwargs__ = kwargs_copy
131
-
132
- return new__init__
133
-
134
-
135
- def get_qualified_name(_object):
136
- """Return the Fully Qualified Name from an instance or class."""
137
- module = _object.__module__
138
- if hasattr(_object, '__name__'):
139
- _class = _object.__name__
140
-
141
- else:
142
- _class = _object.__class__.__name__
143
-
144
- return module + '.' + _class
145
-
146
-
147
- def vectorize(function):
148
- """Allow a method that only accepts scalars to accept vectors too.
149
-
150
- This decorator has two different behaviors depending on the dimensionality of the
151
- array passed as an argument:
152
-
153
- **1-d array**
154
-
155
- It will work under the assumption that the `function` argument is a callable
156
- with signature::
157
-
158
- function(self, X, *args, **kwargs)
159
-
160
- where X is an scalar magnitude.
161
-
162
- In this case the arguments of the input array will be given one at a time, and
163
- both the input and output of the decorated function will have shape (n,).
164
-
165
- **2-d array**
166
-
167
- It will work under the assumption that the `function` argument is a callable with signature::
168
-
169
- function(self, X0, ..., Xj, *args, **kwargs)
170
-
171
- where `Xi` are scalar magnitudes.
172
-
173
- It will pass the contents of each row unpacked on each call. The input is espected to have
174
- shape (n, j), the output a shape of (n,)
175
-
176
- It will return a function that is guaranteed to return a `numpy.array`.
177
-
178
- Args:
179
- function(callable): Function that only accept and return scalars.
180
-
181
- Returns:
182
- callable: Decorated function that can accept and return :attr:`numpy.array`.
183
-
184
- """
185
-
186
- def decorated(self, X, *args, **kwargs):
187
- if not isinstance(X, np.ndarray):
188
- return function(self, X, *args, **kwargs)
189
-
190
- if len(X.shape) == 1:
191
- X = X.reshape([-1, 1])
192
-
193
- if len(X.shape) == 2:
194
- return np.fromiter(
195
- (function(self, *x, *args, **kwargs) for x in X),
196
- np.dtype('float64')
197
- )
198
-
199
- else:
200
- raise ValueError('Arrays of dimensionality higher than 2 are not supported.')
201
-
202
- decorated.__doc__ = function.__doc__
203
- return decorated
204
-
205
-
206
- def scalarize(function):
207
- """Allow methods that only accepts 1-d vectors to work with scalars.
208
-
209
- Args:
210
- function(callable): Function that accepts and returns vectors.
211
-
212
- Returns:
213
- callable: Decorated function that accepts and returns scalars.
214
- """
215
-
216
- def decorated(self, X, *args, **kwargs):
217
- scalar = not isinstance(X, np.ndarray)
218
-
219
- if scalar:
220
- X = np.array([X])
221
-
222
- result = function(self, X, *args, **kwargs)
223
- if scalar:
224
- result = result[0]
225
-
226
- return result
227
-
228
- decorated.__doc__ = function.__doc__
229
- return decorated
230
-
231
-
232
- def check_valid_values(function):
233
- """Raise an exception if the given values are not supported.
234
-
235
- Args:
236
- function(callable): Method whose unique argument is a numpy.array-like object.
237
-
238
- Returns:
239
- callable: Decorated function
240
-
241
- Raises:
242
- ValueError: If there are missing or invalid values or if the dataset is empty.
243
- """
244
-
245
- def decorated(self, X, *args, **kwargs):
246
-
247
- if isinstance(X, pd.DataFrame):
248
- W = X.to_numpy()
249
-
250
- else:
251
- W = X
252
-
253
- if not len(W):
254
- raise ValueError('Your dataset is empty.')
255
-
256
- if not (np.issubdtype(W.dtype, np.floating) or np.issubdtype(W.dtype, np.integer)):
257
- raise ValueError('There are non-numerical values in your data.')
258
-
259
- if np.isnan(W).any().any():
260
- raise ValueError('There are nan values in your data.')
261
-
262
- return function(self, X, *args, **kwargs)
263
-
264
- return decorated
12
+ from types import ModuleType
265
13
 
266
14
 
267
15
  def _get_addon_target(addon_path_name):
@@ -311,11 +59,17 @@ def _get_addon_target(addon_path_name):
311
59
  def _find_addons():
312
60
  """Find and load all copulas add-ons."""
313
61
  group = 'copulas_modules'
314
- for entry_point in iter_entry_points(group=group):
62
+ try:
63
+ eps = entry_points(group=group)
64
+ except TypeError:
65
+ # Load-time selection requires Python >= 3.10 or importlib_metadata >= 3.6
66
+ eps = entry_points().get(group, [])
67
+
68
+ for entry_point in eps:
315
69
  try:
316
70
  addon = entry_point.load()
317
- except Exception: # pylint: disable=broad-exception-caught
318
- msg = f'Failed to load "{entry_point.name}" from "{entry_point.module_name}".'
71
+ except Exception as e: # pylint: disable=broad-exception-caught
72
+ msg = f'Failed to load "{entry_point.name}" from "{entry_point.value}" with error:\n{e}'
319
73
  warnings.warn(msg)
320
74
  continue
321
75
 
@@ -326,6 +80,11 @@ def _find_addons():
326
80
  warnings.warn(msg)
327
81
  continue
328
82
 
83
+ if isinstance(addon, ModuleType):
84
+ addon_module_name = f'{addon_target.__name__}.{addon_name}'
85
+ if addon_module_name not in sys.modules:
86
+ sys.modules[addon_module_name] = addon
87
+
329
88
  setattr(addon_target, addon_name, addon)
330
89
 
331
90
 
@@ -3,7 +3,7 @@
3
3
  import numpy as np
4
4
  import pandas as pd
5
5
 
6
- from copulas import EPSILON
6
+ from copulas.utils import EPSILON
7
7
  from copulas.bivariate.base import Bivariate, CopulaTypes
8
8
  from copulas.bivariate.clayton import Clayton
9
9
  from copulas.bivariate.frank import Frank
@@ -47,7 +47,6 @@ def _compute_empirical(X):
47
47
  right = sum(np.logical_and(U >= base[k], V >= base[k])) / N
48
48
 
49
49
  if left > 0:
50
-
51
50
  z_left.append(base[k])
52
51
  L.append(left / base[k] ** 2)
53
52
 
@@ -151,7 +150,8 @@ def select_copula(X):
151
150
 
152
151
  left_tail, empirical_left_aut, right_tail, empirical_right_aut = _compute_empirical(X)
153
152
  candidate_left_auts, candidate_right_auts = _compute_candidates(
154
- copula_candidates, left_tail, right_tail)
153
+ copula_candidates, left_tail, right_tail
154
+ )
155
155
 
156
156
  empirical_aut = np.concatenate((empirical_left_aut, empirical_right_aut))
157
157
  candidate_auts = [
copulas/bivariate/base.py CHANGED
@@ -8,8 +8,9 @@ import numpy as np
8
8
  from scipy import stats
9
9
  from scipy.optimize import brentq
10
10
 
11
- from copulas import EPSILON, NotFittedError, random_state, validate_random_state
12
11
  from copulas.bivariate.utils import split_matrix
12
+ from copulas.errors import NotFittedError
13
+ from copulas.utils import EPSILON, random_state, validate_random_state
13
14
 
14
15
 
15
16
  class CopulaTypes(Enum):
@@ -96,7 +97,7 @@ class Bivariate(object):
96
97
  return super(Bivariate, cls).__new__(cls)
97
98
 
98
99
  if not isinstance(copula_type, CopulaTypes):
99
- if (isinstance(copula_type, str) and copula_type.upper() in CopulaTypes.__members__):
100
+ if isinstance(copula_type, str) and copula_type.upper() in CopulaTypes.__members__:
100
101
  copula_type = CopulaTypes[copula_type.upper()]
101
102
  else:
102
103
  raise ValueError(f'Invalid copula type {copula_type}')
@@ -192,11 +193,7 @@ class Bivariate(object):
192
193
  dict: Parameters of the copula.
193
194
 
194
195
  """
195
- return {
196
- 'copula_type': self.copula_type.name,
197
- 'theta': self.theta,
198
- 'tau': self.tau
199
- }
196
+ return {'copula_type': self.copula_type.name, 'theta': self.theta, 'tau': self.tau}
200
197
 
201
198
  @classmethod
202
199
  def from_dict(cls, copula_dict):
@@ -297,6 +294,7 @@ class Bivariate(object):
297
294
  self.check_fit()
298
295
  result = []
299
296
  for _y, _v in zip(y, V):
297
+
300
298
  def f(u):
301
299
  return self.partial_derivative_scalar(u, _v) - _y
302
300
 
@@ -330,7 +328,7 @@ class Bivariate(object):
330
328
  np.ndarray
331
329
 
332
330
  """
333
- delta = (-2 * (X[:, 1] > 0.5) + 1)
331
+ delta = -2 * (X[:, 1] > 0.5) + 1
334
332
  delta = 0.0001 * delta
335
333
  X_prime = X.copy()
336
334
  X_prime[:, 1] += delta
@@ -411,10 +409,11 @@ class Bivariate(object):
411
409
 
412
410
  """
413
411
  from copulas.bivariate import select_copula # noqa
412
+
414
413
  warnings.warn(
415
414
  '`Bivariate.select_copula` has been deprecated and will be removed in a later '
416
415
  'release. Please use `copulas.bivariate.select_copula` instead',
417
- DeprecationWarning
416
+ DeprecationWarning,
418
417
  )
419
418
  return select_copula(X)
420
419
 
@@ -84,9 +84,10 @@ class Clayton(Bivariate):
84
84
  cdfs = [
85
85
  np.power(
86
86
  np.power(U[i], -self.theta) + np.power(V[i], -self.theta) - 1,
87
- -1.0 / self.theta
87
+ -1.0 / self.theta,
88
88
  )
89
- if (U[i] > 0 and V[i] > 0) else 0
89
+ if (U[i] > 0 and V[i] > 0)
90
+ else 0
90
91
  for i in range(len(U))
91
92
  ]
92
93
 
@@ -6,9 +6,9 @@ import numpy as np
6
6
  import scipy.integrate as integrate
7
7
  from scipy.optimize import least_squares
8
8
 
9
- from copulas import EPSILON
10
9
  from copulas.bivariate.base import Bivariate, CopulaTypes
11
10
  from copulas.bivariate.utils import split_matrix
11
+ from copulas.utils import EPSILON
12
12
 
13
13
  MIN_FLOAT_LOG = np.log(sys.float_info.min)
14
14
  MAX_FLOAT_LOG = np.log(sys.float_info.max)
@@ -162,6 +162,7 @@ class Frank(Bivariate):
162
162
 
163
163
  def _tau_to_theta(self, alpha):
164
164
  """Relationship between tau and theta as a solvable equation."""
165
+
165
166
  def debye(t):
166
167
  return t / (np.exp(t) - 1)
167
168
 
copulas/datasets.py CHANGED
@@ -4,7 +4,7 @@ import numpy as np
4
4
  import pandas as pd
5
5
  from scipy import stats
6
6
 
7
- from copulas import set_random_state, validate_random_state
7
+ from copulas.utils import set_random_state, validate_random_state
8
8
 
9
9
 
10
10
  def _dummy_fn(state):
@@ -33,10 +33,7 @@ def sample_bivariate_age_income(size=1000, seed=42):
33
33
  income += np.random.normal(loc=np.log(age) / 100, scale=10, size=size)
34
34
  income[np.random.randint(0, 10, size=size) == 0] /= 1000
35
35
 
36
- return pd.DataFrame({
37
- 'age': age,
38
- 'income': income
39
- })
36
+ return pd.DataFrame({'age': age, 'income': income})
40
37
 
41
38
 
42
39
  def sample_trivariate_xyz(size=1000, seed=42):
@@ -61,11 +58,7 @@ def sample_trivariate_xyz(size=1000, seed=42):
61
58
  with set_random_state(validate_random_state(seed), _dummy_fn):
62
59
  x = stats.beta.rvs(a=0.1, b=0.1, size=size)
63
60
  y = stats.beta.rvs(a=0.1, b=0.5, size=size)
64
- return pd.DataFrame({
65
- 'x': x,
66
- 'y': y,
67
- 'z': np.random.normal(size=size) + y * 10
68
- })
61
+ return pd.DataFrame({'x': x, 'y': y, 'z': np.random.normal(size=size) + y * 10})
69
62
 
70
63
 
71
64
  def sample_univariate_bernoulli(size=1000, seed=42):
copulas/errors.py ADDED
@@ -0,0 +1,5 @@
1
+ """Copulas Exceptions."""
2
+
3
+
4
+ class NotFittedError(Exception):
5
+ """NotFittedError class."""
@@ -5,10 +5,4 @@ from copulas.multivariate.gaussian import GaussianMultivariate
5
5
  from copulas.multivariate.tree import Tree, TreeTypes
6
6
  from copulas.multivariate.vine import VineCopula
7
7
 
8
- __all__ = (
9
- 'Multivariate',
10
- 'GaussianMultivariate',
11
- 'VineCopula',
12
- 'Tree',
13
- 'TreeTypes'
14
- )
8
+ __all__ = ('Multivariate', 'GaussianMultivariate', 'VineCopula', 'Tree', 'TreeTypes')
@@ -4,7 +4,8 @@ import pickle
4
4
 
5
5
  import numpy as np
6
6
 
7
- from copulas import NotFittedError, get_instance, validate_random_state
7
+ from copulas.errors import NotFittedError
8
+ from copulas.utils import get_instance, validate_random_state
8
9
 
9
10
 
10
11
  class Multivariate(object):
@@ -7,11 +7,17 @@ import numpy as np
7
7
  import pandas as pd
8
8
  from scipy import stats
9
9
 
10
- from copulas import (
11
- EPSILON, check_valid_values, get_instance, get_qualified_name, random_state, store_args,
12
- validate_random_state)
13
10
  from copulas.multivariate.base import Multivariate
14
11
  from copulas.univariate import GaussianUnivariate, Univariate
12
+ from copulas.utils import (
13
+ EPSILON,
14
+ check_valid_values,
15
+ get_instance,
16
+ get_qualified_name,
17
+ random_state,
18
+ store_args,
19
+ validate_random_state,
20
+ )
15
21
 
16
22
  LOGGER = logging.getLogger(__name__)
17
23
  DEFAULT_DISTRIBUTION = Univariate
@@ -64,26 +70,6 @@ class GaussianMultivariate(Multivariate):
64
70
 
65
71
  return stats.norm.ppf(np.column_stack(U))
66
72
 
67
- def _get_correlation(self, X):
68
- """Compute correlation matrix with transformed data.
69
-
70
- Args:
71
- X (numpy.ndarray):
72
- Data for which the correlation needs to be computed.
73
-
74
- Returns:
75
- numpy.ndarray:
76
- computed correlation matrix.
77
- """
78
- result = self._transform_to_normal(X)
79
- correlation = pd.DataFrame(data=result).corr().to_numpy()
80
- correlation = np.nan_to_num(correlation, nan=0.0)
81
- # If singular, add some noise to the diagonal
82
- if np.linalg.cond(correlation) > 1.0 / sys.float_info.epsilon:
83
- correlation = correlation + np.identity(correlation.shape[0]) * EPSILON
84
-
85
- return pd.DataFrame(correlation, index=self.columns, columns=self.columns)
86
-
87
73
  @check_valid_values
88
74
  def fit(self, X):
89
75
  """Compute the distribution for each variable and then its correlation matrix.
@@ -94,42 +80,88 @@ class GaussianMultivariate(Multivariate):
94
80
  """
95
81
  LOGGER.info('Fitting %s', self)
96
82
 
83
+ # Validate the input data
84
+ X = self._validate_input(X)
85
+ columns, univariates = self._fit_columns(X)
86
+
87
+ self.columns = columns
88
+ self.univariates = univariates
89
+
90
+ LOGGER.debug('Computing correlation.')
91
+ self.correlation = self._get_correlation(X)
92
+ self.fitted = True
93
+ LOGGER.debug('GaussianMultivariate fitted successfully')
94
+
95
+ def _validate_input(self, X):
96
+ """Validate the input data."""
97
97
  if not isinstance(X, pd.DataFrame):
98
98
  X = pd.DataFrame(X)
99
99
 
100
+ return X
101
+
102
+ def _fit_columns(self, X):
103
+ """Fit each column to its distribution."""
100
104
  columns = []
101
105
  univariates = []
102
106
  for column_name, column in X.items():
103
- if isinstance(self.distribution, dict):
104
- distribution = self.distribution.get(column_name, DEFAULT_DISTRIBUTION)
105
- else:
106
- distribution = self.distribution
107
-
107
+ distribution = self._get_distribution_for_column(column_name)
108
108
  LOGGER.debug('Fitting column %s to %s', column_name, distribution)
109
109
 
110
- univariate = get_instance(distribution)
111
- try:
112
- univariate.fit(column)
113
- except BaseException:
114
- log_message = (
115
- f'Unable to fit to a {distribution} distribution for column {column_name}. '
116
- 'Using a Gaussian distribution instead.'
117
- )
118
- LOGGER.info(log_message)
119
- univariate = GaussianUnivariate()
120
- univariate.fit(column)
121
-
110
+ univariate = self._fit_column(column, distribution, column_name)
122
111
  columns.append(column_name)
123
112
  univariates.append(univariate)
124
113
 
125
- self.columns = columns
126
- self.univariates = univariates
114
+ return columns, univariates
115
+
116
+ def _get_distribution_for_column(self, column_name):
117
+ """Retrieve the distribution for a given column name."""
118
+ if isinstance(self.distribution, dict):
119
+ return self.distribution.get(column_name, DEFAULT_DISTRIBUTION)
120
+
121
+ return self.distribution
122
+
123
+ def _fit_column(self, column, distribution, column_name):
124
+ """Fit a single column to its distribution with exception handling."""
125
+ univariate = get_instance(distribution)
126
+ try:
127
+ univariate.fit(column)
128
+ except Exception as error:
129
+ univariate = self._fit_with_fallback_distribution(
130
+ column, distribution, column_name, error
131
+ )
132
+
133
+ return univariate
134
+
135
+ def _fit_with_fallback_distribution(self, column, distribution, column_name, error):
136
+ """Fall back to fitting a Gaussian distribution and log the error."""
137
+ log_message = (
138
+ f'Unable to fit to a {distribution} distribution for column {column_name}. '
139
+ 'Using a Gaussian distribution instead.'
140
+ )
141
+ LOGGER.info(log_message)
142
+ univariate = GaussianUnivariate()
143
+ univariate.fit(column)
144
+ return univariate
127
145
 
128
- LOGGER.debug('Computing correlation')
129
- self.correlation = self._get_correlation(X)
130
- self.fitted = True
146
+ def _get_correlation(self, X):
147
+ """Compute correlation matrix with transformed data.
131
148
 
132
- LOGGER.debug('GaussianMultivariate fitted successfully')
149
+ Args:
150
+ X (numpy.ndarray):
151
+ Data for which the correlation needs to be computed.
152
+
153
+ Returns:
154
+ numpy.ndarray:
155
+ computed correlation matrix.
156
+ """
157
+ result = self._transform_to_normal(X)
158
+ correlation = pd.DataFrame(data=result).corr().to_numpy()
159
+ correlation = np.nan_to_num(correlation, nan=0.0)
160
+ # If singular, add some noise to the diagonal
161
+ if np.linalg.cond(correlation) > 1.0 / sys.float_info.epsilon:
162
+ correlation = correlation + np.identity(correlation.shape[0]) * EPSILON
163
+
164
+ return pd.DataFrame(correlation, index=self.columns, columns=self.columns)
133
165
 
134
166
  def probability_density(self, X):
135
167
  """Compute the probability density for each point in X.
@@ -149,8 +181,7 @@ class GaussianMultivariate(Multivariate):
149
181
  self.check_fit()
150
182
  transformed = self._transform_to_normal(X)
151
183
 
152
- return stats.multivariate_normal.pdf(
153
- transformed, cov=self.correlation, allow_singular=True)
184
+ return stats.multivariate_normal.pdf(transformed, cov=self.correlation, allow_singular=True)
154
185
 
155
186
  def cumulative_distribution(self, X):
156
187
  """Compute the cumulative distribution value for each point in X.