PyPI - copulas - Versions diffs - 0.10.1.dev0__py3-none-any.whl → 0.12.1.dev0__py3-none-any.whl - Mend

copulas 0.10.1.dev0py3-none-any.whl → 0.12.1.dev0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of copulas might be problematic. Click here for more details.

Files changed (30) hide show

copulas/__init__.py +17 -258
copulas/bivariate/__init__.py +3 -3
copulas/bivariate/base.py +8 -9
copulas/bivariate/clayton.py +3 -2
copulas/bivariate/frank.py +2 -1
copulas/datasets.py +3 -10
copulas/errors.py +5 -0
copulas/multivariate/__init__.py +1 -7
copulas/multivariate/base.py +2 -1
copulas/multivariate/gaussian.py +79 -48
copulas/multivariate/tree.py +12 -14
copulas/multivariate/vine.py +14 -9
copulas/optimize/__init__.py +4 -3
copulas/univariate/__init__.py +1 -1
copulas/univariate/base.py +16 -5
copulas/univariate/beta.py +1 -6
copulas/univariate/gaussian.py +2 -8
copulas/univariate/gaussian_kde.py +6 -7
copulas/univariate/selection.py +1 -1
copulas/univariate/student_t.py +1 -5
copulas/univariate/truncated_gaussian.py +9 -17
copulas/univariate/uniform.py +2 -8
copulas/utils.py +248 -0
copulas/visualization.py +15 -20
{copulas-0.10.1.dev0.dist-info → copulas-0.12.1.dev0.dist-info}/METADATA +58 -66
copulas-0.12.1.dev0.dist-info/RECORD +34 -0
{copulas-0.10.1.dev0.dist-info → copulas-0.12.1.dev0.dist-info}/WHEEL +1 -1
copulas-0.10.1.dev0.dist-info/RECORD +0 -32
{copulas-0.10.1.dev0.dist-info → copulas-0.12.1.dev0.dist-info}/LICENSE +0 -0
{copulas-0.10.1.dev0.dist-info → copulas-0.12.1.dev0.dist-info}/top_level.txt +0 -0

copulas/__init__.py CHANGED Viewed

@@ -1,267 +1,15 @@
-# -*- coding: utf-8 -*-
 """Top-level package for Copulas."""
 __author__ = 'DataCebo, Inc.'
 __email__ = 'info@sdv.dev'
-__version__ = '0.10.1.dev0'
+__version__ = '0.12.1.dev0'
-import contextlib
-import importlib
 import sys
 import warnings
 from copy import deepcopy
+from importlib.metadata import entry_points
 from operator import attrgetter
-import numpy as np
-import pandas as pd
-from pkg_resources import iter_entry_points
-EPSILON = np.finfo(np.float32).eps
-class NotFittedError(Exception):
-    """NotFittedError class."""
-@contextlib.contextmanager
-def set_random_state(random_state, set_model_random_state):
-    """Context manager for managing the random state.
-    Args:
-        random_state (int or np.random.RandomState):
-            The random seed or RandomState.
-        set_model_random_state (function):
-            Function to set the random state on the model.
-    """
-    original_state = np.random.get_state()
-    np.random.set_state(random_state.get_state())
-    try:
-        yield
-    finally:
-        current_random_state = np.random.RandomState()
-        current_random_state.set_state(np.random.get_state())
-        set_model_random_state(current_random_state)
-        np.random.set_state(original_state)
-def random_state(function):
-    """Set the random state before calling the function.
-    Args:
-        function (Callable):
-            The function to wrap around.
-    """
-    def wrapper(self, *args, **kwargs):
-        if self.random_state is None:
-            return function(self, *args, **kwargs)
-        else:
-            with set_random_state(self.random_state, self.set_random_state):
-                return function(self, *args, **kwargs)
-    return wrapper
-def validate_random_state(random_state):
-    """Validate random state argument.
-    Args:
-        random_state (int, numpy.random.RandomState, tuple, or None):
-            Seed or RandomState for the random generator.
-    Output:
-        numpy.random.RandomState
-    """
-    if random_state is None:
-        return None
-    if isinstance(random_state, int):
-        return np.random.RandomState(seed=random_state)
-    elif isinstance(random_state, np.random.RandomState):
-        return random_state
-    else:
-        raise TypeError(
-            f'`random_state` {random_state} expected to be an int '
-            'or `np.random.RandomState` object.')
-def get_instance(obj, **kwargs):
-    """Create new instance of the ``obj`` argument.
-    Args:
-        obj (str, type, instance):
-    """
-    instance = None
-    if isinstance(obj, str):
-        package, name = obj.rsplit('.', 1)
-        instance = getattr(importlib.import_module(package), name)(**kwargs)
-    elif isinstance(obj, type):
-        instance = obj(**kwargs)
-    else:
-        if kwargs:
-            instance = obj.__class__(**kwargs)
-        else:
-            args = getattr(obj, '__args__', ())
-            kwargs = getattr(obj, '__kwargs__', {})
-            instance = obj.__class__(*args, **kwargs)
-    return instance
-def store_args(__init__):
-    """Save ``*args`` and ``**kwargs`` used in the ``__init__`` of a copula.
-    Args:
-        __init__(callable): ``__init__`` function to store their arguments.
-    Returns:
-        callable: Decorated ``__init__`` function.
-    """
-    def new__init__(self, *args, **kwargs):
-        args_copy = deepcopy(args)
-        kwargs_copy = deepcopy(kwargs)
-        __init__(self, *args, **kwargs)
-        self.__args__ = args_copy
-        self.__kwargs__ = kwargs_copy
-    return new__init__
-def get_qualified_name(_object):
-    """Return the Fully Qualified Name from an instance or class."""
-    module = _object.__module__
-    if hasattr(_object, '__name__'):
-        _class = _object.__name__
-    else:
-        _class = _object.__class__.__name__
-    return module + '.' + _class
-def vectorize(function):
-    """Allow a method that only accepts scalars to accept vectors too.
-    This decorator has two different behaviors depending on the dimensionality of the
-    array passed as an argument:
-    **1-d array**
-    It will work under the assumption that the `function` argument is a callable
-    with signature::
-        function(self, X, *args, **kwargs)
-    where X is an scalar magnitude.
-    In this case the arguments of the input array will be given one at a time, and
-    both the input and output of the decorated function will have shape (n,).
-    **2-d array**
-    It will work under the assumption that the `function` argument is a callable with signature::
-        function(self, X0, ..., Xj, *args, **kwargs)
-    where `Xi` are scalar magnitudes.
-    It will pass the contents of each row unpacked on each call. The input is espected to have
-    shape (n, j), the output a shape of (n,)
-    It will return a function that is guaranteed to return a `numpy.array`.
-    Args:
-        function(callable): Function that only accept and return scalars.
-    Returns:
-        callable: Decorated function that can accept and return :attr:`numpy.array`.
-    """
-    def decorated(self, X, *args, **kwargs):
-        if not isinstance(X, np.ndarray):
-            return function(self, X, *args, **kwargs)
-        if len(X.shape) == 1:
-            X = X.reshape([-1, 1])
-        if len(X.shape) == 2:
-            return np.fromiter(
-                (function(self, *x, *args, **kwargs) for x in X),
-                np.dtype('float64')
-            )
-        else:
-            raise ValueError('Arrays of dimensionality higher than 2 are not supported.')
-    decorated.__doc__ = function.__doc__
-    return decorated
-def scalarize(function):
-    """Allow methods that only accepts 1-d vectors to work with scalars.
-    Args:
-        function(callable): Function that accepts and returns vectors.
-    Returns:
-        callable: Decorated function that accepts and returns scalars.
-    """
-    def decorated(self, X, *args, **kwargs):
-        scalar = not isinstance(X, np.ndarray)
-        if scalar:
-            X = np.array([X])
-        result = function(self, X, *args, **kwargs)
-        if scalar:
-            result = result[0]
-        return result
-    decorated.__doc__ = function.__doc__
-    return decorated
-def check_valid_values(function):
-    """Raise an exception if the given values are not supported.
-    Args:
-        function(callable): Method whose unique argument is a numpy.array-like object.
-    Returns:
-        callable: Decorated function
-    Raises:
-        ValueError: If there are missing or invalid values or if the dataset is empty.
-    """
-    def decorated(self, X, *args, **kwargs):
-        if isinstance(X, pd.DataFrame):
-            W = X.to_numpy()
-        else:
-            W = X
-        if not len(W):
-            raise ValueError('Your dataset is empty.')
-        if not (np.issubdtype(W.dtype, np.floating) or np.issubdtype(W.dtype, np.integer)):
-            raise ValueError('There are non-numerical values in your data.')
-        if np.isnan(W).any().any():
-            raise ValueError('There are nan values in your data.')
-        return function(self, X, *args, **kwargs)
-    return decorated
+from types import ModuleType
 def _get_addon_target(addon_path_name):
@@ -311,11 +59,17 @@ def _get_addon_target(addon_path_name):
 def _find_addons():
     """Find and load all copulas add-ons."""
     group = 'copulas_modules'
-    for entry_point in iter_entry_points(group=group):
+    try:
+        eps = entry_points(group=group)
+    except TypeError:
+        # Load-time selection requires Python >= 3.10 or importlib_metadata >= 3.6
+        eps = entry_points().get(group, [])
+    for entry_point in eps:
         try:
             addon = entry_point.load()
-        except Exception:  # pylint: disable=broad-exception-caught
-            msg = f'Failed to load "{entry_point.name}" from "{entry_point.module_name}".'
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            msg = f'Failed to load "{entry_point.name}" from "{entry_point.value}" with error:\n{e}'
             warnings.warn(msg)
             continue
@@ -326,6 +80,11 @@ def _find_addons():
             warnings.warn(msg)
             continue
+        if isinstance(addon, ModuleType):
+            addon_module_name = f'{addon_target.__name__}.{addon_name}'
+            if addon_module_name not in sys.modules:
+                sys.modules[addon_module_name] = addon
         setattr(addon_target, addon_name, addon)

copulas/bivariate/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@
 import numpy as np
 import pandas as pd
-from copulas import EPSILON
+from copulas.utils import EPSILON
 from copulas.bivariate.base import Bivariate, CopulaTypes
 from copulas.bivariate.clayton import Clayton
 from copulas.bivariate.frank import Frank
@@ -47,7 +47,6 @@ def _compute_empirical(X):
         right = sum(np.logical_and(U >= base[k], V >= base[k])) / N
         if left > 0:
             z_left.append(base[k])
             L.append(left / base[k] ** 2)
@@ -151,7 +150,8 @@ def select_copula(X):
     left_tail, empirical_left_aut, right_tail, empirical_right_aut = _compute_empirical(X)
     candidate_left_auts, candidate_right_auts = _compute_candidates(
-        copula_candidates, left_tail, right_tail)
+        copula_candidates, left_tail, right_tail
+    )
     empirical_aut = np.concatenate((empirical_left_aut, empirical_right_aut))
     candidate_auts = [

copulas/bivariate/base.py CHANGED Viewed

@@ -8,8 +8,9 @@ import numpy as np
 from scipy import stats
 from scipy.optimize import brentq
-from copulas import EPSILON, NotFittedError, random_state, validate_random_state
 from copulas.bivariate.utils import split_matrix
+from copulas.errors import NotFittedError
+from copulas.utils import EPSILON, random_state, validate_random_state
 class CopulaTypes(Enum):
@@ -96,7 +97,7 @@ class Bivariate(object):
             return super(Bivariate, cls).__new__(cls)
         if not isinstance(copula_type, CopulaTypes):
-            if (isinstance(copula_type, str) and copula_type.upper() in CopulaTypes.__members__):
+            if isinstance(copula_type, str) and copula_type.upper() in CopulaTypes.__members__:
                 copula_type = CopulaTypes[copula_type.upper()]
             else:
                 raise ValueError(f'Invalid copula type {copula_type}')
@@ -192,11 +193,7 @@ class Bivariate(object):
             dict: Parameters of the copula.
         """
-        return {
-            'copula_type': self.copula_type.name,
-            'theta': self.theta,
-            'tau': self.tau
-        }
+        return {'copula_type': self.copula_type.name, 'theta': self.theta, 'tau': self.tau}
     @classmethod
     def from_dict(cls, copula_dict):
@@ -297,6 +294,7 @@ class Bivariate(object):
         self.check_fit()
         result = []
         for _y, _v in zip(y, V):
             def f(u):
                 return self.partial_derivative_scalar(u, _v) - _y
@@ -330,7 +328,7 @@ class Bivariate(object):
             np.ndarray
         """
-        delta = (-2 * (X[:, 1] > 0.5) + 1)
+        delta = -2 * (X[:, 1] > 0.5) + 1
         delta = 0.0001 * delta
         X_prime = X.copy()
         X_prime[:, 1] += delta
@@ -411,10 +409,11 @@ class Bivariate(object):
         """
         from copulas.bivariate import select_copula  # noqa
         warnings.warn(
             '`Bivariate.select_copula` has been deprecated and will be removed in a later '
             'release. Please use `copulas.bivariate.select_copula` instead',
-            DeprecationWarning
+            DeprecationWarning,
         )
         return select_copula(X)

copulas/bivariate/clayton.py CHANGED Viewed

@@ -84,9 +84,10 @@ class Clayton(Bivariate):
             cdfs = [
                 np.power(
                     np.power(U[i], -self.theta) + np.power(V[i], -self.theta) - 1,
-                    -1.0 / self.theta
+                    -1.0 / self.theta,
                 )
-                if (U[i] > 0 and V[i] > 0) else 0
+                if (U[i] > 0 and V[i] > 0)
+                else 0
                 for i in range(len(U))
             ]

copulas/bivariate/frank.py CHANGED Viewed

@@ -6,9 +6,9 @@ import numpy as np
 import scipy.integrate as integrate
 from scipy.optimize import least_squares
-from copulas import EPSILON
 from copulas.bivariate.base import Bivariate, CopulaTypes
 from copulas.bivariate.utils import split_matrix
+from copulas.utils import EPSILON
 MIN_FLOAT_LOG = np.log(sys.float_info.min)
 MAX_FLOAT_LOG = np.log(sys.float_info.max)
@@ -162,6 +162,7 @@ class Frank(Bivariate):
     def _tau_to_theta(self, alpha):
         """Relationship between tau and theta as a solvable equation."""
         def debye(t):
             return t / (np.exp(t) - 1)

copulas/datasets.py CHANGED Viewed

@@ -4,7 +4,7 @@ import numpy as np
 import pandas as pd
 from scipy import stats
-from copulas import set_random_state, validate_random_state
+from copulas.utils import set_random_state, validate_random_state
 def _dummy_fn(state):
@@ -33,10 +33,7 @@ def sample_bivariate_age_income(size=1000, seed=42):
         income += np.random.normal(loc=np.log(age) / 100, scale=10, size=size)
         income[np.random.randint(0, 10, size=size) == 0] /= 1000
-    return pd.DataFrame({
-        'age': age,
-        'income': income
-    })
+    return pd.DataFrame({'age': age, 'income': income})
 def sample_trivariate_xyz(size=1000, seed=42):
@@ -61,11 +58,7 @@ def sample_trivariate_xyz(size=1000, seed=42):
     with set_random_state(validate_random_state(seed), _dummy_fn):
         x = stats.beta.rvs(a=0.1, b=0.1, size=size)
         y = stats.beta.rvs(a=0.1, b=0.5, size=size)
-        return pd.DataFrame({
-            'x': x,
-            'y': y,
-            'z': np.random.normal(size=size) + y * 10
-        })
+        return pd.DataFrame({'x': x, 'y': y, 'z': np.random.normal(size=size) + y * 10})
 def sample_univariate_bernoulli(size=1000, seed=42):

copulas/errors.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Copulas Exceptions."""
+class NotFittedError(Exception):
+    """NotFittedError class."""

copulas/multivariate/__init__.py CHANGED Viewed

@@ -5,10 +5,4 @@ from copulas.multivariate.gaussian import GaussianMultivariate
 from copulas.multivariate.tree import Tree, TreeTypes
 from copulas.multivariate.vine import VineCopula
-__all__ = (
-    'Multivariate',
-    'GaussianMultivariate',
-    'VineCopula',
-    'Tree',
-    'TreeTypes'
-)
+__all__ = ('Multivariate', 'GaussianMultivariate', 'VineCopula', 'Tree', 'TreeTypes')

copulas/multivariate/base.py CHANGED Viewed

@@ -4,7 +4,8 @@ import pickle
 import numpy as np
-from copulas import NotFittedError, get_instance, validate_random_state
+from copulas.errors import NotFittedError
+from copulas.utils import get_instance, validate_random_state
 class Multivariate(object):

copulas/multivariate/gaussian.py CHANGED Viewed

@@ -7,11 +7,17 @@ import numpy as np
 import pandas as pd
 from scipy import stats
-from copulas import (
-    EPSILON, check_valid_values, get_instance, get_qualified_name, random_state, store_args,
-    validate_random_state)
 from copulas.multivariate.base import Multivariate
 from copulas.univariate import GaussianUnivariate, Univariate
+from copulas.utils import (
+    EPSILON,
+    check_valid_values,
+    get_instance,
+    get_qualified_name,
+    random_state,
+    store_args,
+    validate_random_state,
+)
 LOGGER = logging.getLogger(__name__)
 DEFAULT_DISTRIBUTION = Univariate
@@ -64,26 +70,6 @@ class GaussianMultivariate(Multivariate):
         return stats.norm.ppf(np.column_stack(U))
-    def _get_correlation(self, X):
-        """Compute correlation matrix with transformed data.
-        Args:
-            X (numpy.ndarray):
-                Data for which the correlation needs to be computed.
-        Returns:
-            numpy.ndarray:
-                computed correlation matrix.
-        """
-        result = self._transform_to_normal(X)
-        correlation = pd.DataFrame(data=result).corr().to_numpy()
-        correlation = np.nan_to_num(correlation, nan=0.0)
-        # If singular, add some noise to the diagonal
-        if np.linalg.cond(correlation) > 1.0 / sys.float_info.epsilon:
-            correlation = correlation + np.identity(correlation.shape[0]) * EPSILON
-        return pd.DataFrame(correlation, index=self.columns, columns=self.columns)
     @check_valid_values
     def fit(self, X):
         """Compute the distribution for each variable and then its correlation matrix.
@@ -94,42 +80,88 @@ class GaussianMultivariate(Multivariate):
         """
         LOGGER.info('Fitting %s', self)
+        # Validate the input data
+        X = self._validate_input(X)
+        columns, univariates = self._fit_columns(X)
+        self.columns = columns
+        self.univariates = univariates
+        LOGGER.debug('Computing correlation.')
+        self.correlation = self._get_correlation(X)
+        self.fitted = True
+        LOGGER.debug('GaussianMultivariate fitted successfully')
+    def _validate_input(self, X):
+        """Validate the input data."""
         if not isinstance(X, pd.DataFrame):
             X = pd.DataFrame(X)
+        return X
+    def _fit_columns(self, X):
+        """Fit each column to its distribution."""
         columns = []
         univariates = []
         for column_name, column in X.items():
-            if isinstance(self.distribution, dict):
-                distribution = self.distribution.get(column_name, DEFAULT_DISTRIBUTION)
-            else:
-                distribution = self.distribution
+            distribution = self._get_distribution_for_column(column_name)
             LOGGER.debug('Fitting column %s to %s', column_name, distribution)
-            univariate = get_instance(distribution)
-            try:
-                univariate.fit(column)
-            except BaseException:
-                log_message = (
-                    f'Unable to fit to a {distribution} distribution for column {column_name}. '
-                    'Using a Gaussian distribution instead.'
-                )
-                LOGGER.info(log_message)
-                univariate = GaussianUnivariate()
-                univariate.fit(column)
+            univariate = self._fit_column(column, distribution, column_name)
             columns.append(column_name)
             univariates.append(univariate)
-        self.columns = columns
-        self.univariates = univariates
+        return columns, univariates
+    def _get_distribution_for_column(self, column_name):
+        """Retrieve the distribution for a given column name."""
+        if isinstance(self.distribution, dict):
+            return self.distribution.get(column_name, DEFAULT_DISTRIBUTION)
+        return self.distribution
+    def _fit_column(self, column, distribution, column_name):
+        """Fit a single column to its distribution with exception handling."""
+        univariate = get_instance(distribution)
+        try:
+            univariate.fit(column)
+        except Exception as error:
+            univariate = self._fit_with_fallback_distribution(
+                column, distribution, column_name, error
+            )
+        return univariate
+    def _fit_with_fallback_distribution(self, column, distribution, column_name, error):
+        """Fall back to fitting a Gaussian distribution and log the error."""
+        log_message = (
+            f'Unable to fit to a {distribution} distribution for column {column_name}. '
+            'Using a Gaussian distribution instead.'
+        )
+        LOGGER.info(log_message)
+        univariate = GaussianUnivariate()
+        univariate.fit(column)
+        return univariate
-        LOGGER.debug('Computing correlation')
-        self.correlation = self._get_correlation(X)
-        self.fitted = True
+    def _get_correlation(self, X):
+        """Compute correlation matrix with transformed data.
-        LOGGER.debug('GaussianMultivariate fitted successfully')
+        Args:
+            X (numpy.ndarray):
+                Data for which the correlation needs to be computed.
+        Returns:
+            numpy.ndarray:
+                computed correlation matrix.
+        """
+        result = self._transform_to_normal(X)
+        correlation = pd.DataFrame(data=result).corr().to_numpy()
+        correlation = np.nan_to_num(correlation, nan=0.0)
+        # If singular, add some noise to the diagonal
+        if np.linalg.cond(correlation) > 1.0 / sys.float_info.epsilon:
+            correlation = correlation + np.identity(correlation.shape[0]) * EPSILON
+        return pd.DataFrame(correlation, index=self.columns, columns=self.columns)
     def probability_density(self, X):
         """Compute the probability density for each point in X.
@@ -149,8 +181,7 @@ class GaussianMultivariate(Multivariate):
         self.check_fit()
         transformed = self._transform_to_normal(X)
-        return stats.multivariate_normal.pdf(
-            transformed, cov=self.correlation, allow_singular=True)
+        return stats.multivariate_normal.pdf(transformed, cov=self.correlation, allow_singular=True)
     def cumulative_distribution(self, X):
         """Compute the cumulative distribution value for each point in X.

copulas 0.10.1.dev0__py3-none-any.whl → 0.12.1.dev0__py3-none-any.whl

Potentially problematic release.

copulas 0.10.1.dev0py3-none-any.whl → 0.12.1.dev0py3-none-any.whl