copulas 0.10.1.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of copulas might be problematic. Click here for more details.

@@ -0,0 +1,169 @@
1
+ """Frank module."""
2
+
3
+ import sys
4
+
5
+ import numpy as np
6
+ import scipy.integrate as integrate
7
+ from scipy.optimize import least_squares
8
+
9
+ from copulas import EPSILON
10
+ from copulas.bivariate.base import Bivariate, CopulaTypes
11
+ from copulas.bivariate.utils import split_matrix
12
+
13
+ MIN_FLOAT_LOG = np.log(sys.float_info.min)
14
+ MAX_FLOAT_LOG = np.log(sys.float_info.max)
15
+
16
+
17
+ class Frank(Bivariate):
18
+ """Class for Frank copula model."""
19
+
20
+ copula_type = CopulaTypes.FRANK
21
+ theta_interval = [-float('inf'), float('inf')]
22
+ invalid_thetas = [0]
23
+
24
+ def generator(self, t):
25
+ """Return the generator function."""
26
+ a = (np.exp(-self.theta * t) - 1) / (np.exp(-self.theta) - 1)
27
+ return -np.log(a)
28
+
29
+ def _g(self, z):
30
+ r"""Assist in solving the Frank copula.
31
+
32
+ This functions encapsulates :math:`g(z) = e^{-\theta z} - 1` used on Frank copulas.
33
+
34
+ Argument:
35
+ z: np.ndarray
36
+
37
+ Returns:
38
+ np.ndarray
39
+
40
+ """
41
+ return np.exp(-self.theta * z) - 1
42
+
43
+ def probability_density(self, X):
44
+ r"""Compute probability density function for given copula family.
45
+
46
+ The probability density(PDF) for the Frank family of copulas correspond to the formula:
47
+
48
+ .. math:: c(U,V) = \frac{\partial^2 C(u,v)}{\partial v \partial u} =
49
+ \frac{-\theta g(1)(1 + g(u + v))}{(g(u) g(v) + g(1)) ^ 2}
50
+
51
+ Where the g function is defined by:
52
+
53
+ .. math:: g(x) = e^{-\theta x} - 1
54
+
55
+ Args:
56
+ X: `np.ndarray`
57
+
58
+ Returns:
59
+ np.array: probability density
60
+
61
+ """
62
+ self.check_fit()
63
+
64
+ U, V = split_matrix(X)
65
+
66
+ if self.theta == 0:
67
+ return U * V
68
+
69
+ else:
70
+ num = (-self.theta * self._g(1)) * (1 + self._g(U + V))
71
+ aux = self._g(U) * self._g(V) + self._g(1)
72
+ den = np.power(aux, 2)
73
+ return num / den
74
+
75
+ def cumulative_distribution(self, X):
76
+ r"""Compute the cumulative distribution function for the Frank copula.
77
+
78
+ The cumulative density(cdf), or distribution function for the Frank family of copulas
79
+ correspond to the formula:
80
+
81
+ .. math:: C(u,v) = −\frac{\ln({\frac{1 + g(u) g(v)}{g(1)}})}{\theta}
82
+
83
+
84
+ Args:
85
+ X: `np.ndarray`
86
+
87
+ Returns:
88
+ np.array: cumulative distribution
89
+
90
+ """
91
+ self.check_fit()
92
+
93
+ U, V = split_matrix(X)
94
+
95
+ num = (np.exp(-self.theta * U) - 1) * (np.exp(-self.theta * V) - 1)
96
+ den = np.exp(-self.theta) - 1
97
+
98
+ return -1.0 / self.theta * np.log(1 + num / den)
99
+
100
+ def percent_point(self, y, V):
101
+ """Compute the inverse of conditional cumulative distribution :math:`C(u|v)^{-1}`.
102
+
103
+ Args:
104
+ y: `np.ndarray` value of :math:`C(u|v)`.
105
+ v: `np.ndarray` given value of v.
106
+ """
107
+ self.check_fit()
108
+
109
+ if self.theta == 0:
110
+ return V
111
+
112
+ else:
113
+ return super().percent_point(y, V)
114
+
115
+ def partial_derivative(self, X):
116
+ r"""Compute partial derivative of cumulative distribution.
117
+
118
+ The partial derivative of the copula(CDF) is the conditional CDF.
119
+
120
+ .. math:: F(v|u) = \frac{\partial}{\partial u}C(u,v) =
121
+ \frac{g(u)g(v) + g(v)}{g(u)g(v) + g(1)}
122
+
123
+ Args:
124
+ X (np.ndarray)
125
+ y (float)
126
+
127
+ Returns:
128
+ np.ndarray
129
+
130
+ """
131
+ self.check_fit()
132
+
133
+ U, V = split_matrix(X)
134
+
135
+ if self.theta == 0:
136
+ return V
137
+
138
+ else:
139
+ num = self._g(U) * self._g(V) + self._g(U)
140
+ den = self._g(U) * self._g(V) + self._g(1)
141
+ return num / den
142
+
143
+ def compute_theta(self):
144
+ r"""Compute theta parameter using Kendall's tau.
145
+
146
+ On Frank copula, the relationship between tau and theta is defined by:
147
+
148
+ .. math:: \tau = 1 − \frac{4}{\theta} + \frac{4}{\theta^2}\int_0^\theta \!
149
+ \frac{t}{e^t -1} \mathrm{d}t.
150
+
151
+ In order to solve it, we can simplify it as
152
+
153
+ .. math:: 0 = 1 + \frac{4}{\theta}(D_1(\theta) - 1) - \tau
154
+
155
+ where the function D is the Debye function of first order, defined as:
156
+
157
+ .. math:: D_1(x) = \frac{1}{x}\int_0^x\frac{t}{e^t -1} \mathrm{d}t.
158
+
159
+ """
160
+ result = least_squares(self._tau_to_theta, 1, bounds=(MIN_FLOAT_LOG, MAX_FLOAT_LOG))
161
+ return result.x[0]
162
+
163
+ def _tau_to_theta(self, alpha):
164
+ """Relationship between tau and theta as a solvable equation."""
165
+ def debye(t):
166
+ return t / (np.exp(t) - 1)
167
+
168
+ debye_value = integrate.quad(debye, EPSILON, alpha)[0] / alpha
169
+ return 4 * (debye_value - 1) / alpha + 1 - self.tau
@@ -0,0 +1,144 @@
1
+ """Gumbel module."""
2
+
3
+ import numpy as np
4
+
5
+ from copulas.bivariate.base import Bivariate, CopulaTypes
6
+ from copulas.bivariate.utils import split_matrix
7
+
8
+
9
+ class Gumbel(Bivariate):
10
+ """Class for clayton copula model."""
11
+
12
+ copula_type = CopulaTypes.GUMBEL
13
+ theta_interval = [1, float('inf')]
14
+ invalid_thetas = []
15
+
16
+ def generator(self, t):
17
+ """Return the generator function."""
18
+ return np.power(-np.log(t), self.theta)
19
+
20
+ def probability_density(self, X):
21
+ r"""Compute probability density function for given copula family.
22
+
23
+ The probability density(PDF) for the Gumbel family of copulas correspond to the formula:
24
+
25
+ .. math::
26
+
27
+ \begin{align}
28
+ c(U,V)
29
+ &= \frac{\partial^2 C(u,v)}{\partial v \partial u}
30
+ &= \frac{C(u,v)}{uv} \frac{((-\ln u)^{\theta} # noqa: JS101
31
+ + (-\ln v)^{\theta})^{\frac{2} # noqa: JS101
32
+ {\theta} - 2 }}{(\ln u \ln v)^{1 - \theta}} # noqa: JS101
33
+ ( 1 + (\theta-1) \big((-\ln u)^\theta
34
+ + (-\ln v)^\theta\big)^{-1/\theta})
35
+ \end{align}
36
+
37
+ Args:
38
+ X (numpy.ndarray)
39
+
40
+ Returns:
41
+ numpy.ndarray
42
+
43
+ """
44
+ self.check_fit()
45
+
46
+ U, V = split_matrix(X)
47
+
48
+ if self.theta == 1:
49
+ return U * V
50
+
51
+ else:
52
+ a = np.power(U * V, -1)
53
+ tmp = np.power(-np.log(U), self.theta) + np.power(-np.log(V), self.theta)
54
+ b = np.power(tmp, -2 + 2.0 / self.theta)
55
+ c = np.power(np.log(U) * np.log(V), self.theta - 1)
56
+ d = 1 + (self.theta - 1) * np.power(tmp, -1.0 / self.theta)
57
+ return self.cumulative_distribution(X) * a * b * c * d
58
+
59
+ def cumulative_distribution(self, X):
60
+ r"""Compute the cumulative distribution function for the Gumbel copula.
61
+
62
+ The cumulative density(cdf), or distribution function for the Gumbel family of copulas
63
+ correspond to the formula:
64
+
65
+ .. math:: C(u,v) = e^{-((-\ln u)^{\theta} + (-\ln v)^{\theta})^{\frac{1}{\theta}}}
66
+
67
+ Args:
68
+ X (np.ndarray)
69
+
70
+ Returns:
71
+ np.ndarray: cumulative probability for the given datapoints, cdf(X).
72
+
73
+ """
74
+ self.check_fit()
75
+
76
+ U, V = split_matrix(X)
77
+
78
+ if self.theta == 1:
79
+ return U * V
80
+
81
+ else:
82
+ h = np.power(-np.log(U), self.theta) + np.power(-np.log(V), self.theta)
83
+ h = -np.power(h, 1.0 / self.theta)
84
+ cdfs = np.exp(h)
85
+ return cdfs
86
+
87
+ def percent_point(self, y, V):
88
+ """Compute the inverse of conditional cumulative distribution :math:`C(u|v)^{-1}`.
89
+
90
+ Args:
91
+ y (np.ndarray): value of :math:`C(u|v)`.
92
+ v (np.ndarray): given value of v.
93
+
94
+ """
95
+ self.check_fit()
96
+
97
+ if self.theta == 1:
98
+ return y
99
+
100
+ else:
101
+ return super().percent_point(y, V)
102
+
103
+ def partial_derivative(self, X):
104
+ r"""Compute partial derivative of cumulative distribution.
105
+
106
+ The partial derivative of the copula(CDF) is the conditional CDF.
107
+
108
+ .. math:: F(v|u) = \frac{\partial C(u,v)}{\partial u} =
109
+ C(u,v)\frac{((-\ln u)^{\theta} + (-\ln v)^{\theta})^{\frac{1}{\theta} - 1}}
110
+ {\theta(- \ln u)^{1 -\theta}}
111
+
112
+ Args:
113
+ X (np.ndarray)
114
+ y (float)
115
+
116
+ Returns:
117
+ numpy.ndarray
118
+
119
+ """
120
+ self.check_fit()
121
+
122
+ U, V = split_matrix(X)
123
+
124
+ if self.theta == 1:
125
+ return V
126
+
127
+ else:
128
+ t1 = np.power(-np.log(U), self.theta)
129
+ t2 = np.power(-np.log(V), self.theta)
130
+ p1 = self.cumulative_distribution(X)
131
+ p2 = np.power(t1 + t2, -1 + 1.0 / self.theta)
132
+ p3 = np.power(-np.log(V), self.theta - 1)
133
+ return p1 * p2 * p3 / V
134
+
135
+ def compute_theta(self):
136
+ r"""Compute theta parameter using Kendall's tau.
137
+
138
+ On Gumbel copula :math:`\tau` is defined as :math:`τ = \frac{θ−1}{θ}`
139
+ that we solve as :math:`θ = \frac{1}{1-τ}`
140
+ """
141
+ if self.tau == 1:
142
+ raise ValueError("Tau value can't be 1")
143
+
144
+ return 1 / (1 - self.tau)
@@ -0,0 +1,81 @@
1
+ """Independence module."""
2
+
3
+ import numpy as np
4
+
5
+ from copulas.bivariate.base import Bivariate, CopulaTypes
6
+ from copulas.bivariate.utils import split_matrix
7
+
8
+
9
+ class Independence(Bivariate):
10
+ """This class represent the copula for two independent variables."""
11
+
12
+ copula_type = CopulaTypes.INDEPENDENCE
13
+
14
+ def fit(self, X):
15
+ """Fit the copula to the given data.
16
+
17
+ Args:
18
+ X (numpy.array): Probabilites in a matrix shaped (n, 2)
19
+
20
+ Returns:
21
+ None
22
+
23
+ """
24
+
25
+ def generator(self, t):
26
+ """Compute the generator function for the Copula.
27
+
28
+ The generator function is a function f(t), such that an archimedian copula can be
29
+ defined as
30
+
31
+ C(u1, ..., uN) = f(f^-1(u1), ..., f^-1(uN)).
32
+
33
+ Args:
34
+ t(numpy.array)
35
+
36
+ Returns:
37
+ np.array
38
+
39
+ """
40
+ return np.log(t)
41
+
42
+ def probability_density(self, X):
43
+ """Compute the probability density for the independence copula."""
44
+ return np.all((0.0 <= X) & (X <= 1.0), axis=1).astype(float)
45
+
46
+ def cumulative_distribution(self, X):
47
+ """Compute the cumulative distribution of the independence bivariate copula is the product.
48
+
49
+ Args:
50
+ X(numpy.array): Matrix of shape (n,2), whose values are in [0, 1]
51
+
52
+ Returns:
53
+ numpy.array: Cumulative distribution values of given input.
54
+
55
+ """
56
+ U, V = split_matrix(X)
57
+ return U * V
58
+
59
+ def partial_derivative(self, X):
60
+ """Compute the conditional probability of one event conditiones to the other.
61
+
62
+ In the case of the independence copula, due to C(u,v) = u*v, we have that
63
+ F(u|v) = dC/du = v.
64
+
65
+ Args:
66
+ X()
67
+
68
+ """
69
+ _, V = split_matrix(X)
70
+ return V
71
+
72
+ def percent_point(self, y, V):
73
+ """Compute the inverse of conditional cumulative distribution :math:`F(u|v)^-1`.
74
+
75
+ Args:
76
+ y: `np.ndarray` value of :math:`F(u|v)`.
77
+ v: `np.ndarray` given value of v.
78
+
79
+ """
80
+ self.check_fit()
81
+ return y
@@ -0,0 +1,19 @@
1
+ """Utilities for bivariate copulas."""
2
+
3
+ import numpy as np
4
+
5
+
6
+ def split_matrix(X):
7
+ """Split an (n,2) numpy.array into two vectors.
8
+
9
+ Args:
10
+ X(numpy.array): Matrix of shape (n,2)
11
+
12
+ Returns:
13
+ tuple[numpy.array]: Both of shape (n,)
14
+
15
+ """
16
+ if len(X):
17
+ return X[:, 0], X[:, 1]
18
+
19
+ return np.array([]), np.array([])
copulas/datasets.py ADDED
@@ -0,0 +1,221 @@
1
+ """Sample datasets for the Copulas library."""
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from scipy import stats
6
+
7
+ from copulas import set_random_state, validate_random_state
8
+
9
+
10
+ def _dummy_fn(state):
11
+ pass
12
+
13
+
14
+ def sample_bivariate_age_income(size=1000, seed=42):
15
+ """Sample from a bivariate toy dataset.
16
+
17
+ This dataset contains two columns which correspond to the simulated age and
18
+ income which are positively correlated with outliers.
19
+
20
+ Args:
21
+ size (int):
22
+ Amount of samples to generate. Defaults to 1000.
23
+ seed (int):
24
+ Random seed to use. Defaults to 42.
25
+
26
+ Returns:
27
+ pandas.DataFrame:
28
+ DataFrame with two columns, ``age`` and ``income``.
29
+ """
30
+ with set_random_state(validate_random_state(seed), _dummy_fn):
31
+ age = stats.beta.rvs(a=2.0, b=6.0, loc=18, scale=100, size=size)
32
+ income = np.log(age) * 100
33
+ income += np.random.normal(loc=np.log(age) / 100, scale=10, size=size)
34
+ income[np.random.randint(0, 10, size=size) == 0] /= 1000
35
+
36
+ return pd.DataFrame({
37
+ 'age': age,
38
+ 'income': income
39
+ })
40
+
41
+
42
+ def sample_trivariate_xyz(size=1000, seed=42):
43
+ """Sample from three dimensional toy dataset.
44
+
45
+ The output is a DataFrame containing three columns:
46
+
47
+ * ``x``: Beta distribution with a=0.1 and b=0.1
48
+ * ``y``: Beta distribution with a=0.1 and b=0.5
49
+ * ``z``: Normal distribution + 10 times ``y``
50
+
51
+ Args:
52
+ size (int):
53
+ Amount of samples to generate. Defaults to 1000.
54
+ seed (int):
55
+ Random seed to use. Defaults to 42.
56
+
57
+ Returns:
58
+ pandas.DataFrame:
59
+ DataFrame with three columns, ``x``, ``y`` and ``z``.
60
+ """
61
+ with set_random_state(validate_random_state(seed), _dummy_fn):
62
+ x = stats.beta.rvs(a=0.1, b=0.1, size=size)
63
+ y = stats.beta.rvs(a=0.1, b=0.5, size=size)
64
+ return pd.DataFrame({
65
+ 'x': x,
66
+ 'y': y,
67
+ 'z': np.random.normal(size=size) + y * 10
68
+ })
69
+
70
+
71
+ def sample_univariate_bernoulli(size=1000, seed=42):
72
+ """Sample from a Bernoulli distribution with p=0.3.
73
+
74
+ The distribution is built by sampling a uniform random and then setting
75
+ 0 or 1 depending on whether the value is above or below 0.3.
76
+
77
+ Args:
78
+ size (int):
79
+ Amount of samples to generate. Defaults to 1000.
80
+ seed (int):
81
+ Random seed to use. Defaults to 42.
82
+
83
+ Returns:
84
+ pandas.Series:
85
+ Series with the sampled values.
86
+ """
87
+ with set_random_state(validate_random_state(seed), _dummy_fn):
88
+ return pd.Series(np.random.random(size=size) < 0.3).astype(float)
89
+
90
+
91
+ def sample_univariate_bimodal(size=1000, seed=42):
92
+ """Sample from a bimodal distribution which mixes two Gaussians at 0.0 and 10.0 with stdev=1.
93
+
94
+ The distribution is built by sampling a standard normal and a normal with mean ``10``
95
+ and then selecting one or the other based on a bernoulli distribution.
96
+
97
+ Args:
98
+ size (int):
99
+ Amount of samples to generate. Defaults to 1000.
100
+ seed (int):
101
+ Random seed to use. Defaults to 42.
102
+
103
+ Returns:
104
+ pandas.Series:
105
+ Series with the sampled values.
106
+ """
107
+ with set_random_state(validate_random_state(seed), _dummy_fn):
108
+ bernoulli = sample_univariate_bernoulli(size, seed)
109
+ mode1 = np.random.normal(size=size) * bernoulli
110
+ mode2 = np.random.normal(size=size, loc=10) * (1.0 - bernoulli)
111
+
112
+ return pd.Series(mode1 + mode2)
113
+
114
+
115
+ def sample_univariate_uniform(size=1000, seed=42):
116
+ """Sample from a uniform distribution in [-1.0, 3.0].
117
+
118
+ Args:
119
+ size (int):
120
+ Amount of samples to generate. Defaults to 1000.
121
+ seed (int):
122
+ Random seed to use. Defaults to 42.
123
+
124
+ Returns:
125
+ pandas.Series:
126
+ Series with the sampled values.
127
+ """
128
+ with set_random_state(validate_random_state(seed), _dummy_fn):
129
+ return pd.Series(4.0 * np.random.random(size=size) - 1.0)
130
+
131
+
132
+ def sample_univariate_normal(size=1000, seed=42):
133
+ """Sample from a normal distribution with mean 1 and stdev 1.
134
+
135
+ Args:
136
+ size (int):
137
+ Amount of samples to generate. Defaults to 1000.
138
+ seed (int):
139
+ Random seed to use. Defaults to 42.
140
+
141
+ Returns:
142
+ pandas.Series:
143
+ Series with the sampled values.
144
+ """
145
+ with set_random_state(validate_random_state(seed), _dummy_fn):
146
+ return pd.Series(np.random.normal(size=size, loc=1.0))
147
+
148
+
149
+ def sample_univariate_degenerate(size=1000, seed=42):
150
+ """Sample from a degenerate distribution that only takes one random value.
151
+
152
+ Args:
153
+ size (int):
154
+ Amount of samples to generate. Defaults to 1000.
155
+ seed (int):
156
+ Random seed to use. Defaults to 42.
157
+
158
+ Returns:
159
+ pandas.Series:
160
+ Series with the sampled values.
161
+ """
162
+ with set_random_state(validate_random_state(seed), _dummy_fn):
163
+ return pd.Series(np.full(size, np.random.random()))
164
+
165
+
166
+ def sample_univariate_exponential(size=1000, seed=42):
167
+ """Sample from an exponential distribution at 3.0 with rate 1.0.
168
+
169
+ Args:
170
+ size (int):
171
+ Amount of samples to generate. Defaults to 1000.
172
+ seed (int):
173
+ Random seed to use. Defaults to 42.
174
+
175
+ Returns:
176
+ pandas.Series:
177
+ Series with the sampled values.
178
+ """
179
+ with set_random_state(validate_random_state(seed), _dummy_fn):
180
+ return pd.Series(np.random.exponential(size=size) + 3.0)
181
+
182
+
183
+ def sample_univariate_beta(size=1000, seed=42):
184
+ """Sample from a beta distribution with a=3 and b=1 and loc=4.
185
+
186
+ Args:
187
+ size (int):
188
+ Amount of samples to generate. Defaults to 1000.
189
+ seed (int):
190
+ Random seed to use. Defaults to 42.
191
+
192
+ Returns:
193
+ pandas.Series:
194
+ Series with the sampled values.
195
+ """
196
+ with set_random_state(validate_random_state(seed), _dummy_fn):
197
+ return pd.Series(stats.beta.rvs(a=3, b=1, loc=4, size=size))
198
+
199
+
200
+ def sample_univariates(size=1000, seed=42):
201
+ """Sample from a list of univariate distributions.
202
+
203
+ Args:
204
+ size (int):
205
+ Amount of samples to generate. Defaults to 1000.
206
+ seed (int):
207
+ Random seed to use. Defaults to 42.
208
+
209
+ Returns:
210
+ pandas.DataFrame:
211
+ DataFrame with the sampled distributions.
212
+ """
213
+ return pd.DataFrame({
214
+ 'bernoulli': sample_univariate_bernoulli(size, seed),
215
+ 'bimodal': sample_univariate_bimodal(size, seed),
216
+ 'uniform': sample_univariate_uniform(size, seed),
217
+ 'normal': sample_univariate_normal(size, seed),
218
+ 'degenerate': sample_univariate_degenerate(size, seed),
219
+ 'exponential': sample_univariate_exponential(size, seed),
220
+ 'beta': sample_univariate_beta(size, seed),
221
+ })
@@ -0,0 +1,14 @@
1
+ """Multivariate copulas module."""
2
+
3
+ from copulas.multivariate.base import Multivariate
4
+ from copulas.multivariate.gaussian import GaussianMultivariate
5
+ from copulas.multivariate.tree import Tree, TreeTypes
6
+ from copulas.multivariate.vine import VineCopula
7
+
8
+ __all__ = (
9
+ 'Multivariate',
10
+ 'GaussianMultivariate',
11
+ 'VineCopula',
12
+ 'Tree',
13
+ 'TreeTypes'
14
+ )