copulas 0.10.1.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of copulas might be problematic. Click here for more details.

@@ -0,0 +1,356 @@
1
+ """VineCopula module."""
2
+
3
+ import logging
4
+ import sys
5
+ import warnings
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from copulas import (
11
+ EPSILON, check_valid_values, get_qualified_name, random_state, store_args,
12
+ validate_random_state)
13
+ from copulas.bivariate.base import Bivariate, CopulaTypes
14
+ from copulas.multivariate.base import Multivariate
15
+ from copulas.multivariate.tree import Tree, get_tree
16
+ from copulas.univariate.gaussian_kde import GaussianKDE
17
+
18
+ LOGGER = logging.getLogger(__name__)
19
+
20
+
21
+ class VineCopula(Multivariate):
22
+ """Vine copula model.
23
+
24
+ A :math:`vine` is a graphical representation of one factorization of the n-variate probability
25
+ distribution in terms of :math:`n(n − 1)/2` bivariate copulas by means of the chain rule.
26
+
27
+ It consists of a sequence of levels and as many levels as variables. Each level consists of
28
+ a tree (no isolated nodes and no loops) satisfying that if it has :math:`n` nodes there must
29
+ be :math:`n − 1` edges.
30
+
31
+ Each node in tree :math:`T_1` is a variable and edges are couplings of variables constructed
32
+ with bivariate copulas.
33
+
34
+ Each node in tree :math:`T_{k+1}` is a coupling in :math:`T_{k}`, expressed by the copula
35
+ of the variables; while edges are couplings between two vertices that must have one variable
36
+ in common, becoming a conditioning variable in the bivariate copula. Thus, every level has
37
+ one node less than the former. Once all the trees are drawn, the factorization is the product
38
+ of all the nodes.
39
+
40
+ Args:
41
+ vine_type (str):
42
+ type of the vine copula, could be 'center','direct','regular'
43
+ random_state (int or np.random.RandomState):
44
+ Random seed or RandomState to use.
45
+
46
+
47
+ Attributes:
48
+ model (copulas.univariate.Univariate):
49
+ Distribution to compute univariates.
50
+ u_matrix (numpy.array):
51
+ Univariates.
52
+ n_sample (int):
53
+ Number of samples.
54
+ n_var (int):
55
+ Number of variables.
56
+ columns (pandas.Series):
57
+ Names of the variables.
58
+ tau_mat (numpy.array):
59
+ Kendall correlation parameters for data.
60
+ truncated (int):
61
+ Max level used to build the vine.
62
+ depth (int):
63
+ Vine depth.
64
+ trees (list[Tree]):
65
+ List of trees used by this vine.
66
+ ppfs (list[callable]):
67
+ percent point functions from the univariates used by this vine.
68
+ """
69
+
70
+ @store_args
71
+ def __init__(self, vine_type, random_state=None):
72
+ if sys.version_info > (3, 8):
73
+ warnings.warn(
74
+ 'Vines have not been fully tested on Python >= 3.8 and might '
75
+ 'produce wrong results.'
76
+ )
77
+
78
+ self.random_state = validate_random_state(random_state)
79
+ self.vine_type = vine_type
80
+ self.u_matrix = None
81
+
82
+ self.model = GaussianKDE
83
+
84
+ @classmethod
85
+ def _deserialize_trees(cls, tree_list):
86
+ previous = Tree.from_dict(tree_list[0])
87
+ trees = [previous]
88
+
89
+ for tree_dict in tree_list[1:]:
90
+ tree = Tree.from_dict(tree_dict, previous)
91
+ trees.append(tree)
92
+ previous = tree
93
+
94
+ return trees
95
+
96
+ def to_dict(self):
97
+ """Return a `dict` with the parameters to replicate this Vine.
98
+
99
+ Returns:
100
+ dict:
101
+ Parameters of this Vine.
102
+ """
103
+ result = {
104
+ 'type': get_qualified_name(self),
105
+ 'vine_type': self.vine_type,
106
+ 'fitted': self.fitted
107
+ }
108
+
109
+ if not self.fitted:
110
+ return result
111
+
112
+ result.update({
113
+ 'n_sample': self.n_sample,
114
+ 'n_var': self.n_var,
115
+ 'depth': self.depth,
116
+ 'truncated': self.truncated,
117
+ 'trees': [tree.to_dict() for tree in self.trees],
118
+ 'tau_mat': self.tau_mat.tolist(),
119
+ 'u_matrix': self.u_matrix.tolist(),
120
+ 'unis': [distribution.to_dict() for distribution in self.unis],
121
+ 'columns': self.columns
122
+ })
123
+ return result
124
+
125
+ @classmethod
126
+ def from_dict(cls, vine_dict):
127
+ """Create a new instance from a parameters dictionary.
128
+
129
+ Args:
130
+ params (dict):
131
+ Parameters of the Vine, in the same format as the one
132
+ returned by the ``to_dict`` method.
133
+
134
+ Returns:
135
+ Vine:
136
+ Instance of the Vine defined on the parameters.
137
+ """
138
+ instance = cls(vine_dict['vine_type'])
139
+ fitted = vine_dict['fitted']
140
+ if fitted:
141
+ instance.fitted = fitted
142
+ instance.n_sample = vine_dict['n_sample']
143
+ instance.n_var = vine_dict['n_var']
144
+ instance.truncated = vine_dict['truncated']
145
+ instance.depth = vine_dict['depth']
146
+ instance.trees = cls._deserialize_trees(vine_dict['trees'])
147
+ instance.unis = [GaussianKDE.from_dict(uni) for uni in vine_dict['unis']]
148
+ instance.ppfs = [uni.percent_point for uni in instance.unis]
149
+ instance.columns = vine_dict['columns']
150
+ instance.tau_mat = np.array(vine_dict['tau_mat'])
151
+ instance.u_matrix = np.array(vine_dict['u_matrix'])
152
+
153
+ return instance
154
+
155
+ @check_valid_values
156
+ def fit(self, X, truncated=3):
157
+ """Fit a vine model to the data.
158
+
159
+ 1. Transform all the variables by means of their marginals.
160
+ In other words, compute
161
+
162
+ .. math:: u_i = F_i(x_i), i = 1, ..., n
163
+
164
+ and compose the matrix :math:`u = u_1, ..., u_n,` where :math:`u_i` are their columns.
165
+
166
+ Args:
167
+ X (numpy.ndarray):
168
+ Data to be fitted to.
169
+ truncated (int):
170
+ Max level to build the vine.
171
+ """
172
+ LOGGER.info('Fitting VineCopula("%s")', self.vine_type)
173
+ self.n_sample, self.n_var = X.shape
174
+ self.columns = X.columns
175
+ self.tau_mat = X.corr(method='kendall').to_numpy()
176
+ self.u_matrix = np.empty([self.n_sample, self.n_var])
177
+
178
+ self.truncated = truncated
179
+ self.depth = self.n_var - 1
180
+ self.trees = []
181
+
182
+ self.unis, self.ppfs = [], []
183
+ for i, col in enumerate(X):
184
+ uni = self.model()
185
+ uni.fit(X[col])
186
+ self.u_matrix[:, i] = uni.cumulative_distribution(X[col])
187
+ self.unis.append(uni)
188
+ self.ppfs.append(uni.percent_point)
189
+
190
+ self.train_vine(self.vine_type)
191
+ self.fitted = True
192
+
193
+ def train_vine(self, tree_type):
194
+ r"""Build the vine.
195
+
196
+ 1. For the construction of the first tree :math:`T_1`, assign one node to each variable
197
+ and then couple them by maximizing the measure of association considered.
198
+ Different vines impose different constraints on this construction. When those are
199
+ applied different trees are achieved at this level.
200
+
201
+ 2. Select the copula that best fits to the pair of variables coupled by each edge in
202
+ :math:`T_1`.
203
+
204
+ 3. Let :math:`C_{ij}(u_i , u_j )` be the copula for a given edge :math:`(u_i, u_j)`
205
+ in :math:`T_1`. Then for every edge in :math:`T_1`, compute either
206
+
207
+ .. math:: {v^1}_{j|i} = \\frac{\\partial C_{ij}(u_i, u_j)}{\\partial u_j}
208
+
209
+ or similarly :math:`{v^1}_{i|j}`, which are conditional cdfs. When finished with
210
+ all the edges, construct the new matrix with :math:`v^1` that has one less column u.
211
+
212
+ 4. Set k = 2.
213
+
214
+ 5. Assign one node of :math:`T_k` to each edge of :math:`T_ {k−1}`. The structure of
215
+ :math:`T_{k−1}` imposes a set of constraints on which edges of :math:`T_k` are
216
+ realizable. Hence the next step is to get a linked list of the accesible nodes for
217
+ every node in :math:`T_k`.
218
+
219
+ 6. As in step 1, nodes of :math:`T_k` are coupled maximizing the measure of association
220
+ considered and satisfying the constraints impose by the kind of vine employed plus the
221
+ set of constraints imposed by tree :math:`T_{k−1}`.
222
+
223
+ 7. Select the copula that best fit to each edge created in :math:`T_k`.
224
+
225
+ 8. Recompute matrix :math:`v_k` as in step 4, but taking :math:`T_k` and :math:`vk−1`
226
+ instead of :math:`T_1` and u.
227
+
228
+ 9. Set :math:`k = k + 1` and repeat from (5) until all the trees are constructed.
229
+
230
+ Args:
231
+ tree_type (str or TreeTypes):
232
+ Type of trees to use.
233
+ """
234
+ LOGGER.debug('start building tree : 0')
235
+ # 1
236
+ tree_1 = get_tree(tree_type)
237
+ tree_1.fit(0, self.n_var, self.tau_mat, self.u_matrix)
238
+ self.trees.append(tree_1)
239
+ LOGGER.debug('finish building tree : 0')
240
+
241
+ for k in range(1, min(self.n_var - 1, self.truncated)):
242
+ # get constraints from previous tree
243
+ self.trees[k - 1]._get_constraints()
244
+ tau = self.trees[k - 1].get_tau_matrix()
245
+ LOGGER.debug(f'start building tree: {k}')
246
+ tree_k = get_tree(tree_type)
247
+ tree_k.fit(k, self.n_var - k, tau, self.trees[k - 1])
248
+ self.trees.append(tree_k)
249
+ LOGGER.debug(f'finish building tree: {k}')
250
+
251
+ def get_likelihood(self, uni_matrix):
252
+ """Compute likelihood of the vine."""
253
+ num_tree = len(self.trees)
254
+ values = np.empty([1, num_tree])
255
+
256
+ for i in range(num_tree):
257
+ value, new_uni_matrix = self.trees[i].get_likelihood(uni_matrix)
258
+ uni_matrix = new_uni_matrix
259
+ values[0, i] = value
260
+
261
+ return np.sum(values)
262
+
263
+ def _sample_row(self):
264
+ """Generate a single sampled row from vine model.
265
+
266
+ Returns:
267
+ numpy.ndarray
268
+ """
269
+ unis = np.random.uniform(0, 1, self.n_var)
270
+ # randomly select a node to start with
271
+ first_ind = np.random.randint(0, self.n_var)
272
+ adj = self.trees[0].get_adjacent_matrix()
273
+ visited = []
274
+ explore = [first_ind]
275
+
276
+ sampled = np.zeros(self.n_var)
277
+ itr = 0
278
+ while explore:
279
+ current = explore.pop(0)
280
+ adj_is_one = adj[current, :] == 1
281
+ neighbors = np.where(adj_is_one)[0].tolist()
282
+ if itr == 0:
283
+ new_x = self.ppfs[current](unis[current])
284
+
285
+ else:
286
+ for i in range(itr - 1, -1, -1):
287
+ current_ind = -1
288
+
289
+ if i >= self.truncated:
290
+ continue
291
+
292
+ current_tree = self.trees[i].edges
293
+ # get index of edge to retrieve
294
+ for edge in current_tree:
295
+ if i == 0:
296
+ if (edge.L == current and edge.R == visited[0]) or\
297
+ (edge.R == current and edge.L == visited[0]):
298
+ current_ind = edge.index
299
+ break
300
+ else:
301
+ if edge.L == current or edge.R == current:
302
+ condition = set(edge.D)
303
+ condition.add(edge.L) # noqa: PD005
304
+ condition.add(edge.R) # noqa: PD005
305
+
306
+ visit_set = set(visited)
307
+ visit_set.add(current) # noqa: PD005
308
+
309
+ if condition.issubset(visit_set):
310
+ current_ind = edge.index
311
+ break
312
+
313
+ if current_ind != -1:
314
+ # the node is not indepedent contional on visited node
315
+ copula_type = current_tree[current_ind].name
316
+ copula = Bivariate(copula_type=CopulaTypes(copula_type))
317
+ copula.theta = current_tree[current_ind].theta
318
+
319
+ U = np.array([unis[visited[0]]])
320
+ if i == itr - 1:
321
+ tmp = copula.percent_point(np.array([unis[current]]), U)[0]
322
+ else:
323
+ tmp = copula.percent_point(np.array([tmp]), U)[0]
324
+
325
+ tmp = min(max(tmp, EPSILON), 0.99)
326
+
327
+ new_x = self.ppfs[current](np.array([tmp]))
328
+
329
+ sampled[current] = new_x
330
+
331
+ for s in neighbors:
332
+ if s not in visited:
333
+ explore.insert(0, s)
334
+
335
+ itr += 1
336
+ visited.insert(0, current)
337
+
338
+ return sampled
339
+
340
+ @random_state
341
+ def sample(self, num_rows):
342
+ """Sample new rows.
343
+
344
+ Args:
345
+ num_rows (int):
346
+ Number of rows to sample
347
+
348
+ Returns:
349
+ pandas.DataFrame:
350
+ sampled rows.
351
+ """
352
+ sampled_values = []
353
+ for i in range(num_rows):
354
+ sampled_values.append(self._sample_row())
355
+
356
+ return pd.DataFrame(sampled_values, columns=self.columns)
@@ -0,0 +1,153 @@
1
+ """Copulas optimization functions."""
2
+
3
+ import numpy as np
4
+
5
+
6
+ def bisect(f, xmin, xmax, tol=1e-8, maxiter=50):
7
+ """Bisection method for finding roots.
8
+
9
+ This method implements a simple vectorized routine for identifying
10
+ the root (of a monotonically increasing function) given a bracketing
11
+ interval.
12
+
13
+ Arguments:
14
+ f (Callable):
15
+ A function which takes as input a vector x and returns a
16
+ vector with the same number of dimensions.
17
+ xmin (np.ndarray):
18
+ The minimum value for x such that f(x) <= 0.
19
+ xmax (np.ndarray):
20
+ The maximum value for x such that f(x) >= 0.
21
+
22
+ Returns:
23
+ numpy.ndarray:
24
+ The value of x such that f(x) is close to 0.
25
+ """
26
+ assert (f(xmin) <= 0.0).all()
27
+ assert (f(xmax) >= 0.0).all()
28
+
29
+ for _ in range(maxiter):
30
+ guess = (xmin + xmax) / 2.0
31
+ fguess = f(guess)
32
+ xmin[fguess <= 0] = guess[fguess <= 0]
33
+ xmax[fguess >= 0] = guess[fguess >= 0]
34
+ if (xmax - xmin).max() < tol:
35
+ break
36
+
37
+ return (xmin + xmax) / 2.0
38
+
39
+
40
+ def chandrupatla(f, xmin, xmax, eps_m=None, eps_a=None, maxiter=50):
41
+ """Chandrupatla's algorithm.
42
+
43
+ This is adapted from [1] which implements Chandrupatla's algorithm [2]
44
+ which starts from a bracketing interval and, conditionally, swaps between
45
+ bisection and inverse quadratic interpolation.
46
+
47
+ [1] https://github.com/scipy/scipy/issues/7242#issuecomment-290548427
48
+ [2] https://books.google.com/books?id=cC-8BAAAQBAJ&pg=PA95
49
+
50
+ Arguments:
51
+ f (Callable):
52
+ A function which takes as input a vector x and returns a
53
+ vector with the same number of dimensions.
54
+ xmin (np.ndarray):
55
+ The minimum value for x such that f(x) <= 0.
56
+ xmax (np.ndarray):
57
+ The maximum value for x such that f(x) >= 0.
58
+
59
+ Returns:
60
+ numpy.ndarray:
61
+ The value of x such that f(x) is close to 0.
62
+ """
63
+ # Initialization
64
+ a = xmax
65
+ b = xmin
66
+ fa = f(a)
67
+ fb = f(b)
68
+
69
+ # Make sure we know the size of the result
70
+ shape = np.shape(fa)
71
+ assert shape == np.shape(fb)
72
+
73
+ fc = fa
74
+ c = a
75
+
76
+ # Make sure we are bracketing a root in each case
77
+ assert (np.sign(fa) * np.sign(fb) <= 0).all()
78
+ t = 0.5
79
+ # Initialize an array of False,
80
+ # determines whether we should do inverse quadratic interpolation
81
+ iqi = np.zeros(shape, dtype=bool)
82
+
83
+ # jms: some guesses for default values of the eps_m and eps_a settings
84
+ # based on machine precision... not sure exactly what to do here
85
+ eps = np.finfo(float).eps
86
+ if eps_m is None:
87
+ eps_m = eps
88
+ if eps_a is None:
89
+ eps_a = 2 * eps
90
+
91
+ iterations = 0
92
+ terminate = False
93
+
94
+ while maxiter > 0:
95
+ maxiter -= 1
96
+ # use t to linearly interpolate between a and b,
97
+ # and evaluate this function as our newest estimate xt
98
+ xt = np.clip(a + t * (b - a), xmin, xmax)
99
+ ft = f(xt)
100
+
101
+ # update our history of the last few points so that
102
+ # - a is the newest estimate (we're going to update it from xt)
103
+ # - c and b get the preceding two estimates
104
+ # - a and b maintain opposite signs for f(a) and f(b)
105
+ samesign = np.sign(ft) == np.sign(fa)
106
+ c = np.choose(samesign, [b, a])
107
+ b = np.choose(samesign, [a, b])
108
+ fc = np.choose(samesign, [fb, fa])
109
+ fb = np.choose(samesign, [fa, fb])
110
+ a = xt
111
+ fa = ft
112
+
113
+ # set xm so that f(xm) is the minimum magnitude of f(a) and f(b)
114
+ fa_is_smaller = np.abs(fa) < np.abs(fb)
115
+ xm = np.choose(fa_is_smaller, [b, a])
116
+ fm = np.choose(fa_is_smaller, [fb, fa])
117
+
118
+ tol = 2 * eps_m * np.abs(xm) + eps_a
119
+ tlim = tol / np.abs(b - c)
120
+ terminate = np.logical_or(terminate, np.logical_or(fm == 0, tlim > 0.5))
121
+
122
+ if np.all(terminate):
123
+ break
124
+ iterations += 1 - terminate
125
+
126
+ # Figure out values xi and phi
127
+ # to determine which method we should use next
128
+ xi = (a - b) / (c - b)
129
+ phi = (fa - fb) / (fc - fb)
130
+ iqi = np.logical_and(phi**2 < xi, (1 - phi)**2 < 1 - xi)
131
+
132
+ if not shape:
133
+ # scalar case
134
+ if iqi:
135
+ # inverse quadratic interpolation
136
+ eq1 = fa / (fb - fa) * fc / (fb - fc)
137
+ eq2 = (c - a) / (b - a) * fa / (fc - fa) * fb / (fc - fb)
138
+ t = eq1 + eq2
139
+ else:
140
+ # bisection
141
+ t = 0.5
142
+ else:
143
+ # array case
144
+ t = np.full(shape, 0.5)
145
+ a2, b2, c2, fa2, fb2, fc2 = a[iqi], b[iqi], c[iqi], fa[iqi], fb[iqi], fc[iqi]
146
+ t[iqi] = fa2 / (fb2 - fa2) * fc2 / (fb2 - fc2) + (c2 - a2) / \
147
+ (b2 - a2) * fa2 / (fc2 - fa2) * fb2 / (fc2 - fb2)
148
+
149
+ # limit to the range (tlim, 1-tlim)
150
+ t = np.minimum(1 - tlim, np.maximum(tlim, t))
151
+
152
+ # done!
153
+ return xm
@@ -0,0 +1,25 @@
1
+ """Univariate copulas module."""
2
+
3
+ from copulas.univariate.base import BoundedType, ParametricType, Univariate
4
+ from copulas.univariate.beta import BetaUnivariate
5
+ from copulas.univariate.gamma import GammaUnivariate
6
+ from copulas.univariate.gaussian import GaussianUnivariate
7
+ from copulas.univariate.gaussian_kde import GaussianKDE
8
+ from copulas.univariate.log_laplace import LogLaplace
9
+ from copulas.univariate.student_t import StudentTUnivariate
10
+ from copulas.univariate.truncated_gaussian import TruncatedGaussian
11
+ from copulas.univariate.uniform import UniformUnivariate
12
+
13
+ __all__ = (
14
+ 'BetaUnivariate',
15
+ 'GammaUnivariate',
16
+ 'GaussianKDE',
17
+ 'GaussianUnivariate',
18
+ 'TruncatedGaussian',
19
+ 'StudentTUnivariate',
20
+ 'Univariate',
21
+ 'ParametricType',
22
+ 'BoundedType',
23
+ 'UniformUnivariate',
24
+ 'LogLaplace'
25
+ )