likelihood 2.2.0.dev1__cp311-cp311-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
likelihood/graph/nn.py ADDED
@@ -0,0 +1,329 @@
1
+ import logging
2
+ import os
3
+
4
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
5
+ logging.getLogger("tensorflow").setLevel(logging.ERROR)
6
+
7
+ from multiprocessing import Pool, cpu_count
8
+ from typing import List, Tuple
9
+
10
+ import numpy as np
11
+ import pandas as pd
12
+ import tensorflow as tf
13
+ from IPython.display import clear_output
14
+ from sklearn.metrics import f1_score
15
+
16
+ tf.get_logger().setLevel("ERROR")
17
+
18
+ from likelihood import rust_py_integration
19
+ from likelihood.tools import LoRALayer
20
+
21
+
22
+ def compare_similarity_np(arr1: np.ndarray, arr2: np.ndarray, threshold: float = 0.05) -> int:
23
+ """Vectorized similarity comparison between two numeric/categorical arrays."""
24
+ arr1 = np.asarray(arr1)
25
+ arr2 = np.asarray(arr2)
26
+
27
+ is_numeric = np.vectorize(
28
+ lambda a, b: isinstance(a, (int, float)) and isinstance(b, (int, float))
29
+ )(arr1, arr2)
30
+
31
+ similarity = np.zeros_like(arr1, dtype=bool)
32
+
33
+ if np.any(is_numeric):
34
+ a_num = arr1[is_numeric].astype(float)
35
+ b_num = arr2[is_numeric].astype(float)
36
+
37
+ both_zero = (a_num == 0) & (b_num == 0)
38
+ nonzero = ~both_zero & (a_num != 0) & (b_num != 0)
39
+ ratio = np.zeros_like(a_num)
40
+ ratio[nonzero] = np.maximum(a_num[nonzero], b_num[nonzero]) / np.minimum(
41
+ a_num[nonzero], b_num[nonzero]
42
+ )
43
+ numeric_similar = both_zero | ((1 - threshold <= ratio) & (ratio <= 1 + threshold))
44
+
45
+ similarity[is_numeric] = numeric_similar
46
+
47
+ similarity[~is_numeric] = arr1[~is_numeric] == arr2[~is_numeric]
48
+
49
+ return np.count_nonzero(similarity)
50
+
51
+
52
+ def compare_pair(pair, data, similarity, threshold):
53
+ i, j = pair
54
+ sim = compare_similarity_np(data[i], data[j], threshold=threshold)
55
+ return (i, j, 1 if sim >= similarity else 0)
56
+
57
+
58
+ def cal_adjacency_matrix(
59
+ df: pd.DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
60
+ ) -> Tuple[dict, np.ndarray]:
61
+ """
62
+ Calculates the adjacency matrix for a given DataFrame using parallel processing.
63
+
64
+ Parameters
65
+ ----------
66
+ df : `pd.DataFrame`
67
+ The input DataFrame containing the features.
68
+ exclude_subset : `List[str]`, `optional`
69
+ A list of features to exclude from the calculation of the adjacency matrix.
70
+ sparse : `bool`, `optional`
71
+ Whether to return a sparse matrix or a dense matrix.
72
+ **kwargs : `dict`
73
+ Additional keyword arguments to pass to the `compare_similarity` function.
74
+
75
+ Returns
76
+ -------
77
+ adj_dict : `dict`
78
+ A dictionary containing the features.
79
+ adjacency_matrix : `np.ndarray`
80
+ The adjacency matrix.
81
+
82
+ Keyword Arguments
83
+ -----------------
84
+ similarity: `int`
85
+ The minimum number of features that must be the same in both arrays to be considered similar.
86
+ threshold : `float`
87
+ The threshold value used in the `compare_similarity` function. Default is 0.0
88
+ """
89
+ if len(exclude_subset) > 0:
90
+ columns = [col for col in df.columns if col not in exclude_subset]
91
+ df_ = df[columns].copy()
92
+ else:
93
+ df_ = df.copy()
94
+
95
+ assert len(df_) > 0
96
+
97
+ similarity = kwargs.get("similarity", len(df_.columns) - 1)
98
+ threshold = kwargs.get("threshold", 0.05)
99
+ assert similarity <= df_.shape[1]
100
+
101
+ data = df_.to_numpy()
102
+ n = len(data)
103
+
104
+ adj_dict = {i: data[i].tolist() for i in range(n)}
105
+
106
+ def pair_generator():
107
+ for i in range(n):
108
+ for j in range(i, n):
109
+ yield (i, j)
110
+
111
+ with Pool(cpu_count()) as pool:
112
+ results = pool.starmap(
113
+ compare_pair, ((pair, data, similarity, threshold) for pair in pair_generator())
114
+ )
115
+
116
+ adjacency_matrix = np.zeros((n, n), dtype=np.uint8)
117
+ for i, j, val in results:
118
+ if val:
119
+ adjacency_matrix[i, j] = 1
120
+ adjacency_matrix[j, i] = 1
121
+
122
+ if sparse:
123
+ num_nodes = adjacency_matrix.shape[0]
124
+
125
+ indices = np.argwhere(adjacency_matrix != 0.0)
126
+ indices = tf.constant(indices, dtype=tf.int64)
127
+ values = tf.constant(adjacency_matrix[indices[:, 0], indices[:, 1]], dtype=tf.float32)
128
+ adjacency_matrix = tf.sparse.SparseTensor(
129
+ indices=indices, values=values, dense_shape=(num_nodes, num_nodes)
130
+ )
131
+
132
+ return adj_dict, adjacency_matrix
133
+
134
+
135
+ class Data:
136
+ def __init__(
137
+ self,
138
+ df: pd.DataFrame,
139
+ target: str | None = None,
140
+ exclude_subset: List[str] = [],
141
+ **kwargs,
142
+ ):
143
+ sparse = kwargs.get("sparse", True)
144
+ threshold = kwargs.get("threshold", 0.05)
145
+ _, adjacency = cal_adjacency_matrix(
146
+ df, exclude_subset=exclude_subset, sparse=sparse, threshold=threshold
147
+ )
148
+ if target is not None:
149
+ X = df.drop(columns=[target] + exclude_subset)
150
+ else:
151
+ X = df.drop(columns=exclude_subset)
152
+ self.columns = X.columns
153
+ X = X.to_numpy()
154
+ self.x = np.asarray(X).astype(np.float32)
155
+ self.adjacency = adjacency
156
+ if target is not None:
157
+ self.y = np.asarray(df[target].values).astype(np.int32)
158
+
159
+
160
+ @tf.keras.utils.register_keras_serializable(package="Custom", name="VanillaGNNLayer")
161
+ class VanillaGNNLayer(tf.keras.layers.Layer):
162
+ def __init__(self, dim_in, dim_out, rank=None, kernel_initializer="glorot_uniform", **kwargs):
163
+ super(VanillaGNNLayer, self).__init__(**kwargs)
164
+ self.dim_out = dim_out
165
+ self.rank = rank
166
+ self.kernel_initializer = kernel_initializer
167
+ self.linear = None
168
+
169
+ def build(self, input_shape):
170
+ if self.rank:
171
+ self.linear = LoRALayer(self.dim_out, rank=self.rank)
172
+ else:
173
+ self.linear = tf.keras.layers.Dense(
174
+ self.dim_out, use_bias=False, kernel_initializer=self.kernel_initializer
175
+ )
176
+ super(VanillaGNNLayer, self).build(input_shape)
177
+
178
+ def call(self, x, adjacency):
179
+ x = self.linear(x)
180
+ x = tf.sparse.sparse_dense_matmul(adjacency, x)
181
+ return x
182
+
183
+ def get_config(self):
184
+ config = super(VanillaGNNLayer, self).get_config()
185
+ config.update(
186
+ {
187
+ "dim_out": self.dim_out,
188
+ "rank": self.rank,
189
+ "kernel_initializer": (
190
+ None
191
+ if self.rank
192
+ else tf.keras.initializers.serialize(self.linear.kernel_initializer)
193
+ ),
194
+ }
195
+ )
196
+ return config
197
+
198
+
199
+ @tf.keras.utils.register_keras_serializable(package="Custom", name="VanillaGNN")
200
+ class VanillaGNN(tf.keras.Model):
201
+ def __init__(self, dim_in, dim_h, dim_out, rank=2, **kwargs):
202
+ super(VanillaGNN, self).__init__(**kwargs)
203
+ self.dim_in = dim_in
204
+ self.dim_h = dim_h
205
+ self.dim_out = dim_out
206
+ self.rank = rank
207
+
208
+ self.gnn1 = VanillaGNNLayer(self.dim_in, self.dim_h, self.rank)
209
+ self.gnn2 = VanillaGNNLayer(self.dim_h, self.dim_h, self.rank)
210
+ self.gnn3 = VanillaGNNLayer(self.dim_h, self.dim_out, None)
211
+
212
+ def call(self, x, adjacency):
213
+ h = self.gnn1(x, adjacency)
214
+ h = tf.nn.tanh(h)
215
+ h = self.gnn2(h, adjacency)
216
+ h = self.gnn3(h, adjacency)
217
+ return tf.nn.softmax(h, axis=1)
218
+
219
+ def f1_macro(self, y_true, y_pred):
220
+ return f1_score(y_true, y_pred, average="macro")
221
+
222
+ def compute_f1_score(self, logits, labels):
223
+ predictions = tf.argmax(logits, axis=1, output_type=tf.int32)
224
+ true_labels = tf.cast(labels, tf.int32)
225
+ return self.f1_macro(true_labels.numpy(), predictions.numpy())
226
+
227
+ def evaluate(self, x, adjacency, y):
228
+ y = tf.cast(y, tf.int32)
229
+ out = self(x, adjacency)
230
+ loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=out)
231
+ loss = tf.reduce_mean(loss)
232
+ f1 = round(self.compute_f1_score(out, y), 4)
233
+ return loss.numpy(), f1
234
+
235
+ def test(self, data):
236
+ out = self(data.x, data.adjacency)
237
+ test_f1 = self.compute_f1_score(out, data.y)
238
+ return round(test_f1, 4)
239
+
240
+ def predict(self, data):
241
+ out = self(data.x, data.adjacency)
242
+ return tf.argmax(out, axis=1, output_type=tf.int32).numpy()
243
+
244
+ def get_config(self):
245
+ config = {
246
+ "dim_in": self.dim_in,
247
+ "dim_h": self.dim_h,
248
+ "dim_out": self.dim_out,
249
+ "rank": self.rank,
250
+ }
251
+ base_config = super(VanillaGNN, self).get_config()
252
+ return dict(list(base_config.items()) + list(config.items()))
253
+
254
+ @classmethod
255
+ def from_config(cls, config):
256
+ return cls(
257
+ dim_in=config["dim_in"],
258
+ dim_h=config["dim_h"],
259
+ dim_out=config["dim_out"],
260
+ rank=config["rank"],
261
+ )
262
+
263
+ @tf.function
264
+ def train_step(self, batch_x, batch_adjacency, batch_y, optimizer):
265
+ with tf.GradientTape() as tape:
266
+ out = self(batch_x, batch_adjacency)
267
+ loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=batch_y, logits=out)
268
+ loss = tf.reduce_mean(loss)
269
+ gradients = tape.gradient(loss, self.trainable_variables)
270
+ optimizer.apply_gradients(zip(gradients, self.trainable_variables))
271
+ return loss
272
+
273
+ def fit(self, data, epochs, batch_size, test_size=0.2, optimizer="adam"):
274
+ optimizers = {
275
+ "sgd": tf.keras.optimizers.SGD(),
276
+ "adam": tf.keras.optimizers.Adam(),
277
+ "adamw": tf.keras.optimizers.AdamW(),
278
+ "adadelta": tf.keras.optimizers.Adadelta(),
279
+ "rmsprop": tf.keras.optimizers.RMSprop(),
280
+ }
281
+ optimizer = optimizers[optimizer]
282
+ train_losses = []
283
+ train_f1_scores = []
284
+ val_losses = []
285
+ val_f1_scores = []
286
+
287
+ num_nodes = len(data.x)
288
+ split_index = int((1 - test_size) * num_nodes)
289
+
290
+ X_train, X_test = data.x[:split_index], data.x[split_index:]
291
+ y_train, y_test = data.y[:split_index], data.y[split_index:]
292
+
293
+ adjacency_train = tf.sparse.slice(data.adjacency, [0, 0], [split_index, split_index])
294
+ adjacency_test = tf.sparse.slice(
295
+ data.adjacency,
296
+ [split_index, split_index],
297
+ [num_nodes - split_index, num_nodes - split_index],
298
+ )
299
+
300
+ batch_starts = np.arange(0, len(X_train), batch_size)
301
+ for epoch in range(epochs):
302
+ np.random.shuffle(batch_starts)
303
+ for start in batch_starts:
304
+ end = start + batch_size
305
+ batch_x = X_train[start:end, :]
306
+ batch_adjacency = tf.sparse.slice(
307
+ adjacency_train, [start, start], [batch_size, batch_size]
308
+ )
309
+ batch_y = y_train[start:end]
310
+ train_loss = self.train_step(batch_x, batch_adjacency, batch_y, optimizer)
311
+
312
+ train_loss, train_f1 = self.evaluate(X_train, adjacency_train, y_train)
313
+ train_losses.append(train_loss)
314
+ train_f1_scores.append(train_f1)
315
+
316
+ if epoch % 5 == 0:
317
+ clear_output(wait=True)
318
+ val_loss, val_f1 = self.evaluate(X_test, adjacency_test, y_test)
319
+ val_losses.append(val_loss)
320
+ val_f1_scores.append(val_f1)
321
+ print(
322
+ f"Epoch {epoch:>3} | Train Loss: {train_loss:.4f} | Train F1: {train_f1:.4f} | Val Loss: {val_loss:.4f} | Val F1: {val_f1:.4f}"
323
+ )
324
+
325
+ return train_losses, train_f1_scores, val_losses, val_f1_scores
326
+
327
+
328
+ if __name__ == "__main__":
329
+ print("Examples will be running below")
likelihood/main.py ADDED
@@ -0,0 +1,273 @@
1
+ from typing import Callable, List, Tuple
2
+
3
+ import corner
4
+ import matplotlib.pyplot as plt
5
+ import numpy as np
6
+
7
+
8
+ def lnprior(theta: np.ndarray, conditions: List[Tuple[float, float]]) -> float:
9
+ """Computes the prior probability.
10
+
11
+ Parameters
12
+ ----------
13
+ theta : `np.ndarray`
14
+ An array containing the parameters of the model.
15
+ conditions : `list`
16
+ A list containing $2n$-conditions for the (min, max) range of the
17
+ $n$ parameters.
18
+
19
+ Returns
20
+ -------
21
+ lp : `float`
22
+ The a priori probability.
23
+ """
24
+ if len(conditions) != 2 * len(theta):
25
+ raise ValueError("Length of conditions must be twice the length of theta.")
26
+
27
+ cond = np.array(conditions).reshape((len(theta), 2))
28
+ within_bounds = np.logical_and(cond[:, 0] < theta, theta < cond[:, 1])
29
+ if not np.all(within_bounds):
30
+ return np.inf
31
+
32
+ return 0.0
33
+
34
+
35
+ def fun_like(
36
+ x: np.ndarray,
37
+ y: np.ndarray,
38
+ model: Callable,
39
+ theta: np.ndarray,
40
+ conditions: List[Tuple[float, float]] = None,
41
+ var2: float = 1.0,
42
+ ) -> float:
43
+ """Computes the likelihood.
44
+
45
+ Parameters
46
+ ----------
47
+ x : `np.ndarray`
48
+ An $(m, n)$ dimensional array for (cols, rows).
49
+ y : `np.ndarray`
50
+ An $n$ dimensional array that will be compared with model's output.
51
+ model : `Callable`
52
+ A Python function defined by the user. This function should receive
53
+ two arguments $(x, theta)$.
54
+ theta : `np.ndarray`
55
+ The array containing the model's parameters.
56
+ conditions : `list`, optional
57
+ A list containing $2n$-conditions for the (min, max) range of the
58
+ $n$ parameters. Defaults to None.
59
+ var2 : `float`, optional
60
+ Determines the step size of the walker. By default it is set to `1.0`.
61
+
62
+ Returns
63
+ -------
64
+ lhood : `float`
65
+ The computed likelihood.
66
+ """
67
+ lp = 0.0 if conditions is None else lnprior(theta, conditions)
68
+ inv_sigma2 = 1.0 / var2
69
+ y_hat = model(x, theta)
70
+
71
+ try:
72
+ y_hat.shape[1]
73
+ except IndexError:
74
+ y_hat = y_hat[np.newaxis, ...].T
75
+
76
+ y_sum = np.sum((y - y_hat) ** 2 * inv_sigma2 - np.log(inv_sigma2))
77
+ lhood = 0.5 * y_sum
78
+
79
+ if not np.isfinite(lhood):
80
+ return np.inf
81
+
82
+ return lhood + lp
83
+
84
+
85
+ def update_theta(theta: np.ndarray, d: float) -> np.ndarray:
86
+ """Updates the theta parameters.
87
+
88
+ Parameters
89
+ ----------
90
+ theta : `np.ndarray`
91
+ The ndarray containing the model's parameters.
92
+ d : `float`
93
+ Size of the Gaussian step for the walker.
94
+
95
+ Returns
96
+ -------
97
+ theta_new : `np.array`
98
+ An ndarray with the updated theta values.
99
+ """
100
+ return np.random.normal(theta, d / 2.0)
101
+
102
+
103
+ def walk(
104
+ x: np.ndarray,
105
+ y: np.ndarray,
106
+ model: Callable,
107
+ theta: np.ndarray,
108
+ conditions: List[Tuple[float, float]] = None,
109
+ var2: float = 0.01,
110
+ mov: int = 100,
111
+ d: float = 1.0,
112
+ tol: float = 1e-4,
113
+ mode: bool = True,
114
+ ):
115
+ """Executes the walker implementation.
116
+
117
+ Parameters
118
+ ----------
119
+ x : `np.ndarray`
120
+ An $(m, n)$ dimensional array for (cols, rows).
121
+ y : np.ndarray
122
+ An $n$ dimensional array that will be compared with model's output.
123
+ model : `Callable`
124
+ A Python function defined by the user. This function should receive
125
+ two arguments $(x, theta)$.
126
+ theta : `np.ndarray`
127
+ The array containing the model's parameters.
128
+ conditions : `list`, optional
129
+ A list containing $2n$-conditions for the (min, max) range of the
130
+ $n$ parameters. Defaults to None.
131
+ var2 : `float`, optional
132
+ Determines the step size of the walker. By default it is set to `1.0`.
133
+ mov : `int`, optional
134
+ Number of movements that walker will perform. By default it is set
135
+ to `100`.
136
+ d : `float`, optional
137
+ Size of the Gaussian step for the walker.
138
+ tol : `float`, optional
139
+ Convergence criteria for the log-likelihood. By default it is set
140
+ to `1e-3`.
141
+ mode : `bool`, optional
142
+ Defaults to `True`.
143
+
144
+ Returns
145
+ -------
146
+ theta : `np.array`
147
+ An ndarray with the updated theta values.
148
+ nwalk : `np.array`
149
+ Updates of theta for each movement performed by the walker.
150
+ y0 : `float`
151
+ The log-likelihood value.
152
+ """
153
+ nwalk = []
154
+
155
+ for i in range(mov):
156
+ nwalk.append(theta)
157
+ theta_new = update_theta(theta, d)
158
+
159
+ y0 = fun_like(x, y, model, theta, conditions, var2)
160
+ y1 = fun_like(x, y, model, theta_new, conditions, var2)
161
+ if y0 <= tol or y1 <= tol:
162
+ if mode:
163
+ print("Goal reached!")
164
+ return (theta_new, nwalk, y1) if y1 <= tol else (theta, nwalk, y0)
165
+
166
+ if y1 >= y0:
167
+ ratio = y0 / y1
168
+ prob = np.exp(-ratio)
169
+
170
+ if prob > np.random.rand():
171
+ theta = theta_new
172
+ else:
173
+ theta = theta_new
174
+ theta_new = update_theta(theta, d)
175
+
176
+ if mode:
177
+ print("Maximum number of iterations reached!")
178
+ print(f"The log-likelihood is: {y0}")
179
+
180
+ return theta, nwalk, y0
181
+
182
+
183
+ def walkers(
184
+ nwalkers: int,
185
+ x: np.ndarray,
186
+ y: np.ndarray,
187
+ model: Callable,
188
+ theta: np.ndarray,
189
+ conditions: List[Tuple[float, float]] = None,
190
+ var2: float = 0.01,
191
+ mov: int = 100,
192
+ d: float = 1.0,
193
+ tol: float = 1e-4,
194
+ mode: bool = False,
195
+ figname: str = "fig_out.png",
196
+ ):
197
+ """Executes multiple walkers.
198
+
199
+ Parameters
200
+ ----------
201
+ nwalkers : `int`
202
+ The number of walkers to be executed.
203
+ x : `np.ndarray`
204
+ An $(m, n)$ dimensional array for (cols, rows).
205
+ y : `np.ndarray`
206
+ An $n$ dimensional array that will be compared with model's output.
207
+ model : `Callable`
208
+ A Python function defined by the user. This function should receive
209
+ two arguments $(x, theta)$.
210
+ theta : `np.ndarray`
211
+ The array containing the model's parameters.
212
+ conditions : `list`, optional
213
+ A list containing $2n$-conditions for the (min, max) range of the
214
+ $n$ parameters. Defaults to None.
215
+ var2 : `float`, optional
216
+ Determines the step size of the walker. By default it is set to `1.0`.
217
+ mov : `int`, optional
218
+ Number of movements that walker will perform. By default it is set
219
+ to `100`.
220
+ d : `float`, optional
221
+ Size of the Gaussian step for the walker.
222
+ tol : `float`, optional
223
+ Convergence criteria for the log-likelihood. By default it is set
224
+ to `1e-3`.
225
+ mode : `bool`, optional
226
+ Specifies that we will be working with more than one walker. By
227
+ default it is set to `False`.
228
+ figname : `str`, optional
229
+ The name of the output file for the figure. By default it is set
230
+ to `fig_out.png`.
231
+
232
+ Returns
233
+ -------
234
+ par : `np.array`
235
+ The theta found by each of the walkers.
236
+ error : `np.array`
237
+ The log-likelihood array.
238
+ """
239
+ error = []
240
+ par = []
241
+
242
+ for i in range(nwalkers):
243
+ theta, nwalk, y0 = walk(x, y, model, theta, conditions, var2, mov, d, tol, mode)
244
+ par.append(theta)
245
+ nwalk = np.array(nwalk).reshape((len(nwalk), len(nwalk[i])))
246
+ error.append(y0)
247
+
248
+ if figname:
249
+ for k in range(nwalk.shape[1]):
250
+ sub = f"$\\theta _{k}$"
251
+ plt.plot(range(len(nwalk[:, k])), nwalk[:, k], "-", label=sub)
252
+ plt.ylabel("$\\theta$")
253
+ plt.xlabel("iterations")
254
+ plt.savefig(f"walkers_{figname}", dpi=300, transparent=True)
255
+
256
+ if figname:
257
+ plt.show()
258
+
259
+ if len(theta) == 2 and figname:
260
+ corner.hist2d(
261
+ nwalk[:, 0],
262
+ nwalk[:, 1],
263
+ range=None,
264
+ bins=18,
265
+ smooth=True,
266
+ plot_datapoints=True,
267
+ plot_density=True,
268
+ )
269
+ plt.ylabel("$\\theta_{1}$")
270
+ plt.xlabel("$\\theta_{0}$")
271
+ plt.savefig(f"theta_{figname}", dpi=300, transparent=True)
272
+
273
+ return par, error
@@ -0,0 +1,3 @@
1
+ from .environments import *
2
+ from .regression import *
3
+ from .simulation import *
@@ -0,0 +1,13 @@
1
+ import tensorflow as tf
2
+ from packaging import version
3
+
4
+ if version.parse(tf.__version__) > version.parse("2.15.0"):
5
+ from ._autoencoders import *
6
+ from ._predictor import GetInsights
7
+ else:
8
+ from .autoencoders import *
9
+ from .predictor import GetInsights
10
+
11
+ from .bandit import *
12
+ from .gan import *
13
+ from .rl import *