likelihood 1.5.8__tar.gz → 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {likelihood-1.5.8 → likelihood-2.0.0}/PKG-INFO +4 -3
  2. likelihood-2.0.0/likelihood/graph/__init__.py +9 -0
  3. likelihood-2.0.0/likelihood/graph/_nn.py +421 -0
  4. likelihood-2.0.0/likelihood/models/deep/__init__.py +12 -0
  5. likelihood-2.0.0/likelihood/models/deep/_autoencoders.py +895 -0
  6. likelihood-2.0.0/likelihood/models/deep/_predictor.py +810 -0
  7. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood/models/deep/autoencoders.py +2 -2
  8. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood/models/deep/gan.py +4 -4
  9. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood/models/deep/predictor.py +1 -0
  10. likelihood-2.0.0/likelihood/models/deep/rl.py +350 -0
  11. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood/models/simulation.py +9 -4
  12. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood/tools/tools.py +7 -2
  13. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood.egg-info/PKG-INFO +4 -3
  14. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood.egg-info/SOURCES.txt +4 -0
  15. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood.egg-info/requires.txt +3 -2
  16. {likelihood-1.5.8 → likelihood-2.0.0}/setup.py +1 -1
  17. likelihood-1.5.8/likelihood/graph/__init__.py +0 -1
  18. likelihood-1.5.8/likelihood/models/deep/__init__.py +0 -3
  19. {likelihood-1.5.8 → likelihood-2.0.0}/LICENSE +0 -0
  20. {likelihood-1.5.8 → likelihood-2.0.0}/README.md +0 -0
  21. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood/__init__.py +0 -0
  22. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood/graph/graph.py +0 -0
  23. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood/graph/nn.py +0 -0
  24. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood/main.py +0 -0
  25. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood/models/__init__.py +0 -0
  26. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood/models/hmm.py +0 -0
  27. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood/models/regression.py +0 -0
  28. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood/models/utils.py +0 -0
  29. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood/tools/__init__.py +0 -0
  30. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood/tools/cat_embed.py +0 -0
  31. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood/tools/figures.py +0 -0
  32. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood/tools/impute.py +0 -0
  33. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood/tools/models_tools.py +0 -0
  34. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood/tools/numeric_tools.py +0 -0
  35. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood.egg-info/dependency_links.txt +0 -0
  36. {likelihood-1.5.8 → likelihood-2.0.0}/likelihood.egg-info/top_level.txt +0 -0
  37. {likelihood-1.5.8 → likelihood-2.0.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: likelihood
3
- Version: 1.5.8
3
+ Version: 2.0.0
4
4
  Summary: A package that performs the maximum likelihood algorithm.
5
5
  Home-page: https://github.com/jzsmoreno/likelihood/
6
6
  Author: J. A. Moreno-Guerra
@@ -20,9 +20,10 @@ Requires-Dist: pydocstyle>=6.3.0
20
20
  Requires-Dist: flake8>=6.0.0
21
21
  Requires-Dist: isort>=5.12.0
22
22
  Requires-Dist: mypy>=1.4.1
23
- Requires-Dist: numpy<2.0.0
23
+ Requires-Dist: numpy<3.0.0,>=1.26.4
24
24
  Requires-Dist: pydot==2.0.0
25
25
  Requires-Dist: matplotlib
26
+ Requires-Dist: packaging
26
27
  Requires-Dist: graphviz
27
28
  Requires-Dist: seaborn
28
29
  Requires-Dist: pyyaml
@@ -32,7 +33,7 @@ Requires-Dist: tqdm
32
33
  Provides-Extra: full
33
34
  Requires-Dist: networkx; extra == "full"
34
35
  Requires-Dist: pyvis; extra == "full"
35
- Requires-Dist: tensorflow==2.15.0; extra == "full"
36
+ Requires-Dist: tensorflow>=2.15.0; extra == "full"
36
37
  Requires-Dist: keras-tuner; extra == "full"
37
38
  Requires-Dist: scikit-learn; extra == "full"
38
39
  Dynamic: author
@@ -0,0 +1,9 @@
1
+ import tensorflow as tf
2
+ from packaging import version
3
+
4
+ from .graph import *
5
+
6
+ if version.parse(tf.__version__) > version.parse("2.15.0"):
7
+ from ._nn import *
8
+ else:
9
+ from .nn import *
@@ -0,0 +1,421 @@
1
+ import logging
2
+ import os
3
+
4
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
5
+ logging.getLogger("tensorflow").setLevel(logging.ERROR)
6
+
7
+ import warnings
8
+ from multiprocessing import Pool, cpu_count
9
+ from typing import Any, List, Tuple
10
+
11
+ import numpy as np
12
+ import pandas as pd
13
+ import tensorflow as tf
14
+ from IPython.display import clear_output
15
+ from pandas.core.frame import DataFrame
16
+ from sklearn.metrics import f1_score
17
+
18
+ tf.get_logger().setLevel("ERROR")
19
+
20
+ from likelihood.tools import LoRALayer
21
+
22
+
23
+ def compare_similarity_np(arr1: np.ndarray, arr2: np.ndarray, threshold: float = 0.05) -> int:
24
+ """Vectorized similarity comparison between two numeric/categorical arrays."""
25
+ arr1 = np.asarray(arr1)
26
+ arr2 = np.asarray(arr2)
27
+
28
+ is_numeric = np.vectorize(
29
+ lambda a, b: isinstance(a, (int, float)) and isinstance(b, (int, float))
30
+ )(arr1, arr2)
31
+
32
+ similarity = np.zeros_like(arr1, dtype=bool)
33
+
34
+ if np.any(is_numeric):
35
+ a_num = arr1[is_numeric].astype(float)
36
+ b_num = arr2[is_numeric].astype(float)
37
+
38
+ both_zero = (a_num == 0) & (b_num == 0)
39
+ nonzero = ~both_zero & (a_num != 0) & (b_num != 0)
40
+ ratio = np.zeros_like(a_num)
41
+ ratio[nonzero] = np.maximum(a_num[nonzero], b_num[nonzero]) / np.minimum(
42
+ a_num[nonzero], b_num[nonzero]
43
+ )
44
+ numeric_similar = both_zero | ((1 - threshold <= ratio) & (ratio <= 1 + threshold))
45
+
46
+ similarity[is_numeric] = numeric_similar
47
+
48
+ similarity[~is_numeric] = arr1[~is_numeric] == arr2[~is_numeric]
49
+
50
+ return np.count_nonzero(similarity)
51
+
52
+
53
+ def compare_pair(pair, data, similarity, threshold):
54
+ i, j = pair
55
+ sim = compare_similarity_np(data[i], data[j], threshold=threshold)
56
+ return (i, j, 1 if sim >= similarity else 0)
57
+
58
+
59
+ def cal_adjacency_matrix(
60
+ df: pd.DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
61
+ ) -> Tuple[dict, np.ndarray]:
62
+ """
63
+ Calculates the adjacency matrix for a given DataFrame using parallel processing.
64
+
65
+ Parameters
66
+ ----------
67
+ df : `DataFrame`
68
+ The input DataFrame containing the features.
69
+ exclude_subset : `List[str]`, `optional`
70
+ A list of features to exclude from the calculation of the adjacency matrix.
71
+ sparse : `bool`, `optional`
72
+ Whether to return a sparse matrix or a dense matrix.
73
+ **kwargs : `dict`
74
+ Additional keyword arguments to pass to the `compare_similarity` function.
75
+
76
+ Returns
77
+ -------
78
+ adj_dict : `dict`
79
+ A dictionary containing the features.
80
+ adjacency_matrix : `ndarray`
81
+ The adjacency matrix.
82
+
83
+ Keyword Arguments:
84
+ ----------
85
+ similarity: `int`
86
+ The minimum number of features that must be the same in both arrays to be considered similar.
87
+ threshold : `float`
88
+ The threshold value used in the `compare_similarity` function. Default is 0.0
89
+ """
90
+ if len(exclude_subset) > 0:
91
+ columns = [col for col in df.columns if col not in exclude_subset]
92
+ df_ = df[columns].copy()
93
+ else:
94
+ df_ = df.copy()
95
+
96
+ assert len(df_) > 0
97
+
98
+ similarity = kwargs.get("similarity", len(df_.columns) - 1)
99
+ threshold = kwargs.get("threshold", 0.05)
100
+ assert similarity <= df_.shape[1]
101
+
102
+ data = df_.to_numpy()
103
+ n = len(data)
104
+
105
+ adj_dict = {i: data[i].tolist() for i in range(n)}
106
+
107
+ def pair_generator():
108
+ for i in range(n):
109
+ for j in range(i, n):
110
+ yield (i, j)
111
+
112
+ with Pool(cpu_count()) as pool:
113
+ results = pool.starmap(
114
+ compare_pair, ((pair, data, similarity, threshold) for pair in pair_generator())
115
+ )
116
+
117
+ adjacency_matrix = np.zeros((n, n), dtype=np.uint8)
118
+ for i, j, val in results:
119
+ if val:
120
+ adjacency_matrix[i, j] = 1
121
+ adjacency_matrix[j, i] = 1
122
+
123
+ if sparse:
124
+ num_nodes = adjacency_matrix.shape[0]
125
+
126
+ indices = np.argwhere(adjacency_matrix != 0.0)
127
+ indices = tf.constant(indices, dtype=tf.int64)
128
+ values = tf.constant(adjacency_matrix[indices[:, 0], indices[:, 1]], dtype=tf.float32)
129
+ adjacency_matrix = tf.sparse.SparseTensor(
130
+ indices=indices, values=values, dense_shape=(num_nodes, num_nodes)
131
+ )
132
+
133
+ return adj_dict, adjacency_matrix
134
+
135
+
136
+ class Data:
137
+ def __init__(
138
+ self,
139
+ df: DataFrame,
140
+ target: str | None = None,
141
+ exclude_subset: List[str] = [],
142
+ **kwargs,
143
+ ):
144
+ sparse = kwargs.get("sparse", True)
145
+ threshold = kwargs.get("threshold", 0.05)
146
+ _, adjacency = cal_adjacency_matrix(
147
+ df, exclude_subset=exclude_subset, sparse=sparse, threshold=threshold
148
+ )
149
+ if target is not None:
150
+ X = df.drop(columns=[target] + exclude_subset)
151
+ else:
152
+ X = df.drop(columns=exclude_subset)
153
+ self.columns = X.columns
154
+ X = X.to_numpy()
155
+ self.x = np.asarray(X).astype(np.float32)
156
+ self.adjacency = adjacency
157
+ if target is not None:
158
+ self.y = np.asarray(df[target].values).astype(np.int32)
159
+
160
+
161
+ @tf.keras.utils.register_keras_serializable(package="Custom", name="VanillaGNNLayer")
162
+ class VanillaGNNLayer(tf.keras.layers.Layer):
163
+ def __init__(self, dim_in, dim_out, rank=None, kernel_initializer="glorot_uniform", **kwargs):
164
+ super(VanillaGNNLayer, self).__init__(**kwargs)
165
+ self.dim_in = dim_in
166
+ self.dim_out = dim_out
167
+ self.rank = rank
168
+ self.kernel_initializer = kernel_initializer
169
+ self.linear = None
170
+
171
+ def build(self, input_shape):
172
+ if self.rank:
173
+ self.linear = LoRALayer(self.dim_out, rank=self.rank)
174
+ else:
175
+ self.linear = tf.keras.layers.Dense(
176
+ self.dim_out, use_bias=False, kernel_initializer=self.kernel_initializer
177
+ )
178
+ super(VanillaGNNLayer, self).build(input_shape)
179
+
180
+ def call(self, x, adjacency):
181
+ x = self.linear(x)
182
+ x = tf.sparse.sparse_dense_matmul(adjacency, x)
183
+ return x
184
+
185
+ def get_config(self):
186
+ config = super(VanillaGNNLayer, self).get_config()
187
+ config.update(
188
+ {
189
+ "dim_in": self.dim_in,
190
+ "dim_out": self.dim_out,
191
+ "rank": self.rank,
192
+ "kernel_initializer": (
193
+ None
194
+ if self.rank
195
+ else tf.keras.initializers.serialize(self.linear.kernel_initializer)
196
+ ),
197
+ }
198
+ )
199
+ return config
200
+
201
+ @classmethod
202
+ def from_config(cls, config):
203
+ if config.get("kernel_initializer") is not None:
204
+ config["kernel_initializer"] = tf.keras.initializers.deserialize(
205
+ config["kernel_initializer"]
206
+ )
207
+ return cls(**config)
208
+
209
+
210
+ class VanillaGNN:
211
+ def __init__(self, dim_in, dim_h, dim_out, rank=2, **kwargs):
212
+ self.dim_in = dim_in
213
+ self.dim_h = dim_h
214
+ self.dim_out = dim_out
215
+ self.rank = rank
216
+
217
+ self.gnn1 = VanillaGNNLayer(self.dim_in, self.dim_h, self.rank)
218
+ self.gnn2 = VanillaGNNLayer(self.dim_h, self.dim_h, self.rank)
219
+ self.gnn3 = VanillaGNNLayer(self.dim_h, self.dim_out, None)
220
+
221
+ self.build()
222
+
223
+ def build(self):
224
+ x_in = tf.keras.Input(shape=(self.dim_in,), name="node_features")
225
+ adjacency_in = tf.keras.Input(shape=(None,), sparse=True, name="adjacency")
226
+
227
+ gnn1 = VanillaGNNLayer(self.dim_in, self.dim_h, self.rank)
228
+ gnn2 = VanillaGNNLayer(self.dim_h, self.dim_h, self.rank)
229
+ gnn3 = VanillaGNNLayer(self.dim_h, self.dim_out, rank=None)
230
+
231
+ h = gnn1(x_in, adjacency_in)
232
+ h = tf.keras.activations.tanh(h)
233
+ h = gnn2(h, adjacency_in)
234
+ h = gnn3(h, adjacency_in)
235
+ out = tf.keras.activations.softmax(h, axis=-1)
236
+
237
+ self.model = tf.keras.Model(
238
+ inputs=[x_in, adjacency_in], outputs=out, name="VanillaGNN_Functional"
239
+ )
240
+
241
+ @tf.function
242
+ def __call__(self, x, adjacency):
243
+ return self.model([x, adjacency])
244
+
245
+ def f1_macro(self, y_true, y_pred):
246
+ return f1_score(y_true, y_pred, average="macro")
247
+
248
+ def compute_f1_score(self, logits, labels):
249
+ predictions = tf.argmax(logits, axis=1, output_type=tf.int32)
250
+ true_labels = tf.cast(labels, tf.int32)
251
+ return self.f1_macro(true_labels.numpy(), predictions.numpy())
252
+
253
+ def evaluate(self, x, adjacency, y):
254
+ y = tf.cast(y, tf.int32)
255
+ out = self(x, adjacency)
256
+ loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=out)
257
+ loss = tf.reduce_mean(loss)
258
+ f1 = round(self.compute_f1_score(out, y), 4)
259
+ return loss.numpy(), f1
260
+
261
+ def test(self, data):
262
+ data.x = tf.convert_to_tensor(data.x) if not tf.is_tensor(data.x) else data.x
263
+ out = self(data.x, data.adjacency)
264
+ test_f1 = self.compute_f1_score(out, data.y)
265
+ return round(test_f1, 4)
266
+
267
+ def predict(self, data):
268
+ data.x = tf.convert_to_tensor(data.x) if not tf.is_tensor(data.x) else data.x
269
+ out = self(data.x, data.adjacency)
270
+ return tf.argmax(out, axis=1, output_type=tf.int32).numpy()
271
+
272
+ def save(self, filepath, **kwargs):
273
+ """
274
+ Save the complete model including all components.
275
+
276
+ Parameters
277
+ ----------
278
+ filepath : str
279
+ Path where to save the model.
280
+ """
281
+ import os
282
+
283
+ # Create directory if it doesn't exist
284
+ os.makedirs(filepath, exist_ok=True)
285
+
286
+ self.model.save(os.path.join(filepath, "main_model.keras"))
287
+
288
+ # Save configuration
289
+ import json
290
+
291
+ config = self.get_config()
292
+
293
+ with open(os.path.join(filepath, "config.json"), "w") as f:
294
+ json.dump(config, f, indent=2)
295
+
296
+ @classmethod
297
+ def load(cls, filepath):
298
+ """
299
+ Load a complete model from saved components.
300
+
301
+ Parameters
302
+ ----------
303
+ filepath : str
304
+ Path where the model was saved.
305
+
306
+ Returns
307
+ -------
308
+ VanillaGNN
309
+ The loaded model instance.
310
+ """
311
+ import json
312
+ import os
313
+
314
+ # Load configuration
315
+ with open(os.path.join(filepath, "config.json"), "r") as f:
316
+ config = json.load(f)
317
+
318
+ # Create new instance
319
+ instance = cls(**config)
320
+
321
+ instance.model = tf.keras.models.load_model(os.path.join(filepath, "main_model.keras"))
322
+
323
+ return instance
324
+
325
+ def get_config(self):
326
+ return {
327
+ "dim_in": self.dim_in,
328
+ "dim_h": self.dim_h,
329
+ "dim_out": self.dim_out,
330
+ "rank": self.rank,
331
+ }
332
+
333
+ @classmethod
334
+ def from_config(cls, config):
335
+ return cls(
336
+ dim_in=config["dim_in"],
337
+ dim_h=config["dim_h"],
338
+ dim_out=config["dim_out"],
339
+ rank=config["rank"],
340
+ )
341
+
342
+ def get_build_config(self):
343
+ config = {
344
+ "dim_in": self.dim_in,
345
+ "dim_h": self.dim_h,
346
+ "dim_out": self.dim_out,
347
+ "rank": self.rank,
348
+ }
349
+ return config
350
+
351
+ @classmethod
352
+ def build_from_config(cls, config):
353
+ return cls(**config)
354
+
355
+ @tf.function
356
+ def train_step(self, batch_x, batch_adjacency, batch_y, optimizer):
357
+ with tf.GradientTape() as tape:
358
+ out = self(batch_x, batch_adjacency)
359
+ loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=batch_y, logits=out)
360
+ loss = tf.reduce_mean(loss)
361
+ gradients = tape.gradient(loss, self.model.trainable_variables)
362
+ optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
363
+ return loss
364
+
365
+ def fit(self, data, epochs, batch_size, test_size=0.2, optimizer="adam"):
366
+ optimizers = {
367
+ "sgd": tf.keras.optimizers.SGD(),
368
+ "adam": tf.keras.optimizers.Adam(),
369
+ "adamw": tf.keras.optimizers.AdamW(),
370
+ "adadelta": tf.keras.optimizers.Adadelta(),
371
+ "rmsprop": tf.keras.optimizers.RMSprop(),
372
+ }
373
+ optimizer = optimizers[optimizer]
374
+ train_losses = []
375
+ train_f1_scores = []
376
+ val_losses = []
377
+ val_f1_scores = []
378
+
379
+ num_nodes = len(data.x)
380
+ split_index = int((1 - test_size) * num_nodes)
381
+
382
+ X_train, X_test = data.x[:split_index], data.x[split_index:]
383
+ y_train, y_test = data.y[:split_index], data.y[split_index:]
384
+
385
+ adjacency_train = tf.sparse.slice(data.adjacency, [0, 0], [split_index, split_index])
386
+ adjacency_test = tf.sparse.slice(
387
+ data.adjacency,
388
+ [split_index, split_index],
389
+ [num_nodes - split_index, num_nodes - split_index],
390
+ )
391
+
392
+ batch_starts = np.arange(0, len(X_train), batch_size)
393
+ for epoch in range(epochs):
394
+ np.random.shuffle(batch_starts)
395
+ for start in batch_starts:
396
+ end = start + batch_size
397
+ batch_x = X_train[start:end, :]
398
+ batch_adjacency = tf.sparse.slice(
399
+ adjacency_train, [start, start], [batch_size, batch_size]
400
+ )
401
+ batch_y = y_train[start:end]
402
+ train_loss = self.train_step(batch_x, batch_adjacency, batch_y, optimizer)
403
+
404
+ train_loss, train_f1 = self.evaluate(X_train, adjacency_train, y_train)
405
+ train_losses.append(train_loss)
406
+ train_f1_scores.append(train_f1)
407
+
408
+ if epoch % 5 == 0:
409
+ clear_output(wait=True)
410
+ val_loss, val_f1 = self.evaluate(X_test, adjacency_test, y_test)
411
+ val_losses.append(val_loss)
412
+ val_f1_scores.append(val_f1)
413
+ print(
414
+ f"Epoch {epoch:>3} | Train Loss: {train_loss:.4f} | Train F1: {train_f1:.4f} | Val Loss: {val_loss:.4f} | Val F1: {val_f1:.4f}"
415
+ )
416
+
417
+ return train_losses, train_f1_scores, val_losses, val_f1_scores
418
+
419
+
420
+ if __name__ == "__main__":
421
+ print("Examples will be running below")
@@ -0,0 +1,12 @@
1
+ import tensorflow as tf
2
+ from packaging import version
3
+
4
+ if version.parse(tf.__version__) > version.parse("2.15.0"):
5
+ from ._autoencoders import *
6
+ from ._predictor import GetInsights
7
+ else:
8
+ from .autoencoders import *
9
+ from .predictor import GetInsights
10
+
11
+ from .gan import *
12
+ from .rl import *