likelihood 1.5.3__py3-none-any.whl → 1.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
likelihood/graph/nn.py CHANGED
@@ -5,6 +5,7 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
5
5
  logging.getLogger("tensorflow").setLevel(logging.ERROR)
6
6
 
7
7
  import warnings
8
+ from multiprocessing import Pool, cpu_count
8
9
  from typing import Any, List, Tuple
9
10
 
10
11
  import numpy as np
@@ -13,65 +14,79 @@ import tensorflow as tf
13
14
  from IPython.display import clear_output
14
15
  from pandas.core.frame import DataFrame
15
16
  from sklearn.metrics import f1_score
16
- from sklearn.model_selection import train_test_split
17
17
 
18
18
  tf.get_logger().setLevel("ERROR")
19
19
 
20
20
  from likelihood.tools import LoRALayer
21
21
 
22
22
 
23
- def compare_similarity(arr1: List[Any], arr2: List[Any], threshold: float = 0.05) -> int:
24
- """Calculate the similarity between two arrays considering numeric values near to 1 in ratio."""
23
+ def compare_similarity_np(arr1: np.ndarray, arr2: np.ndarray, threshold: float = 0.05) -> int:
24
+ """Vectorized similarity comparison between two numeric/categorical arrays."""
25
+ arr1 = np.asarray(arr1)
26
+ arr2 = np.asarray(arr2)
25
27
 
26
- def is_similar(a: Any, b: Any) -> bool:
27
- if isinstance(a, (int, float)) and isinstance(b, (int, float)):
28
- if a == 0 and b == 0:
29
- return True
30
- if a == 0 or b == 0:
31
- return False
32
- # For numeric values, check if their ratio is within the threshold range
33
- ratio = max(a, b) / min(a, b)
34
- return 1 - threshold <= ratio <= 1 + threshold
35
- else:
36
- return a == b
28
+ is_numeric = np.vectorize(
29
+ lambda a, b: isinstance(a, (int, float)) and isinstance(b, (int, float))
30
+ )(arr1, arr2)
31
+
32
+ similarity = np.zeros_like(arr1, dtype=bool)
33
+
34
+ if np.any(is_numeric):
35
+ a_num = arr1[is_numeric].astype(float)
36
+ b_num = arr2[is_numeric].astype(float)
37
+
38
+ both_zero = (a_num == 0) & (b_num == 0)
39
+ nonzero = ~both_zero & (a_num != 0) & (b_num != 0)
40
+ ratio = np.zeros_like(a_num)
41
+ ratio[nonzero] = np.maximum(a_num[nonzero], b_num[nonzero]) / np.minimum(
42
+ a_num[nonzero], b_num[nonzero]
43
+ )
44
+ numeric_similar = both_zero | ((1 - threshold <= ratio) & (ratio <= 1 + threshold))
45
+
46
+ similarity[is_numeric] = numeric_similar
47
+
48
+ similarity[~is_numeric] = arr1[~is_numeric] == arr2[~is_numeric]
49
+
50
+ return np.count_nonzero(similarity)
37
51
 
38
- return sum(is_similar(a, b) for a, b in zip(arr1, arr2))
52
+
53
+ def compare_pair(pair, data, similarity, threshold):
54
+ i, j = pair
55
+ sim = compare_similarity_np(data[i], data[j], threshold=threshold)
56
+ return (i, j, 1 if sim >= similarity else 0)
39
57
 
40
58
 
41
59
  def cal_adjacency_matrix(
42
- df: DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
60
+ df: pd.DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
43
61
  ) -> Tuple[dict, np.ndarray]:
44
- """Calculates the adjacency matrix for a given DataFrame.
45
- The adjacency matrix is a matrix that represents the similarity between each pair of features.
46
- The similarity is calculated using the `compare_similarity` function.
47
- The resulting matrix is a square matrix with the same number of rows and columns as the rows of the input DataFrame.
62
+ """
63
+ Calculates the adjacency matrix for a given DataFrame using parallel processing.
48
64
 
49
65
  Parameters
50
66
  ----------
51
67
  df : `DataFrame`
52
68
  The input DataFrame containing the features.
53
- exclude_subset : `List[str]`, optional
69
+ exclude_subset : `List[str]`, `optional`
54
70
  A list of features to exclude from the calculation of the adjacency matrix.
55
- sparse : `bool`, optional
71
+ sparse : `bool`, `optional`
56
72
  Whether to return a sparse matrix or a dense matrix.
57
73
  **kwargs : `dict`
58
74
  Additional keyword arguments to pass to the `compare_similarity` function.
59
75
 
60
- Keyword Arguments:
61
- ----------
62
- similarity: `int`
63
- The minimum number of features that must be the same in both arrays to be considered similar.
64
- threshold : `float`
65
- The threshold value used in the `compare_similarity` function. Default is 0.05.
66
-
67
76
  Returns
68
77
  -------
69
78
  adj_dict : `dict`
70
79
  A dictionary containing the features.
71
80
  adjacency_matrix : `ndarray`
72
81
  The adjacency matrix.
73
- """
74
82
 
83
+ Keyword Arguments:
84
+ ----------
85
+ similarity: `int`
86
+ The minimum number of features that must be the same in both arrays to be considered similar.
87
+ threshold : `float`
88
+ The threshold value used in the `compare_similarity` function. Default is 0.0
89
+ """
75
90
  if len(exclude_subset) > 0:
76
91
  columns = [col for col in df.columns if col not in exclude_subset]
77
92
  df_ = df[columns].copy()
@@ -84,14 +99,26 @@ def cal_adjacency_matrix(
84
99
  threshold = kwargs.get("threshold", 0.05)
85
100
  assert similarity <= df_.shape[1]
86
101
 
87
- adj_dict = {index: row.tolist() for index, row in df_.iterrows()}
102
+ data = df_.to_numpy()
103
+ n = len(data)
88
104
 
89
- adjacency_matrix = np.zeros((len(df_), len(df_)))
105
+ adj_dict = {i: data[i].tolist() for i in range(n)}
90
106
 
91
- for i in range(len(df_)):
92
- for j in range(len(df_)):
93
- if compare_similarity(adj_dict[i], adj_dict[j], threshold=threshold) >= similarity:
94
- adjacency_matrix[i][j] = 1
107
+ def pair_generator():
108
+ for i in range(n):
109
+ for j in range(i, n):
110
+ yield (i, j)
111
+
112
+ with Pool(cpu_count()) as pool:
113
+ results = pool.starmap(
114
+ compare_pair, ((pair, data, similarity, threshold) for pair in pair_generator())
115
+ )
116
+
117
+ adjacency_matrix = np.zeros((n, n), dtype=np.uint8)
118
+ for i, j, val in results:
119
+ if val:
120
+ adjacency_matrix[i, j] = 1
121
+ adjacency_matrix[j, i] = 1
95
122
 
96
123
  if sparse:
97
124
  num_nodes = adjacency_matrix.shape[0]
@@ -103,9 +130,7 @@ def cal_adjacency_matrix(
103
130
  indices=indices, values=values, dense_shape=(num_nodes, num_nodes)
104
131
  )
105
132
 
106
- return adj_dict, adjacency_matrix
107
- else:
108
- return adj_dict, adjacency_matrix
133
+ return adj_dict, adjacency_matrix
109
134
 
110
135
 
111
136
  class Data:
@@ -260,12 +285,17 @@ class VanillaGNN(tf.keras.Model):
260
285
  val_losses = []
261
286
  val_f1_scores = []
262
287
 
263
- X_train, X_test, y_train, y_test = train_test_split(
264
- data.x, data.y, test_size=test_size, shuffle=False
265
- )
266
- adjacency_train = tf.sparse.slice(data.adjacency, [0, 0], [len(X_train), len(X_train)])
288
+ num_nodes = len(data.x)
289
+ split_index = int((1 - test_size) * num_nodes)
290
+
291
+ X_train, X_test = data.x[:split_index], data.x[split_index:]
292
+ y_train, y_test = data.y[:split_index], data.y[split_index:]
293
+
294
+ adjacency_train = tf.sparse.slice(data.adjacency, [0, 0], [split_index, split_index])
267
295
  adjacency_test = tf.sparse.slice(
268
- data.adjacency, [len(X_train), 0], [len(X_test), len(X_test)]
296
+ data.adjacency,
297
+ [split_index, split_index],
298
+ [num_nodes - split_index, num_nodes - split_index],
269
299
  )
270
300
 
271
301
  batch_starts = np.arange(0, len(X_train), batch_size)
@@ -286,10 +316,6 @@ class VanillaGNN(tf.keras.Model):
286
316
 
287
317
  if epoch % 5 == 0:
288
318
  clear_output(wait=True)
289
- warnings.warn(
290
- "It is normal for validation metrics to underperform during training. Use the test method to validate after training.",
291
- UserWarning,
292
- )
293
319
  val_loss, val_f1 = self.evaluate(X_test, adjacency_test, y_test)
294
320
  val_losses.append(val_loss)
295
321
  val_f1_scores.append(val_f1)
@@ -51,7 +51,6 @@ class SimulationEngine(FeatureSelection):
51
51
  """
52
52
 
53
53
  def __init__(self, use_scaler: bool = False, **kwargs):
54
-
55
54
  self.df = pd.DataFrame()
56
55
  self.n_importances = None
57
56
  self.use_scaler = use_scaler
@@ -91,7 +90,6 @@ class SimulationEngine(FeatureSelection):
91
90
 
92
91
  # Categorical column
93
92
  if quick_encoder != None:
94
-
95
93
  one_hot = OneHotEncoder()
96
94
  y = one_hot.decode(y)
97
95
  encoding_dic = quick_encoder.decoding_list[0]
@@ -180,7 +178,6 @@ class SimulationEngine(FeatureSelection):
180
178
  ]
181
179
 
182
180
  def _clean_data(self, df: DataFrame) -> DataFrame:
183
-
184
181
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
185
182
  df.replace(" ", np.nan, inplace=True)
186
183
  df = check_nan_inf(df)
@@ -71,7 +71,6 @@ class SimpleImputer:
71
71
  self.cols_transf = X_impute.columns
72
72
  for column in X_impute.columns:
73
73
  if X_impute[column].isnull().sum() > 0:
74
-
75
74
  if not X_impute[column].dtype == "object":
76
75
  min_value = self.params[column]["min"]
77
76
  max_value = self.params[column]["max"]
@@ -356,13 +356,16 @@ def find_multiples(target: int) -> tuple[int, int] | None:
356
356
  Returns
357
357
  -------
358
358
  tuple[int, int] | None
359
- A tuple containing two factors of the target number.
359
+ If i and i+1 both divide target, returns (i, i+1).
360
+ Otherwise, returns (i, target // i).
360
361
  Returns None if no factors are found.
361
362
  """
362
363
  for i in range(2, target + 1):
363
364
  if target % i == 0:
364
- factor = target // i
365
- return i, factor
365
+ if (i + 1) <= target and target % (i + 1) == 0:
366
+ return i + 1, target // (i + 1)
367
+ else:
368
+ return i, target // i
366
369
  return None
367
370
 
368
371
 
@@ -396,4 +399,9 @@ if __name__ == "__main__":
396
399
  df["index"] = ["A", "B", "C", "D"]
397
400
  print("New correlation coefficient test for pandas DataFrame")
398
401
  values_df = xi_corr(df)
402
+ print(find_multiples(30))
403
+ print(find_multiples(25))
404
+ print(find_multiples(49))
405
+ print(find_multiples(17))
406
+ print(find_multiples(24))
399
407
  breakpoint()
likelihood/tools/tools.py CHANGED
@@ -1153,7 +1153,6 @@ class FeatureSelection:
1153
1153
  return feature_string + "} "
1154
1154
 
1155
1155
  def _load_data(self, dataset: DataFrame):
1156
-
1157
1156
  if len(self.not_features) > 0:
1158
1157
  self.X = dataset.drop(columns=self.not_features)
1159
1158
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: likelihood
3
- Version: 1.5.3
3
+ Version: 1.5.5
4
4
  Summary: A package that performs the maximum likelihood algorithm.
5
5
  Home-page: https://github.com/jzsmoreno/likelihood/
6
6
  Author: J. A. Moreno-Guerra
@@ -28,6 +28,7 @@ Requires-Dist: seaborn
28
28
  Requires-Dist: pyyaml
29
29
  Requires-Dist: pandas
30
30
  Requires-Dist: corner
31
+ Requires-Dist: tqdm
31
32
  Provides-Extra: full
32
33
  Requires-Dist: networkx; extra == "full"
33
34
  Requires-Dist: pyvis; extra == "full"
@@ -2,23 +2,23 @@ likelihood/__init__.py,sha256=5C0hapdsk85XZhN_rssRAEFpkRRuKNtj6cyRbqD2_gM,994
2
2
  likelihood/main.py,sha256=fcCkGOOWKjfvw2tLVqjuKPV8t0rVCIT9FlbYcOv4EYo,7974
3
3
  likelihood/graph/__init__.py,sha256=6TuFDfmXTwpLyHl7_KqBfdzW6zqHjGzIFvymjFPlvjI,21
4
4
  likelihood/graph/graph.py,sha256=bLrNMvIh7GOTdPTwnNss8oPZ7cbSHQScAsH_ttmVUK0,3294
5
- likelihood/graph/nn.py,sha256=EaMmboKriCFnkP48_HLGRAsOZSWxwUlMG0WDGZ4ey1o,11035
5
+ likelihood/graph/nn.py,sha256=uxCxGt1suKmThmEjFope2ew93-WlgvGhgr6RVCHwzhM,11420
6
6
  likelihood/models/__init__.py,sha256=e6nB4w47w0Q9DrAFeP3OcUgcoHOtf7Il4mBhgf4AARg,52
7
7
  likelihood/models/hmm.py,sha256=0s0gFySH1u4NjRaZDxiZ8oeTaFhFrw1x0GJxwy3dFrA,6253
8
8
  likelihood/models/regression.py,sha256=9cakyGlJCEO6WfpoKLh3GxdXQeQp7cUvJIkQ5odT0TA,9404
9
- likelihood/models/simulation.py,sha256=IkYGA6-L1LvSnIlyrVWTzQQu-JnfXml5Tewt-GC05PY,8446
9
+ likelihood/models/simulation.py,sha256=6OD2IXAnbctxtOzUJ2b9vKW7_tdGs4dQYmQQShqsioA,8443
10
10
  likelihood/models/utils.py,sha256=dvigPi_hxcs5ntfHr7Y1JvP5ULtMW3kkN0nJpS4orE8,1319
11
11
  likelihood/models/deep/__init__.py,sha256=m607FtMP2gAfPtM0mssFXMKyKOqoeYskZ_xIC6dKhr4,47
12
12
  likelihood/models/deep/autoencoders.py,sha256=0EIZwDNlZ9NCfQbhQ_KdXkkRwIjUEU-jk0l0u-J1wmA,44212
13
13
  likelihood/models/deep/gan.py,sha256=aoSaNO5LvCU62cjxA0AxvnQvE7NSFtrp1Ta4EDJchpo,10874
14
14
  likelihood/tools/__init__.py,sha256=N1IhMDzacsGQT2MIYBMBC0zTxes78vC_0gGrwkuPgmg,78
15
15
  likelihood/tools/figures.py,sha256=waF0NHIMrctCmaLhcuz5DMcXyRKynmn6aG0XITYCTLc,10940
16
- likelihood/tools/impute.py,sha256=BwBVFSQkG3uWsZEk1THTmqZc3YhHlDhMXgKIV3sx5Lg,9486
16
+ likelihood/tools/impute.py,sha256=n87Tv-xLUAdPl7BQLFcLWSsXBZbXksahyCayJWMydXc,9485
17
17
  likelihood/tools/models_tools.py,sha256=c3-vac-1MYSarYDtfR6XfVC7X_WY9auS7y2_3Z973IQ,8875
18
- likelihood/tools/numeric_tools.py,sha256=OelCF45QO-zhanX3GmfcdYMfUZxYt353oJ8_gPEdWss,11959
19
- likelihood/tools/tools.py,sha256=vlQ-peK_z5-MLVnStxlBdl-NfmF6ILxZ6LhBd4K77JI,42282
20
- likelihood-1.5.3.dist-info/licenses/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
21
- likelihood-1.5.3.dist-info/METADATA,sha256=K7CXRIaJbwKyvGzwouhojx8ARZinAgEpaZdMb912c_c,2866
22
- likelihood-1.5.3.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
23
- likelihood-1.5.3.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
24
- likelihood-1.5.3.dist-info/RECORD,,
18
+ likelihood/tools/numeric_tools.py,sha256=Hwf-lbqROqPPZ9N7eVzKIDyZxFGQdP53isWxPqpG0eo,12254
19
+ likelihood/tools/tools.py,sha256=FyldbmYNgt4gK89BKgDsya2_EIENwZZwdbBx5pfNhj4,42281
20
+ likelihood-1.5.5.dist-info/licenses/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
21
+ likelihood-1.5.5.dist-info/METADATA,sha256=jtu0BJ0483cmd4DAKqqn_rsSru1-LVS2Wmj998jMkoA,2886
22
+ likelihood-1.5.5.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
23
+ likelihood-1.5.5.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
24
+ likelihood-1.5.5.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.1.0)
2
+ Generator: setuptools (80.7.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5