PyPI - likelihood - Versions diffs - 1.5.3__py3-none-any.whl → 1.5.5__py3-none-any.whl - Mend

likelihood 1.5.3py3-none-any.whl → 1.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

likelihood/graph/nn.py CHANGED Viewed

@@ -5,6 +5,7 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 logging.getLogger("tensorflow").setLevel(logging.ERROR)
 import warnings
+from multiprocessing import Pool, cpu_count
 from typing import Any, List, Tuple
 import numpy as np
@@ -13,65 +14,79 @@ import tensorflow as tf
 from IPython.display import clear_output
 from pandas.core.frame import DataFrame
 from sklearn.metrics import f1_score
-from sklearn.model_selection import train_test_split
 tf.get_logger().setLevel("ERROR")
 from likelihood.tools import LoRALayer
-def compare_similarity(arr1: List[Any], arr2: List[Any], threshold: float = 0.05) -> int:
-    """Calculate the similarity between two arrays considering numeric values near to 1 in ratio."""
+def compare_similarity_np(arr1: np.ndarray, arr2: np.ndarray, threshold: float = 0.05) -> int:
+    """Vectorized similarity comparison between two numeric/categorical arrays."""
+    arr1 = np.asarray(arr1)
+    arr2 = np.asarray(arr2)
-    def is_similar(a: Any, b: Any) -> bool:
-        if isinstance(a, (int, float)) and isinstance(b, (int, float)):
-            if a == 0 and b == 0:
-                return True
-            if a == 0 or b == 0:
-                return False
-            # For numeric values, check if their ratio is within the threshold range
-            ratio = max(a, b) / min(a, b)
-            return 1 - threshold <= ratio <= 1 + threshold
-        else:
-            return a == b
+    is_numeric = np.vectorize(
+        lambda a, b: isinstance(a, (int, float)) and isinstance(b, (int, float))
+    )(arr1, arr2)
+    similarity = np.zeros_like(arr1, dtype=bool)
+    if np.any(is_numeric):
+        a_num = arr1[is_numeric].astype(float)
+        b_num = arr2[is_numeric].astype(float)
+        both_zero = (a_num == 0) & (b_num == 0)
+        nonzero = ~both_zero & (a_num != 0) & (b_num != 0)
+        ratio = np.zeros_like(a_num)
+        ratio[nonzero] = np.maximum(a_num[nonzero], b_num[nonzero]) / np.minimum(
+            a_num[nonzero], b_num[nonzero]
+        )
+        numeric_similar = both_zero | ((1 - threshold <= ratio) & (ratio <= 1 + threshold))
+        similarity[is_numeric] = numeric_similar
+    similarity[~is_numeric] = arr1[~is_numeric] == arr2[~is_numeric]
+    return np.count_nonzero(similarity)
-    return sum(is_similar(a, b) for a, b in zip(arr1, arr2))
+def compare_pair(pair, data, similarity, threshold):
+    i, j = pair
+    sim = compare_similarity_np(data[i], data[j], threshold=threshold)
+    return (i, j, 1 if sim >= similarity else 0)
 def cal_adjacency_matrix(
-    df: DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
+    df: pd.DataFrame, exclude_subset: List[str] = [], sparse: bool = True, **kwargs
 ) -> Tuple[dict, np.ndarray]:
-    """Calculates the adjacency matrix for a given DataFrame.
-    The adjacency matrix is a matrix that represents the similarity between each pair of features.
-    The similarity is calculated using the `compare_similarity` function.
-    The resulting matrix is a square matrix with the same number of rows and columns as the rows of the input DataFrame.
+    """
+    Calculates the adjacency matrix for a given DataFrame using parallel processing.
     Parameters
     ----------
     df : `DataFrame`
         The input DataFrame containing the features.
-    exclude_subset : `List[str]`, optional
+    exclude_subset : `List[str]`, `optional`
         A list of features to exclude from the calculation of the adjacency matrix.
-    sparse : `bool`, optional
+    sparse : `bool`, `optional`
         Whether to return a sparse matrix or a dense matrix.
     **kwargs : `dict`
         Additional keyword arguments to pass to the `compare_similarity` function.
-    Keyword Arguments:
-    ----------
-    similarity: `int`
-        The minimum number of features that must be the same in both arrays to be considered similar.
-    threshold : `float`
-        The threshold value used in the `compare_similarity` function. Default is 0.05.
     Returns
     -------
     adj_dict : `dict`
         A dictionary containing the features.
     adjacency_matrix : `ndarray`
         The adjacency matrix.
-    """
+    Keyword Arguments:
+    ----------
+    similarity: `int`
+        The minimum number of features that must be the same in both arrays to be considered similar.
+    threshold : `float`
+        The threshold value used in the `compare_similarity` function. Default is 0.0
+    """
     if len(exclude_subset) > 0:
         columns = [col for col in df.columns if col not in exclude_subset]
         df_ = df[columns].copy()
@@ -84,14 +99,26 @@ def cal_adjacency_matrix(
     threshold = kwargs.get("threshold", 0.05)
     assert similarity <= df_.shape[1]
-    adj_dict = {index: row.tolist() for index, row in df_.iterrows()}
+    data = df_.to_numpy()
+    n = len(data)
-    adjacency_matrix = np.zeros((len(df_), len(df_)))
+    adj_dict = {i: data[i].tolist() for i in range(n)}
-    for i in range(len(df_)):
-        for j in range(len(df_)):
-            if compare_similarity(adj_dict[i], adj_dict[j], threshold=threshold) >= similarity:
-                adjacency_matrix[i][j] = 1
+    def pair_generator():
+        for i in range(n):
+            for j in range(i, n):
+                yield (i, j)
+    with Pool(cpu_count()) as pool:
+        results = pool.starmap(
+            compare_pair, ((pair, data, similarity, threshold) for pair in pair_generator())
+        )
+    adjacency_matrix = np.zeros((n, n), dtype=np.uint8)
+    for i, j, val in results:
+        if val:
+            adjacency_matrix[i, j] = 1
+            adjacency_matrix[j, i] = 1
     if sparse:
         num_nodes = adjacency_matrix.shape[0]
@@ -103,9 +130,7 @@ def cal_adjacency_matrix(
             indices=indices, values=values, dense_shape=(num_nodes, num_nodes)
         )
-        return adj_dict, adjacency_matrix
-    else:
-        return adj_dict, adjacency_matrix
+    return adj_dict, adjacency_matrix
 class Data:
@@ -260,12 +285,17 @@ class VanillaGNN(tf.keras.Model):
         val_losses = []
         val_f1_scores = []
-        X_train, X_test, y_train, y_test = train_test_split(
-            data.x, data.y, test_size=test_size, shuffle=False
-        )
-        adjacency_train = tf.sparse.slice(data.adjacency, [0, 0], [len(X_train), len(X_train)])
+        num_nodes = len(data.x)
+        split_index = int((1 - test_size) * num_nodes)
+        X_train, X_test = data.x[:split_index], data.x[split_index:]
+        y_train, y_test = data.y[:split_index], data.y[split_index:]
+        adjacency_train = tf.sparse.slice(data.adjacency, [0, 0], [split_index, split_index])
         adjacency_test = tf.sparse.slice(
-            data.adjacency, [len(X_train), 0], [len(X_test), len(X_test)]
+            data.adjacency,
+            [split_index, split_index],
+            [num_nodes - split_index, num_nodes - split_index],
         )
         batch_starts = np.arange(0, len(X_train), batch_size)
@@ -286,10 +316,6 @@ class VanillaGNN(tf.keras.Model):
             if epoch % 5 == 0:
                 clear_output(wait=True)
-                warnings.warn(
-                    "It is normal for validation metrics to underperform during training. Use the test method to validate after training.",
-                    UserWarning,
-                )
                 val_loss, val_f1 = self.evaluate(X_test, adjacency_test, y_test)
                 val_losses.append(val_loss)
                 val_f1_scores.append(val_f1)

likelihood/models/simulation.py CHANGED Viewed

@@ -51,7 +51,6 @@ class SimulationEngine(FeatureSelection):
     """
     def __init__(self, use_scaler: bool = False, **kwargs):
         self.df = pd.DataFrame()
         self.n_importances = None
         self.use_scaler = use_scaler
@@ -91,7 +90,6 @@ class SimulationEngine(FeatureSelection):
         # Categorical column
         if quick_encoder != None:
             one_hot = OneHotEncoder()
             y = one_hot.decode(y)
             encoding_dic = quick_encoder.decoding_list[0]
@@ -180,7 +178,6 @@ class SimulationEngine(FeatureSelection):
         ]
     def _clean_data(self, df: DataFrame) -> DataFrame:
         df.replace([np.inf, -np.inf], np.nan, inplace=True)
         df.replace(" ", np.nan, inplace=True)
         df = check_nan_inf(df)

likelihood/tools/impute.py CHANGED Viewed

@@ -71,7 +71,6 @@ class SimpleImputer:
         self.cols_transf = X_impute.columns
         for column in X_impute.columns:
             if X_impute[column].isnull().sum() > 0:
                 if not X_impute[column].dtype == "object":
                     min_value = self.params[column]["min"]
                     max_value = self.params[column]["max"]

likelihood/tools/numeric_tools.py CHANGED Viewed

@@ -356,13 +356,16 @@ def find_multiples(target: int) -> tuple[int, int] | None:
     Returns
     -------
     tuple[int, int] | None
-        A tuple containing two factors of the target number.
+        If i and i+1 both divide target, returns (i, i+1).
+        Otherwise, returns (i, target // i).
         Returns None if no factors are found.
     """
     for i in range(2, target + 1):
         if target % i == 0:
-            factor = target // i
-            return i, factor
+            if (i + 1) <= target and target % (i + 1) == 0:
+                return i + 1, target // (i + 1)
+            else:
+                return i, target // i
     return None
@@ -396,4 +399,9 @@ if __name__ == "__main__":
     df["index"] = ["A", "B", "C", "D"]
     print("New correlation coefficient test for pandas DataFrame")
     values_df = xi_corr(df)
+    print(find_multiples(30))
+    print(find_multiples(25))
+    print(find_multiples(49))
+    print(find_multiples(17))
+    print(find_multiples(24))
     breakpoint()

likelihood/tools/tools.py CHANGED Viewed

@@ -1153,7 +1153,6 @@ class FeatureSelection:
         return feature_string + "} "
     def _load_data(self, dataset: DataFrame):
         if len(self.not_features) > 0:
             self.X = dataset.drop(columns=self.not_features)

{likelihood-1.5.3.dist-info → likelihood-1.5.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: likelihood
-Version: 1.5.3
+Version: 1.5.5
 Summary: A package that performs the maximum likelihood algorithm.
 Home-page: https://github.com/jzsmoreno/likelihood/
 Author: J. A. Moreno-Guerra
@@ -28,6 +28,7 @@ Requires-Dist: seaborn
 Requires-Dist: pyyaml
 Requires-Dist: pandas
 Requires-Dist: corner
+Requires-Dist: tqdm
 Provides-Extra: full
 Requires-Dist: networkx; extra == "full"
 Requires-Dist: pyvis; extra == "full"

{likelihood-1.5.3.dist-info → likelihood-1.5.5.dist-info}/RECORD RENAMED Viewed

@@ -2,23 +2,23 @@ likelihood/__init__.py,sha256=5C0hapdsk85XZhN_rssRAEFpkRRuKNtj6cyRbqD2_gM,994
 likelihood/main.py,sha256=fcCkGOOWKjfvw2tLVqjuKPV8t0rVCIT9FlbYcOv4EYo,7974
 likelihood/graph/__init__.py,sha256=6TuFDfmXTwpLyHl7_KqBfdzW6zqHjGzIFvymjFPlvjI,21
 likelihood/graph/graph.py,sha256=bLrNMvIh7GOTdPTwnNss8oPZ7cbSHQScAsH_ttmVUK0,3294
-likelihood/graph/nn.py,sha256=EaMmboKriCFnkP48_HLGRAsOZSWxwUlMG0WDGZ4ey1o,11035
+likelihood/graph/nn.py,sha256=uxCxGt1suKmThmEjFope2ew93-WlgvGhgr6RVCHwzhM,11420
 likelihood/models/__init__.py,sha256=e6nB4w47w0Q9DrAFeP3OcUgcoHOtf7Il4mBhgf4AARg,52
 likelihood/models/hmm.py,sha256=0s0gFySH1u4NjRaZDxiZ8oeTaFhFrw1x0GJxwy3dFrA,6253
 likelihood/models/regression.py,sha256=9cakyGlJCEO6WfpoKLh3GxdXQeQp7cUvJIkQ5odT0TA,9404
-likelihood/models/simulation.py,sha256=IkYGA6-L1LvSnIlyrVWTzQQu-JnfXml5Tewt-GC05PY,8446
+likelihood/models/simulation.py,sha256=6OD2IXAnbctxtOzUJ2b9vKW7_tdGs4dQYmQQShqsioA,8443
 likelihood/models/utils.py,sha256=dvigPi_hxcs5ntfHr7Y1JvP5ULtMW3kkN0nJpS4orE8,1319
 likelihood/models/deep/__init__.py,sha256=m607FtMP2gAfPtM0mssFXMKyKOqoeYskZ_xIC6dKhr4,47
 likelihood/models/deep/autoencoders.py,sha256=0EIZwDNlZ9NCfQbhQ_KdXkkRwIjUEU-jk0l0u-J1wmA,44212
 likelihood/models/deep/gan.py,sha256=aoSaNO5LvCU62cjxA0AxvnQvE7NSFtrp1Ta4EDJchpo,10874
 likelihood/tools/__init__.py,sha256=N1IhMDzacsGQT2MIYBMBC0zTxes78vC_0gGrwkuPgmg,78
 likelihood/tools/figures.py,sha256=waF0NHIMrctCmaLhcuz5DMcXyRKynmn6aG0XITYCTLc,10940
-likelihood/tools/impute.py,sha256=BwBVFSQkG3uWsZEk1THTmqZc3YhHlDhMXgKIV3sx5Lg,9486
+likelihood/tools/impute.py,sha256=n87Tv-xLUAdPl7BQLFcLWSsXBZbXksahyCayJWMydXc,9485
 likelihood/tools/models_tools.py,sha256=c3-vac-1MYSarYDtfR6XfVC7X_WY9auS7y2_3Z973IQ,8875
-likelihood/tools/numeric_tools.py,sha256=OelCF45QO-zhanX3GmfcdYMfUZxYt353oJ8_gPEdWss,11959
-likelihood/tools/tools.py,sha256=vlQ-peK_z5-MLVnStxlBdl-NfmF6ILxZ6LhBd4K77JI,42282
-likelihood-1.5.3.dist-info/licenses/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
-likelihood-1.5.3.dist-info/METADATA,sha256=K7CXRIaJbwKyvGzwouhojx8ARZinAgEpaZdMb912c_c,2866
-likelihood-1.5.3.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-likelihood-1.5.3.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
-likelihood-1.5.3.dist-info/RECORD,,
+likelihood/tools/numeric_tools.py,sha256=Hwf-lbqROqPPZ9N7eVzKIDyZxFGQdP53isWxPqpG0eo,12254
+likelihood/tools/tools.py,sha256=FyldbmYNgt4gK89BKgDsya2_EIENwZZwdbBx5pfNhj4,42281
+likelihood-1.5.5.dist-info/licenses/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
+likelihood-1.5.5.dist-info/METADATA,sha256=jtu0BJ0483cmd4DAKqqn_rsSru1-LVS2Wmj998jMkoA,2886
+likelihood-1.5.5.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
+likelihood-1.5.5.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
+likelihood-1.5.5.dist-info/RECORD,,

{likelihood-1.5.3.dist-info → likelihood-1.5.5.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (78.1.0)
+Generator: setuptools (80.7.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

{likelihood-1.5.3.dist-info → likelihood-1.5.5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{likelihood-1.5.3.dist-info → likelihood-1.5.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

likelihood 1.5.3__py3-none-any.whl → 1.5.5__py3-none-any.whl

likelihood 1.5.3py3-none-any.whl → 1.5.5py3-none-any.whl