likelihood 1.5.7__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,213 @@
1
+ import logging
2
+ import os
3
+ from typing import List
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
9
+ logging.getLogger("tensorflow").setLevel(logging.ERROR)
10
+ import tensorflow as tf
11
+ from pandas.core.frame import DataFrame
12
+ from sklearn.preprocessing import LabelEncoder
13
+
14
+ tf.get_logger().setLevel("ERROR")
15
+
16
+
17
+ class CategoricalEmbedder:
18
+ def __init__(self, embedding_dim=32):
19
+ self.embedding_dim = embedding_dim
20
+ self.label_encoders = {}
21
+ self.embeddings = {}
22
+
23
+ def fit(self, df: DataFrame, categorical_cols: List):
24
+ """
25
+ Fit the embeddings on the given data.
26
+
27
+ Parameters
28
+ ----------
29
+ df : `DataFrame`
30
+ Pandas DataFrame containing the tabular data.
31
+ categorical_cols : `List`
32
+ List of column names representing categorical features.
33
+
34
+ Returns
35
+ -------
36
+ `None`
37
+ """
38
+ df_processed = df.copy()
39
+ for col in categorical_cols:
40
+ if col not in df_processed.columns:
41
+ raise ValueError(f"Column {col} not found in DataFrame")
42
+
43
+ for col in categorical_cols:
44
+ mode_val = df_processed[col].mode()
45
+ if not mode_val.empty:
46
+ df_processed[col] = df_processed[col].fillna(mode_val[0])
47
+
48
+ for col in categorical_cols:
49
+ le = LabelEncoder()
50
+ df_processed[col] = le.fit_transform(df_processed[col])
51
+ self.label_encoders[col] = le
52
+
53
+ vocab_size = len(le.classes_)
54
+ embedding_matrix = np.random.rand(vocab_size, self.embedding_dim)
55
+ self.embeddings[col] = tf.Variable(embedding_matrix, dtype=tf.float32)
56
+
57
+ def transform(self, df: DataFrame, categorical_cols: List[str]):
58
+ """
59
+ Transform the data using the fitted embeddings.
60
+
61
+ Parameters
62
+ ----------
63
+ df : `DataFrame`
64
+ Pandas DataFrame containing the tabular data.
65
+ categorical_cols : `List[str]`
66
+ List of column names representing categorical features.
67
+
68
+ Returns
69
+ -------
70
+ Transformed Pandas DataFrame with original columns except `categorical_cols` replaced by their embedding representations.
71
+ """
72
+
73
+ df_processed = df.copy()
74
+
75
+ for col in categorical_cols:
76
+ if col not in self.label_encoders:
77
+ raise ValueError(
78
+ f"Column {col} has not been fitted. Please call fit() on this column first."
79
+ )
80
+ mode_val = df_processed[col].mode()
81
+ if not mode_val.empty:
82
+ df_processed[col] = df_processed[col].fillna(mode_val[0])
83
+ le = self.label_encoders[col]
84
+ df_processed[col] = le.transform(df_processed[col])
85
+
86
+ for col in categorical_cols:
87
+ indices_tensor = tf.constant(df_processed[col], dtype=tf.int32)
88
+ embedding_layer = tf.nn.embedding_lookup(
89
+ params=self.embeddings[col], ids=indices_tensor
90
+ )
91
+ if len(embedding_layer.shape) == 1:
92
+ embedding_layer = tf.expand_dims(embedding_layer, axis=0)
93
+
94
+ for i in range(self.embedding_dim):
95
+ df_processed[f"{col}_embed_{i}"] = embedding_layer[:, i]
96
+ df_processed.drop(columns=[col], inplace=True)
97
+
98
+ return df_processed
99
+
100
+ def inverse_transform(self, df: pd.DataFrame, categorical_cols: List[str]):
101
+ """
102
+ Inverse transform the data using the fitted embeddings.
103
+
104
+ Parameters
105
+ ----------
106
+ df : `DataFrame`
107
+ Pandas DataFrame containing the tabular data with embedded representations.
108
+ categorical_cols : `List[str]`
109
+ List of column names representing categorical features.
110
+
111
+ Returns
112
+ -------
113
+ Transformed Pandas DataFrame with original columns replaced by their categorical labels.
114
+ """
115
+
116
+ df_processed = df.copy()
117
+
118
+ for col in categorical_cols:
119
+ if col not in self.label_encoders:
120
+ raise ValueError(
121
+ f"Column {col} has not been fitted. Please call fit() on this column first."
122
+ )
123
+
124
+ embedding_matrix = self.embeddings[col].numpy()
125
+ label_encoder = self.label_encoders[col]
126
+
127
+ embedded_columns = [f"{col}_embed_{i}" for i in range(self.embedding_dim)]
128
+ embeddings = df_processed[embedded_columns].values
129
+
130
+ distances = np.linalg.norm(embedding_matrix - embeddings[:, np.newaxis], axis=2)
131
+ original_indices = np.argmin(distances, axis=1)
132
+ original_labels = label_encoder.inverse_transform(original_indices)
133
+
134
+ df_processed[col] = original_labels
135
+ df_processed.drop(columns=embedded_columns, inplace=True)
136
+
137
+ return df_processed
138
+
139
+ def save_embeddings(self, path: str):
140
+ """
141
+ Save the embeddings to a directory.
142
+
143
+ Parameters
144
+ ----------
145
+ path : `str`
146
+ Path to the directory where embeddings will be saved.
147
+ """
148
+
149
+ os.makedirs(path, exist_ok=True)
150
+ for col, embedding in self.embeddings.items():
151
+ np.save(os.path.join(path, f"{col}_embedding.npy"), embedding.numpy())
152
+
153
+ def load_embeddings(self, path: str):
154
+ """
155
+ Load the embeddings from a directory.
156
+
157
+ Parameters
158
+ ----------
159
+ path : `str`
160
+ Path to the directory where embeddings are saved.
161
+ """
162
+
163
+ for col in self.label_encoders.keys():
164
+ embedding_path = os.path.join(path, f"{col}_embedding.npy")
165
+ if not os.path.exists(embedding_path):
166
+ raise FileNotFoundError(f"Embedding file {embedding_path} not found.")
167
+ embedding_matrix = np.load(embedding_path)
168
+ self.embeddings[col] = tf.Variable(embedding_matrix, dtype=tf.float32)
169
+
170
+
171
+ if __name__ == "__main__":
172
+ data = {
173
+ "color": ["red", "blue", None, "green", "blue"],
174
+ "size": ["S", "M", "XL", "XS", None],
175
+ "price": [10.99, 25.50, 30.00, 8.75, 12.25],
176
+ }
177
+ df = pd.DataFrame(data)
178
+
179
+ # Initialize the embedder
180
+ embedder = CategoricalEmbedder(embedding_dim=3)
181
+
182
+ # Fit the embeddings on the data
183
+ embedder.fit(df, categorical_cols=["color", "size"])
184
+
185
+ # Transform the data using the fitted embeddings
186
+ processed_df = embedder.transform(df, categorical_cols=["color", "size"])
187
+
188
+ print("Processed DataFrame:")
189
+ print(processed_df.head())
190
+
191
+ # Save the embeddings to disk
192
+ embedder.save_embeddings("./embeddings")
193
+
194
+ # Load the embeddings from disk
195
+ new_embedder = CategoricalEmbedder(embedding_dim=3)
196
+ new_embedder.label_encoders = (
197
+ embedder.label_encoders
198
+ ) # Assuming label encodings are consistent across runs
199
+ new_embedder.load_embeddings("./embeddings")
200
+
201
+ # Transform the data using the loaded embeddings
202
+ processed_df_loaded = new_embedder.transform(df, categorical_cols=["color", "size"])
203
+ print("\nProcessed DataFrame with Loaded Embeddings:")
204
+ print(processed_df_loaded.head())
205
+
206
+ # Inverse transform the data
207
+ df_loaded = new_embedder.inverse_transform(
208
+ processed_df_loaded, categorical_cols=["color", "size"]
209
+ )
210
+ print("\nOriginal DataFrame:")
211
+ print(df.head())
212
+ print("\nProcessed DataFrame with Inverse Transform:")
213
+ print(df_loaded.head())
likelihood/tools/tools.py CHANGED
@@ -8,10 +8,15 @@ import matplotlib.pyplot as plt
8
8
  import numpy as np
9
9
  import pandas as pd
10
10
  import yaml
11
+ from packaging import version
11
12
  from pandas.core.frame import DataFrame
12
13
 
13
- # Suppress RankWarning
14
- warnings.simplefilter("ignore", np.RankWarning)
14
+ if version.parse(np.__version__) < version.parse("2.0.0"):
15
+ filter = np.RankWarning
16
+ else:
17
+ filter = np.exceptions.RankWarning
18
+
19
+ warnings.simplefilter("ignore", filter)
15
20
 
16
21
  # -------------------------------------------------------------------------
17
22
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: likelihood
3
- Version: 1.5.7
3
+ Version: 2.0.0
4
4
  Summary: A package that performs the maximum likelihood algorithm.
5
5
  Home-page: https://github.com/jzsmoreno/likelihood/
6
6
  Author: J. A. Moreno-Guerra
@@ -20,9 +20,10 @@ Requires-Dist: pydocstyle>=6.3.0
20
20
  Requires-Dist: flake8>=6.0.0
21
21
  Requires-Dist: isort>=5.12.0
22
22
  Requires-Dist: mypy>=1.4.1
23
- Requires-Dist: numpy<2.0.0
23
+ Requires-Dist: numpy<3.0.0,>=1.26.4
24
24
  Requires-Dist: pydot==2.0.0
25
25
  Requires-Dist: matplotlib
26
+ Requires-Dist: packaging
26
27
  Requires-Dist: graphviz
27
28
  Requires-Dist: seaborn
28
29
  Requires-Dist: pyyaml
@@ -32,7 +33,7 @@ Requires-Dist: tqdm
32
33
  Provides-Extra: full
33
34
  Requires-Dist: networkx; extra == "full"
34
35
  Requires-Dist: pyvis; extra == "full"
35
- Requires-Dist: tensorflow==2.15.0; extra == "full"
36
+ Requires-Dist: tensorflow>=2.15.0; extra == "full"
36
37
  Requires-Dist: keras-tuner; extra == "full"
37
38
  Requires-Dist: scikit-learn; extra == "full"
38
39
  Dynamic: author
@@ -0,0 +1,30 @@
1
+ likelihood/__init__.py,sha256=5C0hapdsk85XZhN_rssRAEFpkRRuKNtj6cyRbqD2_gM,994
2
+ likelihood/main.py,sha256=fcCkGOOWKjfvw2tLVqjuKPV8t0rVCIT9FlbYcOv4EYo,7974
3
+ likelihood/graph/__init__.py,sha256=vUY4pKlnm3eSVTXd2d-5JDPawhqGNRIKRhaHIobsNws,188
4
+ likelihood/graph/_nn.py,sha256=Sh7dRz8QSI08Ydfw9e--uCxc4KMtHUsCz_-C-loXklQ,13883
5
+ likelihood/graph/graph.py,sha256=bLrNMvIh7GOTdPTwnNss8oPZ7cbSHQScAsH_ttmVUK0,3294
6
+ likelihood/graph/nn.py,sha256=uxCxGt1suKmThmEjFope2ew93-WlgvGhgr6RVCHwzhM,11420
7
+ likelihood/models/__init__.py,sha256=e6nB4w47w0Q9DrAFeP3OcUgcoHOtf7Il4mBhgf4AARg,52
8
+ likelihood/models/hmm.py,sha256=0s0gFySH1u4NjRaZDxiZ8oeTaFhFrw1x0GJxwy3dFrA,6253
9
+ likelihood/models/regression.py,sha256=9cakyGlJCEO6WfpoKLh3GxdXQeQp7cUvJIkQ5odT0TA,9404
10
+ likelihood/models/simulation.py,sha256=xsl4mJ2qFCuZR_B9LfQcLjV6OtONU1zyESX3CCUfOiw,8619
11
+ likelihood/models/utils.py,sha256=dvigPi_hxcs5ntfHr7Y1JvP5ULtMW3kkN0nJpS4orE8,1319
12
+ likelihood/models/deep/__init__.py,sha256=I55FciI0BfljYdhW2OGNqcpYV57FhPZETZX7Y1y9GVQ,303
13
+ likelihood/models/deep/_autoencoders.py,sha256=CeD79YzU7DdPd92wUNG_EtPVQOBgsgYoC4uS2JF3b6o,30939
14
+ likelihood/models/deep/_predictor.py,sha256=XI4QfVM7PS_60zYtmi-V8UzNDrASFiDMVPmV17BB8lM,27984
15
+ likelihood/models/deep/autoencoders.py,sha256=muUBH9BclOK8ViI7PijyMOBBLVox6uwuIabyJvpU5qw,30729
16
+ likelihood/models/deep/gan.py,sha256=rTnaLmIPjsKg6_0B8JZOVwPxdx59rHmqvzDitdJMCQ4,10924
17
+ likelihood/models/deep/predictor.py,sha256=q5tPaAbF7s5XIcxVr6fyHTQdZa9tlixO9vb9a9Cw0wM,27831
18
+ likelihood/models/deep/rl.py,sha256=9dhhnVTIETi9zvVeyOXYo1hl-LQJezmv0rgsUq11Qwc,11611
19
+ likelihood/tools/__init__.py,sha256=N1IhMDzacsGQT2MIYBMBC0zTxes78vC_0gGrwkuPgmg,78
20
+ likelihood/tools/cat_embed.py,sha256=SJ7o1vbrNYp21fLLcjRnWpUDcz1nVSe8TmMvsLIz5CI,7346
21
+ likelihood/tools/figures.py,sha256=waF0NHIMrctCmaLhcuz5DMcXyRKynmn6aG0XITYCTLc,10940
22
+ likelihood/tools/impute.py,sha256=n87Tv-xLUAdPl7BQLFcLWSsXBZbXksahyCayJWMydXc,9485
23
+ likelihood/tools/models_tools.py,sha256=c3-vac-1MYSarYDtfR6XfVC7X_WY9auS7y2_3Z973IQ,8875
24
+ likelihood/tools/numeric_tools.py,sha256=Hwf-lbqROqPPZ9N7eVzKIDyZxFGQdP53isWxPqpG0eo,12254
25
+ likelihood/tools/tools.py,sha256=GKZsqjyO5tGXWGSfn3jlQBTjRlmBv2byfvpu-QclUx0,42188
26
+ likelihood-2.0.0.dist-info/licenses/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
27
+ likelihood-2.0.0.dist-info/METADATA,sha256=Ziysy1MQuW77OHHd1UzMtlfeUT9wsdgCl6rxW3uLBEE,2917
28
+ likelihood-2.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
29
+ likelihood-2.0.0.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
30
+ likelihood-2.0.0.dist-info/RECORD,,
@@ -1,25 +0,0 @@
1
- likelihood/__init__.py,sha256=5C0hapdsk85XZhN_rssRAEFpkRRuKNtj6cyRbqD2_gM,994
2
- likelihood/main.py,sha256=fcCkGOOWKjfvw2tLVqjuKPV8t0rVCIT9FlbYcOv4EYo,7974
3
- likelihood/graph/__init__.py,sha256=6TuFDfmXTwpLyHl7_KqBfdzW6zqHjGzIFvymjFPlvjI,21
4
- likelihood/graph/graph.py,sha256=bLrNMvIh7GOTdPTwnNss8oPZ7cbSHQScAsH_ttmVUK0,3294
5
- likelihood/graph/nn.py,sha256=uxCxGt1suKmThmEjFope2ew93-WlgvGhgr6RVCHwzhM,11420
6
- likelihood/models/__init__.py,sha256=e6nB4w47w0Q9DrAFeP3OcUgcoHOtf7Il4mBhgf4AARg,52
7
- likelihood/models/hmm.py,sha256=0s0gFySH1u4NjRaZDxiZ8oeTaFhFrw1x0GJxwy3dFrA,6253
8
- likelihood/models/regression.py,sha256=9cakyGlJCEO6WfpoKLh3GxdXQeQp7cUvJIkQ5odT0TA,9404
9
- likelihood/models/simulation.py,sha256=6OD2IXAnbctxtOzUJ2b9vKW7_tdGs4dQYmQQShqsioA,8443
10
- likelihood/models/utils.py,sha256=dvigPi_hxcs5ntfHr7Y1JvP5ULtMW3kkN0nJpS4orE8,1319
11
- likelihood/models/deep/__init__.py,sha256=UV_VYhySvrNnB4a0VXYM4wK3KKF7ytjLFFfwvnaZWaA,82
12
- likelihood/models/deep/autoencoders.py,sha256=9-ZOKbS02tojCufg_Fbd5_Z48pSFSqZnfZZJVohNqdk,29985
13
- likelihood/models/deep/gan.py,sha256=aoSaNO5LvCU62cjxA0AxvnQvE7NSFtrp1Ta4EDJchpo,10874
14
- likelihood/models/deep/predictor.py,sha256=Z6GVm9ciz90cMcp4Q6Lvm-_8_9ZOxX1kBquReW2aGqM,27688
15
- likelihood/tools/__init__.py,sha256=N1IhMDzacsGQT2MIYBMBC0zTxes78vC_0gGrwkuPgmg,78
16
- likelihood/tools/figures.py,sha256=waF0NHIMrctCmaLhcuz5DMcXyRKynmn6aG0XITYCTLc,10940
17
- likelihood/tools/impute.py,sha256=n87Tv-xLUAdPl7BQLFcLWSsXBZbXksahyCayJWMydXc,9485
18
- likelihood/tools/models_tools.py,sha256=c3-vac-1MYSarYDtfR6XfVC7X_WY9auS7y2_3Z973IQ,8875
19
- likelihood/tools/numeric_tools.py,sha256=Hwf-lbqROqPPZ9N7eVzKIDyZxFGQdP53isWxPqpG0eo,12254
20
- likelihood/tools/tools.py,sha256=lk9BIskjUKYQ1XVwARm9jAjHuLQ4UO68aZY8oxkzk5c,42056
21
- likelihood-1.5.7.dist-info/licenses/LICENSE,sha256=XWHWt9egYEUHGPTnlcZfJKLPmysacOwdiLj_-J7Z9ew,1066
22
- likelihood-1.5.7.dist-info/METADATA,sha256=V8yQ5NJPbMyxOB7sICsp5QCkZ8MZhxkfS-4WCWMrFG0,2883
23
- likelihood-1.5.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
24
- likelihood-1.5.7.dist-info/top_level.txt,sha256=KDiBLr870YTxqLFqObTOSrTK10uw8dFsITSNLlte3PA,11
25
- likelihood-1.5.7.dist-info/RECORD,,