instmodel 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ # MANIFEST.in
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.2
2
+ Name: instmodel
3
+ Version: 0.1.0
4
+ Summary: A package that provides advanced coding tools for training and deploying AI
5
+ Home-page:
6
+ Author: Joao Ferreira
7
+ Author-email: joaoprcf@gmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Operating System :: OS Independent
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: tensorflow==2.17.0
12
+ Requires-Dist: numpy
13
+ Requires-Dist: pandas
14
+ Requires-Dist: scipy
15
+ Requires-Dist: scikit-learn
16
+
17
+ This package allow the easy creation of training models with keras backend and deploy them directly as instruction model
@@ -0,0 +1 @@
1
+ This package allow the easy creation of training models with keras backend and deploy them directly as instruction model
@@ -0,0 +1,4 @@
1
+ from . import embeddings
2
+ from . import instruction_model
3
+ from . import training_utils
4
+ from . import model
@@ -0,0 +1,359 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ from .instruction_model import create_instructions_model_from_transformation_list
5
+ from .model import (
6
+ InputBuffer,
7
+ Dense,
8
+ Concatenate,
9
+ ModelGraph,
10
+ )
11
+
12
+
13
+ def one_hot_encode(data, one_hot, k):
14
+ one_hot_columns = one_hot.columns
15
+ val_one_hot = pd.get_dummies(data[k], prefix=k, dtype=np.float32).reindex(
16
+ columns=one_hot_columns, fill_value=0
17
+ )
18
+
19
+ assert val_one_hot.shape[1] == one_hot.shape[1]
20
+ assert (val_one_hot.columns == one_hot.columns).all()
21
+ if not set(val_one_hot.columns).issubset(set(one_hot_columns)):
22
+ raise ValueError(
23
+ f"Validation set contains IDs not present in training set: {set(val_one_hot.columns) - set(one_hot_columns)}"
24
+ )
25
+
26
+ data = data.join(val_one_hot)
27
+ data = data.drop(k, axis=1)
28
+
29
+ return data
30
+
31
+
32
+ def embbed_predict(data, one_hot, model):
33
+ one_hot_columns = one_hot.columns
34
+ idxs = data[one_hot_columns].idxmax(axis=1)
35
+
36
+ return embbed_predict_idx(idxs, one_hot.shape[1], model)
37
+
38
+
39
+ def embbed_predict_idx(idxs, one_hot_len, model):
40
+ if isinstance(idxs, int):
41
+ one_hot = np.zeros((1, one_hot_len), dtype=np.float32)
42
+ one_hot[0, idxs] = 1.0
43
+ return model.predict_on_batch(one_hot)[0]
44
+
45
+ one_hot = np.zeros((len(idxs), one_hot_len), dtype=np.float32)
46
+ one_hot[np.arange(len(idxs)), idxs] = 1.0
47
+
48
+ return model.predict_on_batch(one_hot)
49
+
50
+
51
+ def get_embeddings_dict(emb_models):
52
+ embeddings_dict = {}
53
+
54
+ map_key_sorted = sorted(emb_models.keys())
55
+ for k in map_key_sorted:
56
+ sub_model = emb_models[k]
57
+ model_info = sub_model["model"]
58
+ one_hot_df = sub_model["one_hot"]
59
+
60
+ category_embeddings = {}
61
+
62
+ for idx, col in enumerate(one_hot_df.columns):
63
+ prefix = k + "_"
64
+ if not col.startswith(prefix):
65
+ raise ValueError(
66
+ f"Column '{col}' does not start with expected prefix '{prefix}'"
67
+ )
68
+
69
+ cat_id_str = col[len(prefix) :] # Remove prefix
70
+ cat_id = round(float(cat_id_str))
71
+
72
+ embedding_vector = embbed_predict_idx(
73
+ idx, one_hot_df.shape[1], model_info
74
+ ).tolist()
75
+ category_embeddings[cat_id] = embedding_vector
76
+
77
+ embeddings_dict[k] = category_embeddings
78
+ return embeddings_dict
79
+
80
+
81
+ class EmbeddingWrapper:
82
+ def __init__(
83
+ self,
84
+ training_data,
85
+ validation_data,
86
+ input_features,
87
+ output_features,
88
+ mapping,
89
+ fraction=1.0,
90
+ ):
91
+ assert isinstance(training_data, pd.DataFrame) and isinstance(
92
+ validation_data, pd.DataFrame
93
+ )
94
+ self.map_key_sorted = sorted(mapping.keys())
95
+ self.original_training_data = training_data.copy()
96
+ self.original_validation_data = validation_data.copy()
97
+ self.mapping = mapping
98
+ self.initial_input_features = input_features
99
+ self.initial_output_features = output_features
100
+ self.fraction = fraction
101
+
102
+ self.create_one_hot_encoding()
103
+ self.create_emb_step()
104
+
105
+ def create_instruction_model(self):
106
+ embeddings_dict = get_embeddings_dict(self.emb_models)
107
+
108
+ transformation_list = []
109
+
110
+ # The index where one-hot encoded columns start, based on how you constructed 'features'
111
+ data_index = len(self.initial_input_features) - len(self.mapping)
112
+ segment_start = data_index
113
+
114
+ current_columns = list(self.s1_training_data_X.columns)
115
+ self.s2_training_data_X = self.s1_training_data_X.copy()
116
+ self.s2_validation_data_X = self.s1_validation_data_X.copy()
117
+
118
+ for k, v in self.mapping.items():
119
+ # Validate segment_start and segment_size
120
+ if k not in self.emb_models or "one_hot" not in self.emb_models[k]:
121
+ raise ValueError(f"No one_hot info found for key {k}")
122
+
123
+ segment_size = self.emb_models[k]["one_hot"].shape[1]
124
+
125
+ if segment_start + segment_size > len(current_columns):
126
+ raise IndexError(
127
+ f"Segment for key '{k}' exceeds the number of columns available.\n"
128
+ f"segment_start: {segment_start}, segment_size: {segment_size}, total_cols: {len(current_columns)}"
129
+ )
130
+
131
+ one_hot_cols = current_columns[segment_start : segment_start + segment_size]
132
+
133
+ # Sanity check that these columns exist in the DataFrame
134
+ for col in one_hot_cols:
135
+ if (
136
+ col not in self.s1_training_data_X.columns
137
+ or col not in self.s1_validation_data_X.columns
138
+ ):
139
+ raise KeyError(f"Column {col} expected but not found in data.")
140
+
141
+ # Extract the segments
142
+ train_segment = self.s1_training_data_X[one_hot_cols]
143
+ valid_segment = self.s1_validation_data_X[one_hot_cols]
144
+
145
+ # Predict the embeddings using embeddings_dict
146
+ train_embedded = self.emb_models[k]["model"].predict(train_segment)
147
+ valid_embedded = self.emb_models[k]["model"].predict(valid_segment)
148
+
149
+ print(f"Train embedded shape for {k}: {train_embedded.shape}")
150
+ print(f"Validation embedded shape for {k}: {valid_embedded.shape}")
151
+
152
+ # Check embedding shapes
153
+ if train_embedded.shape[1] != v:
154
+ raise ValueError(
155
+ f"Expected embeddings of size {v}, but got {train_embedded.shape[1]} for train data."
156
+ )
157
+ if valid_embedded.shape[1] != v:
158
+ raise ValueError(
159
+ f"Expected embeddings of size {v}, but got {valid_embedded.shape[1]} for validation data."
160
+ )
161
+
162
+ # Create new column names for the embedded features
163
+ embedded_col_names = [f"{k}_{i}" for i in range(v)]
164
+
165
+ # Create DataFrames for the embedded features
166
+ train_embedded_df = pd.DataFrame(
167
+ train_embedded,
168
+ index=self.s2_training_data_X.index,
169
+ columns=embedded_col_names,
170
+ )
171
+ valid_embedded_df = pd.DataFrame(
172
+ valid_embedded,
173
+ index=self.s2_validation_data_X.index,
174
+ columns=embedded_col_names,
175
+ )
176
+
177
+ # Rebuild the DataFrames with the embedded columns replacing the one-hot columns
178
+ train_before = self.s2_training_data_X.iloc[:, :segment_start]
179
+ train_after = self.s2_training_data_X.iloc[
180
+ :, segment_start + segment_size :
181
+ ]
182
+ valid_before = self.s2_validation_data_X.iloc[:, :segment_start]
183
+ valid_after = self.s2_validation_data_X.iloc[
184
+ :, segment_start + segment_size :
185
+ ]
186
+
187
+ # Construct new DataFrames rather than modifying in place
188
+ new_data_train_X = pd.concat(
189
+ [train_before, train_embedded_df, train_after], axis=1
190
+ )
191
+ new_validation_data_X = pd.concat(
192
+ [valid_before, valid_embedded_df, valid_after], axis=1
193
+ )
194
+
195
+ # Ensure consistent dtypes
196
+ new_data_train_X = new_data_train_X.astype(np.float32, copy=False)
197
+ new_validation_data_X = new_validation_data_X.astype(np.float32, copy=False)
198
+
199
+ # Update the main DataFrames and the list of columns
200
+ self.s2_training_data_X = new_data_train_X
201
+ self.s2_validation_data_X = new_validation_data_X
202
+ current_columns = list(self.s2_training_data_X.columns)
203
+
204
+ default = embeddings_dict[k].pop(-1)
205
+
206
+ # Append transformations
207
+ transformation_list.append(
208
+ {
209
+ "from": k,
210
+ "to": embedded_col_names,
211
+ "map": embeddings_dict[k],
212
+ "size": v,
213
+ "default": default,
214
+ }
215
+ )
216
+
217
+ transformation_list.append({"delete": k})
218
+
219
+ # Update segment_start to jump over the newly embedded columns
220
+ # Originally we had one-hot columns replaced by 'v' embeddings:
221
+ # So we move forward by `v - segment_size` because we've effectively
222
+ # replaced `segment_size` columns with `v` columns, and these columns
223
+ # start at the same segment_start position.
224
+ segment_start = segment_start + v
225
+
226
+ print("Final Data Train Columns:", self.s2_training_data_X.columns)
227
+ print("Final Validation Data Columns:", self.s2_validation_data_X.columns)
228
+
229
+ transformation_model, new_features = (
230
+ create_instructions_model_from_transformation_list(
231
+ self.initial_input_features, transformation_list
232
+ )
233
+ )
234
+
235
+ self.s2_input_features = new_features
236
+ self.s2_output_features = self.initial_output_features
237
+ self.s2_training_data_y = self.s1_training_data_y
238
+ self.s2_validation_data_y = self.s1_validation_data_y
239
+ return transformation_model
240
+
241
+ def create_one_hot_encoding(self):
242
+ self.emb_models = {}
243
+ maps = self.mapping
244
+ training_data_X = self.original_training_data[
245
+ self.initial_input_features
246
+ ].copy()
247
+ training_data_y = self.original_training_data[
248
+ self.initial_output_features
249
+ ].copy()
250
+ validation_data_X = self.original_validation_data[
251
+ self.initial_input_features
252
+ ].copy()
253
+
254
+ new_data_x = []
255
+ new_data_y = []
256
+ for k in self.map_key_sorted:
257
+ subset_X = training_data_X.sample(frac=self.fraction).copy()
258
+ subset_y = training_data_y.loc[subset_X.index].copy()
259
+
260
+ # Modify the subset
261
+ subset_X[k] = -1.0
262
+
263
+ # Append the subset to the lists
264
+ new_data_x.append(subset_X)
265
+ new_data_y.append(subset_y)
266
+
267
+ # Extend data_train_X and data_train_y with the new data
268
+ training_data_X = pd.concat(
269
+ [training_data_X] + new_data_x, ignore_index=True
270
+ ) # Ensure consistent indexing
271
+ training_data_y = pd.concat(
272
+ [training_data_y] + new_data_y, ignore_index=True
273
+ ) # Ensure consistent indexing
274
+
275
+ for k in self.map_key_sorted:
276
+ v = maps[k]
277
+
278
+ one_hot = pd.get_dummies(training_data_X[k], prefix=k, dtype=np.float32)
279
+ one_hot_columns = one_hot.columns
280
+
281
+ assert f"{k}_-1.0" in one_hot_columns
282
+
283
+ training_data_ids = set(training_data_X[k].unique())
284
+ validation_data_ids = set(validation_data_X[k].unique())
285
+
286
+ unique_validation_data_ids = validation_data_ids - training_data_ids
287
+
288
+ # replace the unique_validation_data_ids (unknown) with -1
289
+ validation_data_X.loc[
290
+ validation_data_X[k].isin(unique_validation_data_ids), k
291
+ ] = -1.0
292
+
293
+ training_data_ids = set(training_data_X[k].unique())
294
+ validation_data_ids = set(validation_data_X[k].unique())
295
+
296
+ unique_validation_data_ids = (
297
+ set(validation_data_X[k].unique()) - training_data_ids
298
+ )
299
+
300
+ print(
301
+ f"Traning data x had {len(training_data_X.columns)} cols:\n{training_data_X.columns}"
302
+ )
303
+ print(f"Adding one hot {len(one_hot.columns)} columns and dropping {k}")
304
+
305
+ training_data_X = training_data_X.join(one_hot)
306
+ training_data_X = training_data_X.drop(k, axis=1)
307
+
308
+ print(
309
+ f"Traning data x now has {len(training_data_X.columns)} cols:\n{training_data_X.columns}"
310
+ )
311
+
312
+ validation_data_X = one_hot_encode(validation_data_X, one_hot, k)
313
+
314
+ # Create model for embedding the one-hot columns
315
+ inputs = InputBuffer(one_hot.shape[1])
316
+ outputs = Dense(v, activation="sigmoid")(inputs)
317
+ model = ModelGraph(inputs, outputs)
318
+
319
+ self.emb_models[k] = {
320
+ "model": model,
321
+ "one_hot": one_hot,
322
+ }
323
+
324
+ assert "complete" not in training_data_X.columns
325
+
326
+ self.s1_input_features = training_data_X.columns
327
+ self.s1_output_features = training_data_y.columns
328
+ self.s1_training_data_X = training_data_X
329
+ self.s1_training_data_y = training_data_y
330
+ self.s1_validation_data_X = validation_data_X
331
+ self.s1_validation_data_y = self.original_validation_data[
332
+ self.initial_output_features
333
+ ].copy()
334
+
335
+ def create_emb_step(self):
336
+ full_inputs = InputBuffer(self.s1_training_data_X.shape[1])
337
+ data_index = len(self.initial_input_features) - len(self.mapping)
338
+
339
+ preprocessed_outputs = [full_inputs[:data_index]]
340
+
341
+ for k in self.map_key_sorted:
342
+ segment_size = self.emb_models[k]["one_hot"].shape[1]
343
+ preprocessed_outputs.append(
344
+ self.emb_models[k]["model"](
345
+ full_inputs[data_index : data_index + segment_size]
346
+ )
347
+ )
348
+ data_index += segment_size
349
+
350
+ assert data_index == self.s1_training_data_X.shape[1]
351
+
352
+ full_preprocessed_outputs = Concatenate()(preprocessed_outputs)
353
+
354
+ self.apply_smart_embeddings = ModelGraph(full_inputs, full_preprocessed_outputs)
355
+
356
+ self.apply_embeddings = self.apply_smart_embeddings.get_keras()
357
+
358
+ self.step1_size = data_index
359
+ self.step2_size = full_preprocessed_outputs.os