instmodel 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- instmodel-0.1.0/MANIFEST.in +1 -0
- instmodel-0.1.0/PKG-INFO +17 -0
- instmodel-0.1.0/README.md +1 -0
- instmodel-0.1.0/instmodel/__init__.py +4 -0
- instmodel-0.1.0/instmodel/embeddings.py +359 -0
- instmodel-0.1.0/instmodel/instruction_model.py +377 -0
- instmodel-0.1.0/instmodel/model.py +872 -0
- instmodel-0.1.0/instmodel/training_utils.py +21 -0
- instmodel-0.1.0/instmodel.egg-info/PKG-INFO +17 -0
- instmodel-0.1.0/instmodel.egg-info/SOURCES.txt +16 -0
- instmodel-0.1.0/instmodel.egg-info/dependency_links.txt +1 -0
- instmodel-0.1.0/instmodel.egg-info/requires.txt +5 -0
- instmodel-0.1.0/instmodel.egg-info/top_level.txt +1 -0
- instmodel-0.1.0/pyproject.toml +8 -0
- instmodel-0.1.0/setup.cfg +26 -0
- instmodel-0.1.0/setup.py +6 -0
- instmodel-0.1.0/tests/test_model.py +258 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# MANIFEST.in
|
instmodel-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: instmodel
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A package that provides advanced coding tools for training and deploying AI
|
|
5
|
+
Home-page:
|
|
6
|
+
Author: Joao Ferreira
|
|
7
|
+
Author-email: joaoprcf@gmail.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: tensorflow==2.17.0
|
|
12
|
+
Requires-Dist: numpy
|
|
13
|
+
Requires-Dist: pandas
|
|
14
|
+
Requires-Dist: scipy
|
|
15
|
+
Requires-Dist: scikit-learn
|
|
16
|
+
|
|
17
|
+
This package allow the easy creation of training models with keras backend and deploy them directly as instruction model
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
This package allow the easy creation of training models with keras backend and deploy them directly as instruction model
|
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
from .instruction_model import create_instructions_model_from_transformation_list
|
|
5
|
+
from .model import (
|
|
6
|
+
InputBuffer,
|
|
7
|
+
Dense,
|
|
8
|
+
Concatenate,
|
|
9
|
+
ModelGraph,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def one_hot_encode(data, one_hot, k):
|
|
14
|
+
one_hot_columns = one_hot.columns
|
|
15
|
+
val_one_hot = pd.get_dummies(data[k], prefix=k, dtype=np.float32).reindex(
|
|
16
|
+
columns=one_hot_columns, fill_value=0
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
assert val_one_hot.shape[1] == one_hot.shape[1]
|
|
20
|
+
assert (val_one_hot.columns == one_hot.columns).all()
|
|
21
|
+
if not set(val_one_hot.columns).issubset(set(one_hot_columns)):
|
|
22
|
+
raise ValueError(
|
|
23
|
+
f"Validation set contains IDs not present in training set: {set(val_one_hot.columns) - set(one_hot_columns)}"
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
data = data.join(val_one_hot)
|
|
27
|
+
data = data.drop(k, axis=1)
|
|
28
|
+
|
|
29
|
+
return data
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def embbed_predict(data, one_hot, model):
|
|
33
|
+
one_hot_columns = one_hot.columns
|
|
34
|
+
idxs = data[one_hot_columns].idxmax(axis=1)
|
|
35
|
+
|
|
36
|
+
return embbed_predict_idx(idxs, one_hot.shape[1], model)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def embbed_predict_idx(idxs, one_hot_len, model):
|
|
40
|
+
if isinstance(idxs, int):
|
|
41
|
+
one_hot = np.zeros((1, one_hot_len), dtype=np.float32)
|
|
42
|
+
one_hot[0, idxs] = 1.0
|
|
43
|
+
return model.predict_on_batch(one_hot)[0]
|
|
44
|
+
|
|
45
|
+
one_hot = np.zeros((len(idxs), one_hot_len), dtype=np.float32)
|
|
46
|
+
one_hot[np.arange(len(idxs)), idxs] = 1.0
|
|
47
|
+
|
|
48
|
+
return model.predict_on_batch(one_hot)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def get_embeddings_dict(emb_models):
|
|
52
|
+
embeddings_dict = {}
|
|
53
|
+
|
|
54
|
+
map_key_sorted = sorted(emb_models.keys())
|
|
55
|
+
for k in map_key_sorted:
|
|
56
|
+
sub_model = emb_models[k]
|
|
57
|
+
model_info = sub_model["model"]
|
|
58
|
+
one_hot_df = sub_model["one_hot"]
|
|
59
|
+
|
|
60
|
+
category_embeddings = {}
|
|
61
|
+
|
|
62
|
+
for idx, col in enumerate(one_hot_df.columns):
|
|
63
|
+
prefix = k + "_"
|
|
64
|
+
if not col.startswith(prefix):
|
|
65
|
+
raise ValueError(
|
|
66
|
+
f"Column '{col}' does not start with expected prefix '{prefix}'"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
cat_id_str = col[len(prefix) :] # Remove prefix
|
|
70
|
+
cat_id = round(float(cat_id_str))
|
|
71
|
+
|
|
72
|
+
embedding_vector = embbed_predict_idx(
|
|
73
|
+
idx, one_hot_df.shape[1], model_info
|
|
74
|
+
).tolist()
|
|
75
|
+
category_embeddings[cat_id] = embedding_vector
|
|
76
|
+
|
|
77
|
+
embeddings_dict[k] = category_embeddings
|
|
78
|
+
return embeddings_dict
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class EmbeddingWrapper:
|
|
82
|
+
def __init__(
|
|
83
|
+
self,
|
|
84
|
+
training_data,
|
|
85
|
+
validation_data,
|
|
86
|
+
input_features,
|
|
87
|
+
output_features,
|
|
88
|
+
mapping,
|
|
89
|
+
fraction=1.0,
|
|
90
|
+
):
|
|
91
|
+
assert isinstance(training_data, pd.DataFrame) and isinstance(
|
|
92
|
+
validation_data, pd.DataFrame
|
|
93
|
+
)
|
|
94
|
+
self.map_key_sorted = sorted(mapping.keys())
|
|
95
|
+
self.original_training_data = training_data.copy()
|
|
96
|
+
self.original_validation_data = validation_data.copy()
|
|
97
|
+
self.mapping = mapping
|
|
98
|
+
self.initial_input_features = input_features
|
|
99
|
+
self.initial_output_features = output_features
|
|
100
|
+
self.fraction = fraction
|
|
101
|
+
|
|
102
|
+
self.create_one_hot_encoding()
|
|
103
|
+
self.create_emb_step()
|
|
104
|
+
|
|
105
|
+
def create_instruction_model(self):
|
|
106
|
+
embeddings_dict = get_embeddings_dict(self.emb_models)
|
|
107
|
+
|
|
108
|
+
transformation_list = []
|
|
109
|
+
|
|
110
|
+
# The index where one-hot encoded columns start, based on how you constructed 'features'
|
|
111
|
+
data_index = len(self.initial_input_features) - len(self.mapping)
|
|
112
|
+
segment_start = data_index
|
|
113
|
+
|
|
114
|
+
current_columns = list(self.s1_training_data_X.columns)
|
|
115
|
+
self.s2_training_data_X = self.s1_training_data_X.copy()
|
|
116
|
+
self.s2_validation_data_X = self.s1_validation_data_X.copy()
|
|
117
|
+
|
|
118
|
+
for k, v in self.mapping.items():
|
|
119
|
+
# Validate segment_start and segment_size
|
|
120
|
+
if k not in self.emb_models or "one_hot" not in self.emb_models[k]:
|
|
121
|
+
raise ValueError(f"No one_hot info found for key {k}")
|
|
122
|
+
|
|
123
|
+
segment_size = self.emb_models[k]["one_hot"].shape[1]
|
|
124
|
+
|
|
125
|
+
if segment_start + segment_size > len(current_columns):
|
|
126
|
+
raise IndexError(
|
|
127
|
+
f"Segment for key '{k}' exceeds the number of columns available.\n"
|
|
128
|
+
f"segment_start: {segment_start}, segment_size: {segment_size}, total_cols: {len(current_columns)}"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
one_hot_cols = current_columns[segment_start : segment_start + segment_size]
|
|
132
|
+
|
|
133
|
+
# Sanity check that these columns exist in the DataFrame
|
|
134
|
+
for col in one_hot_cols:
|
|
135
|
+
if (
|
|
136
|
+
col not in self.s1_training_data_X.columns
|
|
137
|
+
or col not in self.s1_validation_data_X.columns
|
|
138
|
+
):
|
|
139
|
+
raise KeyError(f"Column {col} expected but not found in data.")
|
|
140
|
+
|
|
141
|
+
# Extract the segments
|
|
142
|
+
train_segment = self.s1_training_data_X[one_hot_cols]
|
|
143
|
+
valid_segment = self.s1_validation_data_X[one_hot_cols]
|
|
144
|
+
|
|
145
|
+
# Predict the embeddings using embeddings_dict
|
|
146
|
+
train_embedded = self.emb_models[k]["model"].predict(train_segment)
|
|
147
|
+
valid_embedded = self.emb_models[k]["model"].predict(valid_segment)
|
|
148
|
+
|
|
149
|
+
print(f"Train embedded shape for {k}: {train_embedded.shape}")
|
|
150
|
+
print(f"Validation embedded shape for {k}: {valid_embedded.shape}")
|
|
151
|
+
|
|
152
|
+
# Check embedding shapes
|
|
153
|
+
if train_embedded.shape[1] != v:
|
|
154
|
+
raise ValueError(
|
|
155
|
+
f"Expected embeddings of size {v}, but got {train_embedded.shape[1]} for train data."
|
|
156
|
+
)
|
|
157
|
+
if valid_embedded.shape[1] != v:
|
|
158
|
+
raise ValueError(
|
|
159
|
+
f"Expected embeddings of size {v}, but got {valid_embedded.shape[1]} for validation data."
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Create new column names for the embedded features
|
|
163
|
+
embedded_col_names = [f"{k}_{i}" for i in range(v)]
|
|
164
|
+
|
|
165
|
+
# Create DataFrames for the embedded features
|
|
166
|
+
train_embedded_df = pd.DataFrame(
|
|
167
|
+
train_embedded,
|
|
168
|
+
index=self.s2_training_data_X.index,
|
|
169
|
+
columns=embedded_col_names,
|
|
170
|
+
)
|
|
171
|
+
valid_embedded_df = pd.DataFrame(
|
|
172
|
+
valid_embedded,
|
|
173
|
+
index=self.s2_validation_data_X.index,
|
|
174
|
+
columns=embedded_col_names,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Rebuild the DataFrames with the embedded columns replacing the one-hot columns
|
|
178
|
+
train_before = self.s2_training_data_X.iloc[:, :segment_start]
|
|
179
|
+
train_after = self.s2_training_data_X.iloc[
|
|
180
|
+
:, segment_start + segment_size :
|
|
181
|
+
]
|
|
182
|
+
valid_before = self.s2_validation_data_X.iloc[:, :segment_start]
|
|
183
|
+
valid_after = self.s2_validation_data_X.iloc[
|
|
184
|
+
:, segment_start + segment_size :
|
|
185
|
+
]
|
|
186
|
+
|
|
187
|
+
# Construct new DataFrames rather than modifying in place
|
|
188
|
+
new_data_train_X = pd.concat(
|
|
189
|
+
[train_before, train_embedded_df, train_after], axis=1
|
|
190
|
+
)
|
|
191
|
+
new_validation_data_X = pd.concat(
|
|
192
|
+
[valid_before, valid_embedded_df, valid_after], axis=1
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# Ensure consistent dtypes
|
|
196
|
+
new_data_train_X = new_data_train_X.astype(np.float32, copy=False)
|
|
197
|
+
new_validation_data_X = new_validation_data_X.astype(np.float32, copy=False)
|
|
198
|
+
|
|
199
|
+
# Update the main DataFrames and the list of columns
|
|
200
|
+
self.s2_training_data_X = new_data_train_X
|
|
201
|
+
self.s2_validation_data_X = new_validation_data_X
|
|
202
|
+
current_columns = list(self.s2_training_data_X.columns)
|
|
203
|
+
|
|
204
|
+
default = embeddings_dict[k].pop(-1)
|
|
205
|
+
|
|
206
|
+
# Append transformations
|
|
207
|
+
transformation_list.append(
|
|
208
|
+
{
|
|
209
|
+
"from": k,
|
|
210
|
+
"to": embedded_col_names,
|
|
211
|
+
"map": embeddings_dict[k],
|
|
212
|
+
"size": v,
|
|
213
|
+
"default": default,
|
|
214
|
+
}
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
transformation_list.append({"delete": k})
|
|
218
|
+
|
|
219
|
+
# Update segment_start to jump over the newly embedded columns
|
|
220
|
+
# Originally we had one-hot columns replaced by 'v' embeddings:
|
|
221
|
+
# So we move forward by `v - segment_size` because we've effectively
|
|
222
|
+
# replaced `segment_size` columns with `v` columns, and these columns
|
|
223
|
+
# start at the same segment_start position.
|
|
224
|
+
segment_start = segment_start + v
|
|
225
|
+
|
|
226
|
+
print("Final Data Train Columns:", self.s2_training_data_X.columns)
|
|
227
|
+
print("Final Validation Data Columns:", self.s2_validation_data_X.columns)
|
|
228
|
+
|
|
229
|
+
transformation_model, new_features = (
|
|
230
|
+
create_instructions_model_from_transformation_list(
|
|
231
|
+
self.initial_input_features, transformation_list
|
|
232
|
+
)
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
self.s2_input_features = new_features
|
|
236
|
+
self.s2_output_features = self.initial_output_features
|
|
237
|
+
self.s2_training_data_y = self.s1_training_data_y
|
|
238
|
+
self.s2_validation_data_y = self.s1_validation_data_y
|
|
239
|
+
return transformation_model
|
|
240
|
+
|
|
241
|
+
def create_one_hot_encoding(self):
|
|
242
|
+
self.emb_models = {}
|
|
243
|
+
maps = self.mapping
|
|
244
|
+
training_data_X = self.original_training_data[
|
|
245
|
+
self.initial_input_features
|
|
246
|
+
].copy()
|
|
247
|
+
training_data_y = self.original_training_data[
|
|
248
|
+
self.initial_output_features
|
|
249
|
+
].copy()
|
|
250
|
+
validation_data_X = self.original_validation_data[
|
|
251
|
+
self.initial_input_features
|
|
252
|
+
].copy()
|
|
253
|
+
|
|
254
|
+
new_data_x = []
|
|
255
|
+
new_data_y = []
|
|
256
|
+
for k in self.map_key_sorted:
|
|
257
|
+
subset_X = training_data_X.sample(frac=self.fraction).copy()
|
|
258
|
+
subset_y = training_data_y.loc[subset_X.index].copy()
|
|
259
|
+
|
|
260
|
+
# Modify the subset
|
|
261
|
+
subset_X[k] = -1.0
|
|
262
|
+
|
|
263
|
+
# Append the subset to the lists
|
|
264
|
+
new_data_x.append(subset_X)
|
|
265
|
+
new_data_y.append(subset_y)
|
|
266
|
+
|
|
267
|
+
# Extend data_train_X and data_train_y with the new data
|
|
268
|
+
training_data_X = pd.concat(
|
|
269
|
+
[training_data_X] + new_data_x, ignore_index=True
|
|
270
|
+
) # Ensure consistent indexing
|
|
271
|
+
training_data_y = pd.concat(
|
|
272
|
+
[training_data_y] + new_data_y, ignore_index=True
|
|
273
|
+
) # Ensure consistent indexing
|
|
274
|
+
|
|
275
|
+
for k in self.map_key_sorted:
|
|
276
|
+
v = maps[k]
|
|
277
|
+
|
|
278
|
+
one_hot = pd.get_dummies(training_data_X[k], prefix=k, dtype=np.float32)
|
|
279
|
+
one_hot_columns = one_hot.columns
|
|
280
|
+
|
|
281
|
+
assert f"{k}_-1.0" in one_hot_columns
|
|
282
|
+
|
|
283
|
+
training_data_ids = set(training_data_X[k].unique())
|
|
284
|
+
validation_data_ids = set(validation_data_X[k].unique())
|
|
285
|
+
|
|
286
|
+
unique_validation_data_ids = validation_data_ids - training_data_ids
|
|
287
|
+
|
|
288
|
+
# replace the unique_validation_data_ids (unknown) with -1
|
|
289
|
+
validation_data_X.loc[
|
|
290
|
+
validation_data_X[k].isin(unique_validation_data_ids), k
|
|
291
|
+
] = -1.0
|
|
292
|
+
|
|
293
|
+
training_data_ids = set(training_data_X[k].unique())
|
|
294
|
+
validation_data_ids = set(validation_data_X[k].unique())
|
|
295
|
+
|
|
296
|
+
unique_validation_data_ids = (
|
|
297
|
+
set(validation_data_X[k].unique()) - training_data_ids
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
print(
|
|
301
|
+
f"Traning data x had {len(training_data_X.columns)} cols:\n{training_data_X.columns}"
|
|
302
|
+
)
|
|
303
|
+
print(f"Adding one hot {len(one_hot.columns)} columns and dropping {k}")
|
|
304
|
+
|
|
305
|
+
training_data_X = training_data_X.join(one_hot)
|
|
306
|
+
training_data_X = training_data_X.drop(k, axis=1)
|
|
307
|
+
|
|
308
|
+
print(
|
|
309
|
+
f"Traning data x now has {len(training_data_X.columns)} cols:\n{training_data_X.columns}"
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
validation_data_X = one_hot_encode(validation_data_X, one_hot, k)
|
|
313
|
+
|
|
314
|
+
# Create model for embedding the one-hot columns
|
|
315
|
+
inputs = InputBuffer(one_hot.shape[1])
|
|
316
|
+
outputs = Dense(v, activation="sigmoid")(inputs)
|
|
317
|
+
model = ModelGraph(inputs, outputs)
|
|
318
|
+
|
|
319
|
+
self.emb_models[k] = {
|
|
320
|
+
"model": model,
|
|
321
|
+
"one_hot": one_hot,
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
assert "complete" not in training_data_X.columns
|
|
325
|
+
|
|
326
|
+
self.s1_input_features = training_data_X.columns
|
|
327
|
+
self.s1_output_features = training_data_y.columns
|
|
328
|
+
self.s1_training_data_X = training_data_X
|
|
329
|
+
self.s1_training_data_y = training_data_y
|
|
330
|
+
self.s1_validation_data_X = validation_data_X
|
|
331
|
+
self.s1_validation_data_y = self.original_validation_data[
|
|
332
|
+
self.initial_output_features
|
|
333
|
+
].copy()
|
|
334
|
+
|
|
335
|
+
def create_emb_step(self):
|
|
336
|
+
full_inputs = InputBuffer(self.s1_training_data_X.shape[1])
|
|
337
|
+
data_index = len(self.initial_input_features) - len(self.mapping)
|
|
338
|
+
|
|
339
|
+
preprocessed_outputs = [full_inputs[:data_index]]
|
|
340
|
+
|
|
341
|
+
for k in self.map_key_sorted:
|
|
342
|
+
segment_size = self.emb_models[k]["one_hot"].shape[1]
|
|
343
|
+
preprocessed_outputs.append(
|
|
344
|
+
self.emb_models[k]["model"](
|
|
345
|
+
full_inputs[data_index : data_index + segment_size]
|
|
346
|
+
)
|
|
347
|
+
)
|
|
348
|
+
data_index += segment_size
|
|
349
|
+
|
|
350
|
+
assert data_index == self.s1_training_data_X.shape[1]
|
|
351
|
+
|
|
352
|
+
full_preprocessed_outputs = Concatenate()(preprocessed_outputs)
|
|
353
|
+
|
|
354
|
+
self.apply_smart_embeddings = ModelGraph(full_inputs, full_preprocessed_outputs)
|
|
355
|
+
|
|
356
|
+
self.apply_embeddings = self.apply_smart_embeddings.get_keras()
|
|
357
|
+
|
|
358
|
+
self.step1_size = data_index
|
|
359
|
+
self.step2_size = full_preprocessed_outputs.os
|