virgo-modules 0.0.75__py3-none-any.whl → 0.0.76__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of virgo-modules might be problematic. Click here for more details.
- virgo_modules/src/aws_utils.py +34 -2
- virgo_modules/src/edge_utils.py +200 -5
- virgo_modules/src/re_utils.py +360 -54
- virgo_modules/src/ticketer_source.py +1418 -256
- {virgo_modules-0.0.75.dist-info → virgo_modules-0.0.76.dist-info}/METADATA +1 -1
- virgo_modules-0.0.76.dist-info/RECORD +12 -0
- virgo_modules-0.0.75.dist-info/RECORD +0 -12
- {virgo_modules-0.0.75.dist-info → virgo_modules-0.0.76.dist-info}/LICENSE +0 -0
- {virgo_modules-0.0.75.dist-info → virgo_modules-0.0.76.dist-info}/WHEEL +0 -0
- {virgo_modules-0.0.75.dist-info → virgo_modules-0.0.76.dist-info}/top_level.txt +0 -0
virgo_modules/src/aws_utils.py
CHANGED
|
@@ -6,14 +6,36 @@ import pandas as pd
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def upload_file_to_aws(bucket,key,input_path, aws_credentials):
|
|
9
|
-
|
|
9
|
+
'''
|
|
10
|
+
upload file from a folder to an s3 folder
|
|
11
|
+
|
|
12
|
+
Parameters:
|
|
13
|
+
bucket (str): bucket name
|
|
14
|
+
key (str): key pattern or folder in s3 e.g. path/to/upload/
|
|
15
|
+
input_path (str): input path of the file to upload e.g. path/to/upload.txt
|
|
16
|
+
aws_credentials (dict): aws credentials dictionary
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
None
|
|
20
|
+
'''
|
|
10
21
|
session = boto3.Session(aws_access_key_id=aws_credentials['AWS_ACCESS_KEY_ID'],aws_secret_access_key=aws_credentials['AWS_SECRET_ACCESS_KEY'])
|
|
11
22
|
bucket = aws_credentials[bucket]
|
|
12
23
|
s3 = session.resource('s3')
|
|
13
24
|
s3.meta.client.upload_file(Filename=input_path , Bucket=bucket, Key=key)
|
|
14
25
|
|
|
15
26
|
def upload_pandas_to_s3(data_frame,bucket,key, aws_credentials):
|
|
27
|
+
'''
|
|
28
|
+
upload dataframe as csv to an s3 folder
|
|
16
29
|
|
|
30
|
+
Parameters:
|
|
31
|
+
data_frame (pd.DataFrame): data
|
|
32
|
+
bucket (str): bucket name
|
|
33
|
+
key (str): key pattern or folder in s3 e.g. path/to/upload/
|
|
34
|
+
aws_credentials (dict): aws credentials dictionary
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
None
|
|
38
|
+
'''
|
|
17
39
|
csv_buffer = StringIO()
|
|
18
40
|
data_frame.to_csv(csv_buffer)
|
|
19
41
|
csv_buffer.seek(0)
|
|
@@ -23,7 +45,17 @@ def upload_pandas_to_s3(data_frame,bucket,key, aws_credentials):
|
|
|
23
45
|
s3.put_object(Bucket=bucket, Body=csv_buffer.getvalue(), Key= key)
|
|
24
46
|
|
|
25
47
|
def download_file_to_aws(bucket,key, aws_credentials):
|
|
26
|
-
|
|
48
|
+
'''
|
|
49
|
+
download csv file from s3 folder
|
|
50
|
+
|
|
51
|
+
Parameters:
|
|
52
|
+
bucket (str): bucket name
|
|
53
|
+
key (str): key pattern or folder in s3 e.g. path/to/download/file.csv
|
|
54
|
+
aws_credentials (dict): aws credentials dictionary
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
None
|
|
58
|
+
'''
|
|
27
59
|
s3c = boto3.client(
|
|
28
60
|
's3',
|
|
29
61
|
region_name = aws_credentials['AWS_DEFAULT_REGION'],
|
virgo_modules/src/edge_utils.py
CHANGED
|
@@ -12,11 +12,54 @@ from feature_engine.discretisation import EqualWidthDiscretiser
|
|
|
12
12
|
from .ticketer_source import VirgoWinsorizerFeature, InverseHyperbolicSine
|
|
13
13
|
|
|
14
14
|
class produce_model_wrapper:
|
|
15
|
+
"""
|
|
16
|
+
class that wraps a pipeline and a machine learning model. it also provides data spliting train/validation
|
|
17
|
+
|
|
18
|
+
Attributes
|
|
19
|
+
----------
|
|
20
|
+
data : pd.DataFrame
|
|
21
|
+
list of features to apply the transformation
|
|
22
|
+
X_train : pd.DataFrame
|
|
23
|
+
y_train : pd.DataFrame
|
|
24
|
+
X_val : pd.DataFrame
|
|
25
|
+
y_val : pd.DataFrame
|
|
26
|
+
self.pipeline: obj
|
|
27
|
+
sklearn pipeline including model and pipleline
|
|
28
|
+
|
|
29
|
+
Methods
|
|
30
|
+
-------
|
|
31
|
+
preprocess(validation_size=int, target=list):
|
|
32
|
+
ingest data and split data between train and validation data and X and Y data
|
|
33
|
+
train_model(pipe=obj, model=obj, cv_=boolean):
|
|
34
|
+
merge and train pipeline and machine learning model
|
|
35
|
+
"""
|
|
15
36
|
def __init__(self,data):
|
|
37
|
+
"""
|
|
38
|
+
Initialize object
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
data (pd.DataFrame): data
|
|
43
|
+
|
|
44
|
+
Returns
|
|
45
|
+
-------
|
|
46
|
+
None
|
|
47
|
+
"""
|
|
16
48
|
self.data = data.copy()
|
|
17
49
|
|
|
18
50
|
def preprocess(self, validation_size, target):
|
|
19
|
-
|
|
51
|
+
"""
|
|
52
|
+
ingest data and split data between train and validation data and X and Y data
|
|
53
|
+
|
|
54
|
+
Parameters
|
|
55
|
+
----------
|
|
56
|
+
validation_size (int): validation data size, the remaining is taken as training data
|
|
57
|
+
target (list): target column list
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
None
|
|
62
|
+
"""
|
|
20
63
|
val_date = self.data.groupby('Date', as_index = False).agg(target_down = (target[0],'count')).sort_values('Date').iloc[-validation_size:,].head(1)['Date'].values[0]
|
|
21
64
|
|
|
22
65
|
train_data = self.data[self.data['Date'] < val_date].dropna()
|
|
@@ -31,6 +74,18 @@ class produce_model_wrapper:
|
|
|
31
74
|
self.y_val = y_val
|
|
32
75
|
|
|
33
76
|
def train_model(self, pipe, model, cv_ = False):
|
|
77
|
+
"""
|
|
78
|
+
merge and train pipeline and machine learning model
|
|
79
|
+
|
|
80
|
+
Parameters
|
|
81
|
+
----------
|
|
82
|
+
pipe (int): sklearn pipeline object
|
|
83
|
+
model (list): model
|
|
84
|
+
|
|
85
|
+
Returns
|
|
86
|
+
-------
|
|
87
|
+
None
|
|
88
|
+
"""
|
|
34
89
|
self.model = model
|
|
35
90
|
self.pipe_transform = pipe
|
|
36
91
|
self.pipeline = Pipeline([('pipe_transform',self.pipe_transform), ('model',self.model)])
|
|
@@ -38,11 +93,53 @@ class produce_model_wrapper:
|
|
|
38
93
|
self.pipeline.fit(self.X_train, self.y_train)
|
|
39
94
|
|
|
40
95
|
class register_results():
|
|
96
|
+
"""
|
|
97
|
+
class that collects model metrics
|
|
98
|
+
|
|
99
|
+
Attributes
|
|
100
|
+
----------
|
|
101
|
+
model_name : str
|
|
102
|
+
model name
|
|
103
|
+
metric_logger : diot
|
|
104
|
+
dictionary that collect model metrics
|
|
105
|
+
|
|
106
|
+
Methods
|
|
107
|
+
-------
|
|
108
|
+
eval_metrics(pipeline=obj, X=pd.DataFrame, y=pd.DataFrame, type_data=str, phase=str):
|
|
109
|
+
register model metrics
|
|
110
|
+
print_metric_logger():
|
|
111
|
+
print logger results
|
|
112
|
+
"""
|
|
41
113
|
def __init__(self, model_name):
|
|
114
|
+
"""
|
|
115
|
+
Initialize object
|
|
116
|
+
|
|
117
|
+
Parameters
|
|
118
|
+
----------
|
|
119
|
+
model_name (str): model name
|
|
120
|
+
|
|
121
|
+
Returns
|
|
122
|
+
-------
|
|
123
|
+
None
|
|
124
|
+
"""
|
|
42
125
|
self.model_name = model_name
|
|
43
126
|
self.metric_logger = dict()
|
|
44
127
|
def eval_metrics(self, pipeline, X, y, type_data, phase):
|
|
45
|
-
|
|
128
|
+
"""
|
|
129
|
+
register model metrics
|
|
130
|
+
|
|
131
|
+
Parameters
|
|
132
|
+
----------
|
|
133
|
+
pipeline (obj): model pipeline
|
|
134
|
+
X (pd.DataFrame): input data
|
|
135
|
+
Y (pd.DataFrame): target data
|
|
136
|
+
type_data (str): data type, either train, test or validation
|
|
137
|
+
phase (str): model phase, either baseline, feature selection, tunned model
|
|
138
|
+
|
|
139
|
+
Returns
|
|
140
|
+
-------
|
|
141
|
+
None
|
|
142
|
+
"""
|
|
46
143
|
preds_proba = pipeline.predict_proba(X)
|
|
47
144
|
preds = pipeline.predict(X)
|
|
48
145
|
|
|
@@ -56,6 +153,17 @@ class register_results():
|
|
|
56
153
|
self.metric_logger[f'{phase}//{self.model_name}//{type_data}'] = {'roc':roc, 'precision':precision, 'recall':recall}
|
|
57
154
|
|
|
58
155
|
def print_metric_logger(self):
|
|
156
|
+
"""
|
|
157
|
+
print logger results
|
|
158
|
+
|
|
159
|
+
Parameters
|
|
160
|
+
----------
|
|
161
|
+
None
|
|
162
|
+
|
|
163
|
+
Returns
|
|
164
|
+
-------
|
|
165
|
+
None
|
|
166
|
+
"""
|
|
59
167
|
parts = list(self.metric_logger.keys())
|
|
60
168
|
phase_parts = [ x.split('//')[0] for x in parts]
|
|
61
169
|
|
|
@@ -74,7 +182,19 @@ class register_results():
|
|
|
74
182
|
|
|
75
183
|
|
|
76
184
|
def eval_metrics(pipeline, X, y, type_data, model_name):
|
|
77
|
-
|
|
185
|
+
'''
|
|
186
|
+
print metrics from a model pipeline
|
|
187
|
+
|
|
188
|
+
Parameters:
|
|
189
|
+
pipeline (obj): model pipeline
|
|
190
|
+
X (pd.DataFrame): input data
|
|
191
|
+
Y (pd.DataFrame): target data
|
|
192
|
+
type_data (str): data type, either train, test or validation
|
|
193
|
+
model_name (str): model name
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
objects (dict): that contains ml artifacts, data , configs and models
|
|
197
|
+
'''
|
|
78
198
|
preds_proba = pipeline.predict_proba(X)
|
|
79
199
|
preds = pipeline.predict(X)
|
|
80
200
|
|
|
@@ -97,7 +217,22 @@ def data_processing_pipeline_classifier(
|
|
|
97
217
|
invhypervolsin_features = False,
|
|
98
218
|
pipeline_order = 'selector//winzorizer//discretizer//median_inputer//drop//correlation'):
|
|
99
219
|
|
|
220
|
+
'''
|
|
221
|
+
pipeline builder
|
|
100
222
|
|
|
223
|
+
Parameters:
|
|
224
|
+
features_base (list): model pipeline
|
|
225
|
+
features_to_drop (list): features to drop list
|
|
226
|
+
winsorizer_conf (dict): winsorising configuration dictionary
|
|
227
|
+
discretize_columns (list): feature list to discretize
|
|
228
|
+
bins_discretize (int): number of bins to discretize
|
|
229
|
+
correlation (float): correlation threshold to discard correlated features
|
|
230
|
+
fillna (boolean): if true to fill na features
|
|
231
|
+
invhypervolsin_features (list): list of features to apply inverse hyperbolic sine
|
|
232
|
+
pipeline_order (str): custom pipeline order eg. selector//winzorizer//discretizer//median_inputer//drop//correlation
|
|
233
|
+
Returns:
|
|
234
|
+
pipe (obj): pipeline object
|
|
235
|
+
'''
|
|
101
236
|
select_pipe = [('selector', FeatureSelector(features_base))] if features_base else []
|
|
102
237
|
winzorizer_pipe = [('winzorized_features', VirgoWinsorizerFeature(winsorizer_conf))] if winsorizer_conf else []
|
|
103
238
|
drop_pipe = [('drop_features' , DropFeatures(features_to_drop=features_to_drop))] if features_to_drop else []
|
|
@@ -130,15 +265,62 @@ def data_processing_pipeline_classifier(
|
|
|
130
265
|
|
|
131
266
|
|
|
132
267
|
class ExpandingMultipleTimeSeriesKFold:
|
|
133
|
-
"""
|
|
268
|
+
"""
|
|
269
|
+
class that creates a custom cv schema that is compatible with sklearn cv arguments.
|
|
270
|
+
|
|
271
|
+
Attributes
|
|
272
|
+
----------
|
|
273
|
+
df : pd.DataFrame
|
|
274
|
+
dataset
|
|
275
|
+
number_window : int
|
|
276
|
+
number of train splits
|
|
277
|
+
window_size : int
|
|
278
|
+
window size data
|
|
279
|
+
overlap_size : int
|
|
280
|
+
overlap size
|
|
281
|
+
|
|
282
|
+
Methods
|
|
283
|
+
-------
|
|
284
|
+
split(X=pd.DataFrame, y=pd.DataFrame, groups=boolean):
|
|
285
|
+
custom split procedure
|
|
286
|
+
get_n_splits(X=pd.DataFrame, y=pd.DataFrame, groups=boolean):
|
|
287
|
+
get number of splits
|
|
288
|
+
"""
|
|
289
|
+
|
|
134
290
|
def __init__(self, df, window_size = 100, number_window=3, overlap_size = 0):
|
|
291
|
+
"""
|
|
292
|
+
Initialize object
|
|
293
|
+
|
|
294
|
+
Parameters
|
|
295
|
+
----------
|
|
296
|
+
df (pd.DataFrame): dataset
|
|
297
|
+
number_window (int): number of train splits
|
|
298
|
+
window_size (int): window size data
|
|
299
|
+
overlap_size (int): overlap size
|
|
300
|
+
|
|
301
|
+
Returns
|
|
302
|
+
-------
|
|
303
|
+
None
|
|
304
|
+
"""
|
|
135
305
|
self.df = df
|
|
136
306
|
self.number_window = number_window
|
|
137
307
|
self.window_size = window_size
|
|
138
308
|
self.overlap_size = overlap_size
|
|
139
309
|
|
|
140
310
|
def split(self, X, y, groups=None):
|
|
141
|
-
|
|
311
|
+
"""
|
|
312
|
+
custom split procedure
|
|
313
|
+
|
|
314
|
+
Parameters
|
|
315
|
+
----------
|
|
316
|
+
X (pd.DataFrame): input data (required for sklearn classes)
|
|
317
|
+
y (pd.DataFrame): target data (required for sklearn classes)
|
|
318
|
+
groups (boolean): to apply groups (required for sklearn classes)
|
|
319
|
+
|
|
320
|
+
Returns
|
|
321
|
+
-------
|
|
322
|
+
None
|
|
323
|
+
"""
|
|
142
324
|
if 'Date_i' not in self.df.index.names or 'i' not in self.df.index.names:
|
|
143
325
|
raise Exception('no date and/or index in the index dataframe')
|
|
144
326
|
|
|
@@ -175,4 +357,17 @@ class ExpandingMultipleTimeSeriesKFold:
|
|
|
175
357
|
yield train_index, test_index
|
|
176
358
|
|
|
177
359
|
def get_n_splits(self, X, y, groups=None):
|
|
360
|
+
"""
|
|
361
|
+
get number of splits
|
|
362
|
+
|
|
363
|
+
Parameters
|
|
364
|
+
----------
|
|
365
|
+
X (pd.DataFrame): input data (required for sklearn classes)
|
|
366
|
+
y (pd.DataFrame): target data (required for sklearn classes)
|
|
367
|
+
groups (boolean): to apply groups (required for sklearn classes)
|
|
368
|
+
|
|
369
|
+
Returns
|
|
370
|
+
-------
|
|
371
|
+
number_window (int): number of splits
|
|
372
|
+
"""
|
|
178
373
|
return self.number_window
|