virgo-modules 0.0.75__py3-none-any.whl → 0.0.76__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of virgo-modules might be problematic. Click here for more details.

@@ -6,14 +6,36 @@ import pandas as pd
6
6
 
7
7
 
8
8
  def upload_file_to_aws(bucket,key,input_path, aws_credentials):
9
-
9
+ '''
10
+ upload file from a folder to an s3 folder
11
+
12
+ Parameters:
13
+ bucket (str): bucket name
14
+ key (str): key pattern or folder in s3 e.g. path/to/upload/
15
+ input_path (str): input path of the file to upload e.g. path/to/upload.txt
16
+ aws_credentials (dict): aws credentials dictionary
17
+
18
+ Returns:
19
+ None
20
+ '''
10
21
  session = boto3.Session(aws_access_key_id=aws_credentials['AWS_ACCESS_KEY_ID'],aws_secret_access_key=aws_credentials['AWS_SECRET_ACCESS_KEY'])
11
22
  bucket = aws_credentials[bucket]
12
23
  s3 = session.resource('s3')
13
24
  s3.meta.client.upload_file(Filename=input_path , Bucket=bucket, Key=key)
14
25
 
15
26
  def upload_pandas_to_s3(data_frame,bucket,key, aws_credentials):
27
+ '''
28
+ upload dataframe as csv to an s3 folder
16
29
 
30
+ Parameters:
31
+ data_frame (pd.DataFrame): data
32
+ bucket (str): bucket name
33
+ key (str): key pattern or folder in s3 e.g. path/to/upload/
34
+ aws_credentials (dict): aws credentials dictionary
35
+
36
+ Returns:
37
+ None
38
+ '''
17
39
  csv_buffer = StringIO()
18
40
  data_frame.to_csv(csv_buffer)
19
41
  csv_buffer.seek(0)
@@ -23,7 +45,17 @@ def upload_pandas_to_s3(data_frame,bucket,key, aws_credentials):
23
45
  s3.put_object(Bucket=bucket, Body=csv_buffer.getvalue(), Key= key)
24
46
 
25
47
  def download_file_to_aws(bucket,key, aws_credentials):
26
-
48
+ '''
49
+ download csv file from s3 folder
50
+
51
+ Parameters:
52
+ bucket (str): bucket name
53
+ key (str): key pattern or folder in s3 e.g. path/to/download/file.csv
54
+ aws_credentials (dict): aws credentials dictionary
55
+
56
+ Returns:
57
+ None
58
+ '''
27
59
  s3c = boto3.client(
28
60
  's3',
29
61
  region_name = aws_credentials['AWS_DEFAULT_REGION'],
@@ -12,11 +12,54 @@ from feature_engine.discretisation import EqualWidthDiscretiser
12
12
  from .ticketer_source import VirgoWinsorizerFeature, InverseHyperbolicSine
13
13
 
14
14
  class produce_model_wrapper:
15
+ """
16
+ class that wraps a pipeline and a machine learning model. it also provides data spliting train/validation
17
+
18
+ Attributes
19
+ ----------
20
+ data : pd.DataFrame
21
+ list of features to apply the transformation
22
+ X_train : pd.DataFrame
23
+ y_train : pd.DataFrame
24
+ X_val : pd.DataFrame
25
+ y_val : pd.DataFrame
26
+ self.pipeline: obj
27
+ sklearn pipeline including model and pipleline
28
+
29
+ Methods
30
+ -------
31
+ preprocess(validation_size=int, target=list):
32
+ ingest data and split data between train and validation data and X and Y data
33
+ train_model(pipe=obj, model=obj, cv_=boolean):
34
+ merge and train pipeline and machine learning model
35
+ """
15
36
  def __init__(self,data):
37
+ """
38
+ Initialize object
39
+
40
+ Parameters
41
+ ----------
42
+ data (pd.DataFrame): data
43
+
44
+ Returns
45
+ -------
46
+ None
47
+ """
16
48
  self.data = data.copy()
17
49
 
18
50
  def preprocess(self, validation_size, target):
19
-
51
+ """
52
+ ingest data and split data between train and validation data and X and Y data
53
+
54
+ Parameters
55
+ ----------
56
+ validation_size (int): validation data size, the remaining is taken as training data
57
+ target (list): target column list
58
+
59
+ Returns
60
+ -------
61
+ None
62
+ """
20
63
  val_date = self.data.groupby('Date', as_index = False).agg(target_down = (target[0],'count')).sort_values('Date').iloc[-validation_size:,].head(1)['Date'].values[0]
21
64
 
22
65
  train_data = self.data[self.data['Date'] < val_date].dropna()
@@ -31,6 +74,18 @@ class produce_model_wrapper:
31
74
  self.y_val = y_val
32
75
 
33
76
  def train_model(self, pipe, model, cv_ = False):
77
+ """
78
+ merge and train pipeline and machine learning model
79
+
80
+ Parameters
81
+ ----------
82
+ pipe (int): sklearn pipeline object
83
+ model (list): model
84
+
85
+ Returns
86
+ -------
87
+ None
88
+ """
34
89
  self.model = model
35
90
  self.pipe_transform = pipe
36
91
  self.pipeline = Pipeline([('pipe_transform',self.pipe_transform), ('model',self.model)])
@@ -38,11 +93,53 @@ class produce_model_wrapper:
38
93
  self.pipeline.fit(self.X_train, self.y_train)
39
94
 
40
95
  class register_results():
96
+ """
97
+ class that collects model metrics
98
+
99
+ Attributes
100
+ ----------
101
+ model_name : str
102
+ model name
103
+ metric_logger : diot
104
+ dictionary that collect model metrics
105
+
106
+ Methods
107
+ -------
108
+ eval_metrics(pipeline=obj, X=pd.DataFrame, y=pd.DataFrame, type_data=str, phase=str):
109
+ register model metrics
110
+ print_metric_logger():
111
+ print logger results
112
+ """
41
113
  def __init__(self, model_name):
114
+ """
115
+ Initialize object
116
+
117
+ Parameters
118
+ ----------
119
+ model_name (str): model name
120
+
121
+ Returns
122
+ -------
123
+ None
124
+ """
42
125
  self.model_name = model_name
43
126
  self.metric_logger = dict()
44
127
  def eval_metrics(self, pipeline, X, y, type_data, phase):
45
-
128
+ """
129
+ register model metrics
130
+
131
+ Parameters
132
+ ----------
133
+ pipeline (obj): model pipeline
134
+ X (pd.DataFrame): input data
135
+ Y (pd.DataFrame): target data
136
+ type_data (str): data type, either train, test or validation
137
+ phase (str): model phase, either baseline, feature selection, tunned model
138
+
139
+ Returns
140
+ -------
141
+ None
142
+ """
46
143
  preds_proba = pipeline.predict_proba(X)
47
144
  preds = pipeline.predict(X)
48
145
 
@@ -56,6 +153,17 @@ class register_results():
56
153
  self.metric_logger[f'{phase}//{self.model_name}//{type_data}'] = {'roc':roc, 'precision':precision, 'recall':recall}
57
154
 
58
155
  def print_metric_logger(self):
156
+ """
157
+ print logger results
158
+
159
+ Parameters
160
+ ----------
161
+ None
162
+
163
+ Returns
164
+ -------
165
+ None
166
+ """
59
167
  parts = list(self.metric_logger.keys())
60
168
  phase_parts = [ x.split('//')[0] for x in parts]
61
169
 
@@ -74,7 +182,19 @@ class register_results():
74
182
 
75
183
 
76
184
  def eval_metrics(pipeline, X, y, type_data, model_name):
77
-
185
+ '''
186
+ print metrics from a model pipeline
187
+
188
+ Parameters:
189
+ pipeline (obj): model pipeline
190
+ X (pd.DataFrame): input data
191
+ Y (pd.DataFrame): target data
192
+ type_data (str): data type, either train, test or validation
193
+ model_name (str): model name
194
+
195
+ Returns:
196
+ objects (dict): that contains ml artifacts, data , configs and models
197
+ '''
78
198
  preds_proba = pipeline.predict_proba(X)
79
199
  preds = pipeline.predict(X)
80
200
 
@@ -97,7 +217,22 @@ def data_processing_pipeline_classifier(
97
217
  invhypervolsin_features = False,
98
218
  pipeline_order = 'selector//winzorizer//discretizer//median_inputer//drop//correlation'):
99
219
 
220
+ '''
221
+ pipeline builder
100
222
 
223
+ Parameters:
224
+ features_base (list): model pipeline
225
+ features_to_drop (list): features to drop list
226
+ winsorizer_conf (dict): winsorising configuration dictionary
227
+ discretize_columns (list): feature list to discretize
228
+ bins_discretize (int): number of bins to discretize
229
+ correlation (float): correlation threshold to discard correlated features
230
+ fillna (boolean): if true to fill na features
231
+ invhypervolsin_features (list): list of features to apply inverse hyperbolic sine
232
+ pipeline_order (str): custom pipeline order eg. selector//winzorizer//discretizer//median_inputer//drop//correlation
233
+ Returns:
234
+ pipe (obj): pipeline object
235
+ '''
101
236
  select_pipe = [('selector', FeatureSelector(features_base))] if features_base else []
102
237
  winzorizer_pipe = [('winzorized_features', VirgoWinsorizerFeature(winsorizer_conf))] if winsorizer_conf else []
103
238
  drop_pipe = [('drop_features' , DropFeatures(features_to_drop=features_to_drop))] if features_to_drop else []
@@ -130,15 +265,62 @@ def data_processing_pipeline_classifier(
130
265
 
131
266
 
132
267
  class ExpandingMultipleTimeSeriesKFold:
133
- """increasing training window where the test can be overlap"""
268
+ """
269
+ class that creates a custom cv schema that is compatible with sklearn cv arguments.
270
+
271
+ Attributes
272
+ ----------
273
+ df : pd.DataFrame
274
+ dataset
275
+ number_window : int
276
+ number of train splits
277
+ window_size : int
278
+ window size data
279
+ overlap_size : int
280
+ overlap size
281
+
282
+ Methods
283
+ -------
284
+ split(X=pd.DataFrame, y=pd.DataFrame, groups=boolean):
285
+ custom split procedure
286
+ get_n_splits(X=pd.DataFrame, y=pd.DataFrame, groups=boolean):
287
+ get number of splits
288
+ """
289
+
134
290
  def __init__(self, df, window_size = 100, number_window=3, overlap_size = 0):
291
+ """
292
+ Initialize object
293
+
294
+ Parameters
295
+ ----------
296
+ df (pd.DataFrame): dataset
297
+ number_window (int): number of train splits
298
+ window_size (int): window size data
299
+ overlap_size (int): overlap size
300
+
301
+ Returns
302
+ -------
303
+ None
304
+ """
135
305
  self.df = df
136
306
  self.number_window = number_window
137
307
  self.window_size = window_size
138
308
  self.overlap_size = overlap_size
139
309
 
140
310
  def split(self, X, y, groups=None):
141
-
311
+ """
312
+ custom split procedure
313
+
314
+ Parameters
315
+ ----------
316
+ X (pd.DataFrame): input data (required for sklearn classes)
317
+ y (pd.DataFrame): target data (required for sklearn classes)
318
+ groups (boolean): to apply groups (required for sklearn classes)
319
+
320
+ Returns
321
+ -------
322
+ None
323
+ """
142
324
  if 'Date_i' not in self.df.index.names or 'i' not in self.df.index.names:
143
325
  raise Exception('no date and/or index in the index dataframe')
144
326
 
@@ -175,4 +357,17 @@ class ExpandingMultipleTimeSeriesKFold:
175
357
  yield train_index, test_index
176
358
 
177
359
  def get_n_splits(self, X, y, groups=None):
360
+ """
361
+ get number of splits
362
+
363
+ Parameters
364
+ ----------
365
+ X (pd.DataFrame): input data (required for sklearn classes)
366
+ y (pd.DataFrame): target data (required for sklearn classes)
367
+ groups (boolean): to apply groups (required for sklearn classes)
368
+
369
+ Returns
370
+ -------
371
+ number_window (int): number of splits
372
+ """
178
373
  return self.number_window