teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +71 -0
- teradataml/_version.py +2 -2
- teradataml/analytics/analytic_function_executor.py +51 -24
- teradataml/analytics/json_parser/utils.py +11 -17
- teradataml/automl/__init__.py +103 -48
- teradataml/automl/data_preparation.py +55 -37
- teradataml/automl/data_transformation.py +131 -69
- teradataml/automl/feature_engineering.py +117 -185
- teradataml/automl/feature_exploration.py +9 -2
- teradataml/automl/model_evaluation.py +13 -25
- teradataml/automl/model_training.py +214 -75
- teradataml/catalog/model_cataloging_utils.py +1 -1
- teradataml/clients/auth_client.py +133 -0
- teradataml/common/aed_utils.py +3 -2
- teradataml/common/constants.py +11 -6
- teradataml/common/garbagecollector.py +5 -0
- teradataml/common/messagecodes.py +3 -1
- teradataml/common/messages.py +2 -1
- teradataml/common/utils.py +6 -0
- teradataml/context/context.py +49 -29
- teradataml/data/advertising.csv +201 -0
- teradataml/data/bank_marketing.csv +11163 -0
- teradataml/data/bike_sharing.csv +732 -0
- teradataml/data/boston2cols.csv +721 -0
- teradataml/data/breast_cancer.csv +570 -0
- teradataml/data/customer_segmentation_test.csv +2628 -0
- teradataml/data/customer_segmentation_train.csv +8069 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
- teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
- teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
- teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
- teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
- teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
- teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
- teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
- teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
- teradataml/data/glm_example.json +28 -1
- teradataml/data/housing_train_segment.csv +201 -0
- teradataml/data/insect2Cols.csv +61 -0
- teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
- teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
- teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
- teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
- teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
- teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
- teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
- teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
- teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
- teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
- teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
- teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
- teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
- teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
- teradataml/data/kmeans_example.json +5 -0
- teradataml/data/kmeans_table.csv +10 -0
- teradataml/data/onehot_encoder_train.csv +4 -0
- teradataml/data/openml_example.json +29 -0
- teradataml/data/scale_attributes.csv +3 -0
- teradataml/data/scale_example.json +52 -1
- teradataml/data/scale_input_part_sparse.csv +31 -0
- teradataml/data/scale_input_partitioned.csv +16 -0
- teradataml/data/scale_input_sparse.csv +11 -0
- teradataml/data/scale_parameters.csv +3 -0
- teradataml/data/scripts/deploy_script.py +20 -1
- teradataml/data/scripts/sklearn/sklearn_fit.py +23 -27
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +20 -28
- teradataml/data/scripts/sklearn/sklearn_function.template +13 -18
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +18 -27
- teradataml/data/scripts/sklearn/sklearn_score.py +20 -29
- teradataml/data/scripts/sklearn/sklearn_transform.py +30 -38
- teradataml/data/teradataml_example.json +77 -0
- teradataml/data/ztest_example.json +16 -0
- teradataml/dataframe/copy_to.py +8 -3
- teradataml/dataframe/data_transfer.py +120 -61
- teradataml/dataframe/dataframe.py +102 -17
- teradataml/dataframe/dataframe_utils.py +47 -9
- teradataml/dataframe/fastload.py +272 -89
- teradataml/dataframe/sql.py +84 -0
- teradataml/dbutils/dbutils.py +2 -2
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/sklearn/_sklearn_wrapper.py +102 -55
- teradataml/options/__init__.py +13 -4
- teradataml/options/configure.py +27 -6
- teradataml/scriptmgmt/UserEnv.py +19 -16
- teradataml/scriptmgmt/lls_utils.py +117 -14
- teradataml/table_operators/Script.py +2 -3
- teradataml/table_operators/TableOperator.py +58 -10
- teradataml/utils/validators.py +40 -2
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/METADATA +78 -6
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/RECORD +108 -90
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/zip-safe +0 -0
|
@@ -24,7 +24,7 @@ from teradataml.dataframe.dataframe import DataFrame
|
|
|
24
24
|
from teradataml.dataframe.copy_to import copy_to_sql
|
|
25
25
|
from teradataml import Antiselect
|
|
26
26
|
from teradataml import BincodeFit, BincodeTransform
|
|
27
|
-
from teradataml import ColumnSummary,
|
|
27
|
+
from teradataml import CategoricalSummary, ColumnSummary, ConvertTo, GetFutileColumns, FillRowId
|
|
28
28
|
from teradataml import Fit, Transform
|
|
29
29
|
from teradataml import NonLinearCombineFit, NonLinearCombineTransform
|
|
30
30
|
from teradataml import NumApply
|
|
@@ -36,6 +36,8 @@ from teradataml import TargetEncodingFit, TargetEncodingTransform
|
|
|
36
36
|
from sqlalchemy import literal_column
|
|
37
37
|
from teradatasqlalchemy import INTEGER
|
|
38
38
|
from teradataml import display
|
|
39
|
+
from teradataml.common.garbagecollector import GarbageCollector
|
|
40
|
+
from teradataml.dataframe.sql_functions import case
|
|
39
41
|
from teradataml.hyperparameter_tuner.utils import _ProgressBar
|
|
40
42
|
from teradataml.utils.validators import _Validators
|
|
41
43
|
|
|
@@ -61,12 +63,12 @@ class _FeatureEngineering:
|
|
|
61
63
|
Types: teradataml Dataframe
|
|
62
64
|
|
|
63
65
|
target_column:
|
|
64
|
-
Required
|
|
66
|
+
Required Argument.
|
|
65
67
|
Specifies the name of the target column in "data"..
|
|
66
68
|
Types: str
|
|
67
69
|
|
|
68
70
|
model_list:
|
|
69
|
-
Required
|
|
71
|
+
Required Argument.
|
|
70
72
|
Specifies the list of models to be used for model training.
|
|
71
73
|
Types: list
|
|
72
74
|
|
|
@@ -81,7 +83,7 @@ class _FeatureEngineering:
|
|
|
81
83
|
Types: int
|
|
82
84
|
|
|
83
85
|
task_type:
|
|
84
|
-
Required
|
|
86
|
+
Required Argument.
|
|
85
87
|
Specifies the task type for AutoML, whether to apply regresion OR classification
|
|
86
88
|
on the provived dataset.
|
|
87
89
|
Default Value: "Regression"
|
|
@@ -89,7 +91,7 @@ class _FeatureEngineering:
|
|
|
89
91
|
Types: str
|
|
90
92
|
|
|
91
93
|
custom_data:
|
|
92
|
-
Optional
|
|
94
|
+
Optional Argument.
|
|
93
95
|
Specifies json object containing user customized input.
|
|
94
96
|
Types: json object
|
|
95
97
|
"""
|
|
@@ -120,7 +122,7 @@ class _FeatureEngineering:
|
|
|
120
122
|
|
|
121
123
|
PARAMETERS:
|
|
122
124
|
auto:
|
|
123
|
-
Optional
|
|
125
|
+
Optional Argument.
|
|
124
126
|
Specifies whether to run AutoML in custom mode or auto mode.
|
|
125
127
|
When set to False, runs in custom mode. Otherwise, by default runs in auto mode.
|
|
126
128
|
Default Value: True
|
|
@@ -255,7 +257,7 @@ class _FeatureEngineering:
|
|
|
255
257
|
f"Remaining Columns in the data: {self.data.shape[1]}",
|
|
256
258
|
progress_bar=self.progress_bar)
|
|
257
259
|
else:
|
|
258
|
-
self._display_msg(inline_msg="Analysis
|
|
260
|
+
self._display_msg(inline_msg="Analysis completed. No action taken.",
|
|
259
261
|
progress_bar=self.progress_bar)
|
|
260
262
|
|
|
261
263
|
end_time = time.time()
|
|
@@ -333,7 +335,7 @@ class _FeatureEngineering:
|
|
|
333
335
|
f_cols = [row[0] for row in gfc_out.result.itertuples()]
|
|
334
336
|
|
|
335
337
|
if len(f_cols) == 0:
|
|
336
|
-
self._display_msg(inline_msg="
|
|
338
|
+
self._display_msg(inline_msg="Analysis indicates all categorical columns are significant. No action Needed.",
|
|
337
339
|
progress_bar=self.progress_bar)
|
|
338
340
|
else:
|
|
339
341
|
|
|
@@ -350,128 +352,68 @@ class _FeatureEngineering:
|
|
|
350
352
|
self._display_msg(msg="Total time to handle less significant features: {:.2f} sec ".format( end_time - start_time),
|
|
351
353
|
progress_bar=self.progress_bar,
|
|
352
354
|
show_data=True)
|
|
353
|
-
|
|
354
|
-
def _handle_date_component(self,
|
|
355
|
-
date_component_columns,
|
|
356
|
-
date_component):
|
|
357
355
|
|
|
356
|
+
def _fetch_date_component(self):
|
|
358
357
|
"""
|
|
359
358
|
DESCRIPTION:
|
|
360
|
-
Function to
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
PARAMETERS:
|
|
365
|
-
date_component_columns:
|
|
366
|
-
Required Argument.
|
|
367
|
-
Specifies the list of newly generated differnt component of date features.
|
|
368
|
-
Types: list
|
|
369
|
-
|
|
370
|
-
date_component:
|
|
371
|
-
Required Argument.
|
|
372
|
-
Specifies identifier for the differnt component of date features, i.e., D - Days , M - Months and Y - Year diffs.
|
|
373
|
-
Types: str
|
|
374
|
-
|
|
375
|
-
"""
|
|
376
|
-
# Check for day
|
|
377
|
-
if date_component == "D":
|
|
378
|
-
prefix_value = "Day_"
|
|
379
|
-
# Check for month
|
|
380
|
-
elif date_component == "M":
|
|
381
|
-
prefix_value = "Month_"
|
|
382
|
-
# Check for year diff
|
|
383
|
-
elif date_component == "Y":
|
|
384
|
-
prefix_value = "Year_diff_"
|
|
359
|
+
Function to fetch day of week, week of month, month of quarter, quarter of year
|
|
360
|
+
component from date column. Generate weekend and month half details from day of week and
|
|
361
|
+
week of month columns respectively. Convert quarter of year and month of quarter
|
|
362
|
+
component columns to VARCHAR.
|
|
385
363
|
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
data_size = self.data.drop_duplicate(col).size
|
|
389
|
-
if data_size < 4:
|
|
390
|
-
num_bins = data_size
|
|
391
|
-
else:
|
|
392
|
-
num_bins = 4
|
|
393
|
-
# Performing bincode for converting date component to specific labels
|
|
394
|
-
fit_params = {
|
|
395
|
-
"data": self.data,
|
|
396
|
-
"target_columns": col,
|
|
397
|
-
"method_type":"Equal-Width",
|
|
398
|
-
"nbins": num_bins,
|
|
399
|
-
"label_prefix" : prefix_value
|
|
400
|
-
}
|
|
401
|
-
bin_code_fit = BincodeFit(**fit_params)
|
|
402
|
-
|
|
403
|
-
fit_params_map = {"D": "day_component_fit_object",
|
|
404
|
-
"M": "month_component_fit_object",
|
|
405
|
-
"Y": "year_diff_component_fit_object"}
|
|
406
|
-
|
|
407
|
-
# Storing fit object for each date component in data transform dictionary
|
|
408
|
-
self.data_transform_dict[fit_params_map[date_component]][col] = bin_code_fit.output
|
|
409
|
-
|
|
410
|
-
accumulate_columns = self._extract_list(self.data.columns, [col])
|
|
411
|
-
transform_params = {
|
|
412
|
-
"data": self.data,
|
|
413
|
-
"object": bin_code_fit.output,
|
|
414
|
-
"accumulate": accumulate_columns,
|
|
415
|
-
"persist": True
|
|
416
|
-
}
|
|
417
|
-
self.data = BincodeTransform(**transform_params).result
|
|
418
|
-
|
|
419
|
-
def _fetch_date_component(self,
|
|
420
|
-
process,
|
|
421
|
-
regex_str,
|
|
422
|
-
columns,
|
|
423
|
-
date_component):
|
|
424
|
-
|
|
425
|
-
"""
|
|
426
|
-
DESCRIPTION:
|
|
427
|
-
Function to fetch newly generated date component features.
|
|
428
|
-
Passing ahead for performing binning.
|
|
429
|
-
|
|
430
|
-
PARAMETERS:
|
|
431
|
-
process:
|
|
432
|
-
Required Argument.
|
|
433
|
-
Specifies date component of date feature which is going to be fetched and handled.
|
|
434
|
-
Types: str
|
|
435
|
-
|
|
436
|
-
regex_str:
|
|
437
|
-
Required Argument.
|
|
438
|
-
Specifies regular expression for identifying newly generated date component features.
|
|
439
|
-
Types: str
|
|
440
|
-
|
|
441
|
-
columns:
|
|
442
|
-
Required Argument.
|
|
443
|
-
Specifies list of newly generated date component features.
|
|
444
|
-
Types: list
|
|
445
|
-
|
|
446
|
-
date_component:
|
|
447
|
-
Required Argument.
|
|
448
|
-
Specifies identifier for the differnt component of date features, i.e., D - Days , M - Months and Y - Year diffs.
|
|
449
|
-
Types: str
|
|
450
|
-
|
|
364
|
+
RETURNS:
|
|
365
|
+
List of newly generated date component features.
|
|
451
366
|
"""
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
else:
|
|
463
|
-
self._display_msg("\nNo useful feature found for {} component:".format(process),
|
|
464
|
-
progress_bar=self.progress_bar)
|
|
367
|
+
# List for storing newly generated date component features
|
|
368
|
+
new_date_components=[]
|
|
369
|
+
# Extracting weekend, month, quarter details information from date columns
|
|
370
|
+
date_component_param={}
|
|
371
|
+
for col in self.date_column_list:
|
|
372
|
+
# Generating new column names for extracted date components
|
|
373
|
+
weekend_col = f'{col}_weekend'
|
|
374
|
+
month_half_col = f'{col}_month_half'
|
|
375
|
+
month_of_quarter_col=f'{col}_month_of_quarter'
|
|
376
|
+
quarter_of_year_col=f'{col}_quarter_of_year'
|
|
465
377
|
|
|
466
|
-
|
|
378
|
+
date_component_param = {
|
|
379
|
+
**date_component_param,
|
|
380
|
+
weekend_col: case([(self.data[col].day_of_week().isin([1, 7]), 'yes')], else_='no'),
|
|
381
|
+
month_half_col: case([(self.data[col].week_of_month().isin([1, 2]), 'first_half')], else_='second_half'),
|
|
382
|
+
month_of_quarter_col: self.data[col].month_of_quarter(),
|
|
383
|
+
quarter_of_year_col: self.data[col].quarter_of_year()
|
|
384
|
+
}
|
|
385
|
+
# Storing newly generated date component month and quarter columns.
|
|
386
|
+
# Skipping day of week and week of month columns as they will be used
|
|
387
|
+
# later for extracting weekend and month part details.
|
|
388
|
+
new_date_components.extend([weekend_col, month_half_col, month_of_quarter_col, quarter_of_year_col])
|
|
389
|
+
# Adding new date component columns to dataset
|
|
390
|
+
self.data=self.data.assign(**date_component_param)
|
|
391
|
+
# Dropping date columns as different component columns are extracted.
|
|
392
|
+
self.data = self.data.drop(self.date_column_list, axis=1)
|
|
393
|
+
|
|
394
|
+
# Converting remaining component columns to VARCHAR
|
|
395
|
+
# So that it will be treated as categorical columns
|
|
396
|
+
remaining_component_columns = [col for col in self.data.columns if re.search('month_of_quarter|quarter_of_year'+"$", col)]
|
|
397
|
+
accumulate_columns = self._extract_list(self.data.columns, remaining_component_columns)
|
|
398
|
+
convertto_params = {
|
|
399
|
+
"data" : self.data,
|
|
400
|
+
"target_columns" : remaining_component_columns,
|
|
401
|
+
"target_datatype" : ["VARCHAR(charlen=20,charset=UNICODE,casespecific=NO)"],
|
|
402
|
+
"accumulate" : accumulate_columns,
|
|
403
|
+
"persist" : True
|
|
404
|
+
}
|
|
405
|
+
# returning dataset after performing string manipulation
|
|
406
|
+
self.data = ConvertTo(**convertto_params).result
|
|
407
|
+
# Adding transformed data containing table to garbage collector
|
|
408
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
409
|
+
return new_date_components
|
|
467
410
|
|
|
468
411
|
def _handle_date_columns_helper(self):
|
|
469
412
|
|
|
470
413
|
"""
|
|
471
414
|
DESCRIPTION:
|
|
472
|
-
Function for dropping irrelevent date features.
|
|
473
|
-
|
|
474
|
-
Passing extracted component for performing binning.
|
|
415
|
+
Function for dropping irrelevent date features. Perform Extraction of different
|
|
416
|
+
component from revelent date features and transform them.
|
|
475
417
|
"""
|
|
476
418
|
|
|
477
419
|
# Dropping missing value for all date columns
|
|
@@ -484,7 +426,7 @@ class _FeatureEngineering:
|
|
|
484
426
|
# Date columns list eligible for dropping from dataset
|
|
485
427
|
drop_date_cols = []
|
|
486
428
|
|
|
487
|
-
# Checking for
|
|
429
|
+
# Checking for unique valued date columns
|
|
488
430
|
for col in self.date_column_list:
|
|
489
431
|
if self.data.drop_duplicate(col).size == self.data.shape[0]:
|
|
490
432
|
drop_date_cols.append(col)
|
|
@@ -496,46 +438,18 @@ class _FeatureEngineering:
|
|
|
496
438
|
self._display_msg(msg='Dropping date features with all unique value:',
|
|
497
439
|
col_lst = drop_date_cols,
|
|
498
440
|
progress_bar=self.progress_bar)
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
self.date_column_list = [item for item in self.date_column_list if item not in drop_date_cols]
|
|
441
|
+
# Updated date column list after dropping irrelevant date columns
|
|
442
|
+
self.date_column_list = [item for item in self.date_column_list if item not in drop_date_cols]
|
|
502
443
|
|
|
503
|
-
# List for storing newly generated date component features
|
|
504
|
-
new_columns=[]
|
|
505
|
-
|
|
506
|
-
# Extracting day, month and year difference from date columns
|
|
507
444
|
if len(self.date_column_list) != 0:
|
|
508
445
|
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
day_column=str(col)+"_day_comp"
|
|
513
|
-
month_column=str(col)+"_month_comp"
|
|
514
|
-
year_diff_column=str(col)+"_year_diff_comp"
|
|
515
|
-
new_columns.extend([day_column,month_column,year_diff_column])
|
|
516
|
-
day_query=("EXTRACT(DAY FROM {0})".format(col))
|
|
517
|
-
month_query=("EXTRACT(MONTH FROM {0})".format(col))
|
|
518
|
-
year_query=("EXTRACT(YEAR FROM CURRENT_DATE) - EXTRACT(YEAR FROM {0})".format(col))
|
|
519
|
-
component_param[day_column]=literal_column(day_query,INTEGER())
|
|
520
|
-
component_param[month_column]=literal_column(month_query,INTEGER())
|
|
521
|
-
component_param[year_diff_column]=literal_column(year_query,INTEGER())
|
|
522
|
-
|
|
523
|
-
self.data=self.data.assign(**component_param)
|
|
524
|
-
# Storing newly generated date component list along with parameters in data transform dictionary
|
|
525
|
-
self.data_transform_dict['extract_date_comp_col'] = self.date_column_list
|
|
526
|
-
self.data_transform_dict['extract_date_comp_param'] = component_param
|
|
527
|
-
|
|
528
|
-
# Dropping date columns as we have already extracted day, month and year in new columns
|
|
529
|
-
self.data = self.data.drop(self.date_column_list, axis=1)
|
|
446
|
+
# List for storing newly generated date component features
|
|
447
|
+
new_columns=self._fetch_date_component()
|
|
530
448
|
self._display_msg(msg='List of newly generated features from existing date features:',
|
|
531
449
|
col_lst=new_columns,
|
|
532
450
|
progress_bar=self.progress_bar)
|
|
533
|
-
|
|
534
|
-
data=self.data,
|
|
535
|
-
progress_bar=self.progress_bar)
|
|
536
|
-
|
|
451
|
+
# Dropping columns with all unique values or single value
|
|
537
452
|
drop_cols=[]
|
|
538
|
-
|
|
539
453
|
for col in new_columns:
|
|
540
454
|
distinct_rows = self.data.drop_duplicate(col).size
|
|
541
455
|
if distinct_rows == self.data.shape[0]:
|
|
@@ -555,21 +469,11 @@ class _FeatureEngineering:
|
|
|
555
469
|
self.data = self.data.drop(drop_cols, axis=1)
|
|
556
470
|
# Storing extract date component list for drop in data transform dictionary
|
|
557
471
|
self.data_transform_dict['drop_extract_date_columns'] = drop_cols
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
new_columns = [item for item in new_columns if item not in drop_cols]
|
|
472
|
+
# Extracting all newly generated columns
|
|
473
|
+
new_columns = [item for item in new_columns if item not in drop_cols]
|
|
561
474
|
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
'day_component_fit_object': {},
|
|
565
|
-
'month_component_fit_object': {},
|
|
566
|
-
'year_diff_component_fit_object': {}}
|
|
567
|
-
# Grouping date components based on types i.e., day, month, and year_diff for performing binning
|
|
568
|
-
if len(new_columns) != 0:
|
|
569
|
-
self.day_columns = self._fetch_date_component("day", "_day_comp", new_columns, "D")
|
|
570
|
-
self.month_columns = self._fetch_date_component("month", "_month_comp", new_columns, "M")
|
|
571
|
-
self.year_diff_columns = self._fetch_date_component("year_diff", "_year_diff_comp", new_columns, "Y")
|
|
572
|
-
self._display_msg(inline_msg="No useful date component found",
|
|
475
|
+
self._display_msg(msg='Updated list of newly generated features from existing date features :',
|
|
476
|
+
col_lst=new_columns,
|
|
573
477
|
progress_bar=self.progress_bar)
|
|
574
478
|
|
|
575
479
|
self._display_msg(msg='Updated dataset sample after handling date features:',
|
|
@@ -595,7 +499,7 @@ class _FeatureEngineering:
|
|
|
595
499
|
if d_type in ["datetime.date","datetime.datetime"]]
|
|
596
500
|
|
|
597
501
|
if len(self.date_column_list) == 0:
|
|
598
|
-
self._display_msg(inline_msg="Dataset does not contain any feature related to dates.",
|
|
502
|
+
self._display_msg(inline_msg="Analysis Completed. Dataset does not contain any feature related to dates. No action needed.",
|
|
599
503
|
progress_bar=self.progress_bar)
|
|
600
504
|
else:
|
|
601
505
|
# Storing date column list in data transform dictionary
|
|
@@ -622,8 +526,7 @@ class _FeatureEngineering:
|
|
|
622
526
|
self.data = self.data.dropna(subset=[self.target_column])
|
|
623
527
|
|
|
624
528
|
obj = ColumnSummary(data=self.data,
|
|
625
|
-
target_columns=self.data.columns
|
|
626
|
-
volatile=True)
|
|
529
|
+
target_columns=self.data.columns)
|
|
627
530
|
|
|
628
531
|
cols_miss_val={}
|
|
629
532
|
# Iterating over each row in the column summary result
|
|
@@ -705,11 +608,15 @@ class _FeatureEngineering:
|
|
|
705
608
|
self.data_transform_dict['imputation_columns'] = self.imputation_cols
|
|
706
609
|
|
|
707
610
|
if len(delete_rows) != 0:
|
|
611
|
+
rows = self.data.shape[0]
|
|
708
612
|
self.data = self.data.dropna(subset=delete_rows)
|
|
709
613
|
msg_val_found=1
|
|
710
614
|
self._display_msg(msg='Deleting rows of these columns for handling missing values:',
|
|
711
615
|
col_lst=delete_rows,
|
|
712
616
|
progress_bar=self.progress_bar)
|
|
617
|
+
self._display_msg(msg=f'Sample of dataset after removing {rows-self.data.shape[0]} rows:',
|
|
618
|
+
data=self.data,
|
|
619
|
+
progress_bar=self.progress_bar)
|
|
713
620
|
|
|
714
621
|
if len(drop_cols) != 0:
|
|
715
622
|
self.data = self.data.drop(drop_cols, axis=1)
|
|
@@ -719,9 +626,12 @@ class _FeatureEngineering:
|
|
|
719
626
|
self._display_msg(msg='Dropping these columns for handling missing values:',
|
|
720
627
|
col_lst=drop_cols,
|
|
721
628
|
progress_bar=self.progress_bar)
|
|
629
|
+
self._display_msg(msg=f'Sample of dataset after removing {len(drop_cols)} columns:',
|
|
630
|
+
data=self.data,
|
|
631
|
+
progress_bar=self.progress_bar)
|
|
722
632
|
|
|
723
633
|
if len(self.imputation_cols) == 0 and msg_val_found ==0:
|
|
724
|
-
self._display_msg(inline_msg="No Missing Values Detected.",
|
|
634
|
+
self._display_msg(inline_msg="Analysis Completed. No Missing Values Detected.",
|
|
725
635
|
progress_bar=self.progress_bar)
|
|
726
636
|
|
|
727
637
|
end_time = time.time()
|
|
@@ -787,21 +697,19 @@ class _FeatureEngineering:
|
|
|
787
697
|
|
|
788
698
|
fit_obj = SimpleImputeFit(data=self.data,
|
|
789
699
|
stats_columns=col_stat,
|
|
790
|
-
stats=stat
|
|
791
|
-
volatile=True)
|
|
700
|
+
stats=stat)
|
|
792
701
|
|
|
793
702
|
# Storing fit object for imputation in data transform dictionary
|
|
794
703
|
self.data_transform_dict['imputation_fit_object'] = fit_obj.output
|
|
795
704
|
sm = SimpleImputeTransform(data=self.data,
|
|
796
|
-
object=fit_obj
|
|
797
|
-
volatile=True)
|
|
705
|
+
object=fit_obj)
|
|
798
706
|
|
|
799
707
|
self.data = sm.result
|
|
800
|
-
self._display_msg(msg="Sample of
|
|
708
|
+
self._display_msg(msg="Sample of dataset after Imputation:",
|
|
801
709
|
data=self.data,
|
|
802
710
|
progress_bar=self.progress_bar)
|
|
803
711
|
else:
|
|
804
|
-
self._display_msg(inline_msg="No imputation
|
|
712
|
+
self._display_msg(inline_msg="Analysis completed. No imputation required.",
|
|
805
713
|
progress_bar=self.progress_bar)
|
|
806
714
|
|
|
807
715
|
end_time = time.time()
|
|
@@ -898,6 +806,8 @@ class _FeatureEngineering:
|
|
|
898
806
|
}
|
|
899
807
|
# Updating dataset with transform result
|
|
900
808
|
self.data = SimpleImputeTransform(**transform_param).result
|
|
809
|
+
# Adding transformed data containing table to garbage collector
|
|
810
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
901
811
|
self._display_msg(msg="Updated dataset sample after performing customized missing value imputation:",
|
|
902
812
|
data=self.data,
|
|
903
813
|
progress_bar=self.progress_bar)
|
|
@@ -987,6 +897,8 @@ class _FeatureEngineering:
|
|
|
987
897
|
"persist" : True,
|
|
988
898
|
}
|
|
989
899
|
self.data = BincodeTransform(**eql_transform_params).result
|
|
900
|
+
# Adding transformed data containing table to garbage collector
|
|
901
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
990
902
|
self._display_msg(msg="\nUpdated dataset sample after performing Equal-Width binning :-",
|
|
991
903
|
data=self.data,
|
|
992
904
|
progress_bar=self.progress_bar)
|
|
@@ -1026,6 +938,8 @@ class _FeatureEngineering:
|
|
|
1026
938
|
"persist" : True
|
|
1027
939
|
}
|
|
1028
940
|
self.data = BincodeTransform(**var_transform_params).result
|
|
941
|
+
# Adding transformed data containing table to garbage collector
|
|
942
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1029
943
|
self._display_msg(msg="Updated dataset sample after performing Variable-Width binning:",
|
|
1030
944
|
data=self.data,
|
|
1031
945
|
progress_bar=self.progress_bar)
|
|
@@ -1125,7 +1039,10 @@ class _FeatureEngineering:
|
|
|
1125
1039
|
"string_length" : string_length}
|
|
1126
1040
|
|
|
1127
1041
|
# returning dataset after performing string manipulation
|
|
1128
|
-
|
|
1042
|
+
transform_output = StrApply(**fit_params).result
|
|
1043
|
+
# Adding transformed data containing table to garbage collector
|
|
1044
|
+
GarbageCollector._add_to_garbagecollector(transform_output._table_name)
|
|
1045
|
+
return transform_output
|
|
1129
1046
|
|
|
1130
1047
|
def _one_hot_encoding(self,
|
|
1131
1048
|
one_hot_columns,
|
|
@@ -1173,8 +1090,10 @@ class _FeatureEngineering:
|
|
|
1173
1090
|
"persist" : True
|
|
1174
1091
|
}
|
|
1175
1092
|
# Performing one hot encoding transformation
|
|
1176
|
-
|
|
1177
|
-
|
|
1093
|
+
transform_output = OneHotEncodingTransform(**transform_params).result
|
|
1094
|
+
# Adding transformed data containing table to garbage collector
|
|
1095
|
+
GarbageCollector._add_to_garbagecollector(transform_output._table_name)
|
|
1096
|
+
self.data = transform_output.drop(drop_lst, axis=1)
|
|
1178
1097
|
|
|
1179
1098
|
def _ordinal_encoding(self,
|
|
1180
1099
|
ordinal_columns):
|
|
@@ -1191,8 +1110,7 @@ class _FeatureEngineering:
|
|
|
1191
1110
|
# Adding fit parameters for performing encoding
|
|
1192
1111
|
fit_params = {
|
|
1193
1112
|
"data" : self.data,
|
|
1194
|
-
"target_column" : ordinal_columns
|
|
1195
|
-
"volatile" : True
|
|
1113
|
+
"target_column" : ordinal_columns
|
|
1196
1114
|
}
|
|
1197
1115
|
# Performing ordinal encoding fit on target columns
|
|
1198
1116
|
ord_fit_obj = OrdinalEncodingFit(**fit_params)
|
|
@@ -1214,6 +1132,8 @@ class _FeatureEngineering:
|
|
|
1214
1132
|
}
|
|
1215
1133
|
# Performing ordinal encoding transformation
|
|
1216
1134
|
self.data = OrdinalEncodingTransform(**transform_params).result
|
|
1135
|
+
# Adding transformed data containing table to garbage collector
|
|
1136
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1217
1137
|
|
|
1218
1138
|
if len(ordinal_columns) == 1 and ordinal_columns[0] == self.target_column:
|
|
1219
1139
|
self.target_label = ord_fit_obj
|
|
@@ -1276,6 +1196,8 @@ class _FeatureEngineering:
|
|
|
1276
1196
|
}
|
|
1277
1197
|
# Performing ordinal encoding transformation
|
|
1278
1198
|
self.data = TargetEncodingTransform(**transform_params).result
|
|
1199
|
+
# Adding transformed data containing table to garbage collector
|
|
1200
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1279
1201
|
|
|
1280
1202
|
def _encoding_categorical_columns(self):
|
|
1281
1203
|
"""
|
|
@@ -1308,8 +1230,11 @@ class _FeatureEngineering:
|
|
|
1308
1230
|
self._display_msg(msg="ONE HOT Encoding these Columns:",
|
|
1309
1231
|
col_lst=ohe_col,
|
|
1310
1232
|
progress_bar=self.progress_bar)
|
|
1233
|
+
self._display_msg(msg="Sample of dataset after performing one hot encoding:",
|
|
1234
|
+
data=self.data,
|
|
1235
|
+
progress_bar=self.progress_bar)
|
|
1311
1236
|
else:
|
|
1312
|
-
self._display_msg(inline_msg="
|
|
1237
|
+
self._display_msg(inline_msg="Analysis completed. No categorical columns were found.",
|
|
1313
1238
|
progress_bar=self.progress_bar)
|
|
1314
1239
|
|
|
1315
1240
|
# List of columns after one hot
|
|
@@ -1434,7 +1359,10 @@ class _FeatureEngineering:
|
|
|
1434
1359
|
sigmoid_style=transform_val["sigmoid_style"]
|
|
1435
1360
|
fit_params = {**fit_params, "sigmoid_style" : sigmoid_style}
|
|
1436
1361
|
# Performing transformation on target columns
|
|
1437
|
-
|
|
1362
|
+
transform_output = NumApply(**fit_params).result
|
|
1363
|
+
# Adding transformed data containing table to garbage collector
|
|
1364
|
+
GarbageCollector._add_to_garbagecollector(transform_output._table_name)
|
|
1365
|
+
return transform_output
|
|
1438
1366
|
|
|
1439
1367
|
def _numerical_transformation(self, target_columns, num_transform_data):
|
|
1440
1368
|
"""
|
|
@@ -1465,7 +1393,9 @@ class _FeatureEngineering:
|
|
|
1465
1393
|
"persist" :True
|
|
1466
1394
|
}
|
|
1467
1395
|
# Peforming transformation on target columns
|
|
1468
|
-
self.data = Transform(**transform_params).result
|
|
1396
|
+
self.data = Transform(**transform_params).result
|
|
1397
|
+
# Adding transformed data containing table to garbage collector
|
|
1398
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1469
1399
|
self._display_msg(msg="Updated dataset sample after applying numerical transformation:",
|
|
1470
1400
|
data=self.data,
|
|
1471
1401
|
progress_bar=self.progress_bar)
|
|
@@ -1595,6 +1525,8 @@ class _FeatureEngineering:
|
|
|
1595
1525
|
"persist" : True
|
|
1596
1526
|
}
|
|
1597
1527
|
self.data = NonLinearCombineTransform(**transform_params).result
|
|
1528
|
+
# Adding transformed data containing table to garbage collector
|
|
1529
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1598
1530
|
else:
|
|
1599
1531
|
self._display_msg(inline_msg="Combinations are not as per expectation.",
|
|
1600
1532
|
progress_bar=self.progress_bar)
|
|
@@ -21,6 +21,7 @@ from teradataml import ColumnSummary, CategoricalSummary, GetFutileColumns
|
|
|
21
21
|
from teradataml import OutlierFilterFit, OutlierFilterTransform
|
|
22
22
|
from teradataml.hyperparameter_tuner.utils import _ProgressBar
|
|
23
23
|
from teradataml.common.messages import Messages, MessageCodes
|
|
24
|
+
from teradataml import display as dp
|
|
24
25
|
|
|
25
26
|
def _is_terminal():
|
|
26
27
|
"""
|
|
@@ -158,13 +159,14 @@ class _FeatureExplore:
|
|
|
158
159
|
Internal function displays the column summary of categorical column such as
|
|
159
160
|
datatype, null count, non null count, zero count.
|
|
160
161
|
"""
|
|
162
|
+
dp.max_rows = self.data.shape[1]
|
|
161
163
|
# Column Summary of all columns of dataset
|
|
162
164
|
obj = ColumnSummary(data=self.data,
|
|
163
|
-
target_columns=self.data.columns
|
|
164
|
-
volatile=True)
|
|
165
|
+
target_columns=self.data.columns)
|
|
165
166
|
self._display_msg(msg='\nColumn Summary:',
|
|
166
167
|
data=obj.result,
|
|
167
168
|
show_data=True)
|
|
169
|
+
dp.max_rows = 10
|
|
168
170
|
|
|
169
171
|
def _categorical_summary(self,
|
|
170
172
|
categorical_columns=None):
|
|
@@ -503,6 +505,11 @@ class _FeatureExplore:
|
|
|
503
505
|
progress_bar.update(msg=msg, data=col_lst if col_lst else data if data is not None else None,
|
|
504
506
|
progress=False,
|
|
505
507
|
ipython=not self.terminal_print)
|
|
508
|
+
# Displaying shape of data
|
|
509
|
+
if data is not None:
|
|
510
|
+
progress_bar.update(msg=f'{data.shape[0]} rows X {data.shape[1]} columns',
|
|
511
|
+
progress=False,
|
|
512
|
+
ipython=not self.terminal_print)
|
|
506
513
|
# If an inline message is provided instead
|
|
507
514
|
elif inline_msg:
|
|
508
515
|
# Update the progress bar with the inline message
|
|
@@ -18,6 +18,7 @@ import time
|
|
|
18
18
|
|
|
19
19
|
# Teradata libraries
|
|
20
20
|
from teradataml.dataframe.dataframe import DataFrame
|
|
21
|
+
from teradataml.automl.model_training import _ModelTraining
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
class _ModelEvaluator:
|
|
@@ -38,12 +39,12 @@ class _ModelEvaluator:
|
|
|
38
39
|
Types: teradataml Dataframe
|
|
39
40
|
|
|
40
41
|
target_column:
|
|
41
|
-
Required
|
|
42
|
+
Required Argument.
|
|
42
43
|
Specifies the target column present inside the dataset.
|
|
43
44
|
Types: str
|
|
44
45
|
|
|
45
46
|
task_type:
|
|
46
|
-
Required
|
|
47
|
+
Required Argument.
|
|
47
48
|
Specifies the task type for AutoML, whether to apply regresion OR classification
|
|
48
49
|
on the provived dataset.
|
|
49
50
|
Default Value: "Regression"
|
|
@@ -115,37 +116,24 @@ class _ModelEvaluator:
|
|
|
115
116
|
model = self.model_info.loc[rank]
|
|
116
117
|
|
|
117
118
|
# Defining eval_params
|
|
118
|
-
eval_params =
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
# eval_params for Classification
|
|
122
|
-
if self.task_type != "Regression":
|
|
123
|
-
# XGboost
|
|
124
|
-
if model['Name'] == 'xgboost':
|
|
125
|
-
eval_params['model_type'] = 'Classification'
|
|
126
|
-
eval_params['object_order_column'] = ['task_index', 'tree_num', 'iter','class_num', 'tree_order']
|
|
127
|
-
else:
|
|
128
|
-
# DF,KNN,SVM,GLM
|
|
129
|
-
eval_params['output_prob'] = True
|
|
130
|
-
else:
|
|
131
|
-
# eval_params for Regression in XGboost
|
|
132
|
-
if model['Name'] == 'xgboost':
|
|
133
|
-
eval_params['model_type'] = 'Regression'
|
|
134
|
-
eval_params['object_order_column'] = ['task_index', 'tree_num', 'iter', 'tree_order']
|
|
135
|
-
|
|
119
|
+
eval_params = _ModelTraining._eval_params_generation(model['Name'],
|
|
120
|
+
self.target_column,
|
|
121
|
+
self.task_type)
|
|
136
122
|
|
|
137
123
|
# Test Data
|
|
138
|
-
test = DataFrame(self.table_name_mapping['{}_test'.format(model['Feature
|
|
124
|
+
test = DataFrame(self.table_name_mapping['{}_test'.format(model['Feature-Selection'])])
|
|
139
125
|
|
|
140
126
|
# Getting test data from table
|
|
141
127
|
if not self.test_data_ind:
|
|
142
128
|
# Test Data
|
|
143
|
-
test = DataFrame(self.table_name_mapping['{}_test'.format(model['Feature
|
|
129
|
+
test = DataFrame(self.table_name_mapping['{}_test'.format(model['Feature-Selection'])])
|
|
144
130
|
else:
|
|
145
|
-
test = DataFrame(self.table_name_mapping['{}_new_test'.format(model['Feature
|
|
131
|
+
test = DataFrame(self.table_name_mapping['{}_new_test'.format(model['Feature-Selection'])])
|
|
132
|
+
|
|
133
|
+
print("\nFollowing model is being used for generating prediction :")
|
|
134
|
+
print("Model ID :", model['Model-ID'],
|
|
135
|
+
"\nFeature Selection Method :",model['Feature-Selection'])
|
|
146
136
|
|
|
147
|
-
print(model['Name'], model['Feature selection'])
|
|
148
|
-
|
|
149
137
|
# Evaluation and predictions
|
|
150
138
|
if model['Name'] == 'knn':
|
|
151
139
|
metrics = model['model-obj'].evaluate(test_data=test)
|