teradataml 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/README.md +210 -0
- teradataml/__init__.py +1 -1
- teradataml/_version.py +1 -1
- teradataml/analytics/analytic_function_executor.py +162 -76
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/__init__.py +2 -0
- teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
- teradataml/analytics/json_parser/metadata.py +22 -4
- teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
- teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
- teradataml/analytics/sqle/__init__.py +3 -0
- teradataml/analytics/utils.py +4 -1
- teradataml/automl/__init__.py +2369 -464
- teradataml/automl/autodataprep/__init__.py +15 -0
- teradataml/automl/custom_json_utils.py +184 -112
- teradataml/automl/data_preparation.py +113 -58
- teradataml/automl/data_transformation.py +154 -53
- teradataml/automl/feature_engineering.py +113 -53
- teradataml/automl/feature_exploration.py +548 -25
- teradataml/automl/model_evaluation.py +260 -32
- teradataml/automl/model_training.py +399 -206
- teradataml/clients/auth_client.py +2 -2
- teradataml/common/aed_utils.py +11 -2
- teradataml/common/bulk_exposed_utils.py +4 -2
- teradataml/common/constants.py +62 -2
- teradataml/common/garbagecollector.py +50 -21
- teradataml/common/messagecodes.py +47 -2
- teradataml/common/messages.py +19 -1
- teradataml/common/sqlbundle.py +23 -6
- teradataml/common/utils.py +116 -10
- teradataml/context/aed_context.py +16 -10
- teradataml/data/Employee.csv +5 -0
- teradataml/data/Employee_Address.csv +4 -0
- teradataml/data/Employee_roles.csv +5 -0
- teradataml/data/JulesBelvezeDummyData.csv +100 -0
- teradataml/data/byom_example.json +5 -0
- teradataml/data/creditcard_data.csv +284618 -0
- teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
- teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
- teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
- teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
- teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
- teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
- teradataml/data/load_example_data.py +29 -11
- teradataml/data/payment_fraud_dataset.csv +10001 -0
- teradataml/data/teradataml_example.json +67 -0
- teradataml/dataframe/copy_to.py +714 -54
- teradataml/dataframe/dataframe.py +1153 -33
- teradataml/dataframe/dataframe_utils.py +8 -3
- teradataml/dataframe/functions.py +168 -1
- teradataml/dataframe/setop.py +4 -1
- teradataml/dataframe/sql.py +141 -9
- teradataml/dbutils/dbutils.py +470 -35
- teradataml/dbutils/filemgr.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +456 -142
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/lib/libaed_0_1.dylib +0 -0
- teradataml/lib/libaed_0_1.so +0 -0
- teradataml/lib/libaed_0_1_aarch64.so +0 -0
- teradataml/scriptmgmt/UserEnv.py +234 -34
- teradataml/scriptmgmt/lls_utils.py +43 -17
- teradataml/sdk/_json_parser.py +1 -1
- teradataml/sdk/api_client.py +9 -6
- teradataml/sdk/modelops/_client.py +3 -0
- teradataml/series/series.py +12 -7
- teradataml/store/feature_store/constants.py +601 -234
- teradataml/store/feature_store/feature_store.py +2886 -616
- teradataml/store/feature_store/mind_map.py +639 -0
- teradataml/store/feature_store/models.py +5831 -214
- teradataml/store/feature_store/utils.py +390 -0
- teradataml/table_operators/table_operator_util.py +1 -1
- teradataml/table_operators/templates/dataframe_register.template +6 -2
- teradataml/table_operators/templates/dataframe_udf.template +6 -2
- teradataml/utils/docstring.py +527 -0
- teradataml/utils/dtypes.py +93 -0
- teradataml/utils/internal_buffer.py +2 -2
- teradataml/utils/utils.py +41 -2
- teradataml/utils/validators.py +694 -17
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +213 -2
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +96 -81
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0
|
@@ -23,6 +23,7 @@ from teradataml.dataframe.copy_to import copy_to_sql
|
|
|
23
23
|
from teradataml import Antiselect
|
|
24
24
|
from teradataml import BincodeTransform
|
|
25
25
|
from teradataml import ConvertTo
|
|
26
|
+
from teradataml import execute_sql
|
|
26
27
|
from teradataml import FillRowId
|
|
27
28
|
from teradataml import NonLinearCombineTransform
|
|
28
29
|
from teradataml import OneHotEncodingTransform
|
|
@@ -32,7 +33,6 @@ from teradataml import ScaleTransform
|
|
|
32
33
|
from teradataml import SimpleImputeTransform
|
|
33
34
|
from teradataml import TargetEncodingTransform
|
|
34
35
|
from teradataml import Transform, UtilFuncs, TeradataConstants
|
|
35
|
-
from teradataml import execute_sql
|
|
36
36
|
from teradataml.common.garbagecollector import GarbageCollector
|
|
37
37
|
from teradataml.hyperparameter_tuner.utils import _ProgressBar
|
|
38
38
|
from teradataml.options.configure import configure
|
|
@@ -48,10 +48,12 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
48
48
|
def __init__(self,
|
|
49
49
|
data,
|
|
50
50
|
data_transformation_params,
|
|
51
|
-
auto
|
|
52
|
-
verbose
|
|
53
|
-
target_column_ind
|
|
54
|
-
table_name_mapping
|
|
51
|
+
auto=True,
|
|
52
|
+
verbose=0,
|
|
53
|
+
target_column_ind=False,
|
|
54
|
+
table_name_mapping={},
|
|
55
|
+
cluster=False,
|
|
56
|
+
feature_selection_method=None):
|
|
55
57
|
"""
|
|
56
58
|
DESCRIPTION:
|
|
57
59
|
Function initializes the data, data transformation object and running mode
|
|
@@ -89,7 +91,25 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
89
91
|
Optional Argument.
|
|
90
92
|
Specifies whether target column is present in given dataset.
|
|
91
93
|
Default Value: False
|
|
92
|
-
Types: bool
|
|
94
|
+
Types: bool
|
|
95
|
+
|
|
96
|
+
table_name_mapping:
|
|
97
|
+
Optional Argument.
|
|
98
|
+
Specifies the mapping of table names for the transformed data.
|
|
99
|
+
Default Value: {}
|
|
100
|
+
Types: dict
|
|
101
|
+
|
|
102
|
+
cluster:
|
|
103
|
+
Optional Argument.
|
|
104
|
+
Specifies whether to apply clustering techniques.
|
|
105
|
+
Default Value: False
|
|
106
|
+
Types: bool
|
|
107
|
+
|
|
108
|
+
feature_selection_method:
|
|
109
|
+
Optional Argument.
|
|
110
|
+
Specifies the feature selection method to be used.
|
|
111
|
+
Default Value: None
|
|
112
|
+
Types: str
|
|
93
113
|
"""
|
|
94
114
|
self.data = data
|
|
95
115
|
self.data_transformation_params = data_transformation_params
|
|
@@ -97,9 +117,13 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
97
117
|
self.verbose = verbose
|
|
98
118
|
self.target_column_ind = target_column_ind
|
|
99
119
|
self.table_name_mapping = table_name_mapping
|
|
120
|
+
self.data_types = {key: value for key, value in self.data._column_names_and_types}
|
|
100
121
|
self.data_node_id = data._nodeid
|
|
101
122
|
self.table_name_mapping[self.data_node_id] = {}
|
|
102
123
|
|
|
124
|
+
self.cluster = cluster
|
|
125
|
+
self.feature_selection_method = feature_selection_method
|
|
126
|
+
|
|
103
127
|
def data_transformation(self):
|
|
104
128
|
"""
|
|
105
129
|
DESCRIPTION:
|
|
@@ -112,15 +136,17 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
112
136
|
"""
|
|
113
137
|
# Initializing Feature Exploration
|
|
114
138
|
_FeatureExplore.__init__(self,
|
|
115
|
-
data
|
|
116
|
-
target_column
|
|
117
|
-
verbose
|
|
139
|
+
data=self.data,
|
|
140
|
+
target_column=None,
|
|
141
|
+
verbose=self.verbose,
|
|
142
|
+
cluster=self.cluster)
|
|
118
143
|
# Initializing Feature Engineering
|
|
119
|
-
_FeatureEngineering.__init__(self,
|
|
120
|
-
data
|
|
121
|
-
target_column
|
|
122
|
-
model_list
|
|
123
|
-
verbose
|
|
144
|
+
_FeatureEngineering.__init__(self,
|
|
145
|
+
data=self.data,
|
|
146
|
+
target_column=None,
|
|
147
|
+
model_list=None,
|
|
148
|
+
verbose=self.verbose,
|
|
149
|
+
cluster=self.cluster)
|
|
124
150
|
|
|
125
151
|
self._display_msg(msg="Data Transformation started ...", show_data=True)
|
|
126
152
|
# Extracting target column details and type whether it is classification or not
|
|
@@ -128,13 +154,14 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
128
154
|
self.classification_type = self.data_transformation_params.get("classification_type", False)
|
|
129
155
|
|
|
130
156
|
# Setting number of jobs for progress bar based on mode of execution
|
|
131
|
-
jobs = 10 if self.auto else 15
|
|
157
|
+
jobs = 9 if self.cluster else (10 if self.auto else 15)
|
|
132
158
|
self.progress_bar = _ProgressBar(jobs=jobs, verbose=2, prefix='Transformation Running:')
|
|
133
159
|
|
|
134
160
|
# Performing transformation carried out in feature engineering phase
|
|
135
161
|
self.feature_engineering_transformation()
|
|
162
|
+
|
|
136
163
|
# Performing transformation carried out in data preparation phase
|
|
137
|
-
self.data_preparation_transformation()
|
|
164
|
+
self.data_preparation_transformation(feature_selection_method=self.feature_selection_method)
|
|
138
165
|
self._display_msg(msg="Data Transformation completed.", show_data=True)
|
|
139
166
|
|
|
140
167
|
return self.table_name_mapping
|
|
@@ -157,8 +184,9 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
157
184
|
self.progress_bar.update()
|
|
158
185
|
|
|
159
186
|
# Handling target column transformation
|
|
160
|
-
if
|
|
161
|
-
self.
|
|
187
|
+
if not self.cluster:
|
|
188
|
+
if self.target_column_ind and self.classification_type:
|
|
189
|
+
self._handle_target_column_transformation()
|
|
162
190
|
self.progress_bar.update()
|
|
163
191
|
|
|
164
192
|
self._date_column_handling_transformation()
|
|
@@ -193,7 +221,7 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
193
221
|
self._custom_anti_select_column_transformation()
|
|
194
222
|
self.progress_bar.update()
|
|
195
223
|
|
|
196
|
-
def data_preparation_transformation(self):
|
|
224
|
+
def data_preparation_transformation(self, feature_selection_method=None):
|
|
197
225
|
"""
|
|
198
226
|
DESCRIPTION:
|
|
199
227
|
Function performs transformation carried out in data preparation phase
|
|
@@ -209,14 +237,21 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
209
237
|
|
|
210
238
|
# Performing transformation including feature selection using lasso, rfe and pca
|
|
211
239
|
# followed by scaling
|
|
212
|
-
self.
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
self._feature_selection_rfe_transformation()
|
|
216
|
-
self.progress_bar.update()
|
|
240
|
+
if not self.cluster:
|
|
241
|
+
self._feature_selection_lasso_transformation()
|
|
242
|
+
self.progress_bar.update()
|
|
217
243
|
|
|
218
|
-
|
|
219
|
-
|
|
244
|
+
self._feature_selection_rfe_transformation()
|
|
245
|
+
self.progress_bar.update()
|
|
246
|
+
|
|
247
|
+
self._feature_selection_pca_transformation()
|
|
248
|
+
self.progress_bar.update()
|
|
249
|
+
else:
|
|
250
|
+
self._feature_selection_pca_transformation()
|
|
251
|
+
self.progress_bar.update()
|
|
252
|
+
|
|
253
|
+
self._feature_selection_non_pca_transformation()
|
|
254
|
+
self.progress_bar.update()
|
|
220
255
|
|
|
221
256
|
def _preprocess_transformation(self):
|
|
222
257
|
"""
|
|
@@ -224,7 +259,7 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
224
259
|
Function drops irrelevent columns and adds id column.
|
|
225
260
|
"""
|
|
226
261
|
# Extracting irrelevant column list
|
|
227
|
-
columns_to_be_removed = self.data_transformation_params.get("
|
|
262
|
+
columns_to_be_removed = self.data_transformation_params.get("drop_irrelevant_columns", None)
|
|
228
263
|
if columns_to_be_removed:
|
|
229
264
|
self.data = self.data.drop(columns_to_be_removed, axis=1)
|
|
230
265
|
self._display_msg(msg="\nUpdated dataset after dropping irrelevant columns :",
|
|
@@ -297,9 +332,20 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
297
332
|
imputation_cols = self.data_transformation_params.get("imputation_columns", None)
|
|
298
333
|
if imputation_cols:
|
|
299
334
|
sm_fit_obj = self.data_transformation_params.get("imputation_fit_object")
|
|
335
|
+
## Workaround done for bug https://teradata-pe.atlassian.net/browse/TDAF-15617.
|
|
336
|
+
#partition_column = self.data_transformation_params.get("imputation_partition_column", None)
|
|
337
|
+
|
|
338
|
+
params = {"data" : self.data,
|
|
339
|
+
"object" : sm_fit_obj
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
# if partition_column is not None:
|
|
343
|
+
# params["data_partition_column"] = partition_column
|
|
344
|
+
# params["object_partition_column"] = partition_column
|
|
345
|
+
|
|
300
346
|
# imputing column using fit object
|
|
301
|
-
self.data = SimpleImputeTransform(
|
|
302
|
-
|
|
347
|
+
self.data = SimpleImputeTransform(**params).result
|
|
348
|
+
|
|
303
349
|
self._display_msg(msg="\nUpdated dataset after imputing missing value containing columns :",
|
|
304
350
|
data=self.data,
|
|
305
351
|
progress_bar=self.progress_bar)
|
|
@@ -438,7 +484,34 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
438
484
|
self._display_msg(msg="\nUpdated dataset after performing categorical encoding :",
|
|
439
485
|
data=self.data,
|
|
440
486
|
progress_bar=self.progress_bar)
|
|
487
|
+
return
|
|
441
488
|
|
|
489
|
+
# AutoFraud Routine
|
|
490
|
+
auto_target_encoding_ind = self.data_transformation_params.get("auto_target_encoding_ind", False)
|
|
491
|
+
auto_target_encoding_fit_obj = self.data_transformation_params.get("auto_target_encoding_fit_obj", None)
|
|
492
|
+
target_encoding_accumulate_columns = self.data_transformation_params.get("target_encoding_accumulate_columns")
|
|
493
|
+
|
|
494
|
+
if auto_target_encoding_ind:
|
|
495
|
+
# Adding transform parameters for performing encoding
|
|
496
|
+
transform_params = {
|
|
497
|
+
"data" : self.data,
|
|
498
|
+
"object" : auto_target_encoding_fit_obj,
|
|
499
|
+
"accumulate" : target_encoding_accumulate_columns,
|
|
500
|
+
"is_input_dense" : True,
|
|
501
|
+
"persist" : True,
|
|
502
|
+
"display_table_name" : False
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
# Performing one hot encoding transformation
|
|
506
|
+
self.data = TargetEncodingTransform(**transform_params).result
|
|
507
|
+
|
|
508
|
+
# Adding transformed data containing table to garbage collector
|
|
509
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
510
|
+
|
|
511
|
+
self._display_msg(msg="\nUpdated dataset after performing categorical encoding :",
|
|
512
|
+
data=self.data,
|
|
513
|
+
progress_bar=self.progress_bar)
|
|
514
|
+
|
|
442
515
|
def _custom_categorical_encoding_transformation(self):
|
|
443
516
|
"""
|
|
444
517
|
DESCRIPTION:
|
|
@@ -493,7 +566,7 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
493
566
|
warnings.warn(message=f"Unseen categorical values found in test data column(s): {warn_cols}. \
|
|
494
567
|
This may cause inaccurate predictions. Consider retraining the model with updated data.",
|
|
495
568
|
stacklevel=0)
|
|
496
|
-
|
|
569
|
+
|
|
497
570
|
self._display_msg(msg="\nUpdated dataset after performing customized categorical encoding :",
|
|
498
571
|
data=self.data,
|
|
499
572
|
progress_bar=self.progress_bar)
|
|
@@ -628,7 +701,9 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
628
701
|
for classification problem.
|
|
629
702
|
"""
|
|
630
703
|
# Fetching target column encoding indicator and fit object
|
|
704
|
+
|
|
631
705
|
target_col_encode_ind = self.data_transformation_params.get("target_col_encode_ind", False)
|
|
706
|
+
|
|
632
707
|
if target_col_encode_ind:
|
|
633
708
|
# Extracting ordinal encoding fit object for target column
|
|
634
709
|
target_col_ord_encoding_fit_obj = self.data_transformation_params.get("target_col_ord_encoding_fit_obj", None)
|
|
@@ -647,14 +722,7 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
647
722
|
self.data = OrdinalEncodingTransform(**transform_params).result
|
|
648
723
|
# Adding transformed data containing table to garbage collector
|
|
649
724
|
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
650
|
-
|
|
651
|
-
params = {
|
|
652
|
-
"data" : self.data,
|
|
653
|
-
"target_columns" : [self.data_target_column],
|
|
654
|
-
"target_datatype" : ["integer"],
|
|
655
|
-
"accumulate" : self._extract_list(self.data.columns, [self.data_target_column])
|
|
656
|
-
}
|
|
657
|
-
self.data = ConvertTo(**params).result
|
|
725
|
+
|
|
658
726
|
self._display_msg(msg="\nUpdated dataset after performing target column transformation :",
|
|
659
727
|
data=self.data,
|
|
660
728
|
progress_bar=self.progress_bar)
|
|
@@ -715,17 +783,17 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
715
783
|
accumulate=accumulate_cols).result
|
|
716
784
|
# Displaying scaled dataset
|
|
717
785
|
self._display_msg(msg="\nUpdated dataset after performing scaling on Lasso selected features :",
|
|
718
|
-
|
|
719
|
-
|
|
786
|
+
data=lasso_df,
|
|
787
|
+
progress_bar=self.progress_bar)
|
|
720
788
|
|
|
721
789
|
# Uploading lasso dataset to table for further use
|
|
722
|
-
table_name = UtilFuncs._generate_temp_table_name(prefix="
|
|
790
|
+
table_name = UtilFuncs._generate_temp_table_name(prefix="lasso_test",
|
|
723
791
|
table_type = TeradataConstants.TERADATA_TABLE)
|
|
724
792
|
# If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
|
|
725
793
|
# table name in fully qualified format.
|
|
726
794
|
table_name = UtilFuncs._extract_table_name(table_name)
|
|
727
795
|
# Storing table name mapping for lasso dataset
|
|
728
|
-
self.table_name_mapping[self.data_node_id]["
|
|
796
|
+
self.table_name_mapping[self.data_node_id]["lasso_test"] = table_name
|
|
729
797
|
# In the case of the VT option, the table was being persisted, so the VT condition is being checked.
|
|
730
798
|
is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
|
|
731
799
|
copy_to_sql(df = lasso_df, table_name= table_name, if_exists="replace", temporary=is_temporary)
|
|
@@ -760,17 +828,17 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
760
828
|
accumulate=accumulate_cols).result
|
|
761
829
|
# Displaying scaled dataset
|
|
762
830
|
self._display_msg(msg="\nUpdated dataset after performing scaling on RFE selected features :",
|
|
763
|
-
|
|
764
|
-
|
|
831
|
+
data=rfe_df,
|
|
832
|
+
progress_bar=self.progress_bar)
|
|
765
833
|
|
|
766
834
|
# Uploading rfe dataset to table for further use
|
|
767
|
-
table_name = UtilFuncs._generate_temp_table_name(prefix="
|
|
835
|
+
table_name = UtilFuncs._generate_temp_table_name(prefix="rfe_test",
|
|
768
836
|
table_type = TeradataConstants.TERADATA_TABLE)
|
|
769
837
|
# If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
|
|
770
838
|
# table name in fully qualified format.
|
|
771
839
|
table_name = UtilFuncs._extract_table_name(table_name)
|
|
772
840
|
# Storing table name mapping for rfe dataset
|
|
773
|
-
self.table_name_mapping[self.data_node_id]["
|
|
841
|
+
self.table_name_mapping[self.data_node_id]["rfe_test"] = table_name
|
|
774
842
|
# In the case of the VT option, the table was being persisted, so the VT condition is being checked.
|
|
775
843
|
is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
|
|
776
844
|
copy_to_sql(df = rfe_df, table_name= table_name, if_exists="replace", temporary=is_temporary)
|
|
@@ -783,19 +851,19 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
783
851
|
# Extracting fit object and column details for perfroming feature scaling
|
|
784
852
|
pca_scale_fit_obj = self.data_transformation_params.get("pca_scale_fit_obj", None)
|
|
785
853
|
pca_scale_col = self.data_transformation_params.get("pca_scale_col", None)
|
|
786
|
-
|
|
787
|
-
accumulate_cols = self._extract_list(self.data.columns, pca_scale_col)
|
|
788
|
-
|
|
854
|
+
|
|
789
855
|
pca_scaled_df = self.data
|
|
790
856
|
if pca_scale_fit_obj is not None:
|
|
857
|
+
# Extracting accumulate columns
|
|
858
|
+
accumulate_cols = self._extract_list(self.data.columns, pca_scale_col)
|
|
791
859
|
# Scaling on pca dataset
|
|
792
860
|
pca_scaled_df = ScaleTransform(data=self.data,
|
|
793
861
|
object=pca_scale_fit_obj,
|
|
794
862
|
accumulate=accumulate_cols).result
|
|
795
863
|
# Displaying scaled dataset
|
|
796
864
|
self._display_msg(msg="\nUpdated dataset after performing scaling for PCA feature selection :",
|
|
797
|
-
|
|
798
|
-
|
|
865
|
+
data=pca_scaled_df,
|
|
866
|
+
progress_bar=self.progress_bar)
|
|
799
867
|
|
|
800
868
|
# Convert to pandas dataframe for applying pca
|
|
801
869
|
pca_scaled_pd = pca_scaled_df.to_pandas().reset_index()
|
|
@@ -832,14 +900,47 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
832
900
|
progress_bar=self.progress_bar)
|
|
833
901
|
|
|
834
902
|
# Uploading pca dataset to table for further use
|
|
835
|
-
table_name = UtilFuncs._generate_temp_table_name(prefix="
|
|
903
|
+
table_name = UtilFuncs._generate_temp_table_name(prefix="pca_test",
|
|
836
904
|
table_type = TeradataConstants.TERADATA_TABLE)
|
|
837
905
|
# If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
|
|
838
906
|
# table name in fully qualified format.
|
|
839
907
|
table_name = UtilFuncs._extract_table_name(table_name)
|
|
840
908
|
# Storing table name mapping for pca dataset
|
|
841
|
-
self.table_name_mapping[self.data_node_id]["
|
|
909
|
+
self.table_name_mapping[self.data_node_id]["pca_test"] = table_name
|
|
842
910
|
# In the case of the VT option, the table was being persisted, so the VT condition is being checked.
|
|
843
911
|
is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
|
|
844
912
|
copy_to_sql(df = pca_df, table_name=table_name, if_exists="replace", temporary=is_temporary)
|
|
913
|
+
|
|
914
|
+
def _feature_selection_non_pca_transformation(self):
|
|
915
|
+
"""
|
|
916
|
+
DESCRIPTION:
|
|
917
|
+
Function performs feature scaling on raw data for non-PCA clustering models.
|
|
918
|
+
"""
|
|
919
|
+
self._display_msg(msg="\nRunning Non-PCA feature selection transformation for clustering...",
|
|
920
|
+
show_data=True,
|
|
921
|
+
progress_bar=self.progress_bar)
|
|
845
922
|
|
|
923
|
+
# Extracting fit object and columns for scaling
|
|
924
|
+
non_pca_scale_fit_obj = self.data_transformation_params.get("non_pca_scale_fit_obj", None)
|
|
925
|
+
non_pca_scale_col = self.data_transformation_params.get("non_pca_scale_col", None)
|
|
926
|
+
|
|
927
|
+
if non_pca_scale_fit_obj is not None and non_pca_scale_col is not None:
|
|
928
|
+
accumulate_cols = self._extract_list(self.data.columns, non_pca_scale_col)
|
|
929
|
+
|
|
930
|
+
# Scaling dataset
|
|
931
|
+
scaled_df = ScaleTransform(data=self.data,
|
|
932
|
+
object=non_pca_scale_fit_obj,
|
|
933
|
+
accumulate=accumulate_cols).result
|
|
934
|
+
|
|
935
|
+
# Displaying scaled dataset
|
|
936
|
+
self._display_msg(msg="\nUpdated dataset after performing Non-PCA scaling for clustering:",
|
|
937
|
+
data=scaled_df,
|
|
938
|
+
progress_bar=self.progress_bar)
|
|
939
|
+
|
|
940
|
+
# Uploading non_pca dataset to SQL
|
|
941
|
+
table_name = UtilFuncs._generate_temp_table_name(prefix="non_pca_test",
|
|
942
|
+
table_type=TeradataConstants.TERADATA_TABLE)
|
|
943
|
+
self.table_name_mapping[self.data_node_id]["non_pca_test"] = table_name
|
|
944
|
+
copy_to_sql(df=scaled_df, table_name=table_name, if_exists="replace")
|
|
945
|
+
else:
|
|
946
|
+
print(" Missing non_pca_scale_fit_obj or non_pca_scale_col in data transformation params.")
|