teradataml 20.0.0.3__py3-none-any.whl → 20.0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/README.md +193 -1
- teradataml/__init__.py +2 -1
- teradataml/_version.py +2 -2
- teradataml/analytics/analytic_function_executor.py +25 -18
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/analytic_functions_argument.py +4 -0
- teradataml/analytics/sqle/__init__.py +20 -2
- teradataml/analytics/utils.py +15 -1
- teradataml/analytics/valib.py +18 -4
- teradataml/automl/__init__.py +341 -112
- teradataml/automl/autodataprep/__init__.py +471 -0
- teradataml/automl/data_preparation.py +84 -42
- teradataml/automl/data_transformation.py +69 -33
- teradataml/automl/feature_engineering.py +76 -9
- teradataml/automl/feature_exploration.py +639 -25
- teradataml/automl/model_training.py +35 -14
- teradataml/clients/auth_client.py +2 -2
- teradataml/common/__init__.py +1 -2
- teradataml/common/constants.py +122 -63
- teradataml/common/messagecodes.py +14 -3
- teradataml/common/messages.py +8 -4
- teradataml/common/sqlbundle.py +40 -10
- teradataml/common/utils.py +366 -74
- teradataml/common/warnings.py +11 -0
- teradataml/context/context.py +348 -86
- teradataml/data/amazon_reviews_25.csv +26 -0
- teradataml/data/apriori_example.json +22 -0
- teradataml/data/byom_example.json +11 -0
- teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
- teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
- teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
- teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
- teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
- teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
- teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +3 -3
- teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
- teradataml/data/docs/sqle/docs_17_20/Shap.py +28 -6
- teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +54 -3
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/DFFT.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFT2.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +6 -6
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/PACF.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Resample.py +5 -5
- teradataml/data/docs/uaf/docs_17_20/SAX.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Smoothma.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +1 -1
- teradataml/data/hnsw_alter_data.csv +5 -0
- teradataml/data/hnsw_data.csv +10 -0
- teradataml/data/jsons/byom/h2opredict.json +1 -1
- teradataml/data/jsons/byom/onnxembeddings.json +266 -0
- teradataml/data/jsons/sqle/17.20/NGramSplitter.json +6 -6
- teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
- teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
- teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
- teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +114 -9
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +420 -0
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +343 -0
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +359 -0
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +360 -0
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +343 -0
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +343 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +2 -2
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +3 -3
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +6 -6
- teradataml/data/ner_dict.csv +8 -0
- teradataml/data/ner_input_eng.csv +7 -0
- teradataml/data/ner_rule.csv +5 -0
- teradataml/data/pos_input.csv +40 -0
- teradataml/data/tdnerextractor_example.json +14 -0
- teradataml/data/teradataml_example.json +21 -0
- teradataml/data/textmorph_example.json +5 -0
- teradataml/data/to_num_data.csv +4 -0
- teradataml/data/tochar_data.csv +5 -0
- teradataml/data/trans_dense.csv +16 -0
- teradataml/data/trans_sparse.csv +55 -0
- teradataml/data/vectordistance_example.json +1 -1
- teradataml/dataframe/copy_to.py +45 -29
- teradataml/dataframe/data_transfer.py +72 -46
- teradataml/dataframe/dataframe.py +642 -166
- teradataml/dataframe/dataframe_utils.py +167 -22
- teradataml/dataframe/functions.py +135 -20
- teradataml/dataframe/setop.py +11 -6
- teradataml/dataframe/sql.py +330 -78
- teradataml/dbutils/dbutils.py +556 -140
- teradataml/dbutils/filemgr.py +14 -10
- teradataml/hyperparameter_tuner/optimizer.py +12 -1
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/{sklearn/_sklearn_wrapper.py → _base.py} +168 -1013
- teradataml/opensource/_class.py +141 -17
- teradataml/opensource/{constants.py → _constants.py} +7 -3
- teradataml/opensource/_lightgbm.py +52 -53
- teradataml/opensource/_sklearn.py +1008 -0
- teradataml/opensource/_wrapper_utils.py +5 -5
- teradataml/options/__init__.py +47 -15
- teradataml/options/configure.py +103 -26
- teradataml/options/display.py +13 -2
- teradataml/plot/axis.py +47 -8
- teradataml/plot/figure.py +33 -0
- teradataml/plot/plot.py +63 -13
- teradataml/scriptmgmt/UserEnv.py +307 -40
- teradataml/scriptmgmt/lls_utils.py +428 -145
- teradataml/store/__init__.py +2 -3
- teradataml/store/feature_store/feature_store.py +102 -7
- teradataml/table_operators/Apply.py +48 -19
- teradataml/table_operators/Script.py +23 -2
- teradataml/table_operators/TableOperator.py +3 -1
- teradataml/table_operators/table_operator_util.py +58 -9
- teradataml/utils/dtypes.py +49 -1
- teradataml/utils/internal_buffer.py +38 -0
- teradataml/utils/validators.py +377 -62
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/METADATA +200 -4
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/RECORD +146 -112
- teradataml/data/SQL_Fundamentals.pdf +0 -0
- teradataml/libaed_0_1.dylib +0 -0
- teradataml/libaed_0_1.so +0 -0
- teradataml/opensource/sklearn/__init__.py +0 -0
- teradataml/store/vector_store/__init__.py +0 -1586
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/zip-safe +0 -0
|
@@ -16,7 +16,6 @@
|
|
|
16
16
|
# Python libraries
|
|
17
17
|
import numpy as np
|
|
18
18
|
import pandas as pd
|
|
19
|
-
import random
|
|
20
19
|
import time
|
|
21
20
|
import warnings
|
|
22
21
|
|
|
@@ -30,11 +29,9 @@ from teradataml import UtilFuncs, TeradataConstants
|
|
|
30
29
|
from teradataml.common.garbagecollector import GarbageCollector
|
|
31
30
|
from teradataml.common.messages import Messages, MessageCodes
|
|
32
31
|
from teradataml.utils.validators import _Validators
|
|
33
|
-
from teradataml import INTEGER
|
|
32
|
+
from teradataml import configure, INTEGER
|
|
33
|
+
from teradataml.common.constants import TeradataConstants
|
|
34
34
|
|
|
35
|
-
# Control Randomnes
|
|
36
|
-
random.seed(42)
|
|
37
|
-
np.random.seed(42)
|
|
38
35
|
|
|
39
36
|
class _DataPreparation:
|
|
40
37
|
|
|
@@ -117,6 +114,12 @@ class _DataPreparation:
|
|
|
117
114
|
session.
|
|
118
115
|
Default Value: False
|
|
119
116
|
Types: bool
|
|
117
|
+
|
|
118
|
+
seed:
|
|
119
|
+
Optional Argument.
|
|
120
|
+
Specifies the random seed for reproducibility.
|
|
121
|
+
Default Value: 42
|
|
122
|
+
Types: int
|
|
120
123
|
"""
|
|
121
124
|
self.data = data
|
|
122
125
|
self.target_column = target_column
|
|
@@ -127,14 +130,22 @@ class _DataPreparation:
|
|
|
127
130
|
self.task_type = task_type
|
|
128
131
|
self.volatile = kwargs.get("volatile", False)
|
|
129
132
|
self.persist = kwargs.get("persist", False)
|
|
133
|
+
self.aml_phases = kwargs.get("automl_phases", None)
|
|
130
134
|
|
|
131
135
|
# Setting default value for auto run mode
|
|
132
136
|
self._data_sampling_method = "SMOTE"
|
|
133
137
|
self._scale_method_reg = "STD"
|
|
134
138
|
self._scale_method_cls = "RANGE"
|
|
135
|
-
self.table_name_mapping = {}
|
|
136
139
|
|
|
137
140
|
self.data_types = {key: value for key, value in self.data._column_names_and_types}
|
|
141
|
+
self.seed = kwargs.get("seed", 42)
|
|
142
|
+
# np.random.seed() affects the random number generation in numpy and sklearn
|
|
143
|
+
# setting this changes the global state of the random number generator
|
|
144
|
+
# hence, setting the seed only if it is not None
|
|
145
|
+
if kwargs.get("seed") is not None:
|
|
146
|
+
np.random.seed(self.seed)
|
|
147
|
+
|
|
148
|
+
self.data_mapping = kwargs.get("data_mapping", {})
|
|
138
149
|
|
|
139
150
|
|
|
140
151
|
def data_preparation(self,
|
|
@@ -158,7 +169,8 @@ class _DataPreparation:
|
|
|
158
169
|
list of lists containing, feature selected by rfe, pca and lasso.
|
|
159
170
|
"""
|
|
160
171
|
self._display_heading(phase=2,
|
|
161
|
-
progress_bar=self.progress_bar
|
|
172
|
+
progress_bar=self.progress_bar,
|
|
173
|
+
automl_phases=self.aml_phases)
|
|
162
174
|
self._display_msg(msg='Data preparation started ...',
|
|
163
175
|
progress_bar=self.progress_bar)
|
|
164
176
|
# Setting user value in case of custom running mode
|
|
@@ -201,7 +213,7 @@ class _DataPreparation:
|
|
|
201
213
|
self._feature_selection_PCA()
|
|
202
214
|
self.progress_bar.update()
|
|
203
215
|
|
|
204
|
-
return [self.rfe_feature, self.lasso_feature, self.pca_feature], self.data_transform_dict
|
|
216
|
+
return [self.rfe_feature, self.lasso_feature, self.pca_feature], self.data_transform_dict, self.data_mapping
|
|
205
217
|
|
|
206
218
|
def _handle_outliers(self,
|
|
207
219
|
auto):
|
|
@@ -262,25 +274,24 @@ class _DataPreparation:
|
|
|
262
274
|
outlier_method = "Tukey"
|
|
263
275
|
|
|
264
276
|
# List of columns for outlier processing.
|
|
265
|
-
|
|
277
|
+
# Excluding target column and excluded columns from outlier processing
|
|
278
|
+
outlier_columns = [col for col in self.data.columns if col not in self.excluded_columns + ['id', self.target_column]]
|
|
266
279
|
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
if value
|
|
280
|
+
if len(outlier_columns) != 0:
|
|
281
|
+
# Detecting outlier percentage in each columns
|
|
282
|
+
outlier_percentage_df = self._outlier_detection(outlier_method, outlier_columns)
|
|
283
|
+
|
|
284
|
+
# Outlier Handling techniques
|
|
285
|
+
for i in outlier_percentage_df.itertuples():
|
|
286
|
+
# Column Name
|
|
287
|
+
col = i[0]
|
|
288
|
+
# Outlier value
|
|
289
|
+
value = i[1]
|
|
290
|
+
# Dropping rows
|
|
291
|
+
if value > 0.0 and value <= 8.0 :
|
|
279
292
|
columns_to_drop_rows.append(col)
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
elif value> 8.0 and value <= 25.0:
|
|
283
|
-
columns_to_impute.append(col)
|
|
293
|
+
elif value> 8.0 and value <= 25.0:
|
|
294
|
+
columns_to_impute.append(col)
|
|
284
295
|
|
|
285
296
|
return columns_to_drop_rows, columns_to_impute
|
|
286
297
|
|
|
@@ -347,6 +358,9 @@ class _DataPreparation:
|
|
|
347
358
|
# Adding transformed data containing table to garbage collector
|
|
348
359
|
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
349
360
|
|
|
361
|
+
# Returning outlier fit object to store in data mapping dictionary
|
|
362
|
+
return outlier_fit_out
|
|
363
|
+
|
|
350
364
|
def _outlier_processing(self):
|
|
351
365
|
"""
|
|
352
366
|
DESCRIPTION:
|
|
@@ -370,7 +384,10 @@ class _DataPreparation:
|
|
|
370
384
|
progress_bar=self.progress_bar)
|
|
371
385
|
target_columns=columns_to_drop_rows
|
|
372
386
|
replacement_strategy = "DELETE"
|
|
373
|
-
self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
|
|
387
|
+
fit_obj = self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
|
|
388
|
+
self.data_mapping['fit_outlier_delete_output'] = fit_obj.output_data._table_name
|
|
389
|
+
self.data_mapping['fit_outlier_delete_result'] = self.data._table_name
|
|
390
|
+
self.data_mapping['outlier_filtered_data'] = self.data._table_name
|
|
374
391
|
self._display_msg(msg="Sample of dataset after removing outlier rows:",
|
|
375
392
|
data=self.data,
|
|
376
393
|
progress_bar=self.progress_bar)
|
|
@@ -382,7 +399,10 @@ class _DataPreparation:
|
|
|
382
399
|
progress_bar=self.progress_bar)
|
|
383
400
|
target_columns=columns_to_impute
|
|
384
401
|
replacement_strategy = "MEDIAN"
|
|
385
|
-
self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
|
|
402
|
+
fit_obj = self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
|
|
403
|
+
self.data_mapping['fit_outlier_impute_output'] = fit_obj.output_data._table_name
|
|
404
|
+
self.data_mapping['fit_outlier_impute_result'] = fit_obj.result._table_name
|
|
405
|
+
self.data_mapping['outlier_imputed_data'] = self.data._table_name
|
|
386
406
|
self._display_msg(msg="Sample of dataset after performing MEDIAN inplace:",
|
|
387
407
|
data=self.data,
|
|
388
408
|
progress_bar=self.progress_bar)
|
|
@@ -438,7 +458,10 @@ class _DataPreparation:
|
|
|
438
458
|
# Fetching replacement value
|
|
439
459
|
replacement_value = transform_val["replacement_value"]
|
|
440
460
|
# Performing outlier handling
|
|
441
|
-
self._outlier_handling(target_col, outlier_method, replacement_value)
|
|
461
|
+
fit_obj = self._outlier_handling(target_col, outlier_method, replacement_value)
|
|
462
|
+
self.data_mapping[f'fit_{target_col}_outlier_output'] = fit_obj.output_data._table_name
|
|
463
|
+
self.data_mapping[f'fit_{target_col}_outlier_result'] = fit_obj.result._table_name
|
|
464
|
+
self.data_mapping[f'{target_col}_outlier_treated_data'] = self.data._table_name
|
|
442
465
|
else:
|
|
443
466
|
self._display_msg(inline_msg="No information provided for feature transformation in outlier handling.",
|
|
444
467
|
progress_bar=self.progress_bar)
|
|
@@ -483,13 +506,13 @@ class _DataPreparation:
|
|
|
483
506
|
start_time = time.time()
|
|
484
507
|
|
|
485
508
|
# Temporary Pulling data for feature selection
|
|
486
|
-
pca_train = DataFrame.from_table(self.
|
|
509
|
+
pca_train = DataFrame.from_table(self.data_mapping['pca_train']).to_pandas()
|
|
487
510
|
|
|
488
511
|
# Drop unnecessary columns and store the result
|
|
489
512
|
train_data = pca_train.drop(columns=['id', self.target_column], axis=1)
|
|
490
513
|
|
|
491
514
|
# Initialize and fit PCA
|
|
492
|
-
pca = PCA()
|
|
515
|
+
pca = PCA(random_state=self.seed)
|
|
493
516
|
pca.fit(train_data)
|
|
494
517
|
|
|
495
518
|
# Find the number of components for PCA
|
|
@@ -497,7 +520,7 @@ class _DataPreparation:
|
|
|
497
520
|
n = np.argmax(np.cumsum(variance) >= 0.95) + 1
|
|
498
521
|
|
|
499
522
|
# Create a new instance of PCA with the optimal number of components
|
|
500
|
-
pca = PCA(n_components=n, random_state=
|
|
523
|
+
pca = PCA(n_components=n, random_state=self.seed)
|
|
501
524
|
|
|
502
525
|
# Apply PCA on dataset
|
|
503
526
|
X_train_pca = pca.fit_transform(train_data)
|
|
@@ -571,7 +594,7 @@ class _DataPreparation:
|
|
|
571
594
|
|
|
572
595
|
# Random forest for RFE model
|
|
573
596
|
RFModel = RandomForestRegressor if not is_classification else RandomForestClassifier
|
|
574
|
-
rf = RFModel(n_estimators=100, random_state=
|
|
597
|
+
rf = RFModel(n_estimators=100, random_state=self.seed)
|
|
575
598
|
|
|
576
599
|
# Determine the scoring metric based on the number of unique classes
|
|
577
600
|
score = 'r2' if not self.is_classification_type() \
|
|
@@ -665,10 +688,10 @@ class _DataPreparation:
|
|
|
665
688
|
scoring_metric = 'roc_auc'
|
|
666
689
|
else:
|
|
667
690
|
scoring_metric = 'f1_macro'
|
|
668
|
-
estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=
|
|
691
|
+
estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=self.seed)
|
|
669
692
|
parameters = {'C':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
|
|
670
693
|
else:
|
|
671
|
-
estimator = Lasso(random_state=
|
|
694
|
+
estimator = Lasso(random_state=self.seed)
|
|
672
695
|
parameters = {'alpha':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
|
|
673
696
|
scoring_metric = "r2"
|
|
674
697
|
|
|
@@ -679,7 +702,7 @@ class _DataPreparation:
|
|
|
679
702
|
|
|
680
703
|
# Applying hyperparameter tuning and optimizing score
|
|
681
704
|
hyperparameter_search = GridSearchCV(estimator, parameters, cv=cv, refit=True,
|
|
682
|
-
|
|
705
|
+
scoring=scoring_metric, verbose=0)
|
|
683
706
|
|
|
684
707
|
# Fitting the best result from hyperparameter
|
|
685
708
|
hyperparameter_search.fit(train_features, train_target)
|
|
@@ -746,14 +769,20 @@ class _DataPreparation:
|
|
|
746
769
|
train_table_name = UtilFuncs._generate_temp_table_name(prefix='{}_train'.format(prefix),
|
|
747
770
|
table_type = TeradataConstants.TERADATA_TABLE,
|
|
748
771
|
gc_on_quit=not persist)
|
|
772
|
+
# If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
|
|
773
|
+
# table name in fully qualified format.
|
|
774
|
+
train_table_name = UtilFuncs._extract_table_name(train_table_name)
|
|
775
|
+
|
|
749
776
|
# Storing the table names in the table name mapping dictionary
|
|
750
|
-
self.
|
|
777
|
+
self.data_mapping['{}_train'.format(prefix)] = train_table_name
|
|
751
778
|
|
|
779
|
+
# In the case of the VT option, the table was being persisted, so the VT condition is being checked.
|
|
780
|
+
is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
|
|
752
781
|
# Pushing data into database
|
|
753
782
|
if self.is_classification_type():
|
|
754
|
-
copy_to_sql(df=data, table_name=train_table_name, if_exists="replace", types={f'{self.target_column}': INTEGER})
|
|
783
|
+
copy_to_sql(df=data, table_name=train_table_name, temporary=is_temporary, if_exists="replace", types={f'{self.target_column}': INTEGER})
|
|
755
784
|
else:
|
|
756
|
-
copy_to_sql(df=data, table_name=train_table_name, if_exists="replace")
|
|
785
|
+
copy_to_sql(df=data, table_name=train_table_name, if_exists="replace", temporary=is_temporary)
|
|
757
786
|
|
|
758
787
|
def _scaling_features_helper(self,
|
|
759
788
|
data=None,
|
|
@@ -825,9 +854,9 @@ class _DataPreparation:
|
|
|
825
854
|
|
|
826
855
|
# Loading data for feature scaling based of feature selection method
|
|
827
856
|
if feature_selection_mtd == 'rfe':
|
|
828
|
-
data_to_scale = DataFrame(self.
|
|
857
|
+
data_to_scale = DataFrame(self.data_mapping['rfe_train'])
|
|
829
858
|
elif feature_selection_mtd == 'lasso':
|
|
830
|
-
data_to_scale = DataFrame(self.
|
|
859
|
+
data_to_scale = DataFrame(self.data_mapping['lasso_train'])
|
|
831
860
|
else:
|
|
832
861
|
data_to_scale = self.data
|
|
833
862
|
|
|
@@ -850,12 +879,16 @@ class _DataPreparation:
|
|
|
850
879
|
volatile=volatile,
|
|
851
880
|
persist=persist)
|
|
852
881
|
|
|
882
|
+
self.data_mapping[f'fit_scale_{feature_selection_mtd}_output'] = fit_obj.output_data._table_name
|
|
883
|
+
self.data_mapping[f'fit_scale_{feature_selection_mtd}_result'] = fit_obj.output._table_name
|
|
884
|
+
|
|
853
885
|
# storing the scale fit object and columns in data transformation dictionary
|
|
854
886
|
self.data_transform_dict['{}_scale_fit_obj'.format(feature_selection_mtd)] = fit_obj.output
|
|
855
887
|
self.data_transform_dict['{}_scale_col'.format(feature_selection_mtd)] = scale_col
|
|
856
888
|
|
|
857
889
|
# List of columns to copy to the output generated by scale transform
|
|
858
890
|
accumulate_cols = list(set(data_to_scale.columns) - set(scale_col))
|
|
891
|
+
|
|
859
892
|
|
|
860
893
|
# Scaling dataset
|
|
861
894
|
transform_obj = ScaleTransform(data=data_to_scale,
|
|
@@ -867,6 +900,8 @@ class _DataPreparation:
|
|
|
867
900
|
data=scaled_df,
|
|
868
901
|
progress_bar=self.progress_bar)
|
|
869
902
|
else:
|
|
903
|
+
# No columns to scale, Original data will be used
|
|
904
|
+
scaled_df = data_to_scale
|
|
870
905
|
self._display_msg(msg="No columns to scale.",
|
|
871
906
|
progress_bar=self.progress_bar)
|
|
872
907
|
|
|
@@ -915,10 +950,16 @@ class _DataPreparation:
|
|
|
915
950
|
# Assigning data to target dataframe
|
|
916
951
|
target_df = self.data
|
|
917
952
|
# Detecting list of float columns on target dataset
|
|
918
|
-
float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float"]]
|
|
953
|
+
float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float", "decimal.Decimal"]]
|
|
919
954
|
|
|
920
955
|
if len(float_columns) == 0:
|
|
921
|
-
|
|
956
|
+
cols = target_df.columns
|
|
957
|
+
# Doing reset index to get index column
|
|
958
|
+
df = target_df.to_pandas().reset_index()
|
|
959
|
+
|
|
960
|
+
# Returning the dataframe with cols
|
|
961
|
+
# to avoid extra columns generated by reset_index()
|
|
962
|
+
return df[cols]
|
|
922
963
|
|
|
923
964
|
# storing the column details for round up in data transformation dictionary
|
|
924
965
|
self.data_transform_dict["round_columns"] = float_columns
|
|
@@ -942,6 +983,7 @@ class _DataPreparation:
|
|
|
942
983
|
fit_params["persist"] = False
|
|
943
984
|
|
|
944
985
|
transform_output = RoundColumns(**fit_params).result
|
|
986
|
+
self.data_mapping['round_columns_data'] = transform_output._table_name
|
|
945
987
|
if not self.volatile and not self.persist:
|
|
946
988
|
# Adding transformed data containing table to garbage collector
|
|
947
989
|
GarbageCollector._add_to_garbagecollector(transform_output._table_name)
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
|
|
16
16
|
# Python libraries
|
|
17
17
|
import pandas as pd
|
|
18
|
+
import warnings
|
|
18
19
|
|
|
19
20
|
# Teradata libraries
|
|
20
21
|
from teradataml.dataframe.dataframe import DataFrame
|
|
@@ -31,8 +32,11 @@ from teradataml import ScaleTransform
|
|
|
31
32
|
from teradataml import SimpleImputeTransform
|
|
32
33
|
from teradataml import TargetEncodingTransform
|
|
33
34
|
from teradataml import Transform, UtilFuncs, TeradataConstants
|
|
35
|
+
from teradataml import execute_sql
|
|
34
36
|
from teradataml.common.garbagecollector import GarbageCollector
|
|
35
37
|
from teradataml.hyperparameter_tuner.utils import _ProgressBar
|
|
38
|
+
from teradataml.options.configure import configure
|
|
39
|
+
from teradataml.common.constants import TeradataConstants
|
|
36
40
|
|
|
37
41
|
# AutoML Internal libraries
|
|
38
42
|
from teradataml.automl.feature_exploration import _FeatureExplore
|
|
@@ -219,11 +223,11 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
219
223
|
DESCRIPTION:
|
|
220
224
|
Function drops irrelevent columns and adds id column.
|
|
221
225
|
"""
|
|
222
|
-
# Extracting
|
|
226
|
+
# Extracting irrelevant column list
|
|
223
227
|
columns_to_be_removed = self.data_transformation_params.get("drop_irrelevent_columns", None)
|
|
224
228
|
if columns_to_be_removed:
|
|
225
229
|
self.data = self.data.drop(columns_to_be_removed, axis=1)
|
|
226
|
-
self._display_msg(msg="\nUpdated dataset after dropping
|
|
230
|
+
self._display_msg(msg="\nUpdated dataset after dropping irrelevant columns :",
|
|
227
231
|
data=self.data,
|
|
228
232
|
progress_bar=self.progress_bar)
|
|
229
233
|
|
|
@@ -465,6 +469,7 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
465
469
|
custom_target_encoding_ind = self.data_transformation_params.get("custom_target_encoding_ind", False)
|
|
466
470
|
custom_target_encoding_fit_obj = self.data_transformation_params.get("custom_target_encoding_fit_obj", None)
|
|
467
471
|
if custom_target_encoding_ind:
|
|
472
|
+
warn_cols = []
|
|
468
473
|
for col, tar_fit_obj in custom_target_encoding_fit_obj.items():
|
|
469
474
|
# Extracting accumulate columns
|
|
470
475
|
accumulate_columns = self._extract_list(self.data.columns, [col])
|
|
@@ -480,6 +485,15 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
480
485
|
self.data = TargetEncodingTransform(**transform_params).result
|
|
481
486
|
# Adding transformed data containing table to garbage collector
|
|
482
487
|
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
488
|
+
if self.data[self.data[col] == -1].shape[0] > 0:
|
|
489
|
+
warn_cols.append(col)
|
|
490
|
+
|
|
491
|
+
# Checking for unseen values in target encoding columns
|
|
492
|
+
if len(warn_cols) > 0:
|
|
493
|
+
warnings.warn(message=f"Unseen categorical values found in test data column(s): {warn_cols}. \
|
|
494
|
+
This may cause inaccurate predictions. Consider retraining the model with updated data.",
|
|
495
|
+
stacklevel=0)
|
|
496
|
+
|
|
483
497
|
self._display_msg(msg="\nUpdated dataset after performing customized categorical encoding :",
|
|
484
498
|
data=self.data,
|
|
485
499
|
progress_bar=self.progress_bar)
|
|
@@ -693,22 +707,28 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
693
707
|
lasso_scale_fit_obj = self.data_transformation_params.get("lasso_scale_fit_obj", None)
|
|
694
708
|
lasso_scale_col = self.data_transformation_params.get("lasso_scale_col", None)
|
|
695
709
|
# Extracting accumulate columns
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
710
|
+
if lasso_scale_fit_obj is not None:
|
|
711
|
+
accumulate_cols = self._extract_list(lasso_df.columns, lasso_scale_col)
|
|
712
|
+
# Scaling dataset
|
|
713
|
+
lasso_df = ScaleTransform(data=lasso_df,
|
|
714
|
+
object=lasso_scale_fit_obj,
|
|
715
|
+
accumulate=accumulate_cols).result
|
|
716
|
+
# Displaying scaled dataset
|
|
717
|
+
self._display_msg(msg="\nUpdated dataset after performing scaling on Lasso selected features :",
|
|
718
|
+
data=lasso_df,
|
|
719
|
+
progress_bar=self.progress_bar)
|
|
705
720
|
|
|
706
721
|
# Uploading lasso dataset to table for further use
|
|
707
722
|
table_name = UtilFuncs._generate_temp_table_name(prefix="lasso_new_test",
|
|
708
723
|
table_type = TeradataConstants.TERADATA_TABLE)
|
|
724
|
+
# If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
|
|
725
|
+
# table name in fully qualified format.
|
|
726
|
+
table_name = UtilFuncs._extract_table_name(table_name)
|
|
709
727
|
# Storing table name mapping for lasso dataset
|
|
710
728
|
self.table_name_mapping[self.data_node_id]["lasso_new_test"] = table_name
|
|
711
|
-
|
|
729
|
+
# In the case of the VT option, the table was being persisted, so the VT condition is being checked.
|
|
730
|
+
is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
|
|
731
|
+
copy_to_sql(df = lasso_df, table_name= table_name, if_exists="replace", temporary=is_temporary)
|
|
712
732
|
|
|
713
733
|
def _feature_selection_rfe_transformation(self):
|
|
714
734
|
"""
|
|
@@ -730,23 +750,30 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
730
750
|
# Extracting fit object and columns for scaling
|
|
731
751
|
rfe_scale_fit_obj = self.data_transformation_params.get("rfe_scale_fit_obj", None)
|
|
732
752
|
rfe_scale_col = self.data_transformation_params.get("rfe_scale_col", None)
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
753
|
+
|
|
754
|
+
if rfe_scale_fit_obj is not None:
|
|
755
|
+
# Extracting accumulate columns
|
|
756
|
+
accumulate_cols = self._extract_list(rfe_df.columns, rfe_scale_col)
|
|
757
|
+
# Scaling on rfe dataset
|
|
758
|
+
rfe_df = ScaleTransform(data=rfe_df,
|
|
759
|
+
object=rfe_scale_fit_obj,
|
|
760
|
+
accumulate=accumulate_cols).result
|
|
761
|
+
# Displaying scaled dataset
|
|
762
|
+
self._display_msg(msg="\nUpdated dataset after performing scaling on RFE selected features :",
|
|
763
|
+
data=rfe_df,
|
|
764
|
+
progress_bar=self.progress_bar)
|
|
743
765
|
|
|
744
766
|
# Uploading rfe dataset to table for further use
|
|
745
767
|
table_name = UtilFuncs._generate_temp_table_name(prefix="rfe_new_test",
|
|
746
768
|
table_type = TeradataConstants.TERADATA_TABLE)
|
|
769
|
+
# If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
|
|
770
|
+
# table name in fully qualified format.
|
|
771
|
+
table_name = UtilFuncs._extract_table_name(table_name)
|
|
747
772
|
# Storing table name mapping for rfe dataset
|
|
748
773
|
self.table_name_mapping[self.data_node_id]["rfe_new_test"] = table_name
|
|
749
|
-
|
|
774
|
+
# In the case of the VT option, the table was being persisted, so the VT condition is being checked.
|
|
775
|
+
is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
|
|
776
|
+
copy_to_sql(df = rfe_df, table_name= table_name, if_exists="replace", temporary=is_temporary)
|
|
750
777
|
|
|
751
778
|
def _feature_selection_pca_transformation(self):
|
|
752
779
|
"""
|
|
@@ -758,17 +785,20 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
758
785
|
pca_scale_col = self.data_transformation_params.get("pca_scale_col", None)
|
|
759
786
|
# Extracting accumulate columns
|
|
760
787
|
accumulate_cols = self._extract_list(self.data.columns, pca_scale_col)
|
|
761
|
-
|
|
762
|
-
pca_scaled_df =
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
788
|
+
|
|
789
|
+
pca_scaled_df = self.data
|
|
790
|
+
if pca_scale_fit_obj is not None:
|
|
791
|
+
# Scaling on pca dataset
|
|
792
|
+
pca_scaled_df = ScaleTransform(data=self.data,
|
|
793
|
+
object=pca_scale_fit_obj,
|
|
794
|
+
accumulate=accumulate_cols).result
|
|
795
|
+
# Displaying scaled dataset
|
|
796
|
+
self._display_msg(msg="\nUpdated dataset after performing scaling for PCA feature selection :",
|
|
797
|
+
data=pca_scaled_df,
|
|
798
|
+
progress_bar=self.progress_bar)
|
|
769
799
|
|
|
770
800
|
# Convert to pandas dataframe for applying pca
|
|
771
|
-
pca_scaled_pd = pca_scaled_df.to_pandas()
|
|
801
|
+
pca_scaled_pd = pca_scaled_df.to_pandas().reset_index()
|
|
772
802
|
# Extracting pca fit instance for applying pca
|
|
773
803
|
pca_fit_instance = self.data_transformation_params.get("pca_fit_instance", None)
|
|
774
804
|
# Extracting columns for applying pca
|
|
@@ -804,6 +834,12 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
804
834
|
# Uploading pca dataset to table for further use
|
|
805
835
|
table_name = UtilFuncs._generate_temp_table_name(prefix="pca_new_test",
|
|
806
836
|
table_type = TeradataConstants.TERADATA_TABLE)
|
|
837
|
+
# If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
|
|
838
|
+
# table name in fully qualified format.
|
|
839
|
+
table_name = UtilFuncs._extract_table_name(table_name)
|
|
807
840
|
# Storing table name mapping for pca dataset
|
|
808
841
|
self.table_name_mapping[self.data_node_id]["pca_new_test"] = table_name
|
|
809
|
-
|
|
842
|
+
# In the case of the VT option, the table was being persisted, so the VT condition is being checked.
|
|
843
|
+
is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
|
|
844
|
+
copy_to_sql(df = pca_df, table_name=table_name, if_exists="replace", temporary=is_temporary)
|
|
845
|
+
|