teradataml 20.0.0.2__py3-none-any.whl → 20.0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/README.md +315 -2
- teradataml/__init__.py +4 -0
- teradataml/_version.py +1 -1
- teradataml/analytics/analytic_function_executor.py +95 -8
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/metadata.py +12 -3
- teradataml/analytics/json_parser/utils.py +7 -2
- teradataml/analytics/sqle/__init__.py +5 -1
- teradataml/analytics/table_operator/__init__.py +1 -1
- teradataml/analytics/uaf/__init__.py +1 -1
- teradataml/analytics/utils.py +4 -0
- teradataml/analytics/valib.py +18 -4
- teradataml/automl/__init__.py +51 -6
- teradataml/automl/data_preparation.py +59 -35
- teradataml/automl/data_transformation.py +58 -33
- teradataml/automl/feature_engineering.py +27 -12
- teradataml/automl/model_training.py +73 -46
- teradataml/common/constants.py +88 -29
- teradataml/common/garbagecollector.py +2 -1
- teradataml/common/messagecodes.py +19 -3
- teradataml/common/messages.py +6 -1
- teradataml/common/sqlbundle.py +64 -12
- teradataml/common/utils.py +246 -47
- teradataml/common/warnings.py +11 -0
- teradataml/context/context.py +161 -27
- teradataml/data/amazon_reviews_25.csv +26 -0
- teradataml/data/byom_example.json +11 -0
- teradataml/data/dataframe_example.json +18 -2
- teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
- teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
- teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
- teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
- teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
- teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/Shap.py +34 -6
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +4 -4
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
- teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
- teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
- teradataml/data/hnsw_alter_data.csv +5 -0
- teradataml/data/hnsw_data.csv +10 -0
- teradataml/data/jsons/byom/h2opredict.json +1 -1
- teradataml/data/jsons/byom/onnxembeddings.json +266 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
- teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
- teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
- teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +3 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +15 -5
- teradataml/data/medical_readings.csv +101 -0
- teradataml/data/patient_profile.csv +101 -0
- teradataml/data/scripts/lightgbm/dataset.template +157 -0
- teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
- teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
- teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +194 -167
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
- teradataml/data/scripts/sklearn/sklearn_function.template +14 -19
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
- teradataml/data/scripts/sklearn/sklearn_transform.py +129 -42
- teradataml/data/target_udt_data.csv +8 -0
- teradataml/data/templates/open_source_ml.json +3 -2
- teradataml/data/teradataml_example.json +8 -0
- teradataml/data/vectordistance_example.json +4 -0
- teradataml/dataframe/copy_to.py +8 -3
- teradataml/dataframe/data_transfer.py +11 -1
- teradataml/dataframe/dataframe.py +1049 -285
- teradataml/dataframe/dataframe_utils.py +152 -20
- teradataml/dataframe/functions.py +578 -35
- teradataml/dataframe/setop.py +11 -6
- teradataml/dataframe/sql.py +185 -16
- teradataml/dbutils/dbutils.py +1049 -115
- teradataml/dbutils/filemgr.py +48 -1
- teradataml/hyperparameter_tuner/optimizer.py +12 -1
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/__init__.py +1 -1
- teradataml/opensource/_base.py +1466 -0
- teradataml/opensource/_class.py +464 -0
- teradataml/opensource/{sklearn/constants.py → _constants.py} +21 -14
- teradataml/opensource/_lightgbm.py +949 -0
- teradataml/opensource/_sklearn.py +1008 -0
- teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +5 -6
- teradataml/options/__init__.py +54 -38
- teradataml/options/configure.py +131 -27
- teradataml/options/display.py +13 -2
- teradataml/plot/axis.py +47 -8
- teradataml/plot/figure.py +33 -0
- teradataml/plot/plot.py +63 -13
- teradataml/scriptmgmt/UserEnv.py +5 -5
- teradataml/scriptmgmt/lls_utils.py +130 -40
- teradataml/store/__init__.py +12 -0
- teradataml/store/feature_store/__init__.py +0 -0
- teradataml/store/feature_store/constants.py +291 -0
- teradataml/store/feature_store/feature_store.py +2318 -0
- teradataml/store/feature_store/models.py +1505 -0
- teradataml/table_operators/Apply.py +32 -18
- teradataml/table_operators/Script.py +3 -1
- teradataml/table_operators/TableOperator.py +3 -1
- teradataml/table_operators/query_generator.py +3 -0
- teradataml/table_operators/table_operator_query_generator.py +3 -1
- teradataml/table_operators/table_operator_util.py +37 -38
- teradataml/table_operators/templates/dataframe_register.template +69 -0
- teradataml/utils/dtypes.py +51 -2
- teradataml/utils/internal_buffer.py +18 -0
- teradataml/utils/validators.py +99 -8
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/METADATA +321 -5
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/RECORD +121 -94
- teradataml/libaed_0_1.dylib +0 -0
- teradataml/libaed_0_1.so +0 -0
- teradataml/opensource/sklearn/__init__.py +0 -1
- teradataml/opensource/sklearn/_class.py +0 -255
- teradataml/opensource/sklearn/_sklearn_wrapper.py +0 -1800
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/zip-safe +0 -0
|
@@ -73,7 +73,7 @@ for func in _uaf_functions:
|
|
|
73
73
|
"__doc__": _AnalyticFunction.__doc__,
|
|
74
74
|
"__dir__": _common_dir})
|
|
75
75
|
|
|
76
|
-
_stored_procedure = ['FilterFactory1d']
|
|
76
|
+
_stored_procedure = ['CopyArt', 'FilterFactory1d']
|
|
77
77
|
|
|
78
78
|
for func in _stored_procedure:
|
|
79
79
|
globals()[func] = type("{}".format(func), (_AnalyticFunction,),
|
teradataml/analytics/utils.py
CHANGED
|
@@ -441,6 +441,10 @@ class FuncSpecialCaseHandler():
|
|
|
441
441
|
"filter_type": self._single_quote_arg,
|
|
442
442
|
"window_type": self._single_quote_arg,
|
|
443
443
|
"filter_description": self._single_quote_arg},
|
|
444
|
+
"CopyArt":{"database_name": self._single_quote_arg,
|
|
445
|
+
"table_name": self._single_quote_arg,
|
|
446
|
+
"map_name": self._single_quote_arg,
|
|
447
|
+
"permanent_table": self._single_quote_arg},
|
|
444
448
|
"DWT": {"wavelet": self._single_quote_arg},
|
|
445
449
|
"IDWT": {"part": self._single_quote_arg,
|
|
446
450
|
"wavelet": self._single_quote_arg,
|
teradataml/analytics/valib.py
CHANGED
|
@@ -26,6 +26,8 @@ from teradataml.dataframe.dataframe import DataFrame, in_schema
|
|
|
26
26
|
from teradataml.utils.validators import _Validators
|
|
27
27
|
from teradataml.analytics.Transformations import Binning, Derive, OneHotEncoder, FillNa, \
|
|
28
28
|
LabelEncoder, MinMaxScalar, Retain, Sigmoid, ZScore
|
|
29
|
+
from teradataml.common.constants import TeradataReservedKeywords, TeradataConstants
|
|
30
|
+
|
|
29
31
|
|
|
30
32
|
class _VALIB():
|
|
31
33
|
""" An internal class for executing VALIB analytic functions. """
|
|
@@ -370,9 +372,16 @@ class _VALIB():
|
|
|
370
372
|
self.__get_temp_table_name()
|
|
371
373
|
"""
|
|
372
374
|
prefix = "valib_{}".format(self.__tdml_valib_name.lower())
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
375
|
+
tbl_name = UtilFuncs._generate_temp_table_name(prefix=prefix, use_default_database=True,
|
|
376
|
+
gc_on_quit=True, quote=False,
|
|
377
|
+
table_type=TeradataConstants.TERADATA_TABLE)
|
|
378
|
+
# With VT option, table name is getting generated with 'vt_'.
|
|
379
|
+
# But its not getting created as Volatile table. Hence
|
|
380
|
+
# explicitly garbage collecting.
|
|
381
|
+
if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
|
|
382
|
+
GarbageCollector._add_to_garbagecollector(tbl_name,
|
|
383
|
+
TeradataConstants.TERADATA_TABLE)
|
|
384
|
+
return tbl_name
|
|
376
385
|
|
|
377
386
|
def __process_dyn_cls_output_member(self, arg_name, out_tablename, out_var=None):
|
|
378
387
|
"""
|
|
@@ -447,6 +456,7 @@ class _VALIB():
|
|
|
447
456
|
# Add extension to the table name.
|
|
448
457
|
generated_table_name = "{}{}".format(table_name, extension)
|
|
449
458
|
|
|
459
|
+
|
|
450
460
|
# Register new output table to the GC.
|
|
451
461
|
gc_tabname = "\"{}\".\"{}\"".format(self.__db_name, generated_table_name)
|
|
452
462
|
GarbageCollector._add_to_garbagecollector(gc_tabname, TeradataConstants.TERADATA_TABLE)
|
|
@@ -1463,7 +1473,7 @@ class _VALIB():
|
|
|
1463
1473
|
if gen_sql_only:
|
|
1464
1474
|
valib_inst.__generate_valib_sql_argument_syntax(arg=str(gen_sql_only),
|
|
1465
1475
|
arg_name="gensqlonly")
|
|
1466
|
-
|
|
1476
|
+
charset = kwargs.pop("charset", None)
|
|
1467
1477
|
# Raise error if there are additional arguments.
|
|
1468
1478
|
if len(kwargs) != 0:
|
|
1469
1479
|
err_ = "The keyword arguments for Overlap() should have data1, data2, ..., dataN " \
|
|
@@ -1478,6 +1488,10 @@ class _VALIB():
|
|
|
1478
1488
|
arg_name="tablename")
|
|
1479
1489
|
valib_inst.__generate_valib_sql_argument_syntax(arg=",".join(column_names_df),
|
|
1480
1490
|
arg_name="columns")
|
|
1491
|
+
# Generate clause of charset.
|
|
1492
|
+
if charset:
|
|
1493
|
+
valib_inst.__generate_valib_sql_argument_syntax(arg=charset,
|
|
1494
|
+
arg_name="charset")
|
|
1481
1495
|
|
|
1482
1496
|
return valib_inst._execute_valib_function(skip_data_arg_processing=True,
|
|
1483
1497
|
skip_other_arg_processing=True)
|
teradataml/automl/__init__.py
CHANGED
|
@@ -30,7 +30,7 @@ from teradataml import ColumnExpression
|
|
|
30
30
|
from teradataml.dataframe.dataframe import DataFrame
|
|
31
31
|
from teradataml.utils.utils import execute_sql
|
|
32
32
|
from teradataml.utils.validators import _Validators
|
|
33
|
-
from teradataml import ROC, BLOB
|
|
33
|
+
from teradataml import ROC, BLOB, VARCHAR
|
|
34
34
|
from teradataml.utils.dtypes import _Dtypes
|
|
35
35
|
from teradataml.common.utils import UtilFuncs
|
|
36
36
|
from teradataml import TeradataMlException
|
|
@@ -94,6 +94,9 @@ class AutoML:
|
|
|
94
94
|
the processes by passing the JSON file path in case of custom run. It also
|
|
95
95
|
supports early stopping of model training based on stopping metrics,
|
|
96
96
|
maximum running time and maximum models to be trained.
|
|
97
|
+
Note:
|
|
98
|
+
* configure.temp_object_type="VT" follows sequential execution.
|
|
99
|
+
|
|
97
100
|
|
|
98
101
|
PARAMETERS:
|
|
99
102
|
task_type:
|
|
@@ -187,6 +190,12 @@ class AutoML:
|
|
|
187
190
|
session.
|
|
188
191
|
Default Value: False
|
|
189
192
|
Types: bool
|
|
193
|
+
|
|
194
|
+
seed:
|
|
195
|
+
Optional Argument.
|
|
196
|
+
Specifies the random seed for reproducibility.
|
|
197
|
+
Default Value: 42
|
|
198
|
+
Types: int
|
|
190
199
|
|
|
191
200
|
RETURNS:
|
|
192
201
|
Instance of AutoML.
|
|
@@ -417,9 +426,11 @@ class AutoML:
|
|
|
417
426
|
|
|
418
427
|
volatile = kwargs.get('volatile', False)
|
|
419
428
|
persist = kwargs.get('persist', False)
|
|
429
|
+
seed = kwargs.get('seed', 42)
|
|
420
430
|
|
|
421
431
|
arg_info_matrix.append(["volatile", volatile, True, (bool)])
|
|
422
432
|
arg_info_matrix.append(["persist", persist, True, (bool)])
|
|
433
|
+
arg_info_matrix.append(["seed", seed, True, (int)])
|
|
423
434
|
|
|
424
435
|
# Validate argument types
|
|
425
436
|
_Validators._validate_function_arguments(arg_info_matrix)
|
|
@@ -517,7 +528,7 @@ class AutoML:
|
|
|
517
528
|
|
|
518
529
|
# Validate argument types
|
|
519
530
|
_Validators._validate_function_arguments(arg_info_fit_matrix)
|
|
520
|
-
|
|
531
|
+
|
|
521
532
|
# Initializing class variables
|
|
522
533
|
self.data = data
|
|
523
534
|
self.target_column = target_column
|
|
@@ -758,11 +769,12 @@ class AutoML:
|
|
|
758
769
|
if self.target_column_ind:
|
|
759
770
|
prediction_column = 'prediction' if 'prediction' in pred.result.columns else 'Prediction'
|
|
760
771
|
probability_column = 'prob_1'
|
|
772
|
+
pred_target_count = pred.result.drop_duplicate(self.target_column).size
|
|
761
773
|
# Displaying confusion matrix and ROC-AUC for classification problem
|
|
762
774
|
if self.is_classification_type():
|
|
763
775
|
print_data = lambda data: print(data) if _is_terminal() else display(data)
|
|
764
776
|
# Displaying ROC-AUC for binary classification
|
|
765
|
-
if self.target_count == 2:
|
|
777
|
+
if self.target_count == 2 and pred_target_count == 2:
|
|
766
778
|
fit_params = {
|
|
767
779
|
"probability_column" : probability_column,
|
|
768
780
|
"observation_column" : self.target_column,
|
|
@@ -886,8 +898,8 @@ class AutoML:
|
|
|
886
898
|
# as it is required for evaluation.
|
|
887
899
|
if self.target_column not in data.columns:
|
|
888
900
|
raise TeradataMlException(
|
|
889
|
-
|
|
890
|
-
|
|
901
|
+
Messages.get_message(MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE).format(self.target_column),
|
|
902
|
+
MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE)
|
|
891
903
|
|
|
892
904
|
# Checking if data is already transformed before or not
|
|
893
905
|
data_node_id = data._nodeid
|
|
@@ -1234,6 +1246,8 @@ class AutoML:
|
|
|
1234
1246
|
pca.n_components_ = load_pca_info['n_components']
|
|
1235
1247
|
pca.noise_variance_ = load_pca_info['noise_variance']
|
|
1236
1248
|
pca.singular_values_ = np.array(load_pca_info['singular_values'])
|
|
1249
|
+
pca.feature_names_in_ = data_params['pca_fit_columns']
|
|
1250
|
+
pca.n_features_in_ = len(data_params['pca_fit_columns'])
|
|
1237
1251
|
|
|
1238
1252
|
data_params['pca_fit_instance'] = pca
|
|
1239
1253
|
|
|
@@ -1442,7 +1456,8 @@ class AutoML:
|
|
|
1442
1456
|
# Saving data transformation parameters to the specified table
|
|
1443
1457
|
sv_models = pd.concat([sv_models, df], ignore_index=True, sort=False)
|
|
1444
1458
|
|
|
1445
|
-
copy_to_sql(df = sv_models, table_name=table_name, if_exists='replace', types={'DATA_PARAMS':BLOB
|
|
1459
|
+
copy_to_sql(df = sv_models, table_name=table_name, if_exists='replace', types={'DATA_PARAMS':BLOB,
|
|
1460
|
+
'PARAMETERS':VARCHAR(length=32000, charset='UNICODE')})
|
|
1446
1461
|
|
|
1447
1462
|
print('Model Deployment Completed Successfully.')
|
|
1448
1463
|
|
|
@@ -1945,6 +1960,12 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
|
|
|
1945
1960
|
Default Value: False
|
|
1946
1961
|
Types: bool
|
|
1947
1962
|
|
|
1963
|
+
seed:
|
|
1964
|
+
Optional Argument.
|
|
1965
|
+
Specifies the random seed for reproducibility.
|
|
1966
|
+
Default Value: 42
|
|
1967
|
+
Types: int
|
|
1968
|
+
|
|
1948
1969
|
RETURNS:
|
|
1949
1970
|
a tuple containing, model information and leaderboard.
|
|
1950
1971
|
"""
|
|
@@ -2103,6 +2124,12 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
2103
2124
|
session.
|
|
2104
2125
|
Default Value: False
|
|
2105
2126
|
Types: bool
|
|
2127
|
+
|
|
2128
|
+
seed:
|
|
2129
|
+
Optional Argument.
|
|
2130
|
+
Specifies the random seed for reproducibility.
|
|
2131
|
+
Default Value: 42
|
|
2132
|
+
Types: int
|
|
2106
2133
|
|
|
2107
2134
|
RETURNS:
|
|
2108
2135
|
a tuple containing, model information and leaderboard.
|
|
@@ -2324,6 +2351,9 @@ class AutoRegressor(AutoML):
|
|
|
2324
2351
|
"""
|
|
2325
2352
|
DESCRIPTION:
|
|
2326
2353
|
AutoRegressor is a special purpose AutoML feature to run regression specific tasks.
|
|
2354
|
+
Note:
|
|
2355
|
+
* configure.temp_object_type="VT" follows sequential execution.
|
|
2356
|
+
|
|
2327
2357
|
|
|
2328
2358
|
PARAMETERS:
|
|
2329
2359
|
include:
|
|
@@ -2407,6 +2437,12 @@ class AutoRegressor(AutoML):
|
|
|
2407
2437
|
session.
|
|
2408
2438
|
Default Value: False
|
|
2409
2439
|
Types: bool
|
|
2440
|
+
|
|
2441
|
+
seed:
|
|
2442
|
+
Optional Argument.
|
|
2443
|
+
Specifies the random seed for reproducibility.
|
|
2444
|
+
Default Value: 42
|
|
2445
|
+
Types: int
|
|
2410
2446
|
|
|
2411
2447
|
RETURNS:
|
|
2412
2448
|
Instance of AutoRegressor.
|
|
@@ -2555,6 +2591,9 @@ class AutoClassifier(AutoML):
|
|
|
2555
2591
|
"""
|
|
2556
2592
|
DESCRIPTION:
|
|
2557
2593
|
AutoClassifier is a special purpose AutoML feature to run classification specific tasks.
|
|
2594
|
+
Note:
|
|
2595
|
+
* configure.temp_object_type="VT" follows sequential execution.
|
|
2596
|
+
|
|
2558
2597
|
|
|
2559
2598
|
PARAMETERS:
|
|
2560
2599
|
include:
|
|
@@ -2638,6 +2677,12 @@ class AutoClassifier(AutoML):
|
|
|
2638
2677
|
session.
|
|
2639
2678
|
Default Value: False
|
|
2640
2679
|
Types: bool
|
|
2680
|
+
|
|
2681
|
+
seed:
|
|
2682
|
+
Optional Argument.
|
|
2683
|
+
Specifies the random seed for reproducibility.
|
|
2684
|
+
Default Value: 42
|
|
2685
|
+
Types: int
|
|
2641
2686
|
|
|
2642
2687
|
RETURNS:
|
|
2643
2688
|
Instance of AutoClassifier.
|
|
@@ -16,7 +16,6 @@
|
|
|
16
16
|
# Python libraries
|
|
17
17
|
import numpy as np
|
|
18
18
|
import pandas as pd
|
|
19
|
-
import random
|
|
20
19
|
import time
|
|
21
20
|
import warnings
|
|
22
21
|
|
|
@@ -30,11 +29,9 @@ from teradataml import UtilFuncs, TeradataConstants
|
|
|
30
29
|
from teradataml.common.garbagecollector import GarbageCollector
|
|
31
30
|
from teradataml.common.messages import Messages, MessageCodes
|
|
32
31
|
from teradataml.utils.validators import _Validators
|
|
33
|
-
from teradataml import INTEGER
|
|
32
|
+
from teradataml import configure, INTEGER
|
|
33
|
+
from teradataml.common.constants import TeradataConstants
|
|
34
34
|
|
|
35
|
-
# Control Randomnes
|
|
36
|
-
random.seed(42)
|
|
37
|
-
np.random.seed(42)
|
|
38
35
|
|
|
39
36
|
class _DataPreparation:
|
|
40
37
|
|
|
@@ -117,6 +114,12 @@ class _DataPreparation:
|
|
|
117
114
|
session.
|
|
118
115
|
Default Value: False
|
|
119
116
|
Types: bool
|
|
117
|
+
|
|
118
|
+
seed:
|
|
119
|
+
Optional Argument.
|
|
120
|
+
Specifies the random seed for reproducibility.
|
|
121
|
+
Default Value: 42
|
|
122
|
+
Types: int
|
|
120
123
|
"""
|
|
121
124
|
self.data = data
|
|
122
125
|
self.target_column = target_column
|
|
@@ -135,7 +138,13 @@ class _DataPreparation:
|
|
|
135
138
|
self.table_name_mapping = {}
|
|
136
139
|
|
|
137
140
|
self.data_types = {key: value for key, value in self.data._column_names_and_types}
|
|
138
|
-
|
|
141
|
+
self.seed = kwargs.get("seed", 42)
|
|
142
|
+
# np.random.seed() affects the random number generation in numpy and sklearn
|
|
143
|
+
# setting this changes the global state of the random number generator
|
|
144
|
+
# hence, setting the seed only if it is not None
|
|
145
|
+
if kwargs.get("seed") is not None:
|
|
146
|
+
np.random.seed(self.seed)
|
|
147
|
+
|
|
139
148
|
|
|
140
149
|
def data_preparation(self,
|
|
141
150
|
auto = True):
|
|
@@ -262,25 +271,24 @@ class _DataPreparation:
|
|
|
262
271
|
outlier_method = "Tukey"
|
|
263
272
|
|
|
264
273
|
# List of columns for outlier processing.
|
|
265
|
-
|
|
274
|
+
# Excluding target column and excluded columns from outlier processing
|
|
275
|
+
outlier_columns = [col for col in self.data.columns if col not in self.excluded_columns + ['id', self.target_column]]
|
|
266
276
|
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
if value
|
|
277
|
+
if len(outlier_columns) != 0:
|
|
278
|
+
# Detecting outlier percentage in each columns
|
|
279
|
+
outlier_percentage_df = self._outlier_detection(outlier_method, outlier_columns)
|
|
280
|
+
|
|
281
|
+
# Outlier Handling techniques
|
|
282
|
+
for i in outlier_percentage_df.itertuples():
|
|
283
|
+
# Column Name
|
|
284
|
+
col = i[0]
|
|
285
|
+
# Outlier value
|
|
286
|
+
value = i[1]
|
|
287
|
+
# Dropping rows
|
|
288
|
+
if value > 0.0 and value <= 8.0 :
|
|
279
289
|
columns_to_drop_rows.append(col)
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
elif value> 8.0 and value <= 25.0:
|
|
283
|
-
columns_to_impute.append(col)
|
|
290
|
+
elif value> 8.0 and value <= 25.0:
|
|
291
|
+
columns_to_impute.append(col)
|
|
284
292
|
|
|
285
293
|
return columns_to_drop_rows, columns_to_impute
|
|
286
294
|
|
|
@@ -465,7 +473,7 @@ class _DataPreparation:
|
|
|
465
473
|
RETURNS:
|
|
466
474
|
int, number of folds to be used for cross-validation.
|
|
467
475
|
"""
|
|
468
|
-
num_of_folds = lambda rows:
|
|
476
|
+
num_of_folds = lambda rows: 2 if rows > 20000 else (4 if 1000 < rows <= 20000 else 10)
|
|
469
477
|
return num_of_folds(rows)
|
|
470
478
|
|
|
471
479
|
def _feature_selection_PCA(self):
|
|
@@ -489,7 +497,7 @@ class _DataPreparation:
|
|
|
489
497
|
train_data = pca_train.drop(columns=['id', self.target_column], axis=1)
|
|
490
498
|
|
|
491
499
|
# Initialize and fit PCA
|
|
492
|
-
pca = PCA()
|
|
500
|
+
pca = PCA(random_state=self.seed)
|
|
493
501
|
pca.fit(train_data)
|
|
494
502
|
|
|
495
503
|
# Find the number of components for PCA
|
|
@@ -497,7 +505,7 @@ class _DataPreparation:
|
|
|
497
505
|
n = np.argmax(np.cumsum(variance) >= 0.95) + 1
|
|
498
506
|
|
|
499
507
|
# Create a new instance of PCA with the optimal number of components
|
|
500
|
-
pca = PCA(n_components=n, random_state=
|
|
508
|
+
pca = PCA(n_components=n, random_state=self.seed)
|
|
501
509
|
|
|
502
510
|
# Apply PCA on dataset
|
|
503
511
|
X_train_pca = pca.fit_transform(train_data)
|
|
@@ -571,7 +579,7 @@ class _DataPreparation:
|
|
|
571
579
|
|
|
572
580
|
# Random forest for RFE model
|
|
573
581
|
RFModel = RandomForestRegressor if not is_classification else RandomForestClassifier
|
|
574
|
-
rf = RFModel(n_estimators=100, random_state=
|
|
582
|
+
rf = RFModel(n_estimators=100, random_state=self.seed)
|
|
575
583
|
|
|
576
584
|
# Determine the scoring metric based on the number of unique classes
|
|
577
585
|
score = 'r2' if not self.is_classification_type() \
|
|
@@ -665,10 +673,10 @@ class _DataPreparation:
|
|
|
665
673
|
scoring_metric = 'roc_auc'
|
|
666
674
|
else:
|
|
667
675
|
scoring_metric = 'f1_macro'
|
|
668
|
-
estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=
|
|
676
|
+
estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=self.seed)
|
|
669
677
|
parameters = {'C':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
|
|
670
678
|
else:
|
|
671
|
-
estimator = Lasso(random_state=
|
|
679
|
+
estimator = Lasso(random_state=self.seed)
|
|
672
680
|
parameters = {'alpha':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
|
|
673
681
|
scoring_metric = "r2"
|
|
674
682
|
|
|
@@ -679,7 +687,7 @@ class _DataPreparation:
|
|
|
679
687
|
|
|
680
688
|
# Applying hyperparameter tuning and optimizing score
|
|
681
689
|
hyperparameter_search = GridSearchCV(estimator, parameters, cv=cv, refit=True,
|
|
682
|
-
|
|
690
|
+
scoring=scoring_metric, verbose=0)
|
|
683
691
|
|
|
684
692
|
# Fitting the best result from hyperparameter
|
|
685
693
|
hyperparameter_search.fit(train_features, train_target)
|
|
@@ -746,14 +754,20 @@ class _DataPreparation:
|
|
|
746
754
|
train_table_name = UtilFuncs._generate_temp_table_name(prefix='{}_train'.format(prefix),
|
|
747
755
|
table_type = TeradataConstants.TERADATA_TABLE,
|
|
748
756
|
gc_on_quit=not persist)
|
|
757
|
+
# If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
|
|
758
|
+
# table name in fully qualified format.
|
|
759
|
+
train_table_name = UtilFuncs._extract_table_name(train_table_name)
|
|
760
|
+
|
|
749
761
|
# Storing the table names in the table name mapping dictionary
|
|
750
762
|
self.table_name_mapping['{}_train'.format(prefix)] = train_table_name
|
|
751
763
|
|
|
764
|
+
# In the case of the VT option, the table was being persisted, so the VT condition is being checked.
|
|
765
|
+
is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
|
|
752
766
|
# Pushing data into database
|
|
753
767
|
if self.is_classification_type():
|
|
754
|
-
copy_to_sql(df=data, table_name=train_table_name, if_exists="replace", types={f'{self.target_column}': INTEGER})
|
|
768
|
+
copy_to_sql(df=data, table_name=train_table_name, temporary=is_temporary, if_exists="replace", types={f'{self.target_column}': INTEGER})
|
|
755
769
|
else:
|
|
756
|
-
copy_to_sql(df=data, table_name=train_table_name, if_exists="replace")
|
|
770
|
+
copy_to_sql(df=data, table_name=train_table_name, if_exists="replace", temporary=is_temporary)
|
|
757
771
|
|
|
758
772
|
def _scaling_features_helper(self,
|
|
759
773
|
data=None,
|
|
@@ -783,7 +797,8 @@ class _DataPreparation:
|
|
|
783
797
|
for col in data.columns:
|
|
784
798
|
# Selecting columns that will be scaled
|
|
785
799
|
# Exculding target_col and columns with single value
|
|
786
|
-
if col not in ['id', self.target_column] and
|
|
800
|
+
if col not in ['id', self.target_column] and \
|
|
801
|
+
data.drop_duplicate(col).size > 1:
|
|
787
802
|
columns_to_scale.append(col)
|
|
788
803
|
|
|
789
804
|
if feature_selection_mtd == "lasso":
|
|
@@ -855,6 +870,7 @@ class _DataPreparation:
|
|
|
855
870
|
|
|
856
871
|
# List of columns to copy to the output generated by scale transform
|
|
857
872
|
accumulate_cols = list(set(data_to_scale.columns) - set(scale_col))
|
|
873
|
+
|
|
858
874
|
|
|
859
875
|
# Scaling dataset
|
|
860
876
|
transform_obj = ScaleTransform(data=data_to_scale,
|
|
@@ -866,6 +882,8 @@ class _DataPreparation:
|
|
|
866
882
|
data=scaled_df,
|
|
867
883
|
progress_bar=self.progress_bar)
|
|
868
884
|
else:
|
|
885
|
+
# No columns to scale, Original data will be used
|
|
886
|
+
scaled_df = data_to_scale
|
|
869
887
|
self._display_msg(msg="No columns to scale.",
|
|
870
888
|
progress_bar=self.progress_bar)
|
|
871
889
|
|
|
@@ -914,10 +932,16 @@ class _DataPreparation:
|
|
|
914
932
|
# Assigning data to target dataframe
|
|
915
933
|
target_df = self.data
|
|
916
934
|
# Detecting list of float columns on target dataset
|
|
917
|
-
float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float"]]
|
|
935
|
+
float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float", "decimal.Decimal"]]
|
|
918
936
|
|
|
919
937
|
if len(float_columns) == 0:
|
|
920
|
-
|
|
938
|
+
cols = target_df.columns
|
|
939
|
+
# Doing reset index to get index column
|
|
940
|
+
df = target_df.to_pandas().reset_index()
|
|
941
|
+
|
|
942
|
+
# Returning the dataframe with cols
|
|
943
|
+
# to avoid extra columns generated by reset_index()
|
|
944
|
+
return df[cols]
|
|
921
945
|
|
|
922
946
|
# storing the column details for round up in data transformation dictionary
|
|
923
947
|
self.data_transform_dict["round_columns"] = float_columns
|
|
@@ -31,8 +31,11 @@ from teradataml import ScaleTransform
|
|
|
31
31
|
from teradataml import SimpleImputeTransform
|
|
32
32
|
from teradataml import TargetEncodingTransform
|
|
33
33
|
from teradataml import Transform, UtilFuncs, TeradataConstants
|
|
34
|
+
from teradataml import execute_sql
|
|
34
35
|
from teradataml.common.garbagecollector import GarbageCollector
|
|
35
36
|
from teradataml.hyperparameter_tuner.utils import _ProgressBar
|
|
37
|
+
from teradataml.options.configure import configure
|
|
38
|
+
from teradataml.common.constants import TeradataConstants
|
|
36
39
|
|
|
37
40
|
# AutoML Internal libraries
|
|
38
41
|
from teradataml.automl.feature_exploration import _FeatureExplore
|
|
@@ -219,11 +222,11 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
219
222
|
DESCRIPTION:
|
|
220
223
|
Function drops irrelevent columns and adds id column.
|
|
221
224
|
"""
|
|
222
|
-
# Extracting
|
|
225
|
+
# Extracting irrelevant column list
|
|
223
226
|
columns_to_be_removed = self.data_transformation_params.get("drop_irrelevent_columns", None)
|
|
224
227
|
if columns_to_be_removed:
|
|
225
228
|
self.data = self.data.drop(columns_to_be_removed, axis=1)
|
|
226
|
-
self._display_msg(msg="\nUpdated dataset after dropping
|
|
229
|
+
self._display_msg(msg="\nUpdated dataset after dropping irrelevant columns :",
|
|
227
230
|
data=self.data,
|
|
228
231
|
progress_bar=self.progress_bar)
|
|
229
232
|
|
|
@@ -693,22 +696,28 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
693
696
|
lasso_scale_fit_obj = self.data_transformation_params.get("lasso_scale_fit_obj", None)
|
|
694
697
|
lasso_scale_col = self.data_transformation_params.get("lasso_scale_col", None)
|
|
695
698
|
# Extracting accumulate columns
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
699
|
+
if lasso_scale_fit_obj is not None:
|
|
700
|
+
accumulate_cols = self._extract_list(lasso_df.columns, lasso_scale_col)
|
|
701
|
+
# Scaling dataset
|
|
702
|
+
lasso_df = ScaleTransform(data=lasso_df,
|
|
703
|
+
object=lasso_scale_fit_obj,
|
|
704
|
+
accumulate=accumulate_cols).result
|
|
705
|
+
# Displaying scaled dataset
|
|
706
|
+
self._display_msg(msg="\nUpdated dataset after performing scaling on Lasso selected features :",
|
|
707
|
+
data=lasso_df,
|
|
708
|
+
progress_bar=self.progress_bar)
|
|
705
709
|
|
|
706
710
|
# Uploading lasso dataset to table for further use
|
|
707
711
|
table_name = UtilFuncs._generate_temp_table_name(prefix="lasso_new_test",
|
|
708
712
|
table_type = TeradataConstants.TERADATA_TABLE)
|
|
713
|
+
# If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
|
|
714
|
+
# table name in fully qualified format.
|
|
715
|
+
table_name = UtilFuncs._extract_table_name(table_name)
|
|
709
716
|
# Storing table name mapping for lasso dataset
|
|
710
717
|
self.table_name_mapping[self.data_node_id]["lasso_new_test"] = table_name
|
|
711
|
-
|
|
718
|
+
# In the case of the VT option, the table was being persisted, so the VT condition is being checked.
|
|
719
|
+
is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
|
|
720
|
+
copy_to_sql(df = lasso_df, table_name= table_name, if_exists="replace", temporary=is_temporary)
|
|
712
721
|
|
|
713
722
|
def _feature_selection_rfe_transformation(self):
|
|
714
723
|
"""
|
|
@@ -730,23 +739,30 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
730
739
|
# Extracting fit object and columns for scaling
|
|
731
740
|
rfe_scale_fit_obj = self.data_transformation_params.get("rfe_scale_fit_obj", None)
|
|
732
741
|
rfe_scale_col = self.data_transformation_params.get("rfe_scale_col", None)
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
742
|
+
|
|
743
|
+
if rfe_scale_fit_obj is not None:
|
|
744
|
+
# Extracting accumulate columns
|
|
745
|
+
accumulate_cols = self._extract_list(rfe_df.columns, rfe_scale_col)
|
|
746
|
+
# Scaling on rfe dataset
|
|
747
|
+
rfe_df = ScaleTransform(data=rfe_df,
|
|
748
|
+
object=rfe_scale_fit_obj,
|
|
749
|
+
accumulate=accumulate_cols).result
|
|
750
|
+
# Displaying scaled dataset
|
|
751
|
+
self._display_msg(msg="\nUpdated dataset after performing scaling on RFE selected features :",
|
|
752
|
+
data=rfe_df,
|
|
753
|
+
progress_bar=self.progress_bar)
|
|
743
754
|
|
|
744
755
|
# Uploading rfe dataset to table for further use
|
|
745
756
|
table_name = UtilFuncs._generate_temp_table_name(prefix="rfe_new_test",
|
|
746
757
|
table_type = TeradataConstants.TERADATA_TABLE)
|
|
758
|
+
# If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
|
|
759
|
+
# table name in fully qualified format.
|
|
760
|
+
table_name = UtilFuncs._extract_table_name(table_name)
|
|
747
761
|
# Storing table name mapping for rfe dataset
|
|
748
762
|
self.table_name_mapping[self.data_node_id]["rfe_new_test"] = table_name
|
|
749
|
-
|
|
763
|
+
# In the case of the VT option, the table was being persisted, so the VT condition is being checked.
|
|
764
|
+
is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
|
|
765
|
+
copy_to_sql(df = rfe_df, table_name= table_name, if_exists="replace", temporary=is_temporary)
|
|
750
766
|
|
|
751
767
|
def _feature_selection_pca_transformation(self):
|
|
752
768
|
"""
|
|
@@ -758,17 +774,20 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
758
774
|
pca_scale_col = self.data_transformation_params.get("pca_scale_col", None)
|
|
759
775
|
# Extracting accumulate columns
|
|
760
776
|
accumulate_cols = self._extract_list(self.data.columns, pca_scale_col)
|
|
761
|
-
|
|
762
|
-
pca_scaled_df =
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
777
|
+
|
|
778
|
+
pca_scaled_df = self.data
|
|
779
|
+
if pca_scale_fit_obj is not None:
|
|
780
|
+
# Scaling on pca dataset
|
|
781
|
+
pca_scaled_df = ScaleTransform(data=self.data,
|
|
782
|
+
object=pca_scale_fit_obj,
|
|
783
|
+
accumulate=accumulate_cols).result
|
|
784
|
+
# Displaying scaled dataset
|
|
785
|
+
self._display_msg(msg="\nUpdated dataset after performing scaling for PCA feature selection :",
|
|
786
|
+
data=pca_scaled_df,
|
|
787
|
+
progress_bar=self.progress_bar)
|
|
769
788
|
|
|
770
789
|
# Convert to pandas dataframe for applying pca
|
|
771
|
-
pca_scaled_pd = pca_scaled_df.to_pandas()
|
|
790
|
+
pca_scaled_pd = pca_scaled_df.to_pandas().reset_index()
|
|
772
791
|
# Extracting pca fit instance for applying pca
|
|
773
792
|
pca_fit_instance = self.data_transformation_params.get("pca_fit_instance", None)
|
|
774
793
|
# Extracting columns for applying pca
|
|
@@ -804,6 +823,12 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
804
823
|
# Uploading pca dataset to table for further use
|
|
805
824
|
table_name = UtilFuncs._generate_temp_table_name(prefix="pca_new_test",
|
|
806
825
|
table_type = TeradataConstants.TERADATA_TABLE)
|
|
826
|
+
# If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
|
|
827
|
+
# table name in fully qualified format.
|
|
828
|
+
table_name = UtilFuncs._extract_table_name(table_name)
|
|
807
829
|
# Storing table name mapping for pca dataset
|
|
808
830
|
self.table_name_mapping[self.data_node_id]["pca_new_test"] = table_name
|
|
809
|
-
|
|
831
|
+
# In the case of the VT option, the table was being persisted, so the VT condition is being checked.
|
|
832
|
+
is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
|
|
833
|
+
copy_to_sql(df = pca_df, table_name=table_name, if_exists="replace", temporary=is_temporary)
|
|
834
|
+
|