teradataml 20.0.0.2__py3-none-any.whl → 20.0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/README.md +315 -2
- teradataml/__init__.py +4 -0
- teradataml/_version.py +1 -1
- teradataml/analytics/analytic_function_executor.py +95 -8
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/metadata.py +12 -3
- teradataml/analytics/json_parser/utils.py +7 -2
- teradataml/analytics/sqle/__init__.py +5 -1
- teradataml/analytics/table_operator/__init__.py +1 -1
- teradataml/analytics/uaf/__init__.py +1 -1
- teradataml/analytics/utils.py +4 -0
- teradataml/analytics/valib.py +18 -4
- teradataml/automl/__init__.py +51 -6
- teradataml/automl/data_preparation.py +59 -35
- teradataml/automl/data_transformation.py +58 -33
- teradataml/automl/feature_engineering.py +27 -12
- teradataml/automl/model_training.py +73 -46
- teradataml/common/constants.py +88 -29
- teradataml/common/garbagecollector.py +2 -1
- teradataml/common/messagecodes.py +19 -3
- teradataml/common/messages.py +6 -1
- teradataml/common/sqlbundle.py +64 -12
- teradataml/common/utils.py +246 -47
- teradataml/common/warnings.py +11 -0
- teradataml/context/context.py +161 -27
- teradataml/data/amazon_reviews_25.csv +26 -0
- teradataml/data/byom_example.json +11 -0
- teradataml/data/dataframe_example.json +18 -2
- teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
- teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
- teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
- teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
- teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
- teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/Shap.py +34 -6
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +4 -4
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
- teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
- teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
- teradataml/data/hnsw_alter_data.csv +5 -0
- teradataml/data/hnsw_data.csv +10 -0
- teradataml/data/jsons/byom/h2opredict.json +1 -1
- teradataml/data/jsons/byom/onnxembeddings.json +266 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
- teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
- teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
- teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +3 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +15 -5
- teradataml/data/medical_readings.csv +101 -0
- teradataml/data/patient_profile.csv +101 -0
- teradataml/data/scripts/lightgbm/dataset.template +157 -0
- teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
- teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
- teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +194 -167
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
- teradataml/data/scripts/sklearn/sklearn_function.template +14 -19
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
- teradataml/data/scripts/sklearn/sklearn_transform.py +129 -42
- teradataml/data/target_udt_data.csv +8 -0
- teradataml/data/templates/open_source_ml.json +3 -2
- teradataml/data/teradataml_example.json +8 -0
- teradataml/data/vectordistance_example.json +4 -0
- teradataml/dataframe/copy_to.py +8 -3
- teradataml/dataframe/data_transfer.py +11 -1
- teradataml/dataframe/dataframe.py +1049 -285
- teradataml/dataframe/dataframe_utils.py +152 -20
- teradataml/dataframe/functions.py +578 -35
- teradataml/dataframe/setop.py +11 -6
- teradataml/dataframe/sql.py +185 -16
- teradataml/dbutils/dbutils.py +1049 -115
- teradataml/dbutils/filemgr.py +48 -1
- teradataml/hyperparameter_tuner/optimizer.py +12 -1
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/__init__.py +1 -1
- teradataml/opensource/_base.py +1466 -0
- teradataml/opensource/_class.py +464 -0
- teradataml/opensource/{sklearn/constants.py → _constants.py} +21 -14
- teradataml/opensource/_lightgbm.py +949 -0
- teradataml/opensource/_sklearn.py +1008 -0
- teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +5 -6
- teradataml/options/__init__.py +54 -38
- teradataml/options/configure.py +131 -27
- teradataml/options/display.py +13 -2
- teradataml/plot/axis.py +47 -8
- teradataml/plot/figure.py +33 -0
- teradataml/plot/plot.py +63 -13
- teradataml/scriptmgmt/UserEnv.py +5 -5
- teradataml/scriptmgmt/lls_utils.py +130 -40
- teradataml/store/__init__.py +12 -0
- teradataml/store/feature_store/__init__.py +0 -0
- teradataml/store/feature_store/constants.py +291 -0
- teradataml/store/feature_store/feature_store.py +2318 -0
- teradataml/store/feature_store/models.py +1505 -0
- teradataml/table_operators/Apply.py +32 -18
- teradataml/table_operators/Script.py +3 -1
- teradataml/table_operators/TableOperator.py +3 -1
- teradataml/table_operators/query_generator.py +3 -0
- teradataml/table_operators/table_operator_query_generator.py +3 -1
- teradataml/table_operators/table_operator_util.py +37 -38
- teradataml/table_operators/templates/dataframe_register.template +69 -0
- teradataml/utils/dtypes.py +51 -2
- teradataml/utils/internal_buffer.py +18 -0
- teradataml/utils/validators.py +99 -8
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/METADATA +321 -5
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/RECORD +121 -94
- teradataml/libaed_0_1.dylib +0 -0
- teradataml/libaed_0_1.so +0 -0
- teradataml/opensource/sklearn/__init__.py +0 -1
- teradataml/opensource/sklearn/_class.py +0 -255
- teradataml/opensource/sklearn/_sklearn_wrapper.py +0 -1800
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/zip-safe +0 -0
|
@@ -40,6 +40,9 @@ from teradataml.common.garbagecollector import GarbageCollector
|
|
|
40
40
|
from teradataml.dataframe.sql_functions import case
|
|
41
41
|
from teradataml.hyperparameter_tuner.utils import _ProgressBar
|
|
42
42
|
from teradataml.utils.validators import _Validators
|
|
43
|
+
from teradataml.common.utils import UtilFuncs
|
|
44
|
+
from teradataml.common.constants import TeradataConstants
|
|
45
|
+
from teradataml.options.configure import configure
|
|
43
46
|
|
|
44
47
|
|
|
45
48
|
class _FeatureEngineering:
|
|
@@ -131,8 +134,9 @@ class _FeatureEngineering:
|
|
|
131
134
|
self.data_transform_dict = {}
|
|
132
135
|
self.one_hot_obj_count = 0
|
|
133
136
|
self.is_classification_type = lambda: self.task_type.upper() == 'CLASSIFICATION'
|
|
134
|
-
self.volatile = kwargs.get('volatile', False)
|
|
135
137
|
self.persist = kwargs.get('persist', False)
|
|
138
|
+
self.volatile = kwargs.get('volatile', False) or (configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE and self.persist is False)
|
|
139
|
+
|
|
136
140
|
|
|
137
141
|
# Method for doing feature engineering on data -> adding id, removing futile col, imputation, encoding(one hot)
|
|
138
142
|
def feature_engineering(self,
|
|
@@ -259,6 +263,11 @@ class _FeatureEngineering:
|
|
|
259
263
|
Returns extracted elements in form of list.
|
|
260
264
|
|
|
261
265
|
"""
|
|
266
|
+
# Ensure list1 and list2 are lists, default to empty list if None
|
|
267
|
+
if list1 is None:
|
|
268
|
+
list1 = []
|
|
269
|
+
if list2 is None:
|
|
270
|
+
list2 = []
|
|
262
271
|
new_lst = list(set(list1) - set(list2))
|
|
263
272
|
return new_lst
|
|
264
273
|
|
|
@@ -273,7 +282,7 @@ class _FeatureEngineering:
|
|
|
273
282
|
show_data=True)
|
|
274
283
|
start_time = time.time()
|
|
275
284
|
rows = self.data.shape[0]
|
|
276
|
-
self.data=self.data.drop_duplicate()
|
|
285
|
+
self.data=self.data.drop_duplicate(self.data.columns)
|
|
277
286
|
if rows != self.data.shape[0]:
|
|
278
287
|
self._display_msg(msg=f'Updated dataset sample after removing {rows-self.data.shape[0]} duplicate records:',
|
|
279
288
|
data=self.data,
|
|
@@ -347,12 +356,10 @@ class _FeatureEngineering:
|
|
|
347
356
|
|
|
348
357
|
# Detecting and removing futile columns, if categorical_column exists
|
|
349
358
|
if len(categorical_columns) != 0:
|
|
350
|
-
|
|
351
359
|
obj = CategoricalSummary(data=self.data,
|
|
352
360
|
target_columns=categorical_columns,
|
|
353
361
|
volatile=self.volatile,
|
|
354
362
|
persist=self.persist)
|
|
355
|
-
|
|
356
363
|
gfc_out = GetFutileColumns(data=self.data,
|
|
357
364
|
object=obj,
|
|
358
365
|
category_summary_column="ColumnName",
|
|
@@ -565,11 +572,18 @@ class _FeatureEngineering:
|
|
|
565
572
|
|
|
566
573
|
# Removing rows with missing target column value
|
|
567
574
|
self.data = self.data.dropna(subset=[self.target_column])
|
|
575
|
+
|
|
576
|
+
params = {
|
|
577
|
+
"data": self.data,
|
|
578
|
+
"target_columns": self.data.columns,
|
|
579
|
+
"persist": True,
|
|
580
|
+
"display_table_name": False
|
|
581
|
+
}
|
|
568
582
|
|
|
569
|
-
obj = ColumnSummary(
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
583
|
+
obj = ColumnSummary(**params)
|
|
584
|
+
|
|
585
|
+
# Adding transformed data containing table to garbage collector
|
|
586
|
+
GarbageCollector._add_to_garbagecollector(obj.result._table_name)
|
|
573
587
|
|
|
574
588
|
cols_miss_val={}
|
|
575
589
|
# Iterating over each row in the column summary result
|
|
@@ -704,7 +718,7 @@ class _FeatureEngineering:
|
|
|
704
718
|
for key, val in self.imputation_cols.items():
|
|
705
719
|
|
|
706
720
|
col_stat.append(key)
|
|
707
|
-
if self.data_types[key] in ['float', 'int']:
|
|
721
|
+
if self.data_types[key] in ['float', 'int', 'decimal.Decimal']:
|
|
708
722
|
val = skew_data[f'skew_{key}']
|
|
709
723
|
# Median imputation method, if abs(skewness value) > 1
|
|
710
724
|
if abs(val) > 1:
|
|
@@ -713,7 +727,7 @@ class _FeatureEngineering:
|
|
|
713
727
|
else:
|
|
714
728
|
stat.append('mean')
|
|
715
729
|
# Mode imputation method, if categorical column
|
|
716
|
-
|
|
730
|
+
elif self.data_types[key] in ['str']:
|
|
717
731
|
stat.append('mode')
|
|
718
732
|
|
|
719
733
|
self._display_msg(msg="Columns with their imputation method:",
|
|
@@ -1802,10 +1816,11 @@ class _FeatureEngineering:
|
|
|
1802
1816
|
RETURNS:
|
|
1803
1817
|
Tuple containing volatile and persist parameters.
|
|
1804
1818
|
"""
|
|
1805
|
-
|
|
1819
|
+
# Prioritizing persist argument and then volatile
|
|
1806
1820
|
persist = self.persist
|
|
1821
|
+
volatile = self.volatile or (configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE and persist is False)
|
|
1807
1822
|
if self.custom_data is not None and self.custom_data.get(func_indicator, False):
|
|
1808
1823
|
volatile = self.custom_data[param_name].get("volatile", False)
|
|
1809
1824
|
persist = self.custom_data[param_name].get("persist", False)
|
|
1810
1825
|
|
|
1811
|
-
return (volatile, persist)
|
|
1826
|
+
return (volatile, persist)
|
|
@@ -26,9 +26,10 @@ from teradataml.context import context as tdmlctx
|
|
|
26
26
|
from teradataml.dataframe.copy_to import copy_to_sql
|
|
27
27
|
from teradataml.dataframe.dataframe import DataFrame
|
|
28
28
|
from teradataml import execute_sql, get_connection
|
|
29
|
-
from teradataml import SVM, GLM, DecisionForest, XGBoost, GridSearch, KNN, RandomSearch
|
|
29
|
+
from teradataml import configure, SVM, GLM, DecisionForest, XGBoost, GridSearch, KNN, RandomSearch
|
|
30
30
|
from teradataml.utils.validators import _Validators
|
|
31
|
-
|
|
31
|
+
from teradataml.common.utils import UtilFuncs
|
|
32
|
+
from teradataml.common.constants import TeradataConstants
|
|
32
33
|
|
|
33
34
|
class _ModelTraining:
|
|
34
35
|
|
|
@@ -113,6 +114,12 @@ class _ModelTraining:
|
|
|
113
114
|
session.
|
|
114
115
|
Default Value: False
|
|
115
116
|
Types: bool
|
|
117
|
+
|
|
118
|
+
seed:
|
|
119
|
+
Optional Argument.
|
|
120
|
+
Specifies the random seed for reproducibility.
|
|
121
|
+
Default Value: 42
|
|
122
|
+
Types: int
|
|
116
123
|
"""
|
|
117
124
|
self.data = data
|
|
118
125
|
self.target_column = target_column
|
|
@@ -125,6 +132,7 @@ class _ModelTraining:
|
|
|
125
132
|
self.startify_col = None
|
|
126
133
|
self.persist = kwargs.get("persist", False)
|
|
127
134
|
self.volatile = kwargs.get("volatile", False)
|
|
135
|
+
self.seed = kwargs.get("seed", 42)
|
|
128
136
|
|
|
129
137
|
def model_training(self,
|
|
130
138
|
auto=True,
|
|
@@ -498,7 +506,7 @@ class _ModelTraining:
|
|
|
498
506
|
'max_depth': tuple(max_depth),
|
|
499
507
|
'min_node_size': tuple(min_node_size),
|
|
500
508
|
'iter_num': tuple(iter_num),
|
|
501
|
-
'seed':
|
|
509
|
+
'seed':self.seed
|
|
502
510
|
}
|
|
503
511
|
# Hyperparameters for Decision Forest model
|
|
504
512
|
df_params = {
|
|
@@ -509,7 +517,7 @@ class _ModelTraining:
|
|
|
509
517
|
'max_depth': tuple(max_depth),
|
|
510
518
|
'min_node_size': tuple(min_node_size),
|
|
511
519
|
'num_trees': tuple(num_trees),
|
|
512
|
-
'seed':
|
|
520
|
+
'seed':self.seed
|
|
513
521
|
}
|
|
514
522
|
|
|
515
523
|
# Updating model type in case of classification
|
|
@@ -796,7 +804,8 @@ class _ModelTraining:
|
|
|
796
804
|
trained_models = []
|
|
797
805
|
for param in model_params:
|
|
798
806
|
result = self._hyperparameter_tunning(param, trainng_datas)
|
|
799
|
-
|
|
807
|
+
if result is not None:
|
|
808
|
+
trained_models.append(result)
|
|
800
809
|
|
|
801
810
|
models_df = pd.concat(trained_models, ignore_index=True)
|
|
802
811
|
return models_df
|
|
@@ -872,53 +881,71 @@ class _ModelTraining:
|
|
|
872
881
|
verbose = 0
|
|
873
882
|
|
|
874
883
|
# Hyperparameter tunning
|
|
884
|
+
# Parallel run opens multiple connections for parallel execution,
|
|
885
|
+
# but volatile tables are not accessible across different sessions.
|
|
886
|
+
# Therefore, execution is performed sequentially by setting run_parallel=False.
|
|
887
|
+
|
|
888
|
+
run_parallel = configure.temp_object_type != TeradataConstants.TERADATA_VOLATILE_TABLE
|
|
889
|
+
|
|
890
|
+
common_params = {
|
|
891
|
+
"data": train_data,
|
|
892
|
+
"evaluation_metric": self.stopping_metric,
|
|
893
|
+
"early_stop": self.stopping_tolerance,
|
|
894
|
+
"run_parallel": run_parallel,
|
|
895
|
+
"sample_seed": self.seed,
|
|
896
|
+
"sample_id_column": "id",
|
|
897
|
+
"discard_invalid_column_params": True,
|
|
898
|
+
"stratify_column": self.startify_col,
|
|
899
|
+
"verbose": verbose,
|
|
900
|
+
"max_time": self.max_runtime_secs,
|
|
901
|
+
"suppress_refer_msg": True
|
|
902
|
+
}
|
|
903
|
+
|
|
875
904
|
if model_param['name'] == 'knn':
|
|
876
|
-
_obj.fit(
|
|
877
|
-
early_stop=self.stopping_tolerance, run_parallel=True,
|
|
878
|
-
sample_seed=42, sample_id_column='id', discard_invalid_column_params=True,
|
|
879
|
-
stratify_column=self.startify_col,verbose=verbose, max_time=self.max_runtime_secs)
|
|
905
|
+
_obj.fit(**common_params)
|
|
880
906
|
else:
|
|
881
|
-
_obj.fit(
|
|
882
|
-
early_stop=self.stopping_tolerance, **eval_params,
|
|
883
|
-
run_parallel=True, discard_invalid_column_params=True, sample_seed=42,
|
|
884
|
-
sample_id_column='id',stratify_column=self.startify_col, verbose=verbose, max_time=self.max_runtime_secs)
|
|
907
|
+
_obj.fit(**common_params, **eval_params)
|
|
885
908
|
|
|
886
909
|
# Getting all passed models
|
|
887
910
|
model_info = _obj.model_stats.merge(_obj.models[_obj.models['STATUS']=='PASS'][['MODEL_ID', 'DATA_ID', 'PARAMETERS']],
|
|
888
911
|
on='MODEL_ID', how='inner')
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
model_info.insert(1, 'FEATURE_SELECTION', model_info.pop('FEATURE_SELECTION'))
|
|
905
|
-
|
|
906
|
-
if not self.is_classification_type():
|
|
907
|
-
# Calculating Adjusted-R2 for regression
|
|
908
|
-
# Getting size and feature count for each feature selection method
|
|
909
|
-
methods = ["lasso", "rfe", "pca"]
|
|
910
|
-
size_map = {method : df.select('id').size for method, df in zip(methods, train_data)}
|
|
911
|
-
feature_count_map = {method : len(df.columns) - 2 for method, df in zip(methods, train_data)}
|
|
912
|
-
model_info['ADJUSTED_R2'] = model_info.apply(lambda row:
|
|
913
|
-
1 - ((1 - row['R2']) * (size_map[row['FEATURE_SELECTION']] - 1) /
|
|
914
|
-
(size_map[row['FEATURE_SELECTION']] - feature_count_map[row['FEATURE_SELECTION']] - 1)), axis=1)
|
|
915
|
-
|
|
916
|
-
self._display_msg(msg="-"*100,
|
|
917
|
-
progress_bar=self.progress_bar,
|
|
918
|
-
show_data=True)
|
|
919
|
-
self.progress_bar.update()
|
|
912
|
+
if not model_info.empty:
|
|
913
|
+
# Creating mapping data ID to feature selection method
|
|
914
|
+
data_id_to_table_map = {"DF_0": ('lasso', train_data[0]._table_name),
|
|
915
|
+
"DF_1": ('rfe', train_data[1]._table_name),
|
|
916
|
+
"DF_2": ('pca', train_data[2]._table_name)}
|
|
917
|
+
|
|
918
|
+
# Updating model stats with feature selection method and result table
|
|
919
|
+
for index, row in model_info.iterrows():
|
|
920
|
+
model_info.loc[index, 'FEATURE_SELECTION'] = data_id_to_table_map[row['DATA_ID']][0]
|
|
921
|
+
model_info.loc[index, 'DATA_TABLE'] = data_id_to_table_map[row['DATA_ID']][1]
|
|
922
|
+
model_info.loc[index, 'RESULT_TABLE'] = _obj.get_model(row['MODEL_ID']).result._table_name
|
|
923
|
+
model_info.loc[index, 'model-obj'] = _obj.get_model(row['MODEL_ID'])
|
|
924
|
+
|
|
925
|
+
# Dropping column 'DATA_ID'
|
|
926
|
+
model_info.drop(['DATA_ID'], axis=1, inplace=True)
|
|
920
927
|
|
|
921
|
-
|
|
928
|
+
model_info.insert(1, 'FEATURE_SELECTION', model_info.pop('FEATURE_SELECTION'))
|
|
929
|
+
|
|
930
|
+
if not self.is_classification_type():
|
|
931
|
+
# Calculating Adjusted-R2 for regression
|
|
932
|
+
# Getting size and feature count for each feature selection method
|
|
933
|
+
methods = ["lasso", "rfe", "pca"]
|
|
934
|
+
size_map = {method : df.select('id').size for method, df in zip(methods, train_data)}
|
|
935
|
+
feature_count_map = {method : len(df.columns) - 2 for method, df in zip(methods, train_data)}
|
|
936
|
+
model_info['ADJUSTED_R2'] = model_info.apply(lambda row:
|
|
937
|
+
1 - ((1 - row['R2']) * (size_map[row['FEATURE_SELECTION']] - 1) /
|
|
938
|
+
(size_map[row['FEATURE_SELECTION']] - feature_count_map[row['FEATURE_SELECTION']] - 1)), axis=1)
|
|
939
|
+
|
|
940
|
+
self._display_msg(msg="-"*100,
|
|
941
|
+
progress_bar=self.progress_bar,
|
|
942
|
+
show_data=True)
|
|
943
|
+
self.progress_bar.update()
|
|
944
|
+
|
|
945
|
+
return model_info
|
|
946
|
+
|
|
947
|
+
# Returning None, if no model is passed
|
|
948
|
+
return None
|
|
922
949
|
|
|
923
950
|
@staticmethod
|
|
924
951
|
def _eval_params_generation(ml_name,
|
|
@@ -986,4 +1013,4 @@ class _ModelTraining:
|
|
|
986
1013
|
elif ml_name == 'glm':
|
|
987
1014
|
eval_params['family'] = 'GAUSSIAN'
|
|
988
1015
|
|
|
989
|
-
return eval_params
|
|
1016
|
+
return eval_params
|
teradataml/common/constants.py
CHANGED
|
@@ -14,10 +14,17 @@ A class for holding all constants
|
|
|
14
14
|
import re
|
|
15
15
|
import sqlalchemy
|
|
16
16
|
from enum import Enum
|
|
17
|
-
from teradataml.options.configure import configure
|
|
18
17
|
from teradatasqlalchemy.types import (INTEGER, SMALLINT, BIGINT, BYTEINT, DECIMAL, FLOAT, NUMBER, VARCHAR)
|
|
19
18
|
from teradatasqlalchemy.types import (DATE, TIME, TIMESTAMP)
|
|
20
19
|
from teradatasqlalchemy.types import (BYTE, VARBYTE, BLOB)
|
|
20
|
+
from teradatasqlalchemy import (CHAR, CLOB)
|
|
21
|
+
from teradatasqlalchemy import (PERIOD_DATE, PERIOD_TIME, PERIOD_TIMESTAMP)
|
|
22
|
+
from teradatasqlalchemy import (INTERVAL_YEAR, INTERVAL_YEAR_TO_MONTH, INTERVAL_MONTH,
|
|
23
|
+
INTERVAL_DAY,INTERVAL_DAY_TO_HOUR, INTERVAL_DAY_TO_MINUTE,
|
|
24
|
+
INTERVAL_DAY_TO_SECOND, INTERVAL_HOUR,
|
|
25
|
+
INTERVAL_HOUR_TO_MINUTE, INTERVAL_HOUR_TO_SECOND,
|
|
26
|
+
INTERVAL_MINUTE, INTERVAL_MINUTE_TO_SECOND,
|
|
27
|
+
INTERVAL_SECOND)
|
|
21
28
|
from teradatasqlalchemy import (GEOMETRY, MBR, MBB)
|
|
22
29
|
|
|
23
30
|
|
|
@@ -53,6 +60,9 @@ class SQLConstants(Enum):
|
|
|
53
60
|
SQL_DELETE_ALL_ROWS = 29
|
|
54
61
|
SQL_DELETE_SPECIFIC_ROW = 30
|
|
55
62
|
SQL_EXEC_STORED_PROCEDURE = 31
|
|
63
|
+
SQL_SELECT_COLUMNNAMES_WITH_WHERE = 32
|
|
64
|
+
SQL_HELP_DATABASE = 33
|
|
65
|
+
SQL_HELP_DATALAKE = 34
|
|
56
66
|
CONSTRAINT = ["check_constraint", "primary_key_constraint",
|
|
57
67
|
"foreign_key_constraint", "unique_key_constraint"]
|
|
58
68
|
|
|
@@ -123,6 +133,14 @@ class TeradataTypes(Enum):
|
|
|
123
133
|
TD_DATE_TYPES = [DATE, sqlalchemy.sql.sqltypes.Date]
|
|
124
134
|
TD_DATE_CODES = ["DA"]
|
|
125
135
|
TD_NULL_TYPE = "NULLTYPE"
|
|
136
|
+
TD_ALL_TYPES = (BYTEINT, SMALLINT, INTEGER, BIGINT, DECIMAL, FLOAT, NUMBER,
|
|
137
|
+
TIMESTAMP, DATE, TIME, CHAR, VARCHAR, CLOB, BYTE, VARBYTE,
|
|
138
|
+
BLOB, PERIOD_DATE, PERIOD_TIME, PERIOD_TIMESTAMP,
|
|
139
|
+
INTERVAL_YEAR, INTERVAL_YEAR_TO_MONTH, INTERVAL_MONTH,
|
|
140
|
+
INTERVAL_DAY, INTERVAL_DAY_TO_HOUR, INTERVAL_DAY_TO_MINUTE,
|
|
141
|
+
INTERVAL_DAY_TO_SECOND, INTERVAL_HOUR,
|
|
142
|
+
INTERVAL_HOUR_TO_MINUTE, INTERVAL_HOUR_TO_SECOND,
|
|
143
|
+
INTERVAL_MINUTE, INTERVAL_MINUTE_TO_SECOND, INTERVAL_SECOND)
|
|
126
144
|
|
|
127
145
|
|
|
128
146
|
class TeradataTableKindConstants(Enum):
|
|
@@ -427,6 +445,8 @@ class TableOperatorConstants(Enum):
|
|
|
427
445
|
APPLY_TEMPLATE = "dataframe_apply.template"
|
|
428
446
|
# Template of the intermediate script that will be generated for UDF.
|
|
429
447
|
UDF_TEMPLATE = "dataframe_udf.template"
|
|
448
|
+
# Template of the intermediate script that will be generated for register.
|
|
449
|
+
REGISTER_TEMPLATE = "dataframe_register.template"
|
|
430
450
|
# In-DB execution mode.
|
|
431
451
|
INDB_EXEC = "IN-DB"
|
|
432
452
|
# Local execution mode.
|
|
@@ -443,6 +463,8 @@ class TableOperatorConstants(Enum):
|
|
|
443
463
|
APPLY_OP = "apply"
|
|
444
464
|
# udf operation.
|
|
445
465
|
UDF_OP = "udf"
|
|
466
|
+
# register operation.
|
|
467
|
+
REGISTER_OP = "register"
|
|
446
468
|
# Template of the script_executor that will be used to generate the temporary script_executor file.
|
|
447
469
|
SCRIPT_TEMPLATE = "script_executor.template"
|
|
448
470
|
# Log Type.
|
|
@@ -464,11 +486,18 @@ class TableOperatorConstants(Enum):
|
|
|
464
486
|
|
|
465
487
|
# Check if Python interpretor and add-ons are installed or not.
|
|
466
488
|
# Location of In-DB packages is indicated by configure.indb_install_location.
|
|
489
|
+
# Check for both python and pip versions.
|
|
467
490
|
CHECK_PYTHON_INSTALLED = """SELECT distinct * FROM SCRIPT(
|
|
468
491
|
ON (select 1) PARTITION BY ANY
|
|
469
|
-
SCRIPT_COMMAND('{}/bin/pip3 --version')
|
|
470
|
-
returns('
|
|
492
|
+
SCRIPT_COMMAND('echo $({0}/bin/pip3 --version) -- $({0}/bin/python3 --version)')
|
|
493
|
+
returns('pip VARCHAR(256)'))
|
|
471
494
|
"""
|
|
495
|
+
# Check which version of rpms are installed.
|
|
496
|
+
INDB_PYTHON_PATH = """SEL DISTINCT os_ver
|
|
497
|
+
FROM SCRIPT(
|
|
498
|
+
SCRIPT_COMMAND('grep CPE_NAME /etc/os-release')
|
|
499
|
+
RETURNS('os_ver VARCHAR(100)')
|
|
500
|
+
);"""
|
|
472
501
|
|
|
473
502
|
# Script Query to get Python packages and corresponding versions.
|
|
474
503
|
# Location of In-DB packages is indicated by configure.indb_install_location.
|
|
@@ -480,6 +509,9 @@ class TableOperatorConstants(Enum):
|
|
|
480
509
|
"delimiter(' ') " \
|
|
481
510
|
"returns('package VARCHAR({2}), " \
|
|
482
511
|
"version VARCHAR({2})'))"
|
|
512
|
+
|
|
513
|
+
SCRIPT_LIST_FILES_QUERY = "SELECT DISTINCT * FROM SCRIPT (SCRIPT_COMMAND " \
|
|
514
|
+
"('ls ./{}') RETURNS ('Files VARCHAR({})'))"
|
|
483
515
|
|
|
484
516
|
class ValibConstants(Enum):
|
|
485
517
|
# A dictionary that maps teradataml name of the exposed VALIB function name
|
|
@@ -778,7 +810,8 @@ class ValibConstants(Enum):
|
|
|
778
810
|
"subdivision_method": "subdivisionmethod",
|
|
779
811
|
"subdivision_threshold": "subdivisionthreshold",
|
|
780
812
|
"filter": "where",
|
|
781
|
-
"gen_sql_only": "gensqlonly"
|
|
813
|
+
"gen_sql_only": "gensqlonly",
|
|
814
|
+
"charset": "charset"
|
|
782
815
|
},
|
|
783
816
|
|
|
784
817
|
"DATAEXPLORER": {
|
|
@@ -795,7 +828,8 @@ class ValibConstants(Enum):
|
|
|
795
828
|
"stats_options": "statsoptions",
|
|
796
829
|
"distinct": "uniques",
|
|
797
830
|
"filter": "where",
|
|
798
|
-
"gen_sql": "gensql"
|
|
831
|
+
"gen_sql": "gensql",
|
|
832
|
+
"charset": "charset"
|
|
799
833
|
},
|
|
800
834
|
|
|
801
835
|
"FREQUENCY": {
|
|
@@ -809,7 +843,8 @@ class ValibConstants(Enum):
|
|
|
809
843
|
"style": "style",
|
|
810
844
|
"top_n": "topvalues",
|
|
811
845
|
"filter": "where",
|
|
812
|
-
"gen_sql_only": "gensqlonly"
|
|
846
|
+
"gen_sql_only": "gensqlonly",
|
|
847
|
+
"charset": "charset"
|
|
813
848
|
},
|
|
814
849
|
|
|
815
850
|
"HISTOGRAM": {
|
|
@@ -824,7 +859,8 @@ class ValibConstants(Enum):
|
|
|
824
859
|
"stats_columns": "statisticscolumns",
|
|
825
860
|
"hist_style": "style",
|
|
826
861
|
"filter": "where",
|
|
827
|
-
"gen_sql_only": "gensqlonly"
|
|
862
|
+
"gen_sql_only": "gensqlonly",
|
|
863
|
+
"charset": "charset"
|
|
828
864
|
},
|
|
829
865
|
|
|
830
866
|
"STATISTICS": {
|
|
@@ -835,7 +871,8 @@ class ValibConstants(Enum):
|
|
|
835
871
|
"statistical_method": "statisticalmethod",
|
|
836
872
|
"stats_options": "statsoptions",
|
|
837
873
|
"filter": "where",
|
|
838
|
-
"gen_sql_only": "gensqlonly"
|
|
874
|
+
"gen_sql_only": "gensqlonly",
|
|
875
|
+
"charset": "charset"
|
|
839
876
|
},
|
|
840
877
|
|
|
841
878
|
"TEXTFIELDANALYZER": {
|
|
@@ -843,7 +880,8 @@ class ValibConstants(Enum):
|
|
|
843
880
|
"exclude_columns": "columnstoexclude",
|
|
844
881
|
"analyze_numerics": "extendednumericanalysis",
|
|
845
882
|
"analyze_unicode": "extendedunicodeanalysis",
|
|
846
|
-
"gen_sql_only": "gensqlonly"
|
|
883
|
+
"gen_sql_only": "gensqlonly",
|
|
884
|
+
"charset": "charset"
|
|
847
885
|
},
|
|
848
886
|
|
|
849
887
|
"VALUES": {
|
|
@@ -852,7 +890,8 @@ class ValibConstants(Enum):
|
|
|
852
890
|
"group_columns": "groupby",
|
|
853
891
|
"distinct": "uniques",
|
|
854
892
|
"filter": "where",
|
|
855
|
-
"gen_sql_only": "gensqlonly"
|
|
893
|
+
"gen_sql_only": "gensqlonly",
|
|
894
|
+
"charset": "charset"
|
|
856
895
|
},
|
|
857
896
|
|
|
858
897
|
"ASSOCIATION": {
|
|
@@ -877,7 +916,8 @@ class ValibConstants(Enum):
|
|
|
877
916
|
"filter": "where",
|
|
878
917
|
"no_support_results": "dropsupporttables",
|
|
879
918
|
"support_result_prefix": "resulttableprefix",
|
|
880
|
-
"gen_sql_only": "gensqlonly"
|
|
919
|
+
"gen_sql_only": "gensqlonly",
|
|
920
|
+
"charset": "charset"
|
|
881
921
|
},
|
|
882
922
|
|
|
883
923
|
"KMEANS": {
|
|
@@ -887,7 +927,8 @@ class ValibConstants(Enum):
|
|
|
887
927
|
"continuation": "continuation",
|
|
888
928
|
"max_iter": "iterations",
|
|
889
929
|
"operator_database": "operatordatabase",
|
|
890
|
-
"threshold": "threshold"
|
|
930
|
+
"threshold": "threshold",
|
|
931
|
+
"charset": "charset"
|
|
891
932
|
},
|
|
892
933
|
|
|
893
934
|
"KMEANSSCORE": {
|
|
@@ -895,7 +936,8 @@ class ValibConstants(Enum):
|
|
|
895
936
|
"cluster_column": "clustername",
|
|
896
937
|
"fallback": "fallback",
|
|
897
938
|
"operator_database": "operatordatabase",
|
|
898
|
-
"accumulate": "retain"
|
|
939
|
+
"accumulate": "retain",
|
|
940
|
+
"charset": "charset"
|
|
899
941
|
},
|
|
900
942
|
|
|
901
943
|
"DECISIONTREE": {
|
|
@@ -907,7 +949,8 @@ class ValibConstants(Enum):
|
|
|
907
949
|
"max_depth": "max_depth",
|
|
908
950
|
"num_splits": "min_records",
|
|
909
951
|
"operator_database": "operatordatabase",
|
|
910
|
-
"pruning": "pruning"
|
|
952
|
+
"pruning": "pruning",
|
|
953
|
+
"charset": "charset"
|
|
911
954
|
},
|
|
912
955
|
|
|
913
956
|
"DECISIONTREESCORE": {
|
|
@@ -917,7 +960,8 @@ class ValibConstants(Enum):
|
|
|
917
960
|
"profile": "profiletables",
|
|
918
961
|
"accumulate": "retain",
|
|
919
962
|
"targeted_value": "targetedvalue",
|
|
920
|
-
"gen_sql_only": "gensqlonly"
|
|
963
|
+
"gen_sql_only": "gensqlonly",
|
|
964
|
+
"charset": "charset"
|
|
921
965
|
},
|
|
922
966
|
|
|
923
967
|
"MATRIX": {
|
|
@@ -927,7 +971,8 @@ class ValibConstants(Enum):
|
|
|
927
971
|
"matrix_output": "matrixoutput",
|
|
928
972
|
"type": "matrixtype",
|
|
929
973
|
"handle_nulls": "nullhandling",
|
|
930
|
-
"filter": "where"
|
|
974
|
+
"filter": "where",
|
|
975
|
+
"charset": "charset"
|
|
931
976
|
},
|
|
932
977
|
|
|
933
978
|
"LINEAR": {
|
|
@@ -949,7 +994,8 @@ class ValibConstants(Enum):
|
|
|
949
994
|
"stepwise": "stepwise",
|
|
950
995
|
"use_fstat": "usefstat",
|
|
951
996
|
"use_pvalue": "usepvalue",
|
|
952
|
-
"variance_prop_threshold": "varianceproportionthreshold"
|
|
997
|
+
"variance_prop_threshold": "varianceproportionthreshold",
|
|
998
|
+
"charset": "charset"
|
|
953
999
|
},
|
|
954
1000
|
|
|
955
1001
|
"LINEARSCORE": {
|
|
@@ -957,7 +1003,8 @@ class ValibConstants(Enum):
|
|
|
957
1003
|
"response_column": "predicted",
|
|
958
1004
|
"residual_column": "residual",
|
|
959
1005
|
"accumulate": "retain",
|
|
960
|
-
"gen_sql_only": "gensqlonly"
|
|
1006
|
+
"gen_sql_only": "gensqlonly",
|
|
1007
|
+
"charset": "charset"
|
|
961
1008
|
},
|
|
962
1009
|
|
|
963
1010
|
"LOGISTIC": {
|
|
@@ -987,7 +1034,8 @@ class ValibConstants(Enum):
|
|
|
987
1034
|
"end_threshold": "thresholdend",
|
|
988
1035
|
"increment_threshold": "thresholdincrement",
|
|
989
1036
|
"threshold_output": "thresholdtable",
|
|
990
|
-
"variance_prop_threshold": "varianceproportionthreshold"
|
|
1037
|
+
"variance_prop_threshold": "varianceproportionthreshold",
|
|
1038
|
+
"charset": "charset"
|
|
991
1039
|
},
|
|
992
1040
|
|
|
993
1041
|
"LOGISTICSCORE": {
|
|
@@ -999,7 +1047,8 @@ class ValibConstants(Enum):
|
|
|
999
1047
|
"start_threshold": "thresholdbegin",
|
|
1000
1048
|
"end_threshold": "thresholdend",
|
|
1001
1049
|
"increment_threshold": "thresholdincrement",
|
|
1002
|
-
"gen_sql_only": "gensqlonly"
|
|
1050
|
+
"gen_sql_only": "gensqlonly",
|
|
1051
|
+
"charset": "charset"
|
|
1003
1052
|
|
|
1004
1053
|
# The following 3 arguments three should not be present for LogRegPredict function
|
|
1005
1054
|
# where as when the function is LogRegEvaluator, at least one of these should be
|
|
@@ -1027,13 +1076,15 @@ class ValibConstants(Enum):
|
|
|
1027
1076
|
"rotation_type": "rotationtype",
|
|
1028
1077
|
"load_threshold": "thresholdloading",
|
|
1029
1078
|
"percent_threshold": "thresholdpercent",
|
|
1030
|
-
"variance_prop_threshold": "varianceproportionthreshold"
|
|
1079
|
+
"variance_prop_threshold": "varianceproportionthreshold",
|
|
1080
|
+
"charset": "charset"
|
|
1031
1081
|
},
|
|
1032
1082
|
|
|
1033
1083
|
"FACTORSCORE": {
|
|
1034
1084
|
"index_columns": "index",
|
|
1035
1085
|
"accumulate": "retain",
|
|
1036
|
-
"gen_sql_only": "gensqlonly"
|
|
1086
|
+
"gen_sql_only": "gensqlonly",
|
|
1087
|
+
"charset": "charset"
|
|
1037
1088
|
},
|
|
1038
1089
|
|
|
1039
1090
|
"PARAMETRICTEST": {
|
|
@@ -1052,7 +1103,8 @@ class ValibConstants(Enum):
|
|
|
1052
1103
|
"style": "teststyle",
|
|
1053
1104
|
"probability_threshold": "thresholdprobability",
|
|
1054
1105
|
"with_indicator": "withindicator",
|
|
1055
|
-
"gen_sql_only": "gensqlonly"
|
|
1106
|
+
"gen_sql_only": "gensqlonly",
|
|
1107
|
+
"charset": "charset"
|
|
1056
1108
|
},
|
|
1057
1109
|
|
|
1058
1110
|
"BINOMIALTEST": {
|
|
@@ -1067,7 +1119,8 @@ class ValibConstants(Enum):
|
|
|
1067
1119
|
"stats_database": "statsdatabase",
|
|
1068
1120
|
"style": "teststyle",
|
|
1069
1121
|
"probability_threshold": "thresholdprobability",
|
|
1070
|
-
"gen_sql_only": "gensqlonly"
|
|
1122
|
+
"gen_sql_only": "gensqlonly",
|
|
1123
|
+
"charset": "charset"
|
|
1071
1124
|
},
|
|
1072
1125
|
|
|
1073
1126
|
"KSTEST": {
|
|
@@ -1079,7 +1132,8 @@ class ValibConstants(Enum):
|
|
|
1079
1132
|
"stats_database": "statsdatabase",
|
|
1080
1133
|
"style": "teststyle",
|
|
1081
1134
|
"probability_threshold": "thresholdprobability",
|
|
1082
|
-
"gen_sql_only": "gensqlonly"
|
|
1135
|
+
"gen_sql_only": "gensqlonly",
|
|
1136
|
+
"charset": "charset"
|
|
1083
1137
|
},
|
|
1084
1138
|
|
|
1085
1139
|
"CHISQUARETEST": {
|
|
@@ -1093,7 +1147,8 @@ class ValibConstants(Enum):
|
|
|
1093
1147
|
"stats_database": "statsdatabase",
|
|
1094
1148
|
"style": "teststyle",
|
|
1095
1149
|
"probability_threshold": "thresholdprobability",
|
|
1096
|
-
"gen_sql_only": "gensqlonly"
|
|
1150
|
+
"gen_sql_only": "gensqlonly",
|
|
1151
|
+
"charset": "charset"
|
|
1097
1152
|
},
|
|
1098
1153
|
|
|
1099
1154
|
"RANKTEST": {
|
|
@@ -1112,7 +1167,8 @@ class ValibConstants(Enum):
|
|
|
1112
1167
|
"style": "teststyle",
|
|
1113
1168
|
"probability_threshold": "thresholdprobability",
|
|
1114
1169
|
"treatment_column": "treatmentcolumn",
|
|
1115
|
-
"gen_sql_only": "gensqlonly"
|
|
1170
|
+
"gen_sql_only": "gensqlonly",
|
|
1171
|
+
"charset": "charset"
|
|
1116
1172
|
},
|
|
1117
1173
|
|
|
1118
1174
|
"VARTRAN": {
|
|
@@ -1123,13 +1179,15 @@ class ValibConstants(Enum):
|
|
|
1123
1179
|
"allow_duplicates": "multiset",
|
|
1124
1180
|
"nopi": "noindex",
|
|
1125
1181
|
"filter": "whereclause",
|
|
1126
|
-
"gen_sql_only": "gensqlonly"
|
|
1182
|
+
"gen_sql_only": "gensqlonly",
|
|
1183
|
+
"charset": "charset"
|
|
1127
1184
|
},
|
|
1128
1185
|
|
|
1129
1186
|
"REPORT": {
|
|
1130
1187
|
"analysis_type": "analysistype",
|
|
1131
1188
|
"filter": "where",
|
|
1132
|
-
"gen_sql_only": "gensqlonly"
|
|
1189
|
+
"gen_sql_only": "gensqlonly",
|
|
1190
|
+
"charset": "charset"
|
|
1133
1191
|
}
|
|
1134
1192
|
}
|
|
1135
1193
|
|
|
@@ -1424,6 +1482,7 @@ class HTTPRequest(Enum):
|
|
|
1424
1482
|
POST = "post"
|
|
1425
1483
|
PUT = "put"
|
|
1426
1484
|
DELETE = "delete"
|
|
1485
|
+
PATCH = "patch"
|
|
1427
1486
|
|
|
1428
1487
|
|
|
1429
1488
|
class AsyncStatusColumns(Enum):
|
|
@@ -520,7 +520,8 @@ class GarbageCollector():
|
|
|
520
520
|
fileparts = file.split(GarbageCollector.__filenameseperator)
|
|
521
521
|
hostname = fileparts[1]
|
|
522
522
|
filepid = int(fileparts[2])
|
|
523
|
-
|
|
523
|
+
# Check for both host ip and hostname in case user passed hostname for creating connection.
|
|
524
|
+
if hostname == tdmlctx.context._get_host_ip() or hostname == tdmlctx.context._get_host():
|
|
524
525
|
if filepid == os.getpid() or not psutil.pid_exists(filepid):
|
|
525
526
|
tempfiles.append(filepath)
|
|
526
527
|
except (IndexError, ValueError):
|