teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +71 -0
- teradataml/_version.py +2 -2
- teradataml/analytics/analytic_function_executor.py +51 -24
- teradataml/analytics/json_parser/utils.py +11 -17
- teradataml/automl/__init__.py +103 -48
- teradataml/automl/data_preparation.py +55 -37
- teradataml/automl/data_transformation.py +131 -69
- teradataml/automl/feature_engineering.py +117 -185
- teradataml/automl/feature_exploration.py +9 -2
- teradataml/automl/model_evaluation.py +13 -25
- teradataml/automl/model_training.py +214 -75
- teradataml/catalog/model_cataloging_utils.py +1 -1
- teradataml/clients/auth_client.py +133 -0
- teradataml/common/aed_utils.py +3 -2
- teradataml/common/constants.py +11 -6
- teradataml/common/garbagecollector.py +5 -0
- teradataml/common/messagecodes.py +3 -1
- teradataml/common/messages.py +2 -1
- teradataml/common/utils.py +6 -0
- teradataml/context/context.py +49 -29
- teradataml/data/advertising.csv +201 -0
- teradataml/data/bank_marketing.csv +11163 -0
- teradataml/data/bike_sharing.csv +732 -0
- teradataml/data/boston2cols.csv +721 -0
- teradataml/data/breast_cancer.csv +570 -0
- teradataml/data/customer_segmentation_test.csv +2628 -0
- teradataml/data/customer_segmentation_train.csv +8069 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
- teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
- teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
- teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
- teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
- teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
- teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
- teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
- teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
- teradataml/data/glm_example.json +28 -1
- teradataml/data/housing_train_segment.csv +201 -0
- teradataml/data/insect2Cols.csv +61 -0
- teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
- teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
- teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
- teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
- teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
- teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
- teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
- teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
- teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
- teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
- teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
- teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
- teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
- teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
- teradataml/data/kmeans_example.json +5 -0
- teradataml/data/kmeans_table.csv +10 -0
- teradataml/data/onehot_encoder_train.csv +4 -0
- teradataml/data/openml_example.json +29 -0
- teradataml/data/scale_attributes.csv +3 -0
- teradataml/data/scale_example.json +52 -1
- teradataml/data/scale_input_part_sparse.csv +31 -0
- teradataml/data/scale_input_partitioned.csv +16 -0
- teradataml/data/scale_input_sparse.csv +11 -0
- teradataml/data/scale_parameters.csv +3 -0
- teradataml/data/scripts/deploy_script.py +20 -1
- teradataml/data/scripts/sklearn/sklearn_fit.py +23 -27
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +20 -28
- teradataml/data/scripts/sklearn/sklearn_function.template +13 -18
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +18 -27
- teradataml/data/scripts/sklearn/sklearn_score.py +20 -29
- teradataml/data/scripts/sklearn/sklearn_transform.py +30 -38
- teradataml/data/teradataml_example.json +77 -0
- teradataml/data/ztest_example.json +16 -0
- teradataml/dataframe/copy_to.py +8 -3
- teradataml/dataframe/data_transfer.py +120 -61
- teradataml/dataframe/dataframe.py +102 -17
- teradataml/dataframe/dataframe_utils.py +47 -9
- teradataml/dataframe/fastload.py +272 -89
- teradataml/dataframe/sql.py +84 -0
- teradataml/dbutils/dbutils.py +2 -2
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/sklearn/_sklearn_wrapper.py +102 -55
- teradataml/options/__init__.py +13 -4
- teradataml/options/configure.py +27 -6
- teradataml/scriptmgmt/UserEnv.py +19 -16
- teradataml/scriptmgmt/lls_utils.py +117 -14
- teradataml/table_operators/Script.py +2 -3
- teradataml/table_operators/TableOperator.py +58 -10
- teradataml/utils/validators.py +40 -2
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/METADATA +78 -6
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/RECORD +108 -90
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/zip-safe +0 -0
|
@@ -28,9 +28,14 @@ from teradataml import OutlierFilterFit, OutlierFilterTransform
|
|
|
28
28
|
from teradataml import RoundColumns, TeradataMlException
|
|
29
29
|
from teradataml import ScaleFit, ScaleTransform
|
|
30
30
|
from teradataml import TrainTestSplit, UtilFuncs, TeradataConstants
|
|
31
|
+
from teradataml.common.garbagecollector import GarbageCollector
|
|
31
32
|
from teradataml.common.messages import Messages, MessageCodes
|
|
32
33
|
from teradataml.utils.validators import _Validators
|
|
34
|
+
from teradataml import INTEGER
|
|
33
35
|
|
|
36
|
+
# Control Randomnes
|
|
37
|
+
random.seed(42)
|
|
38
|
+
np.random.seed(42)
|
|
34
39
|
|
|
35
40
|
class _DataPreparation:
|
|
36
41
|
|
|
@@ -54,7 +59,7 @@ class _DataPreparation:
|
|
|
54
59
|
Types: teradataml Dataframe
|
|
55
60
|
|
|
56
61
|
target_column:
|
|
57
|
-
Required
|
|
62
|
+
Required Argument.
|
|
58
63
|
Specifies the name of the target column in "data".
|
|
59
64
|
Types: str
|
|
60
65
|
|
|
@@ -69,22 +74,22 @@ class _DataPreparation:
|
|
|
69
74
|
Types: int
|
|
70
75
|
|
|
71
76
|
excluded_columns:
|
|
72
|
-
Required
|
|
77
|
+
Required Argument.
|
|
73
78
|
Specifies the columns should be excluded from any processing.
|
|
74
79
|
Types: str or list of strings (str)
|
|
75
80
|
|
|
76
81
|
custom_data:
|
|
77
|
-
Optional
|
|
82
|
+
Optional Argument.
|
|
78
83
|
Specifies json object containing user customized input.
|
|
79
84
|
Types: json object
|
|
80
85
|
|
|
81
86
|
data_transform_dict:
|
|
82
|
-
Optional
|
|
87
|
+
Optional Argument.
|
|
83
88
|
Specifies the parameters for data transformation.
|
|
84
89
|
Types: dict
|
|
85
90
|
|
|
86
91
|
task_type:
|
|
87
|
-
Required
|
|
92
|
+
Required Argument.
|
|
88
93
|
Specifies the task type for AutoML, whether to apply regresion OR classification
|
|
89
94
|
on the provived dataset.
|
|
90
95
|
Default Value: "Regression"
|
|
@@ -106,8 +111,6 @@ class _DataPreparation:
|
|
|
106
111
|
self._scale_method_cls = "RANGE"
|
|
107
112
|
self.table_name_mapping = {}
|
|
108
113
|
|
|
109
|
-
random.seed(42)
|
|
110
|
-
np.random.seed(42)
|
|
111
114
|
self.data_types = {key: value for key, value in self.data._column_names_and_types}
|
|
112
115
|
|
|
113
116
|
|
|
@@ -123,7 +126,7 @@ class _DataPreparation:
|
|
|
123
126
|
|
|
124
127
|
PARAMETERS:
|
|
125
128
|
auto:
|
|
126
|
-
Optional
|
|
129
|
+
Optional Argument.
|
|
127
130
|
Specifies whether to run AutoML in custom mode or auto mode.
|
|
128
131
|
When set to False, runs in custom mode. Otherwise, by default runs in auto mode.
|
|
129
132
|
Default Value: True
|
|
@@ -163,6 +166,10 @@ class _DataPreparation:
|
|
|
163
166
|
train = self._data_sampling(train)
|
|
164
167
|
self.progress_bar.update()
|
|
165
168
|
|
|
169
|
+
# Sorting the data based on id to
|
|
170
|
+
# remove any shuffling done by sampling
|
|
171
|
+
train = train.sort_values(by='id')
|
|
172
|
+
|
|
166
173
|
# Performing feature selection using lasso followed by scaling
|
|
167
174
|
self._feature_selection_Lasso(train, test)
|
|
168
175
|
self._scaling_features(feature_selection_mtd="lasso")
|
|
@@ -375,6 +382,8 @@ class _DataPreparation:
|
|
|
375
382
|
"persist" : True
|
|
376
383
|
}
|
|
377
384
|
self.train_df = OutlierFilterTransform(**transform_params).result
|
|
385
|
+
# Adding transformed data containing table to garbage collector
|
|
386
|
+
GarbageCollector._add_to_garbagecollector(self.train_df._table_name)
|
|
378
387
|
|
|
379
388
|
def _outlier_processing(self):
|
|
380
389
|
"""
|
|
@@ -400,6 +409,9 @@ class _DataPreparation:
|
|
|
400
409
|
target_columns=columns_to_drop_rows
|
|
401
410
|
replacement_strategy = "DELETE"
|
|
402
411
|
self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
|
|
412
|
+
self._display_msg(msg="Sample of training dataset after removing outlier rows:",
|
|
413
|
+
data=self.train_df,
|
|
414
|
+
progress_bar=self.progress_bar)
|
|
403
415
|
|
|
404
416
|
# Imputing Median value in place of outliers
|
|
405
417
|
if len(columns_to_impute) != 0:
|
|
@@ -409,6 +421,13 @@ class _DataPreparation:
|
|
|
409
421
|
target_columns=columns_to_impute
|
|
410
422
|
replacement_strategy = "MEDIAN"
|
|
411
423
|
self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
|
|
424
|
+
self._display_msg(msg="Sample of training dataset after performing MEDIAN inplace:",
|
|
425
|
+
data=self.train_df,
|
|
426
|
+
progress_bar=self.progress_bar)
|
|
427
|
+
|
|
428
|
+
if len(columns_to_drop_rows) == 0 and len(columns_to_impute) == 0:
|
|
429
|
+
self._display_msg(msg='Analysis indicates not outlier in the dataset. No Action Taken.',
|
|
430
|
+
progress_bar=self.progress_bar)
|
|
412
431
|
|
|
413
432
|
end_time = time.time()
|
|
414
433
|
self._display_msg("Time Taken by Outlier processing: {:.2f} sec ".format(end_time - start_time),
|
|
@@ -557,10 +576,6 @@ class _DataPreparation:
|
|
|
557
576
|
progress_bar=self.progress_bar,
|
|
558
577
|
show_data=True)
|
|
559
578
|
|
|
560
|
-
if self.is_classification_type():
|
|
561
|
-
train_df[self.target_column] = train_df[self.target_column].astype('int')
|
|
562
|
-
test_df[self.target_column] = test_df[self.target_column].astype('int')
|
|
563
|
-
|
|
564
579
|
# Pushing the data in database
|
|
565
580
|
self.copy_dataframe_to_sql(train_df, test_df, 'pca')
|
|
566
581
|
|
|
@@ -590,7 +605,7 @@ class _DataPreparation:
|
|
|
590
605
|
# Required imports for RFE
|
|
591
606
|
from sklearn.feature_selection import RFECV
|
|
592
607
|
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
|
|
593
|
-
from sklearn.model_selection import StratifiedKFold
|
|
608
|
+
from sklearn.model_selection import StratifiedKFold
|
|
594
609
|
|
|
595
610
|
start_time = time.time()
|
|
596
611
|
# Regression
|
|
@@ -606,9 +621,9 @@ class _DataPreparation:
|
|
|
606
621
|
score = 'r2' if not self.is_classification_type() \
|
|
607
622
|
else 'roc_auc' if self.data.drop_duplicate(self.target_column).size == 2 else 'f1_macro'
|
|
608
623
|
|
|
609
|
-
# Instantiate StratifiedKFold with shuffling for classification
|
|
624
|
+
# # Instantiate StratifiedKFold with shuffling for classification
|
|
610
625
|
cv = folds if not self.is_classification_type() \
|
|
611
|
-
else StratifiedKFold(n_splits=folds, shuffle=
|
|
626
|
+
else StratifiedKFold(n_splits=folds, shuffle=False)
|
|
612
627
|
|
|
613
628
|
# Define the RFE with cross-validation
|
|
614
629
|
rfecv = RFECV(rf, cv=cv, scoring=score)
|
|
@@ -682,7 +697,8 @@ class _DataPreparation:
|
|
|
682
697
|
from sklearn.model_selection import GridSearchCV
|
|
683
698
|
from sklearn.linear_model import Lasso
|
|
684
699
|
from sklearn.linear_model import LogisticRegression
|
|
685
|
-
|
|
700
|
+
from sklearn.model_selection import StratifiedKFold
|
|
701
|
+
|
|
686
702
|
# Getting the value k in k-fold cross-validation
|
|
687
703
|
num_folds = self._num_of_folds(train.shape[0])
|
|
688
704
|
|
|
@@ -696,15 +712,21 @@ class _DataPreparation:
|
|
|
696
712
|
scoring_metric = 'roc_auc'
|
|
697
713
|
else:
|
|
698
714
|
scoring_metric = 'f1_macro'
|
|
699
|
-
estimator = LogisticRegression(
|
|
715
|
+
estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=42)
|
|
700
716
|
parameters = {'C':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
|
|
701
717
|
else:
|
|
702
|
-
estimator = Lasso()
|
|
718
|
+
estimator = Lasso(random_state=42)
|
|
703
719
|
parameters = {'alpha':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
|
|
704
720
|
scoring_metric = "r2"
|
|
705
721
|
|
|
722
|
+
if self.is_classification_type():
|
|
723
|
+
cv = StratifiedKFold(n_splits=5, shuffle=False)
|
|
724
|
+
else:
|
|
725
|
+
cv = num_folds
|
|
726
|
+
|
|
706
727
|
# Applying hyperparameter tuning and optimizing score
|
|
707
|
-
hyperparameter_search = GridSearchCV(estimator, parameters, cv=
|
|
728
|
+
hyperparameter_search = GridSearchCV(estimator, parameters, cv=cv, refit=True,
|
|
729
|
+
scoring=scoring_metric, verbose=0)
|
|
708
730
|
|
|
709
731
|
# Fitting the best result from hyperparameter
|
|
710
732
|
hyperparameter_search.fit(train_features, train_target)
|
|
@@ -775,8 +797,12 @@ class _DataPreparation:
|
|
|
775
797
|
self.table_name_mapping['{}_test'.format(prefix)] = test_table_name
|
|
776
798
|
|
|
777
799
|
# Pushing data into database
|
|
778
|
-
|
|
779
|
-
|
|
800
|
+
if self.is_classification_type():
|
|
801
|
+
copy_to_sql(df=train, table_name=train_table_name, if_exists="replace", types={f'{self.target_column}': INTEGER})
|
|
802
|
+
copy_to_sql(df=test, table_name=test_table_name, if_exists="replace", types={f'{self.target_column}': INTEGER})
|
|
803
|
+
else:
|
|
804
|
+
copy_to_sql(df=train, table_name=train_table_name, if_exists="replace")
|
|
805
|
+
copy_to_sql(df=test, table_name=test_table_name, if_exists="replace")
|
|
780
806
|
|
|
781
807
|
|
|
782
808
|
|
|
@@ -901,9 +927,6 @@ class _DataPreparation:
|
|
|
901
927
|
else:
|
|
902
928
|
self._display_msg(msg="No columns to scale.",
|
|
903
929
|
progress_bar=self.progress_bar)
|
|
904
|
-
|
|
905
|
-
if self.is_classification_type():
|
|
906
|
-
train, test = self._bigint_to_int(train, test)
|
|
907
930
|
|
|
908
931
|
self.copy_dataframe_to_sql(train, test, feature_selection_mtd)
|
|
909
932
|
|
|
@@ -911,15 +934,6 @@ class _DataPreparation:
|
|
|
911
934
|
self._display_msg(msg="Total time taken by feature scaling: {:.2f} sec".format( end_time - start_time),
|
|
912
935
|
progress_bar=self.progress_bar,
|
|
913
936
|
show_data=True)
|
|
914
|
-
|
|
915
|
-
def _bigint_to_int(self, train, test):
|
|
916
|
-
tr = train.to_pandas()
|
|
917
|
-
tr[self.target_column] = tr[self.target_column].astype('int')
|
|
918
|
-
|
|
919
|
-
ts = test.to_pandas()
|
|
920
|
-
ts[self.target_column] = ts[self.target_column].astype('int')
|
|
921
|
-
|
|
922
|
-
return tr, ts
|
|
923
937
|
|
|
924
938
|
def _set_custom_scaling_method(self):
|
|
925
939
|
"""
|
|
@@ -987,7 +1001,11 @@ class _DataPreparation:
|
|
|
987
1001
|
"precision_digit" : 4,
|
|
988
1002
|
"accumulate" : accumulate_columns,
|
|
989
1003
|
"persist" : True}
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
1004
|
+
|
|
1005
|
+
transform_output = RoundColumns(**fit_params).result
|
|
1006
|
+
# Adding transformed data containing table to garbage collector
|
|
1007
|
+
GarbageCollector._add_to_garbagecollector(transform_output._table_name)
|
|
1008
|
+
cols = transform_output.columns
|
|
1009
|
+
df = transform_output.to_pandas().reset_index()
|
|
1010
|
+
df = df[cols]
|
|
1011
|
+
return df
|