teradataml 17.20.0.6__py3-none-any.whl → 20.0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +238 -1
- teradataml/__init__.py +13 -3
- teradataml/_version.py +1 -1
- teradataml/analytics/Transformations.py +4 -4
- teradataml/analytics/__init__.py +0 -2
- teradataml/analytics/analytic_function_executor.py +3 -0
- teradataml/analytics/json_parser/utils.py +13 -12
- teradataml/analytics/sqle/DecisionTreePredict.py +15 -30
- teradataml/analytics/sqle/NaiveBayesPredict.py +11 -20
- teradataml/analytics/sqle/__init__.py +0 -13
- teradataml/analytics/utils.py +1 -0
- teradataml/analytics/valib.py +3 -0
- teradataml/automl/__init__.py +1628 -0
- teradataml/automl/custom_json_utils.py +1270 -0
- teradataml/automl/data_preparation.py +993 -0
- teradataml/automl/data_transformation.py +727 -0
- teradataml/automl/feature_engineering.py +1648 -0
- teradataml/automl/feature_exploration.py +547 -0
- teradataml/automl/model_evaluation.py +163 -0
- teradataml/automl/model_training.py +887 -0
- teradataml/catalog/__init__.py +0 -2
- teradataml/catalog/byom.py +49 -6
- teradataml/catalog/function_argument_mapper.py +0 -2
- teradataml/catalog/model_cataloging_utils.py +2 -1021
- teradataml/common/aed_utils.py +6 -2
- teradataml/common/constants.py +50 -58
- teradataml/common/deprecations.py +160 -0
- teradataml/common/garbagecollector.py +61 -104
- teradataml/common/messagecodes.py +27 -36
- teradataml/common/messages.py +11 -15
- teradataml/common/utils.py +205 -287
- teradataml/common/wrapper_utils.py +1 -110
- teradataml/context/context.py +150 -78
- teradataml/data/bank_churn.csv +10001 -0
- teradataml/data/bmi.csv +501 -0
- teradataml/data/docs/sqle/docs_17_10/BincodeFit.py +3 -3
- teradataml/data/docs/sqle/docs_17_10/BincodeTransform.py +6 -5
- teradataml/data/docs/sqle/docs_17_10/Fit.py +1 -1
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +1 -1
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +1 -1
- teradataml/data/docs/sqle/docs_17_10/PolynomialFeaturesTransform.py +2 -2
- teradataml/data/docs/sqle/docs_17_10/RowNormalizeTransform.py +2 -1
- teradataml/data/docs/sqle/docs_17_10/ScaleTransform.py +1 -0
- teradataml/data/docs/sqle/docs_17_10/SimpleImputeTransform.py +1 -1
- teradataml/data/docs/sqle/docs_17_10/Transform.py +2 -1
- teradataml/data/docs/sqle/docs_17_20/BincodeFit.py +3 -3
- teradataml/data/docs/sqle/docs_17_20/BincodeTransform.py +6 -5
- teradataml/data/docs/sqle/docs_17_20/Fit.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/GLM.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/GLMPredictPerSegment.py +9 -10
- teradataml/data/docs/sqle/docs_17_20/KMeansPredict.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierPredict.py +16 -15
- teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierTrainer.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineTransform.py +8 -8
- teradataml/data/docs/sqle/docs_17_20/OneClassSVMPredict.py +21 -20
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterTransform.py +8 -3
- teradataml/data/docs/sqle/docs_17_20/PolynomialFeaturesTransform.py +6 -5
- teradataml/data/docs/sqle/docs_17_20/RandomProjectionTransform.py +6 -6
- teradataml/data/docs/sqle/docs_17_20/RowNormalizeTransform.py +2 -1
- teradataml/data/docs/sqle/docs_17_20/SVM.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +16 -16
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +1 -0
- teradataml/data/docs/sqle/docs_17_20/SimpleImputeTransform.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/TDDecisionForestPredict.py +4 -4
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +19 -19
- teradataml/data/docs/sqle/docs_17_20/TargetEncodingTransform.py +5 -4
- teradataml/data/docs/sqle/docs_17_20/Transform.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +9 -9
- teradataml/data/fish.csv +160 -0
- teradataml/data/glass_types.csv +215 -0
- teradataml/data/insurance.csv +1 -1
- teradataml/data/iris_data.csv +151 -0
- teradataml/data/jsons/sqle/17.10/TD_FunctionTransform.json +1 -0
- teradataml/data/jsons/sqle/17.10/TD_OneHotEncodingTransform.json +1 -0
- teradataml/data/jsons/sqle/17.10/TD_OutlierFilterTransform.json +1 -0
- teradataml/data/jsons/sqle/17.10/TD_PolynomialFeaturesTransform.json +1 -0
- teradataml/data/jsons/sqle/17.10/TD_RowNormalizeTransform.json +1 -0
- teradataml/data/jsons/sqle/17.10/TD_ScaleTransform.json +1 -0
- teradataml/data/jsons/sqle/17.10/TD_SimpleImputeTransform.json +1 -0
- teradataml/data/load_example_data.py +3 -0
- teradataml/data/multi_model_classification.csv +401 -0
- teradataml/data/multi_model_regression.csv +401 -0
- teradataml/data/openml_example.json +63 -0
- teradataml/data/scripts/deploy_script.py +65 -0
- teradataml/data/scripts/mapper.R +20 -0
- teradataml/data/scripts/sklearn/__init__.py +0 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +175 -0
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +135 -0
- teradataml/data/scripts/sklearn/sklearn_function.template +113 -0
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +158 -0
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +152 -0
- teradataml/data/scripts/sklearn/sklearn_score.py +128 -0
- teradataml/data/scripts/sklearn/sklearn_transform.py +179 -0
- teradataml/data/templates/open_source_ml.json +9 -0
- teradataml/data/teradataml_example.json +73 -1
- teradataml/data/test_classification.csv +101 -0
- teradataml/data/test_prediction.csv +101 -0
- teradataml/data/test_regression.csv +101 -0
- teradataml/data/train_multiclass.csv +101 -0
- teradataml/data/train_regression.csv +101 -0
- teradataml/data/train_regression_multiple_labels.csv +101 -0
- teradataml/data/wine_data.csv +1600 -0
- teradataml/dataframe/copy_to.py +79 -13
- teradataml/dataframe/data_transfer.py +8 -0
- teradataml/dataframe/dataframe.py +910 -311
- teradataml/dataframe/dataframe_utils.py +102 -5
- teradataml/dataframe/fastload.py +11 -3
- teradataml/dataframe/setop.py +15 -2
- teradataml/dataframe/sql.py +3735 -77
- teradataml/dataframe/sql_function_parameters.py +56 -5
- teradataml/dataframe/vantage_function_types.py +45 -1
- teradataml/dataframe/window.py +30 -29
- teradataml/dbutils/dbutils.py +18 -1
- teradataml/geospatial/geodataframe.py +18 -7
- teradataml/geospatial/geodataframecolumn.py +5 -0
- teradataml/hyperparameter_tuner/optimizer.py +910 -120
- teradataml/hyperparameter_tuner/utils.py +131 -37
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/lib/libaed_0_1.dylib +0 -0
- teradataml/lib/libaed_0_1.so +0 -0
- teradataml/libaed_0_1.dylib +0 -0
- teradataml/libaed_0_1.so +0 -0
- teradataml/opensource/__init__.py +1 -0
- teradataml/opensource/sklearn/__init__.py +1 -0
- teradataml/opensource/sklearn/_class.py +255 -0
- teradataml/opensource/sklearn/_sklearn_wrapper.py +1668 -0
- teradataml/opensource/sklearn/_wrapper_utils.py +268 -0
- teradataml/opensource/sklearn/constants.py +54 -0
- teradataml/options/__init__.py +3 -6
- teradataml/options/configure.py +21 -20
- teradataml/scriptmgmt/UserEnv.py +61 -5
- teradataml/scriptmgmt/lls_utils.py +135 -53
- teradataml/table_operators/Apply.py +38 -6
- teradataml/table_operators/Script.py +45 -308
- teradataml/table_operators/TableOperator.py +182 -591
- teradataml/table_operators/__init__.py +0 -1
- teradataml/table_operators/table_operator_util.py +32 -40
- teradataml/utils/validators.py +127 -3
- {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/METADATA +243 -3
- {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/RECORD +147 -391
- teradataml/analytics/mle/AdaBoost.py +0 -651
- teradataml/analytics/mle/AdaBoostPredict.py +0 -564
- teradataml/analytics/mle/Antiselect.py +0 -342
- teradataml/analytics/mle/Arima.py +0 -641
- teradataml/analytics/mle/ArimaPredict.py +0 -477
- teradataml/analytics/mle/Attribution.py +0 -1070
- teradataml/analytics/mle/Betweenness.py +0 -658
- teradataml/analytics/mle/Burst.py +0 -711
- teradataml/analytics/mle/CCM.py +0 -600
- teradataml/analytics/mle/CCMPrepare.py +0 -324
- teradataml/analytics/mle/CFilter.py +0 -460
- teradataml/analytics/mle/ChangePointDetection.py +0 -572
- teradataml/analytics/mle/ChangePointDetectionRT.py +0 -477
- teradataml/analytics/mle/Closeness.py +0 -737
- teradataml/analytics/mle/ConfusionMatrix.py +0 -420
- teradataml/analytics/mle/Correlation.py +0 -477
- teradataml/analytics/mle/Correlation2.py +0 -573
- teradataml/analytics/mle/CoxHazardRatio.py +0 -679
- teradataml/analytics/mle/CoxPH.py +0 -556
- teradataml/analytics/mle/CoxSurvival.py +0 -478
- teradataml/analytics/mle/CumulativeMovAvg.py +0 -363
- teradataml/analytics/mle/DTW.py +0 -623
- teradataml/analytics/mle/DWT.py +0 -564
- teradataml/analytics/mle/DWT2D.py +0 -599
- teradataml/analytics/mle/DecisionForest.py +0 -716
- teradataml/analytics/mle/DecisionForestEvaluator.py +0 -363
- teradataml/analytics/mle/DecisionForestPredict.py +0 -561
- teradataml/analytics/mle/DecisionTree.py +0 -830
- teradataml/analytics/mle/DecisionTreePredict.py +0 -528
- teradataml/analytics/mle/ExponentialMovAvg.py +0 -418
- teradataml/analytics/mle/FMeasure.py +0 -402
- teradataml/analytics/mle/FPGrowth.py +0 -734
- teradataml/analytics/mle/FrequentPaths.py +0 -695
- teradataml/analytics/mle/GLM.py +0 -558
- teradataml/analytics/mle/GLML1L2.py +0 -547
- teradataml/analytics/mle/GLML1L2Predict.py +0 -519
- teradataml/analytics/mle/GLMPredict.py +0 -529
- teradataml/analytics/mle/HMMDecoder.py +0 -945
- teradataml/analytics/mle/HMMEvaluator.py +0 -901
- teradataml/analytics/mle/HMMSupervised.py +0 -521
- teradataml/analytics/mle/HMMUnsupervised.py +0 -572
- teradataml/analytics/mle/Histogram.py +0 -561
- teradataml/analytics/mle/IDWT.py +0 -476
- teradataml/analytics/mle/IDWT2D.py +0 -493
- teradataml/analytics/mle/IdentityMatch.py +0 -763
- teradataml/analytics/mle/Interpolator.py +0 -918
- teradataml/analytics/mle/KMeans.py +0 -485
- teradataml/analytics/mle/KNN.py +0 -627
- teradataml/analytics/mle/KNNRecommender.py +0 -488
- teradataml/analytics/mle/KNNRecommenderPredict.py +0 -581
- teradataml/analytics/mle/LAR.py +0 -439
- teradataml/analytics/mle/LARPredict.py +0 -478
- teradataml/analytics/mle/LDA.py +0 -548
- teradataml/analytics/mle/LDAInference.py +0 -492
- teradataml/analytics/mle/LDATopicSummary.py +0 -464
- teradataml/analytics/mle/LevenshteinDistance.py +0 -450
- teradataml/analytics/mle/LinReg.py +0 -433
- teradataml/analytics/mle/LinRegPredict.py +0 -438
- teradataml/analytics/mle/MinHash.py +0 -544
- teradataml/analytics/mle/Modularity.py +0 -587
- teradataml/analytics/mle/NEREvaluator.py +0 -410
- teradataml/analytics/mle/NERExtractor.py +0 -595
- teradataml/analytics/mle/NERTrainer.py +0 -458
- teradataml/analytics/mle/NGrams.py +0 -570
- teradataml/analytics/mle/NPath.py +0 -634
- teradataml/analytics/mle/NTree.py +0 -549
- teradataml/analytics/mle/NaiveBayes.py +0 -462
- teradataml/analytics/mle/NaiveBayesPredict.py +0 -513
- teradataml/analytics/mle/NaiveBayesTextClassifier.py +0 -607
- teradataml/analytics/mle/NaiveBayesTextClassifier2.py +0 -531
- teradataml/analytics/mle/NaiveBayesTextClassifierPredict.py +0 -799
- teradataml/analytics/mle/NamedEntityFinder.py +0 -529
- teradataml/analytics/mle/NamedEntityFinderEvaluator.py +0 -414
- teradataml/analytics/mle/NamedEntityFinderTrainer.py +0 -396
- teradataml/analytics/mle/POSTagger.py +0 -417
- teradataml/analytics/mle/Pack.py +0 -411
- teradataml/analytics/mle/PageRank.py +0 -535
- teradataml/analytics/mle/PathAnalyzer.py +0 -426
- teradataml/analytics/mle/PathGenerator.py +0 -367
- teradataml/analytics/mle/PathStart.py +0 -464
- teradataml/analytics/mle/PathSummarizer.py +0 -470
- teradataml/analytics/mle/Pivot.py +0 -471
- teradataml/analytics/mle/ROC.py +0 -425
- teradataml/analytics/mle/RandomSample.py +0 -637
- teradataml/analytics/mle/RandomWalkSample.py +0 -490
- teradataml/analytics/mle/SAX.py +0 -779
- teradataml/analytics/mle/SVMDense.py +0 -677
- teradataml/analytics/mle/SVMDensePredict.py +0 -536
- teradataml/analytics/mle/SVMDenseSummary.py +0 -437
- teradataml/analytics/mle/SVMSparse.py +0 -557
- teradataml/analytics/mle/SVMSparsePredict.py +0 -553
- teradataml/analytics/mle/SVMSparseSummary.py +0 -435
- teradataml/analytics/mle/Sampling.py +0 -549
- teradataml/analytics/mle/Scale.py +0 -565
- teradataml/analytics/mle/ScaleByPartition.py +0 -496
- teradataml/analytics/mle/ScaleMap.py +0 -378
- teradataml/analytics/mle/ScaleSummary.py +0 -320
- teradataml/analytics/mle/SentenceExtractor.py +0 -363
- teradataml/analytics/mle/SentimentEvaluator.py +0 -432
- teradataml/analytics/mle/SentimentExtractor.py +0 -578
- teradataml/analytics/mle/SentimentTrainer.py +0 -405
- teradataml/analytics/mle/SeriesSplitter.py +0 -641
- teradataml/analytics/mle/Sessionize.py +0 -475
- teradataml/analytics/mle/SimpleMovAvg.py +0 -397
- teradataml/analytics/mle/StringSimilarity.py +0 -425
- teradataml/analytics/mle/TF.py +0 -389
- teradataml/analytics/mle/TFIDF.py +0 -504
- teradataml/analytics/mle/TextChunker.py +0 -414
- teradataml/analytics/mle/TextClassifier.py +0 -399
- teradataml/analytics/mle/TextClassifierEvaluator.py +0 -413
- teradataml/analytics/mle/TextClassifierTrainer.py +0 -565
- teradataml/analytics/mle/TextMorph.py +0 -494
- teradataml/analytics/mle/TextParser.py +0 -623
- teradataml/analytics/mle/TextTagger.py +0 -530
- teradataml/analytics/mle/TextTokenizer.py +0 -502
- teradataml/analytics/mle/UnivariateStatistics.py +0 -488
- teradataml/analytics/mle/Unpack.py +0 -526
- teradataml/analytics/mle/Unpivot.py +0 -438
- teradataml/analytics/mle/VarMax.py +0 -776
- teradataml/analytics/mle/VectorDistance.py +0 -762
- teradataml/analytics/mle/WeightedMovAvg.py +0 -400
- teradataml/analytics/mle/XGBoost.py +0 -842
- teradataml/analytics/mle/XGBoostPredict.py +0 -627
- teradataml/analytics/mle/__init__.py +0 -123
- teradataml/analytics/mle/json/adaboost_mle.json +0 -135
- teradataml/analytics/mle/json/adaboostpredict_mle.json +0 -85
- teradataml/analytics/mle/json/antiselect_mle.json +0 -34
- teradataml/analytics/mle/json/antiselect_mle_mle.json +0 -34
- teradataml/analytics/mle/json/arima_mle.json +0 -172
- teradataml/analytics/mle/json/arimapredict_mle.json +0 -52
- teradataml/analytics/mle/json/attribution_mle_mle.json +0 -143
- teradataml/analytics/mle/json/betweenness_mle.json +0 -97
- teradataml/analytics/mle/json/burst_mle.json +0 -140
- teradataml/analytics/mle/json/ccm_mle.json +0 -124
- teradataml/analytics/mle/json/ccmprepare_mle.json +0 -14
- teradataml/analytics/mle/json/cfilter_mle.json +0 -93
- teradataml/analytics/mle/json/changepointdetection_mle.json +0 -92
- teradataml/analytics/mle/json/changepointdetectionrt_mle.json +0 -78
- teradataml/analytics/mle/json/closeness_mle.json +0 -104
- teradataml/analytics/mle/json/confusionmatrix_mle.json +0 -79
- teradataml/analytics/mle/json/correlation_mle.json +0 -86
- teradataml/analytics/mle/json/correlationreduce_mle.json +0 -49
- teradataml/analytics/mle/json/coxhazardratio_mle.json +0 -89
- teradataml/analytics/mle/json/coxph_mle.json +0 -98
- teradataml/analytics/mle/json/coxsurvival_mle.json +0 -79
- teradataml/analytics/mle/json/cumulativemovavg_mle.json +0 -34
- teradataml/analytics/mle/json/decisionforest_mle.json +0 -167
- teradataml/analytics/mle/json/decisionforestevaluator_mle.json +0 -33
- teradataml/analytics/mle/json/decisionforestpredict_mle_mle.json +0 -74
- teradataml/analytics/mle/json/decisiontree_mle.json +0 -194
- teradataml/analytics/mle/json/decisiontreepredict_mle_mle.json +0 -86
- teradataml/analytics/mle/json/dtw_mle.json +0 -97
- teradataml/analytics/mle/json/dwt2d_mle.json +0 -116
- teradataml/analytics/mle/json/dwt_mle.json +0 -101
- teradataml/analytics/mle/json/exponentialmovavg_mle.json +0 -55
- teradataml/analytics/mle/json/fmeasure_mle.json +0 -58
- teradataml/analytics/mle/json/fpgrowth_mle.json +0 -159
- teradataml/analytics/mle/json/frequentpaths_mle.json +0 -129
- teradataml/analytics/mle/json/glm_mle.json +0 -111
- teradataml/analytics/mle/json/glml1l2_mle.json +0 -106
- teradataml/analytics/mle/json/glml1l2predict_mle.json +0 -57
- teradataml/analytics/mle/json/glmpredict_mle_mle.json +0 -74
- teradataml/analytics/mle/json/histogram_mle.json +0 -100
- teradataml/analytics/mle/json/hmmdecoder_mle.json +0 -192
- teradataml/analytics/mle/json/hmmevaluator_mle.json +0 -206
- teradataml/analytics/mle/json/hmmsupervised_mle.json +0 -91
- teradataml/analytics/mle/json/hmmunsupervised_mle.json +0 -114
- teradataml/analytics/mle/json/identitymatch_mle.json +0 -88
- teradataml/analytics/mle/json/idwt2d_mle.json +0 -73
- teradataml/analytics/mle/json/idwt_mle.json +0 -66
- teradataml/analytics/mle/json/interpolator_mle.json +0 -151
- teradataml/analytics/mle/json/kmeans_mle.json +0 -97
- teradataml/analytics/mle/json/knn_mle.json +0 -141
- teradataml/analytics/mle/json/knnrecommender_mle.json +0 -111
- teradataml/analytics/mle/json/knnrecommenderpredict_mle.json +0 -75
- teradataml/analytics/mle/json/lar_mle.json +0 -78
- teradataml/analytics/mle/json/larpredict_mle.json +0 -69
- teradataml/analytics/mle/json/lda_mle.json +0 -130
- teradataml/analytics/mle/json/ldainference_mle.json +0 -78
- teradataml/analytics/mle/json/ldatopicsummary_mle.json +0 -64
- teradataml/analytics/mle/json/levenshteindistance_mle.json +0 -92
- teradataml/analytics/mle/json/linreg_mle.json +0 -42
- teradataml/analytics/mle/json/linregpredict_mle.json +0 -56
- teradataml/analytics/mle/json/minhash_mle.json +0 -113
- teradataml/analytics/mle/json/modularity_mle.json +0 -91
- teradataml/analytics/mle/json/naivebayespredict_mle_mle.json +0 -85
- teradataml/analytics/mle/json/naivebayesreduce_mle.json +0 -52
- teradataml/analytics/mle/json/naivebayestextclassifierpredict_mle_mle.json +0 -147
- teradataml/analytics/mle/json/naivebayestextclassifiertrainer2_mle.json +0 -108
- teradataml/analytics/mle/json/naivebayestextclassifiertrainer_mle.json +0 -102
- teradataml/analytics/mle/json/namedentityfinder_mle.json +0 -84
- teradataml/analytics/mle/json/namedentityfinderevaluatorreduce_mle.json +0 -43
- teradataml/analytics/mle/json/namedentityfindertrainer_mle.json +0 -64
- teradataml/analytics/mle/json/nerevaluator_mle.json +0 -54
- teradataml/analytics/mle/json/nerextractor_mle.json +0 -87
- teradataml/analytics/mle/json/nertrainer_mle.json +0 -89
- teradataml/analytics/mle/json/ngrams_mle.json +0 -137
- teradataml/analytics/mle/json/ngramsplitter_mle_mle.json +0 -137
- teradataml/analytics/mle/json/npath@coprocessor_mle.json +0 -73
- teradataml/analytics/mle/json/ntree@coprocessor_mle.json +0 -123
- teradataml/analytics/mle/json/pack_mle.json +0 -58
- teradataml/analytics/mle/json/pack_mle_mle.json +0 -58
- teradataml/analytics/mle/json/pagerank_mle.json +0 -81
- teradataml/analytics/mle/json/pathanalyzer_mle.json +0 -63
- teradataml/analytics/mle/json/pathgenerator_mle.json +0 -40
- teradataml/analytics/mle/json/pathstart_mle.json +0 -62
- teradataml/analytics/mle/json/pathsummarizer_mle.json +0 -72
- teradataml/analytics/mle/json/pivoting_mle.json +0 -71
- teradataml/analytics/mle/json/postagger_mle.json +0 -51
- teradataml/analytics/mle/json/randomsample_mle.json +0 -131
- teradataml/analytics/mle/json/randomwalksample_mle.json +0 -85
- teradataml/analytics/mle/json/roc_mle.json +0 -73
- teradataml/analytics/mle/json/sampling_mle.json +0 -75
- teradataml/analytics/mle/json/sax_mle.json +0 -154
- teradataml/analytics/mle/json/scale_mle.json +0 -93
- teradataml/analytics/mle/json/scalebypartition_mle.json +0 -89
- teradataml/analytics/mle/json/scalemap_mle.json +0 -44
- teradataml/analytics/mle/json/scalesummary_mle.json +0 -14
- teradataml/analytics/mle/json/sentenceextractor_mle.json +0 -41
- teradataml/analytics/mle/json/sentimentevaluator_mle.json +0 -43
- teradataml/analytics/mle/json/sentimentextractor_mle.json +0 -100
- teradataml/analytics/mle/json/sentimenttrainer_mle.json +0 -68
- teradataml/analytics/mle/json/seriessplitter_mle.json +0 -133
- teradataml/analytics/mle/json/sessionize_mle_mle.json +0 -62
- teradataml/analytics/mle/json/simplemovavg_mle.json +0 -48
- teradataml/analytics/mle/json/stringsimilarity_mle.json +0 -50
- teradataml/analytics/mle/json/stringsimilarity_mle_mle.json +0 -50
- teradataml/analytics/mle/json/svmdense_mle.json +0 -165
- teradataml/analytics/mle/json/svmdensepredict_mle.json +0 -95
- teradataml/analytics/mle/json/svmdensesummary_mle.json +0 -58
- teradataml/analytics/mle/json/svmsparse_mle.json +0 -148
- teradataml/analytics/mle/json/svmsparsepredict_mle_mle.json +0 -103
- teradataml/analytics/mle/json/svmsparsesummary_mle.json +0 -57
- teradataml/analytics/mle/json/textchunker_mle.json +0 -40
- teradataml/analytics/mle/json/textclassifier_mle.json +0 -51
- teradataml/analytics/mle/json/textclassifierevaluator_mle.json +0 -43
- teradataml/analytics/mle/json/textclassifiertrainer_mle.json +0 -103
- teradataml/analytics/mle/json/textmorph_mle.json +0 -63
- teradataml/analytics/mle/json/textparser_mle.json +0 -166
- teradataml/analytics/mle/json/texttagger_mle.json +0 -81
- teradataml/analytics/mle/json/texttokenizer_mle.json +0 -91
- teradataml/analytics/mle/json/tf_mle.json +0 -33
- teradataml/analytics/mle/json/tfidf_mle.json +0 -34
- teradataml/analytics/mle/json/univariatestatistics_mle.json +0 -81
- teradataml/analytics/mle/json/unpack_mle.json +0 -91
- teradataml/analytics/mle/json/unpack_mle_mle.json +0 -91
- teradataml/analytics/mle/json/unpivoting_mle.json +0 -63
- teradataml/analytics/mle/json/varmax_mle.json +0 -176
- teradataml/analytics/mle/json/vectordistance_mle.json +0 -179
- teradataml/analytics/mle/json/weightedmovavg_mle.json +0 -48
- teradataml/analytics/mle/json/xgboost_mle.json +0 -178
- teradataml/analytics/mle/json/xgboostpredict_mle.json +0 -104
- teradataml/analytics/sqle/Antiselect.py +0 -321
- teradataml/analytics/sqle/Attribution.py +0 -603
- teradataml/analytics/sqle/DecisionForestPredict.py +0 -408
- teradataml/analytics/sqle/GLMPredict.py +0 -430
- teradataml/analytics/sqle/MovingAverage.py +0 -543
- teradataml/analytics/sqle/NGramSplitter.py +0 -548
- teradataml/analytics/sqle/NPath.py +0 -632
- teradataml/analytics/sqle/NaiveBayesTextClassifierPredict.py +0 -515
- teradataml/analytics/sqle/Pack.py +0 -388
- teradataml/analytics/sqle/SVMSparsePredict.py +0 -464
- teradataml/analytics/sqle/Sessionize.py +0 -390
- teradataml/analytics/sqle/StringSimilarity.py +0 -400
- teradataml/analytics/sqle/Unpack.py +0 -503
- teradataml/analytics/sqle/json/antiselect_sqle.json +0 -21
- teradataml/analytics/sqle/json/attribution_sqle.json +0 -92
- teradataml/analytics/sqle/json/decisionforestpredict_sqle.json +0 -48
- teradataml/analytics/sqle/json/glmpredict_sqle.json +0 -48
- teradataml/analytics/sqle/json/h2opredict_sqle.json +0 -63
- teradataml/analytics/sqle/json/movingaverage_sqle.json +0 -58
- teradataml/analytics/sqle/json/naivebayestextclassifierpredict_sqle.json +0 -76
- teradataml/analytics/sqle/json/ngramsplitter_sqle.json +0 -126
- teradataml/analytics/sqle/json/npath_sqle.json +0 -67
- teradataml/analytics/sqle/json/pack_sqle.json +0 -47
- teradataml/analytics/sqle/json/pmmlpredict_sqle.json +0 -55
- teradataml/analytics/sqle/json/sessionize_sqle.json +0 -43
- teradataml/analytics/sqle/json/stringsimilarity_sqle.json +0 -39
- teradataml/analytics/sqle/json/svmsparsepredict_sqle.json +0 -74
- teradataml/analytics/sqle/json/unpack_sqle.json +0 -80
- teradataml/catalog/model_cataloging.py +0 -980
- teradataml/config/mlengine_alias_definitions_v1.0 +0 -118
- teradataml/config/mlengine_alias_definitions_v1.1 +0 -127
- teradataml/config/mlengine_alias_definitions_v1.3 +0 -129
- teradataml/table_operators/sandbox_container_util.py +0 -643
- {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/WHEEL +0 -0
- {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/top_level.txt +0 -0
- {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/zip-safe +0 -0
|
@@ -0,0 +1,1628 @@
|
|
|
1
|
+
# ##################################################################
|
|
2
|
+
#
|
|
3
|
+
# Copyright 2024 Teradata. All rights reserved.
|
|
4
|
+
# TERADATA CONFIDENTIAL AND TRADE SECRET
|
|
5
|
+
#
|
|
6
|
+
# Primary Owner: Sweta Shaw
|
|
7
|
+
# Email Id: Sweta.Shaw@Teradata.com
|
|
8
|
+
#
|
|
9
|
+
# Secondary Owner: Akhil Bisht
|
|
10
|
+
# Email Id: AKHIL.BISHT@Teradata.com
|
|
11
|
+
#
|
|
12
|
+
# Version: 1.1
|
|
13
|
+
# Function Version: 1.0
|
|
14
|
+
# ##################################################################
|
|
15
|
+
|
|
16
|
+
# Python libraries
|
|
17
|
+
import json
|
|
18
|
+
import numpy as np
|
|
19
|
+
from sklearn.metrics import confusion_matrix
|
|
20
|
+
import time
|
|
21
|
+
|
|
22
|
+
# Teradata libraries
|
|
23
|
+
from teradataml.dataframe.copy_to import copy_to_sql
|
|
24
|
+
from teradataml import ColumnExpression
|
|
25
|
+
from teradataml.dataframe.dataframe import DataFrame
|
|
26
|
+
from teradataml.utils.validators import _Validators
|
|
27
|
+
from teradataml import ROC
|
|
28
|
+
from teradataml.common.utils import UtilFuncs
|
|
29
|
+
from teradataml.utils.dtypes import _Dtypes
|
|
30
|
+
from teradataml.common.utils import UtilFuncs
|
|
31
|
+
from teradataml import TeradataMlException
|
|
32
|
+
from teradataml.common.messages import Messages, MessageCodes
|
|
33
|
+
|
|
34
|
+
# AutoML Internal libraries
|
|
35
|
+
from teradataml.automl.data_preparation import _DataPreparation
|
|
36
|
+
from teradataml.automl.feature_engineering import _FeatureEngineering
|
|
37
|
+
from teradataml.automl.feature_exploration import _FeatureExplore, _is_terminal
|
|
38
|
+
from teradataml.automl.model_evaluation import _ModelEvaluator
|
|
39
|
+
from teradataml.automl.model_training import _ModelTraining
|
|
40
|
+
from teradataml.automl.data_transformation import _DataTransformation
|
|
41
|
+
from teradataml.automl.custom_json_utils import _GenerateCustomJson
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class AutoML:
|
|
45
|
+
|
|
46
|
+
def __init__(self,
|
|
47
|
+
task_type = "Default",
|
|
48
|
+
include = None,
|
|
49
|
+
exclude = None,
|
|
50
|
+
verbose = 0,
|
|
51
|
+
max_runtime_secs = None,
|
|
52
|
+
stopping_metric = None,
|
|
53
|
+
stopping_tolerance = None,
|
|
54
|
+
custom_config_file = None):
|
|
55
|
+
"""
|
|
56
|
+
DESCRIPTION:
|
|
57
|
+
AutoML (Automated Machine Learning) is an approach that automates the process
|
|
58
|
+
of building, training, and validating machine learning models. It involves
|
|
59
|
+
various algorithms to automate various aspects of the machine learning workflow,
|
|
60
|
+
such as data preparation, feature engineering, model selection, hyperparameter
|
|
61
|
+
tuning, and model deployment. It aims to simplify the process of building
|
|
62
|
+
machine learning models, by automating some of the more time-consuming
|
|
63
|
+
and labor-intensive tasks involved in the process.
|
|
64
|
+
|
|
65
|
+
AutoML is designed to handle both regression and classification (binary and
|
|
66
|
+
multiclass) tasks. User can specify the task type whether to apply
|
|
67
|
+
regression OR classification algorithm on the provided dataset. By default, AutoML
|
|
68
|
+
decides the task type.
|
|
69
|
+
|
|
70
|
+
AutoML by default, trains using all model algorithms applicable for the
|
|
71
|
+
task type problem. For example, "glm" and "svm" does not support multi-class
|
|
72
|
+
classification problem. Thus, only 3 models are available to train in case
|
|
73
|
+
of multi-class classification problem, by default. While for regression and
|
|
74
|
+
binary classification problem, all 5 models i.e., "glm", "svm", "knn",
|
|
75
|
+
"decision_forest", "xgboost" are available to train by default.
|
|
76
|
+
|
|
77
|
+
AutoML provides functionality to use specific model algorithms for training.
|
|
78
|
+
User can provide either include or exclude model. In case of include,
|
|
79
|
+
only specified models are trained while for exclude, all models except
|
|
80
|
+
specified model are trained.
|
|
81
|
+
|
|
82
|
+
AutoML also provides an option to customize the processes within feature
|
|
83
|
+
engineering, data preparation and model training phases. User can customize
|
|
84
|
+
the processes by passing the JSON file path in case of custom run. It also
|
|
85
|
+
supports early stopping of model training based on stopping metrics and
|
|
86
|
+
maximum running time.
|
|
87
|
+
|
|
88
|
+
PARAMETERS:
|
|
89
|
+
task_type:
|
|
90
|
+
Optional Arugment.
|
|
91
|
+
Specifies the task type for AutoML, whether to apply regression OR classification
|
|
92
|
+
on the provided dataset. If user wants AutoML to decide the task type automatically,
|
|
93
|
+
then it should be set to "Default".
|
|
94
|
+
Default Value: "Default"
|
|
95
|
+
Permitted Values: "Regression", "Classification", "Default"
|
|
96
|
+
Types: str
|
|
97
|
+
|
|
98
|
+
include:
|
|
99
|
+
Optional Argument.
|
|
100
|
+
Specifies the model algorithms to be used for model training phase.
|
|
101
|
+
By default, all 5 models are used for training for regression and binary
|
|
102
|
+
classification problem, while only 3 models are used for multi-class.
|
|
103
|
+
Permitted Values: "glm", "svm", "knn", "decision_forest", "xgboost"
|
|
104
|
+
Types: str OR list of str
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
exclude:
|
|
108
|
+
Optional Argument.
|
|
109
|
+
Specifies the model algorithms to be excluded from model training phase.
|
|
110
|
+
No model is excluded by default.
|
|
111
|
+
Permitted Values: "glm", "svm", "knn", "decision_forest", "xgboost"
|
|
112
|
+
Types: str OR list of str
|
|
113
|
+
|
|
114
|
+
verbose:
|
|
115
|
+
Optional Argument.
|
|
116
|
+
Specifies the detailed execution steps based on verbose level.
|
|
117
|
+
Default Value: 0
|
|
118
|
+
Permitted Values:
|
|
119
|
+
* 0: prints the progress bar and leaderboard
|
|
120
|
+
* 1: prints the execution steps of AutoML.
|
|
121
|
+
* 2: prints the intermediate data between the execution of each step of AutoML.
|
|
122
|
+
Types: int
|
|
123
|
+
|
|
124
|
+
max_runtime_secs:
|
|
125
|
+
Optional Arugment.
|
|
126
|
+
Specifies the time limit in seconds for model training.
|
|
127
|
+
Types: int
|
|
128
|
+
|
|
129
|
+
stopping_metric:
|
|
130
|
+
Required, when "stopping_tolerance" is set, otherwise optional.
|
|
131
|
+
Specifies the stopping metrics for stopping tolerance in model training.
|
|
132
|
+
Permitted Values:
|
|
133
|
+
* For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
|
|
134
|
+
"RMSE", "RMSLE"
|
|
135
|
+
* For task_type "Classification": 'MICRO-F1','MACRO-F1',
|
|
136
|
+
'MICRO-RECALL','MACRO-RECALL',
|
|
137
|
+
'MICRO-PRECISION', 'MACRO-PRECISION',
|
|
138
|
+
'WEIGHTED-PRECISION','WEIGHTED-RECALL',
|
|
139
|
+
'WEIGHTED-F1', 'ACCURACY'
|
|
140
|
+
Types: str
|
|
141
|
+
|
|
142
|
+
stopping_tolerance:
|
|
143
|
+
Required, when "stopping_metric" is set, otherwise optional.
|
|
144
|
+
Specifies the stopping tolerance for stopping metrics in model training.
|
|
145
|
+
Types: float
|
|
146
|
+
|
|
147
|
+
custom_config_file:
|
|
148
|
+
Optional Argument.
|
|
149
|
+
Specifies the path of JSON file in case of custom run.
|
|
150
|
+
Types: str
|
|
151
|
+
|
|
152
|
+
RETURNS:
|
|
153
|
+
Instance of AutoML.
|
|
154
|
+
|
|
155
|
+
RAISES:
|
|
156
|
+
TeradataMlException, TypeError, ValueError
|
|
157
|
+
|
|
158
|
+
EXAMPLES:
|
|
159
|
+
# Notes:
|
|
160
|
+
# 1. Get the connection to Vantage to execute the function.
|
|
161
|
+
# 2. One must import the required functions mentioned in
|
|
162
|
+
# the example from teradataml.
|
|
163
|
+
# 3. Function raises error if not supported on the Vantage
|
|
164
|
+
# user is connected to.
|
|
165
|
+
|
|
166
|
+
# Load the example data.
|
|
167
|
+
>>> load_example_data("GLMPredict", ["admissions_test", "admissions_train"])
|
|
168
|
+
>>> load_example_data("decisionforestpredict", ["housing_train", "housing_test"])
|
|
169
|
+
>>> load_example_data("teradataml", "iris_input")
|
|
170
|
+
|
|
171
|
+
# Create teradataml DataFrames.
|
|
172
|
+
>>> admissions_train = DataFrame.from_table("admissions_train")
|
|
173
|
+
>>> admissions_test = DataFrame.from_table("admissions_test")
|
|
174
|
+
>>> housing_train = DataFrame.from_table("housing_train")
|
|
175
|
+
>>> housing_test = DataFrame.from_table("housing_test")
|
|
176
|
+
>>> iris_input = DataFrame.from_table("iris_input")
|
|
177
|
+
|
|
178
|
+
# Example 1: Run AutoML for classification problem.
|
|
179
|
+
# Scenario: Predict whether a student will be admitted to a university
|
|
180
|
+
# based on different factors. Run AutoML to get the best
|
|
181
|
+
# performing model out of available models.
|
|
182
|
+
|
|
183
|
+
# Create an instance of AutoML.
|
|
184
|
+
>>> automl_obj = AutoML(task_type="Classification")
|
|
185
|
+
|
|
186
|
+
# Fit the data.
|
|
187
|
+
>>> automl_obj.fit(admissions_train, "admitted")
|
|
188
|
+
|
|
189
|
+
# Run predict with best performing model.
|
|
190
|
+
>>> prediction = automl_obj.predict()
|
|
191
|
+
>>> prediction
|
|
192
|
+
|
|
193
|
+
# Run predict for new test data with best performing model.
|
|
194
|
+
>>> prediction = automl_obj.predict(admissions_test)
|
|
195
|
+
>>> prediction
|
|
196
|
+
|
|
197
|
+
# Run predict for new test data with second best performing model.
|
|
198
|
+
>>> prediction = automl_obj.predict(admissions_test, rank=2)
|
|
199
|
+
>>> prediction
|
|
200
|
+
|
|
201
|
+
# Display leaderboard.
|
|
202
|
+
>>> automl_obj.leaderboard()
|
|
203
|
+
|
|
204
|
+
# Display best performing model.
|
|
205
|
+
>>> automl_obj.leader()
|
|
206
|
+
|
|
207
|
+
# Example 2 : Run AutoML for regression problem.
|
|
208
|
+
# Scenario : Predict the price of house based on different factors.
|
|
209
|
+
# Run AutoML to get the best performing model using custom
|
|
210
|
+
# configuration file to customize different processes of
|
|
211
|
+
# AutoML Run. Use include to specify "xgbooost" and
|
|
212
|
+
# "decision_forset" models to be used for training.
|
|
213
|
+
|
|
214
|
+
# Generate custom JSON file
|
|
215
|
+
>>> AutoML.generate_custom_config("custom_housing")
|
|
216
|
+
|
|
217
|
+
# Create instance of AutoML.
|
|
218
|
+
>>> automl_obj = AutoML(task_type="Regression",
|
|
219
|
+
>>> verbose=1,
|
|
220
|
+
>>> include=["decision_forest", "xgboost"],
|
|
221
|
+
>>> custom_config_file="custom_housing.json")
|
|
222
|
+
# Fit the data.
|
|
223
|
+
>>> automl_obj.fit(housing_train, "price")
|
|
224
|
+
|
|
225
|
+
# Run predict with best performing model.
|
|
226
|
+
>>> prediction = automl_obj.predict()
|
|
227
|
+
>>> prediction
|
|
228
|
+
|
|
229
|
+
# Run predict for new test data with best performing model.
|
|
230
|
+
>>> prediction = automl_obj.predict(housing_test)
|
|
231
|
+
>>> prediction
|
|
232
|
+
|
|
233
|
+
# Run predict for new test data with second best performing model.
|
|
234
|
+
>>> prediction = automl_obj.predict(housing_test, rank=2)
|
|
235
|
+
>>> prediction
|
|
236
|
+
|
|
237
|
+
# Display leaderboard.
|
|
238
|
+
>>> automl_obj.leaderboard()
|
|
239
|
+
|
|
240
|
+
# Display best performing model.
|
|
241
|
+
>>> automl_obj.leader()
|
|
242
|
+
|
|
243
|
+
# Example 3 : Run AutoML for multiclass classification problem.
|
|
244
|
+
# Scenario : Predict the species of iris flower based on different
|
|
245
|
+
# factors. Use custom configuration file to customize
|
|
246
|
+
# different processes of AutoML Run to get the best
|
|
247
|
+
# performing model out of available models.
|
|
248
|
+
|
|
249
|
+
# Generate custom JSON file
|
|
250
|
+
>>> AutoML.generate_custom_config()
|
|
251
|
+
|
|
252
|
+
# Create instance of AutoML.
|
|
253
|
+
>>> automl_obj = AutoML(verbose=2,
|
|
254
|
+
>>> exclude="xgboost",
|
|
255
|
+
>>> custom_config_file="custom.json")
|
|
256
|
+
# Fit the data.
|
|
257
|
+
>>> automl_obj.fit(iris_input, iris_input.species)
|
|
258
|
+
|
|
259
|
+
# Run predict with best performing model.
|
|
260
|
+
>>> prediction = automl_obj.predict()
|
|
261
|
+
>>> prediction
|
|
262
|
+
|
|
263
|
+
# Run predict with second best performing model.
|
|
264
|
+
>>> prediction = automl_obj.predict(rank=2)
|
|
265
|
+
>>> prediction
|
|
266
|
+
|
|
267
|
+
# Display leaderboard.
|
|
268
|
+
>>> automl_obj.leaderboard()
|
|
269
|
+
|
|
270
|
+
# Display best performing model.
|
|
271
|
+
>>> automl_obj.leader()
|
|
272
|
+
|
|
273
|
+
# Example 4 : Run AutoML for regression problem with early stopping metric and tolerance.
|
|
274
|
+
# Scenario : Predict the price of house based on different factors.
|
|
275
|
+
# Use custom configuration file to customize different
|
|
276
|
+
# processes of AutoML Run. Define performance threshold
|
|
277
|
+
# to acquire for the available models, and terminate training
|
|
278
|
+
# upon meeting the stipulated performance criteria.
|
|
279
|
+
|
|
280
|
+
# Generate custom JSON file
|
|
281
|
+
>>> AutoML.generate_custom_config("custom_housing")
|
|
282
|
+
|
|
283
|
+
# Create instance of AutoML.
|
|
284
|
+
>>> automl_obj = AutoML(verbose=2,
|
|
285
|
+
>>> exclude="xgboost",
|
|
286
|
+
>>> stopping_metric="R2",
|
|
287
|
+
>>> stopping_tolerance=0.7,
|
|
288
|
+
>>> custom_config_file="custom_housing.json")
|
|
289
|
+
# Fit the data.
|
|
290
|
+
>>> automl_obj.fit(housing_train, "price")
|
|
291
|
+
|
|
292
|
+
# Run predict with best performing model.
|
|
293
|
+
>>> prediction = automl_obj.predict()
|
|
294
|
+
>>> prediction
|
|
295
|
+
|
|
296
|
+
# Display leaderboard.
|
|
297
|
+
>>> automl_obj.leaderboard()
|
|
298
|
+
|
|
299
|
+
# Example 5 : Run AutoML for regression problem with maximum runtime.
|
|
300
|
+
# Scenario : Predict the species of iris flower based on different factors.
|
|
301
|
+
# Run AutoML to get the best performing model in specified time.
|
|
302
|
+
|
|
303
|
+
# Create instance of AutoML.
|
|
304
|
+
>>> automl_obj = AutoML(verbose=2,
|
|
305
|
+
>>> exclude="xgboost",
|
|
306
|
+
>>> max_runtime_secs=500)
|
|
307
|
+
# Fit the data.
|
|
308
|
+
>>> automl_obj.fit(iris_input, iris_input.species)
|
|
309
|
+
|
|
310
|
+
# Run predict with best performing model.
|
|
311
|
+
>>> prediction = automl_obj.predict()
|
|
312
|
+
>>> prediction
|
|
313
|
+
|
|
314
|
+
# Run predict with second best performing model.
|
|
315
|
+
>>> prediction = automl_obj.predict(rank=2)
|
|
316
|
+
>>> prediction
|
|
317
|
+
|
|
318
|
+
# Display leaderboard.
|
|
319
|
+
>>> automl_obj.leaderboard()
|
|
320
|
+
|
|
321
|
+
# Display best performing model.
|
|
322
|
+
>>> automl_obj.leader()
|
|
323
|
+
"""
|
|
324
|
+
# Appending arguments to list for validation
|
|
325
|
+
arg_info_matrix = []
|
|
326
|
+
arg_info_matrix.append(["task_type", task_type, True, (str), True, ["Regression", "Classification", "Default"]])
|
|
327
|
+
arg_info_matrix.append(["include", include, True, (str, list), True, ["glm", "svm", "knn",
|
|
328
|
+
"decision_forest", "xgboost"]])
|
|
329
|
+
arg_info_matrix.append(["exclude", exclude, True, (str, list), True, ["glm", "svm", "knn",
|
|
330
|
+
"decision_forest", "xgboost"]])
|
|
331
|
+
arg_info_matrix.append(["verbose", verbose, True, (int), True, [0,1,2]])
|
|
332
|
+
arg_info_matrix.append(["max_runtime_secs", max_runtime_secs, True, (int, float)])
|
|
333
|
+
arg_info_matrix.append(["stopping_metric", stopping_metric, True, (str), True, ["R2", 'MAE',
|
|
334
|
+
'MSE', 'MSLE',
|
|
335
|
+
'RMSE', 'RMSLE',
|
|
336
|
+
'MICRO-F1','MACRO-F1',
|
|
337
|
+
'MICRO-RECALL','MACRO-RECALL',
|
|
338
|
+
'MICRO-PRECISION', 'MACRO-PRECISION',
|
|
339
|
+
'WEIGHTED-PRECISION','WEIGHTED-RECALL',
|
|
340
|
+
'WEIGHTED-F1', 'ACCURACY']])
|
|
341
|
+
arg_info_matrix.append(["stopping_tolerance", stopping_tolerance, True, (float, int)])
|
|
342
|
+
arg_info_matrix.append(["custom_config_file", custom_config_file, True, (str), True])
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
# Validate argument types
|
|
346
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
347
|
+
# Either include or exclude can be used.
|
|
348
|
+
if include is not None or exclude is not None:
|
|
349
|
+
_Validators._validate_mutually_exclusive_arguments(include, "include", exclude, "exclude")
|
|
350
|
+
# Validate mutually inclusive arguments
|
|
351
|
+
_Validators._validate_mutually_inclusive_arguments(stopping_metric, "stopping_metric", stopping_tolerance, "stopping_tolerance")
|
|
352
|
+
|
|
353
|
+
custom_data = None
|
|
354
|
+
self.auto = True
|
|
355
|
+
# Validate custom file
|
|
356
|
+
if custom_config_file:
|
|
357
|
+
# Performing validation
|
|
358
|
+
_Validators._validate_file_exists(custom_config_file)
|
|
359
|
+
_Validators._validate_file_extension(custom_config_file, "json")
|
|
360
|
+
_Validators._check_empty_file(custom_config_file)
|
|
361
|
+
# Setting auto to False
|
|
362
|
+
self.auto = False
|
|
363
|
+
# Loading file
|
|
364
|
+
with open(custom_config_file, 'r') as json_file:
|
|
365
|
+
custom_data = json.load(json_file)
|
|
366
|
+
|
|
367
|
+
# Initializing class variables
|
|
368
|
+
self.data = None
|
|
369
|
+
self.target_column = None
|
|
370
|
+
self.custom_data = custom_data
|
|
371
|
+
self.task_type = task_type
|
|
372
|
+
self.include_model = include
|
|
373
|
+
self.exclude_model = exclude
|
|
374
|
+
self.verbose = verbose
|
|
375
|
+
self.max_runtime_secs = max_runtime_secs
|
|
376
|
+
self.stopping_metric = stopping_metric
|
|
377
|
+
self.stopping_tolerance = stopping_tolerance
|
|
378
|
+
self.model_list = ['decision_forest', 'xgboost', 'knn', 'svm', 'glm']
|
|
379
|
+
self.is_classification_type = lambda: self.task_type.upper() == 'CLASSIFICATION'
|
|
380
|
+
self._is_fit_called = False
|
|
381
|
+
|
|
382
|
+
def fit(self,
|
|
383
|
+
data,
|
|
384
|
+
target_column):
|
|
385
|
+
"""
|
|
386
|
+
DESCRIPTION:
|
|
387
|
+
Function triggers the AutoML run. It is designed to handle both
|
|
388
|
+
regression and classification tasks depending on the specified "task_type".
|
|
389
|
+
|
|
390
|
+
PARAMETERS:
|
|
391
|
+
data:
|
|
392
|
+
Required Argument.
|
|
393
|
+
Specifies the input teradataml DataFrame.
|
|
394
|
+
Types: teradataml Dataframe
|
|
395
|
+
|
|
396
|
+
target_column:
|
|
397
|
+
Required Arugment.
|
|
398
|
+
Specifies target column of dataset.
|
|
399
|
+
Types: str or ColumnExpression
|
|
400
|
+
|
|
401
|
+
RETURNS:
|
|
402
|
+
None
|
|
403
|
+
|
|
404
|
+
RAISES:
|
|
405
|
+
TeradataMlException, TypeError, ValueError
|
|
406
|
+
|
|
407
|
+
EXAMPLES:
|
|
408
|
+
# Create an instance of the AutoML called "automl_obj"
|
|
409
|
+
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
410
|
+
# Perform fit() operation on the "automl_obj".
|
|
411
|
+
|
|
412
|
+
# Example 1: Passing column expression for target column.
|
|
413
|
+
>>> automl_obj.fit(data = housing_train, target_col = housing_train.price)
|
|
414
|
+
|
|
415
|
+
# Example 2: Passing name of target column.
|
|
416
|
+
>>> automl_obj.fit(data = housing_train, target_col = "price")
|
|
417
|
+
"""
|
|
418
|
+
|
|
419
|
+
self._is_fit_called = True
|
|
420
|
+
# Checking if target column is of type ColumnExpression
|
|
421
|
+
if isinstance(target_column, ColumnExpression):
|
|
422
|
+
target_column = target_column.name
|
|
423
|
+
|
|
424
|
+
# Appending fit arguments to list for validation
|
|
425
|
+
arg_info_fit_matrix = []
|
|
426
|
+
arg_info_fit_matrix.append(["data", data, False, (DataFrame), True])
|
|
427
|
+
arg_info_fit_matrix.append(["target_column", target_column, False, (str), True])
|
|
428
|
+
|
|
429
|
+
# Validate argument types
|
|
430
|
+
_Validators._validate_function_arguments(arg_info_fit_matrix)
|
|
431
|
+
|
|
432
|
+
# Initializing class variables
|
|
433
|
+
self.data = data
|
|
434
|
+
self.target_column = target_column
|
|
435
|
+
|
|
436
|
+
# Checking if include model list is present
|
|
437
|
+
if self.include_model:
|
|
438
|
+
# Converting to list if passed as string
|
|
439
|
+
self.include_model = UtilFuncs._as_list(self.include_model)
|
|
440
|
+
# Updating model list based on include list
|
|
441
|
+
self.model_list = list(set(self.include_model))
|
|
442
|
+
self.model_list = [model.lower() for model in self.model_list]
|
|
443
|
+
|
|
444
|
+
# Checking if exclude model list is present
|
|
445
|
+
if self.exclude_model:
|
|
446
|
+
# Converting to list if passed as string
|
|
447
|
+
self.exclude_model = UtilFuncs._as_list(self.exclude_model)
|
|
448
|
+
# Updating model list based on exclude list
|
|
449
|
+
self.model_list = list(set(self.model_list) - set(self.exclude_model))
|
|
450
|
+
self.model_list = [model.lower() for model in self.model_list]
|
|
451
|
+
|
|
452
|
+
# Checking if target column is present in data
|
|
453
|
+
_Validators._validate_dataframe_has_argument_columns(self.target_column, "target_column", self.data, "df")
|
|
454
|
+
|
|
455
|
+
# Handling default task type
|
|
456
|
+
if self.task_type.casefold() == "default":
|
|
457
|
+
# if target column is having distinct values less than or equal to 20,
|
|
458
|
+
# then it will be mapped to classification problem else regression problem
|
|
459
|
+
if self.data.drop_duplicate(self.target_column).size <= 20:
|
|
460
|
+
print("\nTask type is set to Classification as target column "
|
|
461
|
+
"is having distinct values less than or equal to 20.")
|
|
462
|
+
self.task_type = "Classification"
|
|
463
|
+
else:
|
|
464
|
+
print("\nTask type is set to Regression as target column is "
|
|
465
|
+
"having distinct values greater than 20.")
|
|
466
|
+
self.task_type = "Regression"
|
|
467
|
+
|
|
468
|
+
if self.is_classification_type():
|
|
469
|
+
if self.stopping_metric is not None:
|
|
470
|
+
permitted_values = ["MICRO-F1", "MACRO-F1",
|
|
471
|
+
"MICRO-RECALL", "MACRO-RECALL",
|
|
472
|
+
"MICRO-PRECISION", "MACRO-PRECISION",
|
|
473
|
+
"WEIGHTED-PRECISION", "WEIGHTED-RECALL",
|
|
474
|
+
"WEIGHTED-F1", "ACCURACY"]
|
|
475
|
+
_Validators._validate_permitted_values(self.stopping_metric, permitted_values, "stopping_metric")
|
|
476
|
+
else:
|
|
477
|
+
if self.stopping_metric is not None:
|
|
478
|
+
permitted_values = ["R2", 'MAE', 'MSE', 'MSLE','RMSE', 'RMSLE']
|
|
479
|
+
_Validators._validate_permitted_values(self.stopping_metric, permitted_values, "stopping_metric")
|
|
480
|
+
|
|
481
|
+
if not self.is_classification_type():
|
|
482
|
+
_Validators._validate_column_type(self.data, self.target_column, 'target_column',
|
|
483
|
+
expected_types=UtilFuncs()._get_numeric_datatypes())
|
|
484
|
+
|
|
485
|
+
# Displaying received custom input
|
|
486
|
+
if self.custom_data:
|
|
487
|
+
print("\n Received below input for customization : ")
|
|
488
|
+
print(json.dumps(self.custom_data, indent=4))
|
|
489
|
+
|
|
490
|
+
# Classification probelm
|
|
491
|
+
task_cls = _Classification
|
|
492
|
+
cls_method = "_classification"
|
|
493
|
+
|
|
494
|
+
# Regression problem
|
|
495
|
+
if self.task_type.casefold() == "regression":
|
|
496
|
+
task_cls = _Regression
|
|
497
|
+
cls_method = "_regression"
|
|
498
|
+
|
|
499
|
+
# Running AutoML
|
|
500
|
+
clf = task_cls(self.data, self.target_column, self.custom_data)
|
|
501
|
+
|
|
502
|
+
self.model_info, self.leader_board, self.target_count, self.target_label, \
|
|
503
|
+
self.data_transformation_params, self.table_name_mapping = getattr(clf, cls_method)(
|
|
504
|
+
model_list = self.model_list,
|
|
505
|
+
auto = self.auto,
|
|
506
|
+
verbose = self.verbose,
|
|
507
|
+
max_runtime_secs = self.max_runtime_secs,
|
|
508
|
+
stopping_metric = self.stopping_metric,
|
|
509
|
+
stopping_tolerance = self.stopping_tolerance
|
|
510
|
+
)
|
|
511
|
+
# Model Evaluation Phase
|
|
512
|
+
self.m_evaluator = _ModelEvaluator(self.model_info,
|
|
513
|
+
self.target_column,
|
|
514
|
+
self.task_type)
|
|
515
|
+
|
|
516
|
+
def predict(self,
|
|
517
|
+
data = None,
|
|
518
|
+
rank = 1):
|
|
519
|
+
"""
|
|
520
|
+
DESCRIPTION:
|
|
521
|
+
Function generates prediction on either default test data or any other data
|
|
522
|
+
using model rank in leaderboard and displays performance metrics
|
|
523
|
+
of the specified model.
|
|
524
|
+
|
|
525
|
+
If test data contains target column, then it displays both prediction
|
|
526
|
+
and performance metrics, otherwise displays only prediction.
|
|
527
|
+
|
|
528
|
+
PARAMETERS:
|
|
529
|
+
data:
|
|
530
|
+
Optional Argument.
|
|
531
|
+
Specifies the dataset on which prediction and performance
|
|
532
|
+
metrices needs to be generated using model rank in leaderboard.
|
|
533
|
+
When "data" is not specified default test data is used. Default
|
|
534
|
+
test data is the dataset generated at the time of training.
|
|
535
|
+
Types: teradataml DataFrame
|
|
536
|
+
|
|
537
|
+
rank:
|
|
538
|
+
Optional Argument.
|
|
539
|
+
Specifies the rank of the model in the leaderboard to be used for prediction.
|
|
540
|
+
Default Value: 1
|
|
541
|
+
Types: int
|
|
542
|
+
|
|
543
|
+
RETURNS:
|
|
544
|
+
Pandas DataFrame with predictions.
|
|
545
|
+
|
|
546
|
+
RAISES:
|
|
547
|
+
TeradataMlException, TypeError, ValueError
|
|
548
|
+
|
|
549
|
+
EXAMPLES:
|
|
550
|
+
# Create an instance of the AutoML called "automl_obj"
|
|
551
|
+
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
552
|
+
# Perform fit() operation on the "automl_obj".
|
|
553
|
+
# Perform predict() operation on the "automl_obj".
|
|
554
|
+
|
|
555
|
+
# Example 1: Run predict with best performing model.
|
|
556
|
+
>>> prediction = automl_obj.predict()
|
|
557
|
+
>>> prediction
|
|
558
|
+
|
|
559
|
+
# Example 2: Run predict with second best performing model.
|
|
560
|
+
>>> prediction = automl_obj.predict(rank=2)
|
|
561
|
+
>>> prediction
|
|
562
|
+
|
|
563
|
+
# Example 3: Run predict for new test data with best performing model.
|
|
564
|
+
>>> prediction = automl_obj.predict(admissions_test)
|
|
565
|
+
>>> prediction
|
|
566
|
+
|
|
567
|
+
# Example 4: Run predict for new test data with second best performing model.
|
|
568
|
+
>>> prediction = automl_obj.predict(admissions_test, rank=2)
|
|
569
|
+
>>> prediction
|
|
570
|
+
"""
|
|
571
|
+
if not self._is_fit_called:
|
|
572
|
+
# raise ValueError("fit() method must be called before generating prediction.")
|
|
573
|
+
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
574
|
+
"'predict' method", \
|
|
575
|
+
"'fit' method must be called before" \
|
|
576
|
+
" running predict.")
|
|
577
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
578
|
+
# Appending predict arguments to list for validation.
|
|
579
|
+
arg_info_pred_matrix = []
|
|
580
|
+
arg_info_pred_matrix.append(["data", data, True, (DataFrame), True])
|
|
581
|
+
arg_info_pred_matrix.append(["rank", rank, True, (int), True])
|
|
582
|
+
|
|
583
|
+
# Validate argument types
|
|
584
|
+
_Validators._validate_function_arguments(arg_info_pred_matrix)
|
|
585
|
+
|
|
586
|
+
# Setting test data indicator to default value, i.e., False.
|
|
587
|
+
self.test_data_ind = False
|
|
588
|
+
# Setting target column indicator to default value, i.e., False.
|
|
589
|
+
self.target_column_ind = False
|
|
590
|
+
# Model Evaluation using rank-1 [rank starts from 0 in leaderboard]
|
|
591
|
+
rank = rank-1
|
|
592
|
+
|
|
593
|
+
# Checking if there is test data provided or not.
|
|
594
|
+
# If no, then model will generate predicion on default test data.
|
|
595
|
+
# If yes, then at first data transformation will happen then prediction will be generated.
|
|
596
|
+
if data is None:
|
|
597
|
+
metrics, pred = self.m_evaluator.model_evaluation(rank = rank,
|
|
598
|
+
table_name_mapping=self.table_name_mapping)
|
|
599
|
+
else:
|
|
600
|
+
# Setting test data indicator to True
|
|
601
|
+
self.test_data_ind = True
|
|
602
|
+
# Setting indicator to True if target column exists
|
|
603
|
+
if self.target_column in data.columns:
|
|
604
|
+
self.target_column_ind = True
|
|
605
|
+
|
|
606
|
+
# Data Transformation Phase
|
|
607
|
+
data_transform_instance = _DataTransformation(data = data,
|
|
608
|
+
data_transformation_params = \
|
|
609
|
+
self.data_transformation_params,
|
|
610
|
+
auto = self.auto,
|
|
611
|
+
verbose = self.verbose,
|
|
612
|
+
target_column_ind = self.target_column_ind,
|
|
613
|
+
table_name_mapping=self.table_name_mapping)
|
|
614
|
+
|
|
615
|
+
self.table_name_mapping = data_transform_instance.data_transformation()
|
|
616
|
+
|
|
617
|
+
# Checking for target column presence in passed test data.
|
|
618
|
+
# If present, then both prediction and evaluation metrics will be generated.
|
|
619
|
+
# If not present, then only prediction will be generated.
|
|
620
|
+
if self.target_column_ind:
|
|
621
|
+
metrics, pred = self.m_evaluator.model_evaluation(rank = rank,
|
|
622
|
+
test_data_ind = \
|
|
623
|
+
self.test_data_ind,
|
|
624
|
+
target_column_ind = \
|
|
625
|
+
self.target_column_ind,
|
|
626
|
+
table_name_mapping=self.table_name_mapping)
|
|
627
|
+
else:
|
|
628
|
+
pred = self.m_evaluator.model_evaluation(rank = rank,
|
|
629
|
+
test_data_ind = \
|
|
630
|
+
self.test_data_ind,
|
|
631
|
+
table_name_mapping=self.table_name_mapping)
|
|
632
|
+
# Checking if problem type is classification and target label is present.
|
|
633
|
+
if self.is_classification_type() and self.target_label is not None:
|
|
634
|
+
# Displaying target column labels
|
|
635
|
+
tar_dct = {}
|
|
636
|
+
print('Target Column Mapping:')
|
|
637
|
+
# Iterating rows
|
|
638
|
+
for row in self.target_label.result.itertuples():
|
|
639
|
+
# Retrieving the category names of encoded target column
|
|
640
|
+
# row[1] contains the orginal name of cateogry
|
|
641
|
+
# row[2] contains the encoded value
|
|
642
|
+
if row[1] != 'TD_CATEGORY_COUNT':
|
|
643
|
+
tar_dct[row[1]] = row[2]
|
|
644
|
+
|
|
645
|
+
for key, value in tar_dct.items():
|
|
646
|
+
print(f"{key}: {value}")
|
|
647
|
+
|
|
648
|
+
print("\n Prediction : ")
|
|
649
|
+
print(pred.result)
|
|
650
|
+
|
|
651
|
+
# Showing performance metrics if there is no test data
|
|
652
|
+
# Or if target column is present in test data.
|
|
653
|
+
if not self.test_data_ind or self.target_column_ind:
|
|
654
|
+
print("\n Performance Metrics : ")
|
|
655
|
+
print(metrics.result)
|
|
656
|
+
|
|
657
|
+
prediction_column = 'prediction' if 'prediction' in pred.result.columns else 'Prediction'
|
|
658
|
+
|
|
659
|
+
# Displaying confusion matrix and ROC-AUC for classification problem
|
|
660
|
+
if self.is_classification_type():
|
|
661
|
+
print_data = lambda data: print(data) if _is_terminal() else display(data)
|
|
662
|
+
# Displaying ROC-AUC for binary classification
|
|
663
|
+
if self.target_count == 2:
|
|
664
|
+
fit_params = {
|
|
665
|
+
"probability_column" : prediction_column,
|
|
666
|
+
"observation_column" : self.target_column,
|
|
667
|
+
"positive_class" : "1",
|
|
668
|
+
"data" : pred.result
|
|
669
|
+
}
|
|
670
|
+
# Fitting ROC
|
|
671
|
+
roc_out = ROC(**fit_params)
|
|
672
|
+
print("\n ROC-AUC : ")
|
|
673
|
+
print_data(roc_out.result)
|
|
674
|
+
print_data(roc_out.output_data)
|
|
675
|
+
|
|
676
|
+
# Displaying confusion matrix for binary and multiclass classification
|
|
677
|
+
prediction_df=pred.result.to_pandas()
|
|
678
|
+
target_col = self.target_column
|
|
679
|
+
print("\n Confusion Matrix : ")
|
|
680
|
+
print_data(confusion_matrix(prediction_df[target_col], prediction_df[prediction_column]))
|
|
681
|
+
|
|
682
|
+
# Returning prediction
|
|
683
|
+
return pred.result
|
|
684
|
+
|
|
685
|
+
def leaderboard(self):
|
|
686
|
+
"""
|
|
687
|
+
DESCRIPTION:
|
|
688
|
+
Function displays leaderboard.
|
|
689
|
+
|
|
690
|
+
RETURNS:
|
|
691
|
+
Pandas DataFrame with Leaderboard information.
|
|
692
|
+
|
|
693
|
+
RAISES:
|
|
694
|
+
TeradataMlException.
|
|
695
|
+
|
|
696
|
+
EXAMPLES:
|
|
697
|
+
# Create an instance of the AutoML called "automl_obj"
|
|
698
|
+
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
699
|
+
# Perform fit() operation on the "automl_obj".
|
|
700
|
+
# Generate leaderboard using leaderboard() method on "automl_obj".
|
|
701
|
+
>>> automl_obj.leaderboard()
|
|
702
|
+
"""
|
|
703
|
+
if not self._is_fit_called:
|
|
704
|
+
# raise ValueError("fit() method must be called before generating leaderboard.")
|
|
705
|
+
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
706
|
+
"'leaderboard' method", \
|
|
707
|
+
"'fit' method must be called before" \
|
|
708
|
+
" generating leaderboard.")
|
|
709
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
710
|
+
return self.leader_board
|
|
711
|
+
|
|
712
|
+
def leader(self):
|
|
713
|
+
"""
|
|
714
|
+
DESCRIPTION:
|
|
715
|
+
Function displays best performing model.
|
|
716
|
+
|
|
717
|
+
RETURNS:
|
|
718
|
+
None
|
|
719
|
+
|
|
720
|
+
RAISES:
|
|
721
|
+
TeradataMlException.
|
|
722
|
+
|
|
723
|
+
EXAMPLES:
|
|
724
|
+
# Create an instance of the AutoML called "automl_obj"
|
|
725
|
+
# by referring "AutoML() or AutoRegressor() or AutoClassifier()" method.
|
|
726
|
+
# Perform fit() operation on the "automl_obj".
|
|
727
|
+
# Generate leaderboard using leaderboard() method on "automl_obj".
|
|
728
|
+
# Display best performing model using leader() method on "automl_obj".
|
|
729
|
+
>>> automl_obj.leader()
|
|
730
|
+
"""
|
|
731
|
+
if not self._is_fit_called:
|
|
732
|
+
# raise ValueError("fit() method must be called before generating leader.")
|
|
733
|
+
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
734
|
+
"'leader' method", \
|
|
735
|
+
"'fit' method must be called before" \
|
|
736
|
+
" generating leader.")
|
|
737
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
738
|
+
record = self.leader_board
|
|
739
|
+
if not _is_terminal():
|
|
740
|
+
display(record[record['Rank'] == 1])
|
|
741
|
+
else:
|
|
742
|
+
print(record[record['Rank'] == 1])
|
|
743
|
+
|
|
744
|
+
@staticmethod
|
|
745
|
+
def generate_custom_config(file_name = "custom"):
|
|
746
|
+
"""
|
|
747
|
+
DESCRIPTION:
|
|
748
|
+
Function generates custom JSON file containing user customized input under current
|
|
749
|
+
working directory which can be used for AutoML execution.
|
|
750
|
+
|
|
751
|
+
PARAMETERS:
|
|
752
|
+
file_name:
|
|
753
|
+
Optional Argument.
|
|
754
|
+
Specifies the name of the file to be generated. Do not pass the file name
|
|
755
|
+
with extension. Extension '.json' is automatically added to specified file name.
|
|
756
|
+
Default Value: "custom"
|
|
757
|
+
Types: str
|
|
758
|
+
|
|
759
|
+
RETURNS:
|
|
760
|
+
None
|
|
761
|
+
|
|
762
|
+
EXAMPLES:
|
|
763
|
+
# Import either of AutoML or AutoClassifier or AutoRegressor from teradataml.
|
|
764
|
+
# As per requirement, generate json file using generate_custom_config() method.
|
|
765
|
+
|
|
766
|
+
# Generate a default file named "custom.json" file using either of below options.
|
|
767
|
+
>>> AutoML.generate_custom_config()
|
|
768
|
+
or
|
|
769
|
+
>>> AutoClassifier.generate_custom_config()
|
|
770
|
+
or
|
|
771
|
+
>>> AutoRegressor.generate_custom_config()
|
|
772
|
+
# The above code will generate "custom.json" file under the current working directory.
|
|
773
|
+
|
|
774
|
+
# Generate different file name using "file_name" argument.
|
|
775
|
+
>>> AutoML.generate_custom_config("titanic_custom")
|
|
776
|
+
or
|
|
777
|
+
>>> AutoClassifier.generate_custom_config("titanic_custom")
|
|
778
|
+
or
|
|
779
|
+
>>> AutoRegressor.generate_custom_config("housing_custom")
|
|
780
|
+
# The above code will generate "titanic_custom.json" file under the current working directory.
|
|
781
|
+
|
|
782
|
+
"""
|
|
783
|
+
# Intializing class
|
|
784
|
+
generator = _GenerateCustomJson()
|
|
785
|
+
# Generating custom JSON data
|
|
786
|
+
data = generator._generate_custom_json()
|
|
787
|
+
# Converting to JSON
|
|
788
|
+
custom_json = json.dumps(data, indent=4)
|
|
789
|
+
# Save JSON data to the specified file
|
|
790
|
+
json_file = f"{file_name}.json"
|
|
791
|
+
with open(json_file, 'w') as file:
|
|
792
|
+
file.write(custom_json)
|
|
793
|
+
print(f"\n'{json_file}' file is generated successfully under the current working directory.")
|
|
794
|
+
|
|
795
|
+
|
|
796
|
+
class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _ModelTraining):
|
|
797
|
+
|
|
798
|
+
def __init__(self,
|
|
799
|
+
data,
|
|
800
|
+
target_column,
|
|
801
|
+
custom_data = None):
|
|
802
|
+
"""
|
|
803
|
+
DESCRIPTION:
|
|
804
|
+
Function initializes the data, target column for Regression.
|
|
805
|
+
|
|
806
|
+
PARAMETERS:
|
|
807
|
+
data:
|
|
808
|
+
Required Argument.
|
|
809
|
+
Specifies the input teradataml Dataframe.
|
|
810
|
+
Types: teradataml Dataframe
|
|
811
|
+
|
|
812
|
+
target_column:
|
|
813
|
+
Required Arugment.
|
|
814
|
+
Specifies the name of the target column in "data".
|
|
815
|
+
Types: str
|
|
816
|
+
|
|
817
|
+
custom_data:
|
|
818
|
+
Optional Arugment.
|
|
819
|
+
Specifies json object containing user customized input.
|
|
820
|
+
Types: json object
|
|
821
|
+
"""
|
|
822
|
+
self.data = data
|
|
823
|
+
self.target_column = target_column
|
|
824
|
+
self.custom_data = custom_data
|
|
825
|
+
|
|
826
|
+
|
|
827
|
+
def _regression(self,
|
|
828
|
+
model_list = None,
|
|
829
|
+
auto = False,
|
|
830
|
+
verbose = 0,
|
|
831
|
+
max_runtime_secs = None,
|
|
832
|
+
stopping_metric = None,
|
|
833
|
+
stopping_tolerance = None):
|
|
834
|
+
"""
|
|
835
|
+
DESCRIPTION:
|
|
836
|
+
Interal Function runs Regression.
|
|
837
|
+
|
|
838
|
+
PARAMETERS:
|
|
839
|
+
auto:
|
|
840
|
+
Optional Arugment.
|
|
841
|
+
Specifies whether to run AutoML in custom mode or auto mode.
|
|
842
|
+
When set to False, runs in custom mode. Otherwise, by default runs in auto mode.
|
|
843
|
+
Types: bool
|
|
844
|
+
|
|
845
|
+
verbose:
|
|
846
|
+
Optional Argument.
|
|
847
|
+
Specifies the detailed execution steps based on verbose level.
|
|
848
|
+
Default Value: 0
|
|
849
|
+
Permitted Values:
|
|
850
|
+
* 0: prints the progress bar and leaderboard
|
|
851
|
+
* 1: prints the execution steps of AutoML.
|
|
852
|
+
* 2: prints the intermediate data between the execution of each step of AutoML.
|
|
853
|
+
Types: int
|
|
854
|
+
|
|
855
|
+
max_runtime_secs:
|
|
856
|
+
Optional Arugment.
|
|
857
|
+
Specifies the time limit in seconds for model training.
|
|
858
|
+
Types: int
|
|
859
|
+
|
|
860
|
+
stopping_metric:
|
|
861
|
+
Required, when "stopping_tolerance" is set, otherwise optional.
|
|
862
|
+
Specifies the stopping mertics for stopping tolerance in model training.
|
|
863
|
+
Types: str
|
|
864
|
+
|
|
865
|
+
stopping_tolerance:
|
|
866
|
+
Required, when "stopping_metric" is set, otherwise optional.
|
|
867
|
+
Specifies the stopping tolerance for stopping metrics in model training.
|
|
868
|
+
Types: float
|
|
869
|
+
|
|
870
|
+
RETURNS:
|
|
871
|
+
a tuple containing, model information and leaderboard.
|
|
872
|
+
"""
|
|
873
|
+
# Feature Exploration Phase
|
|
874
|
+
_FeatureExplore.__init__(self,
|
|
875
|
+
data = self.data,
|
|
876
|
+
target_column = self.target_column,
|
|
877
|
+
verbose=verbose)
|
|
878
|
+
if verbose > 0:
|
|
879
|
+
self._exploration()
|
|
880
|
+
# Feature Engineering Phase
|
|
881
|
+
_FeatureEngineering.__init__(self,
|
|
882
|
+
data = self.data,
|
|
883
|
+
target_column = self.target_column,
|
|
884
|
+
model_list = model_list,
|
|
885
|
+
verbose = verbose,
|
|
886
|
+
custom_data = self.custom_data)
|
|
887
|
+
# Start time
|
|
888
|
+
start_time = time.time()
|
|
889
|
+
data, excluded_columns, target_label, data_transformation_params = self.feature_engineering(auto)
|
|
890
|
+
|
|
891
|
+
# Data preparation Phase
|
|
892
|
+
_DataPreparation.__init__(self,
|
|
893
|
+
data = self.data,
|
|
894
|
+
target_column = self.target_column,
|
|
895
|
+
verbose = verbose,
|
|
896
|
+
excluded_columns = excluded_columns,
|
|
897
|
+
custom_data = self.custom_data,
|
|
898
|
+
data_transform_dict = data_transformation_params)
|
|
899
|
+
features, data_transformation_params = self.data_preparation(auto)
|
|
900
|
+
|
|
901
|
+
# Calculating max_runtime_secs for model training by,
|
|
902
|
+
# subtracting the time taken for feature engineering and data preparation
|
|
903
|
+
max_runtime_secs = max_runtime_secs - (time.time() - start_time) \
|
|
904
|
+
if max_runtime_secs is not None else None
|
|
905
|
+
|
|
906
|
+
# Setting max_runtime_secs to 60 seconds if it is less than 0
|
|
907
|
+
max_runtime_secs = 60 if max_runtime_secs is not None and \
|
|
908
|
+
max_runtime_secs < 0 else max_runtime_secs
|
|
909
|
+
|
|
910
|
+
# Model Training
|
|
911
|
+
_ModelTraining.__init__(self,
|
|
912
|
+
data = self.data,
|
|
913
|
+
target_column = self.target_column,
|
|
914
|
+
model_list = model_list,
|
|
915
|
+
verbose = verbose,
|
|
916
|
+
features = features,
|
|
917
|
+
task_type = "Regression",
|
|
918
|
+
custom_data = self.custom_data)
|
|
919
|
+
models_info, leaderboard, target_count = self.model_training(auto = auto,
|
|
920
|
+
max_runtime_secs = max_runtime_secs,
|
|
921
|
+
stopping_metric = stopping_metric,
|
|
922
|
+
stopping_tolerance = stopping_tolerance)
|
|
923
|
+
|
|
924
|
+
return (models_info, leaderboard, target_count, target_label, data_transformation_params, self.table_name_mapping)
|
|
925
|
+
|
|
926
|
+
class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _ModelTraining):
|
|
927
|
+
|
|
928
|
+
def __init__(self,
|
|
929
|
+
data,
|
|
930
|
+
target_column,
|
|
931
|
+
custom_data = None):
|
|
932
|
+
"""
|
|
933
|
+
DESCRIPTION:
|
|
934
|
+
Function initializes the data, target column for Classification.
|
|
935
|
+
|
|
936
|
+
PARAMETERS:
|
|
937
|
+
data:
|
|
938
|
+
Required Argument.
|
|
939
|
+
Specifies the input teradataml Dataframe.
|
|
940
|
+
Types: teradataml Dataframe
|
|
941
|
+
|
|
942
|
+
target_column:
|
|
943
|
+
Required Arugment.
|
|
944
|
+
Specifies the name of the target column in "data".
|
|
945
|
+
Types: str
|
|
946
|
+
|
|
947
|
+
custom_data:
|
|
948
|
+
Optional Arugment.
|
|
949
|
+
Specifies json object containing user customized input.
|
|
950
|
+
Types: json object
|
|
951
|
+
"""
|
|
952
|
+
self.data = data
|
|
953
|
+
self.target_column = target_column
|
|
954
|
+
self.custom_data = custom_data
|
|
955
|
+
|
|
956
|
+
def _classification(self,
|
|
957
|
+
model_list = None,
|
|
958
|
+
auto = False,
|
|
959
|
+
verbose = 0,
|
|
960
|
+
max_runtime_secs = None,
|
|
961
|
+
stopping_metric = None,
|
|
962
|
+
stopping_tolerance = None):
|
|
963
|
+
"""
|
|
964
|
+
DESCRIPTION:
|
|
965
|
+
Interal Function runs Classification.
|
|
966
|
+
|
|
967
|
+
PARAMETERS:
|
|
968
|
+
auto:
|
|
969
|
+
Optional Arugment.
|
|
970
|
+
Specifies whether to run AutoML in custom mode or auto mode.
|
|
971
|
+
When set to False, runs in custom mode. Otherwise, by default runs in auto mode.
|
|
972
|
+
Types: bool
|
|
973
|
+
|
|
974
|
+
verbose:
|
|
975
|
+
Optional Argument.
|
|
976
|
+
Specifies the detailed execution steps based on verbose level.
|
|
977
|
+
Default Value: 0
|
|
978
|
+
Permitted Values:
|
|
979
|
+
* 0: prints the progress bar and leaderboard
|
|
980
|
+
* 1: prints the execution steps of AutoML.
|
|
981
|
+
* 2: prints the intermediate data between the execution of each step of AutoML.
|
|
982
|
+
Types: int
|
|
983
|
+
|
|
984
|
+
max_runtime_secs:
|
|
985
|
+
Optional Arugment.
|
|
986
|
+
Specifies the time limit in seconds for model training.
|
|
987
|
+
Types: int
|
|
988
|
+
|
|
989
|
+
stopping_metric:
|
|
990
|
+
Required, when "stopping_tolerance" is set, otherwise optional.
|
|
991
|
+
Specifies the stopping mertics for stopping tolerance in model training.
|
|
992
|
+
Types: str
|
|
993
|
+
|
|
994
|
+
stopping_tolerance:
|
|
995
|
+
Required, when "stopping_metric" is set, otherwise optional.
|
|
996
|
+
Specifies the stopping tolerance for stopping metrics in model training.
|
|
997
|
+
Types: float
|
|
998
|
+
|
|
999
|
+
RETURNS:
|
|
1000
|
+
a tuple containing, model information and leaderboard.
|
|
1001
|
+
"""
|
|
1002
|
+
|
|
1003
|
+
|
|
1004
|
+
# Feature Exploration Phase
|
|
1005
|
+
_FeatureExplore.__init__(self,
|
|
1006
|
+
data = self.data,
|
|
1007
|
+
target_column = self.target_column,
|
|
1008
|
+
verbose=verbose)
|
|
1009
|
+
if verbose > 0:
|
|
1010
|
+
self._exploration()
|
|
1011
|
+
# Feature Engineeting Phase
|
|
1012
|
+
_FeatureEngineering.__init__(self,
|
|
1013
|
+
data = self.data,
|
|
1014
|
+
target_column = self.target_column,
|
|
1015
|
+
model_list = model_list,
|
|
1016
|
+
verbose = verbose,
|
|
1017
|
+
task_type = "Classification",
|
|
1018
|
+
custom_data = self.custom_data)
|
|
1019
|
+
# Start time
|
|
1020
|
+
start_time = time.time()
|
|
1021
|
+
data, excluded_columns, target_label, data_transformation_params = self.feature_engineering(auto)
|
|
1022
|
+
# Data Preparation Phase
|
|
1023
|
+
_DataPreparation.__init__(self,
|
|
1024
|
+
data = self.data,
|
|
1025
|
+
target_column = self.target_column,
|
|
1026
|
+
verbose = verbose,
|
|
1027
|
+
excluded_columns = excluded_columns,
|
|
1028
|
+
custom_data = self.custom_data,
|
|
1029
|
+
data_transform_dict = data_transformation_params,
|
|
1030
|
+
task_type = "Classification")
|
|
1031
|
+
features, data_transformation_params = self.data_preparation(auto)
|
|
1032
|
+
|
|
1033
|
+
# Calculating max_runtime_secs for model training by,
|
|
1034
|
+
# subtracting the time taken for feature engineering and data preparation
|
|
1035
|
+
max_runtime_secs = max_runtime_secs - (time.time() - start_time) \
|
|
1036
|
+
if max_runtime_secs is not None else None
|
|
1037
|
+
|
|
1038
|
+
# Setting max_runtime_secs to 60 seconds if it is less than 0
|
|
1039
|
+
max_runtime_secs = 60 if max_runtime_secs is not None and \
|
|
1040
|
+
max_runtime_secs < 0 else max_runtime_secs
|
|
1041
|
+
|
|
1042
|
+
# Model training
|
|
1043
|
+
_ModelTraining.__init__(self,
|
|
1044
|
+
data = self.data,
|
|
1045
|
+
target_column = self.target_column,
|
|
1046
|
+
model_list = model_list,
|
|
1047
|
+
verbose = verbose,
|
|
1048
|
+
features = features,
|
|
1049
|
+
task_type = "Classification",
|
|
1050
|
+
custom_data = self.custom_data)
|
|
1051
|
+
models_info, leaderboard, target_count = self.model_training(auto = auto,
|
|
1052
|
+
max_runtime_secs = max_runtime_secs,
|
|
1053
|
+
stopping_metric = stopping_metric,
|
|
1054
|
+
stopping_tolerance = stopping_tolerance)
|
|
1055
|
+
|
|
1056
|
+
return (models_info, leaderboard, target_count, target_label, data_transformation_params, self.table_name_mapping)
|
|
1057
|
+
|
|
1058
|
+
def _target_column_details(self):
|
|
1059
|
+
"""
|
|
1060
|
+
DESCRIPTION:
|
|
1061
|
+
Internal function displays the target column distribution of Target column/ Response column.
|
|
1062
|
+
"""
|
|
1063
|
+
# If data visualization libraries are available
|
|
1064
|
+
if self._check_visualization_libraries() and not _is_terminal():
|
|
1065
|
+
import matplotlib.pyplot as plt
|
|
1066
|
+
import seaborn as sns
|
|
1067
|
+
self._display_msg(msg='\nTarget Column Distribution:',
|
|
1068
|
+
show_data=True)
|
|
1069
|
+
plt.figure(figsize=(6, 6))
|
|
1070
|
+
# Ploting a histogram for target column
|
|
1071
|
+
sns.countplot(data=self.data.select([self.target_column]).to_pandas(), x=self.target_column)
|
|
1072
|
+
plt.show()
|
|
1073
|
+
|
|
1074
|
+
def _check_data_imbalance(self,
|
|
1075
|
+
data=None):
|
|
1076
|
+
"""
|
|
1077
|
+
DESCRIPTION:
|
|
1078
|
+
Internal function calculate and checks the imbalance in dataset.
|
|
1079
|
+
|
|
1080
|
+
PARAMETERS:
|
|
1081
|
+
data:
|
|
1082
|
+
Required Argument.
|
|
1083
|
+
Specifies the input teradataml DataFrame.
|
|
1084
|
+
Types: teradataml Dataframe
|
|
1085
|
+
|
|
1086
|
+
RETURNS:
|
|
1087
|
+
bool, True if imbalance dataset detected, Otherwise False.
|
|
1088
|
+
"""
|
|
1089
|
+
self._display_msg(msg="\nChecking imbalance data ...",
|
|
1090
|
+
progress_bar=self.progress_bar)
|
|
1091
|
+
# Calculate the distribution of classes in the target column
|
|
1092
|
+
class_dist = data[self.target_column].value_counts().values
|
|
1093
|
+
|
|
1094
|
+
# Find the minimum count of data points among the classes
|
|
1095
|
+
min_ct = np.min(class_dist)
|
|
1096
|
+
|
|
1097
|
+
# Find the maximum count of data points among the classes
|
|
1098
|
+
max_ct = np.max(class_dist)
|
|
1099
|
+
|
|
1100
|
+
# Calculate the imbalance ratio(minimum count to maximum count)
|
|
1101
|
+
imb_ratio = min_ct / max_ct
|
|
1102
|
+
|
|
1103
|
+
# Check if the imbalance ratio less than the threshold of 0.4
|
|
1104
|
+
if imb_ratio < 0.4:
|
|
1105
|
+
self._display_msg(msg="Imbalance Found.",
|
|
1106
|
+
progress_bar=self.progress_bar)
|
|
1107
|
+
return True
|
|
1108
|
+
|
|
1109
|
+
self._display_msg(msg="Imbalance Not Found.",
|
|
1110
|
+
progress_bar=self.progress_bar)
|
|
1111
|
+
return False
|
|
1112
|
+
|
|
1113
|
+
def _set_custom_sampling(self):
|
|
1114
|
+
"""
|
|
1115
|
+
DESCRIPTION:
|
|
1116
|
+
Function to handle customized data sampling for imbalance dataset.
|
|
1117
|
+
"""
|
|
1118
|
+
# Fetching user input for data sampling
|
|
1119
|
+
data_imbalance_input = self.custom_data.get("DataImbalanceIndicator", False)
|
|
1120
|
+
if data_imbalance_input:
|
|
1121
|
+
# Extracting method for performing data sampling
|
|
1122
|
+
handling_method = self.custom_data.get("DataImbalanceMethod", None)
|
|
1123
|
+
if handling_method == 'SMOTE':
|
|
1124
|
+
self._data_sampling_method = "SMOTE"
|
|
1125
|
+
elif handling_method == 'NearMiss':
|
|
1126
|
+
self._data_sampling_method = "NearMiss"
|
|
1127
|
+
else:
|
|
1128
|
+
self._display_msg(inline_msg="Provided method for data imbalance is not supported. AutoML will Proceed with default option.",
|
|
1129
|
+
progress_bar=self.progress_bar)
|
|
1130
|
+
else:
|
|
1131
|
+
self._display_msg(inline_msg="No information provided for performing customized imbalanced dataset sampling. AutoML will Proceed with default option.",
|
|
1132
|
+
progress_bar=self.progress_bar)
|
|
1133
|
+
|
|
1134
|
+
def _data_sampling(self,
|
|
1135
|
+
data):
|
|
1136
|
+
"""
|
|
1137
|
+
DESCRIPTION:
|
|
1138
|
+
Function to handle data imbalance in dataset using sampling techniques
|
|
1139
|
+
in case of classification.
|
|
1140
|
+
|
|
1141
|
+
PARAMETERS:
|
|
1142
|
+
data:
|
|
1143
|
+
Required Argument.
|
|
1144
|
+
Specifies the input teradataml DataFrame.
|
|
1145
|
+
Types: pandas Dataframe.
|
|
1146
|
+
|
|
1147
|
+
RETURNS:
|
|
1148
|
+
Teradataml dataframe after handling data imbalance.
|
|
1149
|
+
"""
|
|
1150
|
+
self._display_msg(msg="\nStarting data imbalance handling ...",
|
|
1151
|
+
progress_bar=self.progress_bar,
|
|
1152
|
+
show_data=True)
|
|
1153
|
+
|
|
1154
|
+
# Importing required libraries
|
|
1155
|
+
from imblearn.over_sampling import SMOTE
|
|
1156
|
+
from imblearn.under_sampling import NearMiss
|
|
1157
|
+
|
|
1158
|
+
st = time.time()
|
|
1159
|
+
self._display_msg(msg=f"\nBalancing the data using {self._data_sampling_method}...",
|
|
1160
|
+
progress_bar=self.progress_bar,
|
|
1161
|
+
show_data=True)
|
|
1162
|
+
# Performing data sampling
|
|
1163
|
+
try:
|
|
1164
|
+
# Fetching the minimum target column label count and
|
|
1165
|
+
# accordingly setting the number of neighbors for the sampler
|
|
1166
|
+
min_label_count = min(data[self.target_column].value_counts())
|
|
1167
|
+
if self._data_sampling_method == 'SMOTE':
|
|
1168
|
+
n_neighbors = min(5, min_label_count - 1)
|
|
1169
|
+
sampling_method = SMOTE(k_neighbors=n_neighbors, random_state=5)
|
|
1170
|
+
else:
|
|
1171
|
+
n_neighbors = min(3, min_label_count)
|
|
1172
|
+
sampling_method = NearMiss(version=1, n_neighbors=n_neighbors)
|
|
1173
|
+
|
|
1174
|
+
# Fitting on dataset
|
|
1175
|
+
xt, yt = sampling_method.fit_resample(data.drop(columns=[self.target_column], axis=1),
|
|
1176
|
+
data[self.target_column])
|
|
1177
|
+
|
|
1178
|
+
# Merging the balanced dataset with target column
|
|
1179
|
+
balanced_df = (xt.reset_index().merge(yt.reset_index(), on="index"))
|
|
1180
|
+
balanced_df.drop(columns=['index', 'id'], axis=1, inplace=True)
|
|
1181
|
+
balanced_df = balanced_df.reset_index().rename(columns={'index': 'id'})
|
|
1182
|
+
|
|
1183
|
+
et = time.time()
|
|
1184
|
+
self._display_msg(msg=f"Handled imbalanced dataset using {self._data_sampling_method}: {et - st:.2f} sec",
|
|
1185
|
+
progress_bar=self.progress_bar,
|
|
1186
|
+
show_data=True)
|
|
1187
|
+
except:
|
|
1188
|
+
self._display_msg(msg=f"Balancing using {self._data_sampling_method} Failed!!",
|
|
1189
|
+
progress_bar=self.progress_bar,
|
|
1190
|
+
show_data=True)
|
|
1191
|
+
# Returning original data if the data sampler fails
|
|
1192
|
+
return data
|
|
1193
|
+
|
|
1194
|
+
self._display_msg(msg="Completed data imbalance handling.",
|
|
1195
|
+
progress_bar=self.progress_bar,
|
|
1196
|
+
show_data=True)
|
|
1197
|
+
# Returning balanced dataframe
|
|
1198
|
+
return balanced_df
|
|
1199
|
+
|
|
1200
|
+
class AutoRegressor(AutoML):
|
|
1201
|
+
|
|
1202
|
+
def __init__(self,
|
|
1203
|
+
include = None,
|
|
1204
|
+
exclude = None,
|
|
1205
|
+
verbose=0,
|
|
1206
|
+
max_runtime_secs=None,
|
|
1207
|
+
stopping_metric=None,
|
|
1208
|
+
stopping_tolerance=None,
|
|
1209
|
+
custom_config_file=None
|
|
1210
|
+
):
|
|
1211
|
+
"""
|
|
1212
|
+
DESCRIPTION:
|
|
1213
|
+
AutoRegressor is a special purpose AutoML feature to run regression specific tasks.
|
|
1214
|
+
|
|
1215
|
+
PARAMETERS:
|
|
1216
|
+
include:
|
|
1217
|
+
Optional Argument.
|
|
1218
|
+
Specifies the model algorithms to be used for model training phase.
|
|
1219
|
+
By default, all 5 models are used for training for regression and binary
|
|
1220
|
+
classification problem, while only 3 models are used for multi-class.
|
|
1221
|
+
Permitted Values: "glm", "svm", "knn", "decision_forest", "xgboost"
|
|
1222
|
+
Types: str OR list of str
|
|
1223
|
+
|
|
1224
|
+
exclude:
|
|
1225
|
+
Optional Argument.
|
|
1226
|
+
Specifies the model algorithms to be excluded from model training phase.
|
|
1227
|
+
No model is excluded by default.
|
|
1228
|
+
Permitted Values: "glm", "svm", "knn", "decision_forest", "xgboost"
|
|
1229
|
+
Types: str OR list of str
|
|
1230
|
+
|
|
1231
|
+
verbose:
|
|
1232
|
+
Optional Argument.
|
|
1233
|
+
Specifies the detailed execution steps based on verbose level.
|
|
1234
|
+
Default Value: 0
|
|
1235
|
+
Permitted Values:
|
|
1236
|
+
* 0: prints the progress bar and leaderboard
|
|
1237
|
+
* 1: prints the execution steps of AutoML.
|
|
1238
|
+
* 2: prints the intermediate data between the execution of each step of AutoML.
|
|
1239
|
+
Types: int
|
|
1240
|
+
|
|
1241
|
+
max_runtime_secs:
|
|
1242
|
+
Optional Arugment.
|
|
1243
|
+
Specifies the time limit in seconds for model training.
|
|
1244
|
+
Types: int
|
|
1245
|
+
|
|
1246
|
+
stopping_metric:
|
|
1247
|
+
Required, when "stopping_tolerance" is set, otherwise optional.
|
|
1248
|
+
Specifies the stopping mertics for stopping tolerance in model training.
|
|
1249
|
+
Permitted Values:
|
|
1250
|
+
* For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
|
|
1251
|
+
"RMSE", "RMSLE"
|
|
1252
|
+
* For task_type "Classification": 'MICRO-F1','MACRO-F1',
|
|
1253
|
+
'MICRO-RECALL','MACRO-RECALL',
|
|
1254
|
+
'MICRO-PRECISION', 'MACRO-PRECISION',
|
|
1255
|
+
'WEIGHTED-PRECISION','WEIGHTED-RECALL',
|
|
1256
|
+
'WEIGHTED-F1', 'ACCURACY'
|
|
1257
|
+
Types: str
|
|
1258
|
+
|
|
1259
|
+
stopping_tolerance:
|
|
1260
|
+
Required, when "stopping_metric" is set, otherwise optional.
|
|
1261
|
+
Specifies the stopping tolerance for stopping metrics in model training.
|
|
1262
|
+
Types: float
|
|
1263
|
+
|
|
1264
|
+
custom_config_file:
|
|
1265
|
+
Optional Argument.
|
|
1266
|
+
Specifies the path of JSON file in case of custom run.
|
|
1267
|
+
Types: str
|
|
1268
|
+
|
|
1269
|
+
RETURNS:
|
|
1270
|
+
Instance of AutoRegressor.
|
|
1271
|
+
|
|
1272
|
+
RAISES:
|
|
1273
|
+
TeradataMlException, TypeError, ValueError
|
|
1274
|
+
|
|
1275
|
+
EXAMPLES:
|
|
1276
|
+
# Notes:
|
|
1277
|
+
# 1. Get the connection to Vantage to execute the function.
|
|
1278
|
+
# 2. One must import the required functions mentioned in
|
|
1279
|
+
# the example from teradataml.
|
|
1280
|
+
# 3. Function will raise error if not supported on the Vantage
|
|
1281
|
+
# user is connected to.
|
|
1282
|
+
|
|
1283
|
+
# Load the example data.
|
|
1284
|
+
>>> load_example_data("decisionforestpredict", ["housing_train", "housing_test"])
|
|
1285
|
+
|
|
1286
|
+
# Create teradataml DataFrame object.
|
|
1287
|
+
>>> housing_train = DataFrame.from_table("housing_train")
|
|
1288
|
+
|
|
1289
|
+
# Example 1 : Run AutoRegressor using default options.
|
|
1290
|
+
# Scenario : Predict the price of house based on different factors.
|
|
1291
|
+
|
|
1292
|
+
# Create instance of AutoRegressor.
|
|
1293
|
+
>>> automl_obj = AutoRegressor()
|
|
1294
|
+
|
|
1295
|
+
# Fit the data.
|
|
1296
|
+
>>> automl_obj.fit(housing_train, "price")
|
|
1297
|
+
|
|
1298
|
+
# Predict using best performing model.
|
|
1299
|
+
>>> prediction = automl_obj.predict()
|
|
1300
|
+
>>> prediction
|
|
1301
|
+
|
|
1302
|
+
# Run predict for new test data with best performing model.
|
|
1303
|
+
>>> prediction = automl_obj.predict(housing_test)
|
|
1304
|
+
>>> prediction
|
|
1305
|
+
|
|
1306
|
+
# Run predict for new test data with second best performing model.
|
|
1307
|
+
>>> prediction = automl_obj.predict(housing_test, rank=2)
|
|
1308
|
+
>>> prediction
|
|
1309
|
+
|
|
1310
|
+
# Display leaderboard.
|
|
1311
|
+
>>> automl_obj.leaderboard()
|
|
1312
|
+
|
|
1313
|
+
# Display best performing model.
|
|
1314
|
+
>>> automl_obj.leader()
|
|
1315
|
+
|
|
1316
|
+
# Example 2 : Run AutoRegressor for regression problem with early stopping metric and tolerance.
|
|
1317
|
+
# Scenario : Predict the price of house based on different factors.
|
|
1318
|
+
# Use custom configuration file to customize different
|
|
1319
|
+
# processes of AutoML Run. Define performance threshold
|
|
1320
|
+
# to acquire for the available models, and terminate training
|
|
1321
|
+
# upon meeting the stipulated performance criteria.
|
|
1322
|
+
|
|
1323
|
+
# Generate custom configuration file.
|
|
1324
|
+
>>> AutoRegressor.generate_custom_config("custom_housing")
|
|
1325
|
+
|
|
1326
|
+
# Create instance of AutoRegressor.
|
|
1327
|
+
>>> automl_obj = AutoRegressor(verbose=2,
|
|
1328
|
+
>>> exclude="xgboost",
|
|
1329
|
+
>>> stopping_metric="R2",
|
|
1330
|
+
>>> stopping_tolerance=0.7,
|
|
1331
|
+
>>> custom_config_file="custom_housing.json")
|
|
1332
|
+
# Fit the data.
|
|
1333
|
+
>>> automl_obj.fit(housing_train, "price")
|
|
1334
|
+
|
|
1335
|
+
# Run predict with best performing model.
|
|
1336
|
+
>>> prediction = automl_obj.predict()
|
|
1337
|
+
>>> prediction
|
|
1338
|
+
|
|
1339
|
+
# Display leaderboard.
|
|
1340
|
+
>>> automl_obj.leaderboard()
|
|
1341
|
+
|
|
1342
|
+
# Example 3 : Run AutoRegressor for regression problem with maximum runtime.
|
|
1343
|
+
# Scenario : Predict the price of house based on different factors.
|
|
1344
|
+
# Run AutoML to get the best performing model in specified time.
|
|
1345
|
+
|
|
1346
|
+
# Create instance of AutoRegressor.
|
|
1347
|
+
>>> automl_obj = AutoRegressor(verbose=2,
|
|
1348
|
+
>>> exclude="xgboost",
|
|
1349
|
+
>>> max_runtime_secs=500)
|
|
1350
|
+
# Fit the data.
|
|
1351
|
+
>>> automl_obj.fit(housing_train, "price")
|
|
1352
|
+
|
|
1353
|
+
# Run predict with best performing model.
|
|
1354
|
+
>>> prediction = automl_obj.predict()
|
|
1355
|
+
>>> prediction
|
|
1356
|
+
|
|
1357
|
+
# Run predict with second best performing model.
|
|
1358
|
+
>>> prediction = automl_obj.predict(rank=2)
|
|
1359
|
+
>>> prediction
|
|
1360
|
+
|
|
1361
|
+
# Display leaderboard.
|
|
1362
|
+
>>> automl_obj.leaderboard()
|
|
1363
|
+
|
|
1364
|
+
# Display best performing model.
|
|
1365
|
+
>>> automl_obj.leader()
|
|
1366
|
+
"""
|
|
1367
|
+
self.verbose = verbose
|
|
1368
|
+
self.max_runtime_secs = max_runtime_secs
|
|
1369
|
+
self.stopping_metric = stopping_metric
|
|
1370
|
+
self.stopping_tolerance = stopping_tolerance
|
|
1371
|
+
self.custom_config_file = custom_config_file
|
|
1372
|
+
self.task_type = "Regression"
|
|
1373
|
+
self.include = include
|
|
1374
|
+
self.exclude = exclude
|
|
1375
|
+
|
|
1376
|
+
super(AutoRegressor, self).__init__(task_type=self.task_type,
|
|
1377
|
+
include = self.include,
|
|
1378
|
+
exclude = self.exclude,
|
|
1379
|
+
verbose=self.verbose,
|
|
1380
|
+
max_runtime_secs=self.max_runtime_secs,
|
|
1381
|
+
stopping_metric=self.stopping_metric,
|
|
1382
|
+
stopping_tolerance=self.stopping_tolerance,
|
|
1383
|
+
custom_config_file=self.custom_config_file)
|
|
1384
|
+
class AutoClassifier(AutoML):
|
|
1385
|
+
|
|
1386
|
+
def __init__(self,
|
|
1387
|
+
include = None,
|
|
1388
|
+
exclude = None,
|
|
1389
|
+
verbose=0,
|
|
1390
|
+
max_runtime_secs=None,
|
|
1391
|
+
stopping_metric=None,
|
|
1392
|
+
stopping_tolerance=None,
|
|
1393
|
+
custom_config_file=None
|
|
1394
|
+
):
|
|
1395
|
+
"""
|
|
1396
|
+
DESCRIPTION:
|
|
1397
|
+
AutoClassifier is a special purpose AutoML feature to run classification specific tasks.
|
|
1398
|
+
|
|
1399
|
+
PARAMETERS:
|
|
1400
|
+
include:
|
|
1401
|
+
Optional Argument.
|
|
1402
|
+
Specifies the model algorithms to be used for model training phase.
|
|
1403
|
+
By default, all 5 models are used for training for regression and binary
|
|
1404
|
+
classification problem, while only 3 models are used for multi-class.
|
|
1405
|
+
Permitted Values: "glm", "svm", "knn", "decision_forest", "xgboost"
|
|
1406
|
+
Types: str OR list of str
|
|
1407
|
+
|
|
1408
|
+
exclude:
|
|
1409
|
+
Optional Argument.
|
|
1410
|
+
Specifies the model algorithms to be excluded from model training phase.
|
|
1411
|
+
No model is excluded by default.
|
|
1412
|
+
Permitted Values: "glm", "svm", "knn", "decision_forest", "xgboost"
|
|
1413
|
+
Types: str OR list of str
|
|
1414
|
+
|
|
1415
|
+
verbose:
|
|
1416
|
+
Optional Argument.
|
|
1417
|
+
Specifies the detailed execution steps based on verbose level.
|
|
1418
|
+
Default Value: 0
|
|
1419
|
+
Permitted Values:
|
|
1420
|
+
* 0: prints the progress bar and leaderboard
|
|
1421
|
+
* 1: prints the execution steps of AutoML.
|
|
1422
|
+
* 2: prints the intermediate data between the execution of each step of AutoML.
|
|
1423
|
+
Types: int
|
|
1424
|
+
|
|
1425
|
+
max_runtime_secs:
|
|
1426
|
+
Optional Arugment.
|
|
1427
|
+
Specifies the time limit in seconds for model training.
|
|
1428
|
+
Types: int
|
|
1429
|
+
|
|
1430
|
+
stopping_metric:
|
|
1431
|
+
Required, when "stopping_tolerance" is set, otherwise optional.
|
|
1432
|
+
Specifies the stopping mertics for stopping tolerance in model training.
|
|
1433
|
+
Types: str
|
|
1434
|
+
|
|
1435
|
+
stopping_tolerance:
|
|
1436
|
+
Required, when "stopping_metric" is set, otherwise optional.
|
|
1437
|
+
Specifies the stopping tolerance for stopping metrics in model training.
|
|
1438
|
+
Permitted Values:
|
|
1439
|
+
* For task_type "Regression": "R2", "MAE", "MSE", "MSLE",
|
|
1440
|
+
"RMSE", "RMSLE"
|
|
1441
|
+
* For task_type "Classification": 'MICRO-F1','MACRO-F1',
|
|
1442
|
+
'MICRO-RECALL','MACRO-RECALL',
|
|
1443
|
+
'MICRO-PRECISION', 'MACRO-PRECISION',
|
|
1444
|
+
'WEIGHTED-PRECISION','WEIGHTED-RECALL',
|
|
1445
|
+
'WEIGHTED-F1', 'ACCURACY'
|
|
1446
|
+
Types: float
|
|
1447
|
+
|
|
1448
|
+
custom_config_file:
|
|
1449
|
+
Optional Argument.
|
|
1450
|
+
Specifies the path of json file in case of custom run.
|
|
1451
|
+
Types: str
|
|
1452
|
+
|
|
1453
|
+
RETURNS:
|
|
1454
|
+
Instance of AutoClassifier.
|
|
1455
|
+
|
|
1456
|
+
RAISES:
|
|
1457
|
+
TeradataMlException, TypeError, ValueError
|
|
1458
|
+
|
|
1459
|
+
EXAMPLES:
|
|
1460
|
+
# Notes:
|
|
1461
|
+
# 1. Get the connection to Vantage to execute the function.
|
|
1462
|
+
# 2. One must import the required functions mentioned in
|
|
1463
|
+
# the example from teradataml.
|
|
1464
|
+
# 3. Function will raise error if not supported on the Vantage
|
|
1465
|
+
# user is connected to.
|
|
1466
|
+
|
|
1467
|
+
# Load the example data.
|
|
1468
|
+
>>> load_example_data("teradataml", ["titanic", "iris_input"])
|
|
1469
|
+
>>> load_example_data("GLMPredict", ["admissions_test", "admissions_train"])
|
|
1470
|
+
|
|
1471
|
+
# Create teradataml DataFrame object.
|
|
1472
|
+
>>> admissions_train = DataFrame.from_table("admissions_train")
|
|
1473
|
+
>>> titanic = DataFrame.from_table("titanic")
|
|
1474
|
+
>>> iris_input = DataFrame.from_table("iris_input")
|
|
1475
|
+
>>> admissions_test = DataFrame.from_table("admissions_test")
|
|
1476
|
+
|
|
1477
|
+
# Example 1 : Run AutoClassifier for binary classification problem
|
|
1478
|
+
# Scenario : Predict whether a student will be admitted to a university
|
|
1479
|
+
# based on different factors. Run AutoML to get the best performing model
|
|
1480
|
+
# out of available models.
|
|
1481
|
+
|
|
1482
|
+
# Create instance of AutoClassifier..
|
|
1483
|
+
>>> automl_obj = AutoClassifier()
|
|
1484
|
+
|
|
1485
|
+
# Fit the data.
|
|
1486
|
+
>>> automl_obj.fit(admissions_train, "admitted")
|
|
1487
|
+
|
|
1488
|
+
# Predict using best performing model.
|
|
1489
|
+
>>> prediction = automl_obj.predict()
|
|
1490
|
+
>>> prediction
|
|
1491
|
+
|
|
1492
|
+
# Run predict for new test data with best performing model.
|
|
1493
|
+
>>> prediction = automl_obj.predict(admissions_test)
|
|
1494
|
+
>>> prediction
|
|
1495
|
+
|
|
1496
|
+
# Run predict for new test data with second best performing model.
|
|
1497
|
+
>>> prediction = automl_obj.predict(admissions_test, rank=2)
|
|
1498
|
+
>>> prediction
|
|
1499
|
+
|
|
1500
|
+
# Display leaderboard.
|
|
1501
|
+
>>> automl_obj.leaderboard()
|
|
1502
|
+
|
|
1503
|
+
# Display best performing model.
|
|
1504
|
+
>>> automl_obj.leader()
|
|
1505
|
+
|
|
1506
|
+
# Example 2 : Run AutoClassifier for binary classification.
|
|
1507
|
+
# Scenario : Predict whether passenger aboard the RMS Titanic survived
|
|
1508
|
+
# or not based on differect factors. Run AutoML to get the
|
|
1509
|
+
# best performing model out of available models. Use custom
|
|
1510
|
+
# configuration file to customize different processes of
|
|
1511
|
+
# AutoML Run.
|
|
1512
|
+
|
|
1513
|
+
# Generate custom configuration file.
|
|
1514
|
+
>>> AutoClassifier.generate_custom_config("custom_titanic")
|
|
1515
|
+
|
|
1516
|
+
# Create instance of AutoClassifier.
|
|
1517
|
+
>>> automl_obj = AutoClassifier(verbose=2,
|
|
1518
|
+
>>> custom_config_file="custom_titanic.json")
|
|
1519
|
+
# Fit the data.
|
|
1520
|
+
>>> automl_obj.fit(titanic, titanic.survived)
|
|
1521
|
+
|
|
1522
|
+
# Run predict with best performing model.
|
|
1523
|
+
>>> prediction = automl_obj.predict()
|
|
1524
|
+
>>> prediction
|
|
1525
|
+
|
|
1526
|
+
# Run predict with second best performing model.
|
|
1527
|
+
>>> prediction = automl_obj.predict(rank=2)
|
|
1528
|
+
>>> prediction
|
|
1529
|
+
|
|
1530
|
+
# Display leaderboard.
|
|
1531
|
+
>>> automl_obj.leaderboard()
|
|
1532
|
+
|
|
1533
|
+
# Display best performing model.
|
|
1534
|
+
>>> automl_obj.leader()
|
|
1535
|
+
|
|
1536
|
+
# Example 3 : Run AutoClassifier for multiclass classification problem.
|
|
1537
|
+
# Scenario : Predict the species of iris flower based on different factors.
|
|
1538
|
+
# Run AutoML to get the best performing model out of available
|
|
1539
|
+
# models. Use custom configuration file to customize different
|
|
1540
|
+
# processes of AutoML Run.
|
|
1541
|
+
|
|
1542
|
+
# Generate custom configuration file.
|
|
1543
|
+
>>> AutoClassifier.generate_custom_config("custom_iris")
|
|
1544
|
+
|
|
1545
|
+
# Create instance of AutoClassifier.
|
|
1546
|
+
>>> automl_obj = AutoClassifier(verbose=1,
|
|
1547
|
+
>>> custom_config_file="custom_iris.json")
|
|
1548
|
+
# Fit the data.
|
|
1549
|
+
>>> automl_obj.fit(iris_input, "species")
|
|
1550
|
+
|
|
1551
|
+
# Predict using best performing model.
|
|
1552
|
+
>>> prediction = automl_obj.predict()
|
|
1553
|
+
>>> prediction
|
|
1554
|
+
|
|
1555
|
+
# Display leaderboard.
|
|
1556
|
+
>>> automl_obj.leaderboard()
|
|
1557
|
+
|
|
1558
|
+
# Display best performing model.
|
|
1559
|
+
>>> automl_obj.leader()
|
|
1560
|
+
|
|
1561
|
+
# Example 4 : Run AutoClassifier for classification problem with stopping metric and tolerance.
|
|
1562
|
+
# Scenario : Predict whether passenger aboard the RMS Titanic survived
|
|
1563
|
+
# or not based on differect factors. Use custom configuration
|
|
1564
|
+
# file to customize different processes of AutoML Run. Define
|
|
1565
|
+
# performance threshold to acquire for the available models, and
|
|
1566
|
+
# terminate training upon meeting the stipulated performance criteria.
|
|
1567
|
+
|
|
1568
|
+
# Generate custom configuration file.
|
|
1569
|
+
>>> AutoClassifier.generate_custom_config("custom_titanic")
|
|
1570
|
+
|
|
1571
|
+
# Create instance of AutoClassifier.
|
|
1572
|
+
>>> automl_obj = AutoClassifier(verbose=2,
|
|
1573
|
+
>>> exclude="xgboost",
|
|
1574
|
+
>>> stopping_metric="MICRO-F1",
|
|
1575
|
+
>>> stopping_tolerance=0.7,
|
|
1576
|
+
>>> custom_config_file="custom_titanic.json")
|
|
1577
|
+
# Fit the data.
|
|
1578
|
+
>>> automl_obj.fit(titanic, titanic.survived)
|
|
1579
|
+
|
|
1580
|
+
# Run predict with best performing model.
|
|
1581
|
+
>>> prediction = automl_obj.predict()
|
|
1582
|
+
>>> prediction
|
|
1583
|
+
|
|
1584
|
+
# Display leaderboard.
|
|
1585
|
+
>>> automl_obj.leaderboard()
|
|
1586
|
+
|
|
1587
|
+
# Example 5 : Run AutoClassifier for classification problem with maximum runtime.
|
|
1588
|
+
# Scenario : Predict the species of iris flower based on different factors.
|
|
1589
|
+
# Run AutoML to get the best performing model in specified time.
|
|
1590
|
+
|
|
1591
|
+
# Create instance of AutoClassifier.
|
|
1592
|
+
>>> automl_obj = AutoClassifier(verbose=2,
|
|
1593
|
+
>>> exclude="xgboost",
|
|
1594
|
+
>>> max_runtime_secs=500)
|
|
1595
|
+
# Fit the data.
|
|
1596
|
+
>>> automl_obj.fit(iris_input, iris_input.species)
|
|
1597
|
+
|
|
1598
|
+
# Run predict with best performing model.
|
|
1599
|
+
>>> prediction = automl_obj.predict()
|
|
1600
|
+
>>> prediction
|
|
1601
|
+
|
|
1602
|
+
# Run predict with second best performing model.
|
|
1603
|
+
>>> prediction = automl_obj.predict(rank=2)
|
|
1604
|
+
>>> prediction
|
|
1605
|
+
|
|
1606
|
+
# Display leaderboard.
|
|
1607
|
+
>>> automl_obj.leaderboard()
|
|
1608
|
+
|
|
1609
|
+
# Display best performing model.
|
|
1610
|
+
>>> automl_obj.leader()
|
|
1611
|
+
"""
|
|
1612
|
+
self.verbose = verbose
|
|
1613
|
+
self.max_runtime_secs = max_runtime_secs
|
|
1614
|
+
self.stopping_metric = stopping_metric
|
|
1615
|
+
self.stopping_tolerance = stopping_tolerance
|
|
1616
|
+
self.custom_config_file = custom_config_file
|
|
1617
|
+
self.task_type = "Classification"
|
|
1618
|
+
self.include = include
|
|
1619
|
+
self.exclude = exclude
|
|
1620
|
+
|
|
1621
|
+
super(AutoClassifier, self).__init__(task_type=self.task_type,
|
|
1622
|
+
include = self.include,
|
|
1623
|
+
exclude = self.exclude,
|
|
1624
|
+
verbose=self.verbose,
|
|
1625
|
+
max_runtime_secs=self.max_runtime_secs,
|
|
1626
|
+
stopping_metric=self.stopping_metric,
|
|
1627
|
+
stopping_tolerance=self.stopping_tolerance,
|
|
1628
|
+
custom_config_file=self.custom_config_file)
|