teradataml 17.20.0.6__py3-none-any.whl → 20.0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +238 -1
- teradataml/__init__.py +13 -3
- teradataml/_version.py +1 -1
- teradataml/analytics/Transformations.py +4 -4
- teradataml/analytics/__init__.py +0 -2
- teradataml/analytics/analytic_function_executor.py +3 -0
- teradataml/analytics/json_parser/utils.py +13 -12
- teradataml/analytics/sqle/DecisionTreePredict.py +15 -30
- teradataml/analytics/sqle/NaiveBayesPredict.py +11 -20
- teradataml/analytics/sqle/__init__.py +0 -13
- teradataml/analytics/utils.py +1 -0
- teradataml/analytics/valib.py +3 -0
- teradataml/automl/__init__.py +1628 -0
- teradataml/automl/custom_json_utils.py +1270 -0
- teradataml/automl/data_preparation.py +993 -0
- teradataml/automl/data_transformation.py +727 -0
- teradataml/automl/feature_engineering.py +1648 -0
- teradataml/automl/feature_exploration.py +547 -0
- teradataml/automl/model_evaluation.py +163 -0
- teradataml/automl/model_training.py +887 -0
- teradataml/catalog/__init__.py +0 -2
- teradataml/catalog/byom.py +49 -6
- teradataml/catalog/function_argument_mapper.py +0 -2
- teradataml/catalog/model_cataloging_utils.py +2 -1021
- teradataml/common/aed_utils.py +6 -2
- teradataml/common/constants.py +50 -58
- teradataml/common/deprecations.py +160 -0
- teradataml/common/garbagecollector.py +61 -104
- teradataml/common/messagecodes.py +27 -36
- teradataml/common/messages.py +11 -15
- teradataml/common/utils.py +205 -287
- teradataml/common/wrapper_utils.py +1 -110
- teradataml/context/context.py +150 -78
- teradataml/data/bank_churn.csv +10001 -0
- teradataml/data/bmi.csv +501 -0
- teradataml/data/docs/sqle/docs_17_10/BincodeFit.py +3 -3
- teradataml/data/docs/sqle/docs_17_10/BincodeTransform.py +6 -5
- teradataml/data/docs/sqle/docs_17_10/Fit.py +1 -1
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +1 -1
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +1 -1
- teradataml/data/docs/sqle/docs_17_10/PolynomialFeaturesTransform.py +2 -2
- teradataml/data/docs/sqle/docs_17_10/RowNormalizeTransform.py +2 -1
- teradataml/data/docs/sqle/docs_17_10/ScaleTransform.py +1 -0
- teradataml/data/docs/sqle/docs_17_10/SimpleImputeTransform.py +1 -1
- teradataml/data/docs/sqle/docs_17_10/Transform.py +2 -1
- teradataml/data/docs/sqle/docs_17_20/BincodeFit.py +3 -3
- teradataml/data/docs/sqle/docs_17_20/BincodeTransform.py +6 -5
- teradataml/data/docs/sqle/docs_17_20/Fit.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/GLM.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/GLMPredictPerSegment.py +9 -10
- teradataml/data/docs/sqle/docs_17_20/KMeansPredict.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierPredict.py +16 -15
- teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierTrainer.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineTransform.py +8 -8
- teradataml/data/docs/sqle/docs_17_20/OneClassSVMPredict.py +21 -20
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterTransform.py +8 -3
- teradataml/data/docs/sqle/docs_17_20/PolynomialFeaturesTransform.py +6 -5
- teradataml/data/docs/sqle/docs_17_20/RandomProjectionTransform.py +6 -6
- teradataml/data/docs/sqle/docs_17_20/RowNormalizeTransform.py +2 -1
- teradataml/data/docs/sqle/docs_17_20/SVM.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +16 -16
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +1 -0
- teradataml/data/docs/sqle/docs_17_20/SimpleImputeTransform.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/TDDecisionForestPredict.py +4 -4
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +19 -19
- teradataml/data/docs/sqle/docs_17_20/TargetEncodingTransform.py +5 -4
- teradataml/data/docs/sqle/docs_17_20/Transform.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +9 -9
- teradataml/data/fish.csv +160 -0
- teradataml/data/glass_types.csv +215 -0
- teradataml/data/insurance.csv +1 -1
- teradataml/data/iris_data.csv +151 -0
- teradataml/data/jsons/sqle/17.10/TD_FunctionTransform.json +1 -0
- teradataml/data/jsons/sqle/17.10/TD_OneHotEncodingTransform.json +1 -0
- teradataml/data/jsons/sqle/17.10/TD_OutlierFilterTransform.json +1 -0
- teradataml/data/jsons/sqle/17.10/TD_PolynomialFeaturesTransform.json +1 -0
- teradataml/data/jsons/sqle/17.10/TD_RowNormalizeTransform.json +1 -0
- teradataml/data/jsons/sqle/17.10/TD_ScaleTransform.json +1 -0
- teradataml/data/jsons/sqle/17.10/TD_SimpleImputeTransform.json +1 -0
- teradataml/data/load_example_data.py +3 -0
- teradataml/data/multi_model_classification.csv +401 -0
- teradataml/data/multi_model_regression.csv +401 -0
- teradataml/data/openml_example.json +63 -0
- teradataml/data/scripts/deploy_script.py +65 -0
- teradataml/data/scripts/mapper.R +20 -0
- teradataml/data/scripts/sklearn/__init__.py +0 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +175 -0
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +135 -0
- teradataml/data/scripts/sklearn/sklearn_function.template +113 -0
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +158 -0
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +152 -0
- teradataml/data/scripts/sklearn/sklearn_score.py +128 -0
- teradataml/data/scripts/sklearn/sklearn_transform.py +179 -0
- teradataml/data/templates/open_source_ml.json +9 -0
- teradataml/data/teradataml_example.json +73 -1
- teradataml/data/test_classification.csv +101 -0
- teradataml/data/test_prediction.csv +101 -0
- teradataml/data/test_regression.csv +101 -0
- teradataml/data/train_multiclass.csv +101 -0
- teradataml/data/train_regression.csv +101 -0
- teradataml/data/train_regression_multiple_labels.csv +101 -0
- teradataml/data/wine_data.csv +1600 -0
- teradataml/dataframe/copy_to.py +79 -13
- teradataml/dataframe/data_transfer.py +8 -0
- teradataml/dataframe/dataframe.py +910 -311
- teradataml/dataframe/dataframe_utils.py +102 -5
- teradataml/dataframe/fastload.py +11 -3
- teradataml/dataframe/setop.py +15 -2
- teradataml/dataframe/sql.py +3735 -77
- teradataml/dataframe/sql_function_parameters.py +56 -5
- teradataml/dataframe/vantage_function_types.py +45 -1
- teradataml/dataframe/window.py +30 -29
- teradataml/dbutils/dbutils.py +18 -1
- teradataml/geospatial/geodataframe.py +18 -7
- teradataml/geospatial/geodataframecolumn.py +5 -0
- teradataml/hyperparameter_tuner/optimizer.py +910 -120
- teradataml/hyperparameter_tuner/utils.py +131 -37
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/lib/libaed_0_1.dylib +0 -0
- teradataml/lib/libaed_0_1.so +0 -0
- teradataml/libaed_0_1.dylib +0 -0
- teradataml/libaed_0_1.so +0 -0
- teradataml/opensource/__init__.py +1 -0
- teradataml/opensource/sklearn/__init__.py +1 -0
- teradataml/opensource/sklearn/_class.py +255 -0
- teradataml/opensource/sklearn/_sklearn_wrapper.py +1668 -0
- teradataml/opensource/sklearn/_wrapper_utils.py +268 -0
- teradataml/opensource/sklearn/constants.py +54 -0
- teradataml/options/__init__.py +3 -6
- teradataml/options/configure.py +21 -20
- teradataml/scriptmgmt/UserEnv.py +61 -5
- teradataml/scriptmgmt/lls_utils.py +135 -53
- teradataml/table_operators/Apply.py +38 -6
- teradataml/table_operators/Script.py +45 -308
- teradataml/table_operators/TableOperator.py +182 -591
- teradataml/table_operators/__init__.py +0 -1
- teradataml/table_operators/table_operator_util.py +32 -40
- teradataml/utils/validators.py +127 -3
- {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/METADATA +243 -3
- {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/RECORD +147 -391
- teradataml/analytics/mle/AdaBoost.py +0 -651
- teradataml/analytics/mle/AdaBoostPredict.py +0 -564
- teradataml/analytics/mle/Antiselect.py +0 -342
- teradataml/analytics/mle/Arima.py +0 -641
- teradataml/analytics/mle/ArimaPredict.py +0 -477
- teradataml/analytics/mle/Attribution.py +0 -1070
- teradataml/analytics/mle/Betweenness.py +0 -658
- teradataml/analytics/mle/Burst.py +0 -711
- teradataml/analytics/mle/CCM.py +0 -600
- teradataml/analytics/mle/CCMPrepare.py +0 -324
- teradataml/analytics/mle/CFilter.py +0 -460
- teradataml/analytics/mle/ChangePointDetection.py +0 -572
- teradataml/analytics/mle/ChangePointDetectionRT.py +0 -477
- teradataml/analytics/mle/Closeness.py +0 -737
- teradataml/analytics/mle/ConfusionMatrix.py +0 -420
- teradataml/analytics/mle/Correlation.py +0 -477
- teradataml/analytics/mle/Correlation2.py +0 -573
- teradataml/analytics/mle/CoxHazardRatio.py +0 -679
- teradataml/analytics/mle/CoxPH.py +0 -556
- teradataml/analytics/mle/CoxSurvival.py +0 -478
- teradataml/analytics/mle/CumulativeMovAvg.py +0 -363
- teradataml/analytics/mle/DTW.py +0 -623
- teradataml/analytics/mle/DWT.py +0 -564
- teradataml/analytics/mle/DWT2D.py +0 -599
- teradataml/analytics/mle/DecisionForest.py +0 -716
- teradataml/analytics/mle/DecisionForestEvaluator.py +0 -363
- teradataml/analytics/mle/DecisionForestPredict.py +0 -561
- teradataml/analytics/mle/DecisionTree.py +0 -830
- teradataml/analytics/mle/DecisionTreePredict.py +0 -528
- teradataml/analytics/mle/ExponentialMovAvg.py +0 -418
- teradataml/analytics/mle/FMeasure.py +0 -402
- teradataml/analytics/mle/FPGrowth.py +0 -734
- teradataml/analytics/mle/FrequentPaths.py +0 -695
- teradataml/analytics/mle/GLM.py +0 -558
- teradataml/analytics/mle/GLML1L2.py +0 -547
- teradataml/analytics/mle/GLML1L2Predict.py +0 -519
- teradataml/analytics/mle/GLMPredict.py +0 -529
- teradataml/analytics/mle/HMMDecoder.py +0 -945
- teradataml/analytics/mle/HMMEvaluator.py +0 -901
- teradataml/analytics/mle/HMMSupervised.py +0 -521
- teradataml/analytics/mle/HMMUnsupervised.py +0 -572
- teradataml/analytics/mle/Histogram.py +0 -561
- teradataml/analytics/mle/IDWT.py +0 -476
- teradataml/analytics/mle/IDWT2D.py +0 -493
- teradataml/analytics/mle/IdentityMatch.py +0 -763
- teradataml/analytics/mle/Interpolator.py +0 -918
- teradataml/analytics/mle/KMeans.py +0 -485
- teradataml/analytics/mle/KNN.py +0 -627
- teradataml/analytics/mle/KNNRecommender.py +0 -488
- teradataml/analytics/mle/KNNRecommenderPredict.py +0 -581
- teradataml/analytics/mle/LAR.py +0 -439
- teradataml/analytics/mle/LARPredict.py +0 -478
- teradataml/analytics/mle/LDA.py +0 -548
- teradataml/analytics/mle/LDAInference.py +0 -492
- teradataml/analytics/mle/LDATopicSummary.py +0 -464
- teradataml/analytics/mle/LevenshteinDistance.py +0 -450
- teradataml/analytics/mle/LinReg.py +0 -433
- teradataml/analytics/mle/LinRegPredict.py +0 -438
- teradataml/analytics/mle/MinHash.py +0 -544
- teradataml/analytics/mle/Modularity.py +0 -587
- teradataml/analytics/mle/NEREvaluator.py +0 -410
- teradataml/analytics/mle/NERExtractor.py +0 -595
- teradataml/analytics/mle/NERTrainer.py +0 -458
- teradataml/analytics/mle/NGrams.py +0 -570
- teradataml/analytics/mle/NPath.py +0 -634
- teradataml/analytics/mle/NTree.py +0 -549
- teradataml/analytics/mle/NaiveBayes.py +0 -462
- teradataml/analytics/mle/NaiveBayesPredict.py +0 -513
- teradataml/analytics/mle/NaiveBayesTextClassifier.py +0 -607
- teradataml/analytics/mle/NaiveBayesTextClassifier2.py +0 -531
- teradataml/analytics/mle/NaiveBayesTextClassifierPredict.py +0 -799
- teradataml/analytics/mle/NamedEntityFinder.py +0 -529
- teradataml/analytics/mle/NamedEntityFinderEvaluator.py +0 -414
- teradataml/analytics/mle/NamedEntityFinderTrainer.py +0 -396
- teradataml/analytics/mle/POSTagger.py +0 -417
- teradataml/analytics/mle/Pack.py +0 -411
- teradataml/analytics/mle/PageRank.py +0 -535
- teradataml/analytics/mle/PathAnalyzer.py +0 -426
- teradataml/analytics/mle/PathGenerator.py +0 -367
- teradataml/analytics/mle/PathStart.py +0 -464
- teradataml/analytics/mle/PathSummarizer.py +0 -470
- teradataml/analytics/mle/Pivot.py +0 -471
- teradataml/analytics/mle/ROC.py +0 -425
- teradataml/analytics/mle/RandomSample.py +0 -637
- teradataml/analytics/mle/RandomWalkSample.py +0 -490
- teradataml/analytics/mle/SAX.py +0 -779
- teradataml/analytics/mle/SVMDense.py +0 -677
- teradataml/analytics/mle/SVMDensePredict.py +0 -536
- teradataml/analytics/mle/SVMDenseSummary.py +0 -437
- teradataml/analytics/mle/SVMSparse.py +0 -557
- teradataml/analytics/mle/SVMSparsePredict.py +0 -553
- teradataml/analytics/mle/SVMSparseSummary.py +0 -435
- teradataml/analytics/mle/Sampling.py +0 -549
- teradataml/analytics/mle/Scale.py +0 -565
- teradataml/analytics/mle/ScaleByPartition.py +0 -496
- teradataml/analytics/mle/ScaleMap.py +0 -378
- teradataml/analytics/mle/ScaleSummary.py +0 -320
- teradataml/analytics/mle/SentenceExtractor.py +0 -363
- teradataml/analytics/mle/SentimentEvaluator.py +0 -432
- teradataml/analytics/mle/SentimentExtractor.py +0 -578
- teradataml/analytics/mle/SentimentTrainer.py +0 -405
- teradataml/analytics/mle/SeriesSplitter.py +0 -641
- teradataml/analytics/mle/Sessionize.py +0 -475
- teradataml/analytics/mle/SimpleMovAvg.py +0 -397
- teradataml/analytics/mle/StringSimilarity.py +0 -425
- teradataml/analytics/mle/TF.py +0 -389
- teradataml/analytics/mle/TFIDF.py +0 -504
- teradataml/analytics/mle/TextChunker.py +0 -414
- teradataml/analytics/mle/TextClassifier.py +0 -399
- teradataml/analytics/mle/TextClassifierEvaluator.py +0 -413
- teradataml/analytics/mle/TextClassifierTrainer.py +0 -565
- teradataml/analytics/mle/TextMorph.py +0 -494
- teradataml/analytics/mle/TextParser.py +0 -623
- teradataml/analytics/mle/TextTagger.py +0 -530
- teradataml/analytics/mle/TextTokenizer.py +0 -502
- teradataml/analytics/mle/UnivariateStatistics.py +0 -488
- teradataml/analytics/mle/Unpack.py +0 -526
- teradataml/analytics/mle/Unpivot.py +0 -438
- teradataml/analytics/mle/VarMax.py +0 -776
- teradataml/analytics/mle/VectorDistance.py +0 -762
- teradataml/analytics/mle/WeightedMovAvg.py +0 -400
- teradataml/analytics/mle/XGBoost.py +0 -842
- teradataml/analytics/mle/XGBoostPredict.py +0 -627
- teradataml/analytics/mle/__init__.py +0 -123
- teradataml/analytics/mle/json/adaboost_mle.json +0 -135
- teradataml/analytics/mle/json/adaboostpredict_mle.json +0 -85
- teradataml/analytics/mle/json/antiselect_mle.json +0 -34
- teradataml/analytics/mle/json/antiselect_mle_mle.json +0 -34
- teradataml/analytics/mle/json/arima_mle.json +0 -172
- teradataml/analytics/mle/json/arimapredict_mle.json +0 -52
- teradataml/analytics/mle/json/attribution_mle_mle.json +0 -143
- teradataml/analytics/mle/json/betweenness_mle.json +0 -97
- teradataml/analytics/mle/json/burst_mle.json +0 -140
- teradataml/analytics/mle/json/ccm_mle.json +0 -124
- teradataml/analytics/mle/json/ccmprepare_mle.json +0 -14
- teradataml/analytics/mle/json/cfilter_mle.json +0 -93
- teradataml/analytics/mle/json/changepointdetection_mle.json +0 -92
- teradataml/analytics/mle/json/changepointdetectionrt_mle.json +0 -78
- teradataml/analytics/mle/json/closeness_mle.json +0 -104
- teradataml/analytics/mle/json/confusionmatrix_mle.json +0 -79
- teradataml/analytics/mle/json/correlation_mle.json +0 -86
- teradataml/analytics/mle/json/correlationreduce_mle.json +0 -49
- teradataml/analytics/mle/json/coxhazardratio_mle.json +0 -89
- teradataml/analytics/mle/json/coxph_mle.json +0 -98
- teradataml/analytics/mle/json/coxsurvival_mle.json +0 -79
- teradataml/analytics/mle/json/cumulativemovavg_mle.json +0 -34
- teradataml/analytics/mle/json/decisionforest_mle.json +0 -167
- teradataml/analytics/mle/json/decisionforestevaluator_mle.json +0 -33
- teradataml/analytics/mle/json/decisionforestpredict_mle_mle.json +0 -74
- teradataml/analytics/mle/json/decisiontree_mle.json +0 -194
- teradataml/analytics/mle/json/decisiontreepredict_mle_mle.json +0 -86
- teradataml/analytics/mle/json/dtw_mle.json +0 -97
- teradataml/analytics/mle/json/dwt2d_mle.json +0 -116
- teradataml/analytics/mle/json/dwt_mle.json +0 -101
- teradataml/analytics/mle/json/exponentialmovavg_mle.json +0 -55
- teradataml/analytics/mle/json/fmeasure_mle.json +0 -58
- teradataml/analytics/mle/json/fpgrowth_mle.json +0 -159
- teradataml/analytics/mle/json/frequentpaths_mle.json +0 -129
- teradataml/analytics/mle/json/glm_mle.json +0 -111
- teradataml/analytics/mle/json/glml1l2_mle.json +0 -106
- teradataml/analytics/mle/json/glml1l2predict_mle.json +0 -57
- teradataml/analytics/mle/json/glmpredict_mle_mle.json +0 -74
- teradataml/analytics/mle/json/histogram_mle.json +0 -100
- teradataml/analytics/mle/json/hmmdecoder_mle.json +0 -192
- teradataml/analytics/mle/json/hmmevaluator_mle.json +0 -206
- teradataml/analytics/mle/json/hmmsupervised_mle.json +0 -91
- teradataml/analytics/mle/json/hmmunsupervised_mle.json +0 -114
- teradataml/analytics/mle/json/identitymatch_mle.json +0 -88
- teradataml/analytics/mle/json/idwt2d_mle.json +0 -73
- teradataml/analytics/mle/json/idwt_mle.json +0 -66
- teradataml/analytics/mle/json/interpolator_mle.json +0 -151
- teradataml/analytics/mle/json/kmeans_mle.json +0 -97
- teradataml/analytics/mle/json/knn_mle.json +0 -141
- teradataml/analytics/mle/json/knnrecommender_mle.json +0 -111
- teradataml/analytics/mle/json/knnrecommenderpredict_mle.json +0 -75
- teradataml/analytics/mle/json/lar_mle.json +0 -78
- teradataml/analytics/mle/json/larpredict_mle.json +0 -69
- teradataml/analytics/mle/json/lda_mle.json +0 -130
- teradataml/analytics/mle/json/ldainference_mle.json +0 -78
- teradataml/analytics/mle/json/ldatopicsummary_mle.json +0 -64
- teradataml/analytics/mle/json/levenshteindistance_mle.json +0 -92
- teradataml/analytics/mle/json/linreg_mle.json +0 -42
- teradataml/analytics/mle/json/linregpredict_mle.json +0 -56
- teradataml/analytics/mle/json/minhash_mle.json +0 -113
- teradataml/analytics/mle/json/modularity_mle.json +0 -91
- teradataml/analytics/mle/json/naivebayespredict_mle_mle.json +0 -85
- teradataml/analytics/mle/json/naivebayesreduce_mle.json +0 -52
- teradataml/analytics/mle/json/naivebayestextclassifierpredict_mle_mle.json +0 -147
- teradataml/analytics/mle/json/naivebayestextclassifiertrainer2_mle.json +0 -108
- teradataml/analytics/mle/json/naivebayestextclassifiertrainer_mle.json +0 -102
- teradataml/analytics/mle/json/namedentityfinder_mle.json +0 -84
- teradataml/analytics/mle/json/namedentityfinderevaluatorreduce_mle.json +0 -43
- teradataml/analytics/mle/json/namedentityfindertrainer_mle.json +0 -64
- teradataml/analytics/mle/json/nerevaluator_mle.json +0 -54
- teradataml/analytics/mle/json/nerextractor_mle.json +0 -87
- teradataml/analytics/mle/json/nertrainer_mle.json +0 -89
- teradataml/analytics/mle/json/ngrams_mle.json +0 -137
- teradataml/analytics/mle/json/ngramsplitter_mle_mle.json +0 -137
- teradataml/analytics/mle/json/npath@coprocessor_mle.json +0 -73
- teradataml/analytics/mle/json/ntree@coprocessor_mle.json +0 -123
- teradataml/analytics/mle/json/pack_mle.json +0 -58
- teradataml/analytics/mle/json/pack_mle_mle.json +0 -58
- teradataml/analytics/mle/json/pagerank_mle.json +0 -81
- teradataml/analytics/mle/json/pathanalyzer_mle.json +0 -63
- teradataml/analytics/mle/json/pathgenerator_mle.json +0 -40
- teradataml/analytics/mle/json/pathstart_mle.json +0 -62
- teradataml/analytics/mle/json/pathsummarizer_mle.json +0 -72
- teradataml/analytics/mle/json/pivoting_mle.json +0 -71
- teradataml/analytics/mle/json/postagger_mle.json +0 -51
- teradataml/analytics/mle/json/randomsample_mle.json +0 -131
- teradataml/analytics/mle/json/randomwalksample_mle.json +0 -85
- teradataml/analytics/mle/json/roc_mle.json +0 -73
- teradataml/analytics/mle/json/sampling_mle.json +0 -75
- teradataml/analytics/mle/json/sax_mle.json +0 -154
- teradataml/analytics/mle/json/scale_mle.json +0 -93
- teradataml/analytics/mle/json/scalebypartition_mle.json +0 -89
- teradataml/analytics/mle/json/scalemap_mle.json +0 -44
- teradataml/analytics/mle/json/scalesummary_mle.json +0 -14
- teradataml/analytics/mle/json/sentenceextractor_mle.json +0 -41
- teradataml/analytics/mle/json/sentimentevaluator_mle.json +0 -43
- teradataml/analytics/mle/json/sentimentextractor_mle.json +0 -100
- teradataml/analytics/mle/json/sentimenttrainer_mle.json +0 -68
- teradataml/analytics/mle/json/seriessplitter_mle.json +0 -133
- teradataml/analytics/mle/json/sessionize_mle_mle.json +0 -62
- teradataml/analytics/mle/json/simplemovavg_mle.json +0 -48
- teradataml/analytics/mle/json/stringsimilarity_mle.json +0 -50
- teradataml/analytics/mle/json/stringsimilarity_mle_mle.json +0 -50
- teradataml/analytics/mle/json/svmdense_mle.json +0 -165
- teradataml/analytics/mle/json/svmdensepredict_mle.json +0 -95
- teradataml/analytics/mle/json/svmdensesummary_mle.json +0 -58
- teradataml/analytics/mle/json/svmsparse_mle.json +0 -148
- teradataml/analytics/mle/json/svmsparsepredict_mle_mle.json +0 -103
- teradataml/analytics/mle/json/svmsparsesummary_mle.json +0 -57
- teradataml/analytics/mle/json/textchunker_mle.json +0 -40
- teradataml/analytics/mle/json/textclassifier_mle.json +0 -51
- teradataml/analytics/mle/json/textclassifierevaluator_mle.json +0 -43
- teradataml/analytics/mle/json/textclassifiertrainer_mle.json +0 -103
- teradataml/analytics/mle/json/textmorph_mle.json +0 -63
- teradataml/analytics/mle/json/textparser_mle.json +0 -166
- teradataml/analytics/mle/json/texttagger_mle.json +0 -81
- teradataml/analytics/mle/json/texttokenizer_mle.json +0 -91
- teradataml/analytics/mle/json/tf_mle.json +0 -33
- teradataml/analytics/mle/json/tfidf_mle.json +0 -34
- teradataml/analytics/mle/json/univariatestatistics_mle.json +0 -81
- teradataml/analytics/mle/json/unpack_mle.json +0 -91
- teradataml/analytics/mle/json/unpack_mle_mle.json +0 -91
- teradataml/analytics/mle/json/unpivoting_mle.json +0 -63
- teradataml/analytics/mle/json/varmax_mle.json +0 -176
- teradataml/analytics/mle/json/vectordistance_mle.json +0 -179
- teradataml/analytics/mle/json/weightedmovavg_mle.json +0 -48
- teradataml/analytics/mle/json/xgboost_mle.json +0 -178
- teradataml/analytics/mle/json/xgboostpredict_mle.json +0 -104
- teradataml/analytics/sqle/Antiselect.py +0 -321
- teradataml/analytics/sqle/Attribution.py +0 -603
- teradataml/analytics/sqle/DecisionForestPredict.py +0 -408
- teradataml/analytics/sqle/GLMPredict.py +0 -430
- teradataml/analytics/sqle/MovingAverage.py +0 -543
- teradataml/analytics/sqle/NGramSplitter.py +0 -548
- teradataml/analytics/sqle/NPath.py +0 -632
- teradataml/analytics/sqle/NaiveBayesTextClassifierPredict.py +0 -515
- teradataml/analytics/sqle/Pack.py +0 -388
- teradataml/analytics/sqle/SVMSparsePredict.py +0 -464
- teradataml/analytics/sqle/Sessionize.py +0 -390
- teradataml/analytics/sqle/StringSimilarity.py +0 -400
- teradataml/analytics/sqle/Unpack.py +0 -503
- teradataml/analytics/sqle/json/antiselect_sqle.json +0 -21
- teradataml/analytics/sqle/json/attribution_sqle.json +0 -92
- teradataml/analytics/sqle/json/decisionforestpredict_sqle.json +0 -48
- teradataml/analytics/sqle/json/glmpredict_sqle.json +0 -48
- teradataml/analytics/sqle/json/h2opredict_sqle.json +0 -63
- teradataml/analytics/sqle/json/movingaverage_sqle.json +0 -58
- teradataml/analytics/sqle/json/naivebayestextclassifierpredict_sqle.json +0 -76
- teradataml/analytics/sqle/json/ngramsplitter_sqle.json +0 -126
- teradataml/analytics/sqle/json/npath_sqle.json +0 -67
- teradataml/analytics/sqle/json/pack_sqle.json +0 -47
- teradataml/analytics/sqle/json/pmmlpredict_sqle.json +0 -55
- teradataml/analytics/sqle/json/sessionize_sqle.json +0 -43
- teradataml/analytics/sqle/json/stringsimilarity_sqle.json +0 -39
- teradataml/analytics/sqle/json/svmsparsepredict_sqle.json +0 -74
- teradataml/analytics/sqle/json/unpack_sqle.json +0 -80
- teradataml/catalog/model_cataloging.py +0 -980
- teradataml/config/mlengine_alias_definitions_v1.0 +0 -118
- teradataml/config/mlengine_alias_definitions_v1.1 +0 -127
- teradataml/config/mlengine_alias_definitions_v1.3 +0 -129
- teradataml/table_operators/sandbox_container_util.py +0 -643
- {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/WHEEL +0 -0
- {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/top_level.txt +0 -0
- {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/zip-safe +0 -0
|
@@ -17,7 +17,9 @@ import numpy as np
|
|
|
17
17
|
import pandas as pd
|
|
18
18
|
import random
|
|
19
19
|
import time
|
|
20
|
+
import threading
|
|
20
21
|
from itertools import product
|
|
22
|
+
from collections import defaultdict
|
|
21
23
|
from teradataml import DataFrame, valib, TeradataMlException
|
|
22
24
|
from teradataml.common.messages import Messages, MessageCodes
|
|
23
25
|
from teradataml.hyperparameter_tuner.utils import _ProgressBar
|
|
@@ -171,7 +173,9 @@ class _BaseSearch:
|
|
|
171
173
|
self.__progress_bar = None
|
|
172
174
|
# '__model_err_records' holds error messages of failed model.
|
|
173
175
|
self.__model_err_records = dict()
|
|
174
|
-
|
|
176
|
+
# '__parallel_stop_event' is used to stop threads in parallel execution.
|
|
177
|
+
self.__parallel_stop_event = None
|
|
178
|
+
|
|
175
179
|
# Get the function name.
|
|
176
180
|
self.__func_name = func._tdml_valib_name if "_VALIB" in str(func.__class__) \
|
|
177
181
|
else func.__name__
|
|
@@ -227,6 +231,9 @@ class _BaseSearch:
|
|
|
227
231
|
if self.__func_comparator[self.__evaluation_metric] \
|
|
228
232
|
else self.__best_score_ <= self.__early_stop
|
|
229
233
|
|
|
234
|
+
# '_is_time_stoppable' function is to check whether HPT execution reached self.__timeout value.
|
|
235
|
+
self._is_time_stoppable = lambda : True if time.time() - self.__start_time >= self.__timeout else False
|
|
236
|
+
|
|
230
237
|
# Special case comparator for "MPE" metrics.
|
|
231
238
|
# When "curr_score" argument is 'None' then lambda function checks
|
|
232
239
|
# for '_is_early_stoppable'. Otherwise, it checks for '_is_best_metrics'.
|
|
@@ -876,10 +883,6 @@ class _BaseSearch:
|
|
|
876
883
|
self.__sampled_df_mapper[_data_id] = [{train_data_arg:_train_data},
|
|
877
884
|
{test_data_arg:_test_data}]
|
|
878
885
|
|
|
879
|
-
# Update model trainer function parameter grid.
|
|
880
|
-
self.__update_model_parameters()
|
|
881
|
-
|
|
882
|
-
|
|
883
886
|
def __update_model_parameters(self):
|
|
884
887
|
"""
|
|
885
888
|
DESCRIPTION:
|
|
@@ -924,16 +927,13 @@ class _BaseSearch:
|
|
|
924
927
|
'data_id': 'DF_1'}
|
|
925
928
|
]
|
|
926
929
|
"""
|
|
927
|
-
|
|
928
930
|
# Get data identifiers.
|
|
929
|
-
_model_ids = self.__sampled_df_mapper.keys()
|
|
930
|
-
|
|
931
|
+
_model_ids = self.__sampled_df_mapper.keys()
|
|
931
932
|
# Update '_parameter_grid' with data identifiers by performing
|
|
932
933
|
# cartesian product.
|
|
933
934
|
self._parameter_grid = [{"param":param[0] , self.__DATA_ID:param[1]} for \
|
|
934
935
|
param in product(self._parameter_grid, _model_ids)]
|
|
935
|
-
|
|
936
|
-
|
|
936
|
+
|
|
937
937
|
def __validate_model_trainer_input_data_argument(self, data, is_optional_arg=True):
|
|
938
938
|
"""
|
|
939
939
|
DESCRIPTION:
|
|
@@ -1006,6 +1006,7 @@ class _BaseSearch:
|
|
|
1006
1006
|
stratify_column=None,
|
|
1007
1007
|
sample_id_column=None,
|
|
1008
1008
|
sample_seed=None,
|
|
1009
|
+
max_time=None,
|
|
1009
1010
|
**kwargs):
|
|
1010
1011
|
"""
|
|
1011
1012
|
DESCRIPTION:
|
|
@@ -1146,6 +1147,12 @@ class _BaseSearch:
|
|
|
1146
1147
|
* Mandatory when "sample_seed" argument is present.
|
|
1147
1148
|
Types: str
|
|
1148
1149
|
|
|
1150
|
+
max_time:
|
|
1151
|
+
Optional Argument.
|
|
1152
|
+
Specifies the maximum time for the completion of Hyperparameter tuning execution.
|
|
1153
|
+
Default Value: None
|
|
1154
|
+
Types: int or float
|
|
1155
|
+
|
|
1149
1156
|
kwargs:
|
|
1150
1157
|
Optional Argument.
|
|
1151
1158
|
Specifies the keyword arguments. Accepts additional arguments
|
|
@@ -1225,24 +1232,6 @@ class _BaseSearch:
|
|
|
1225
1232
|
# Set the flag to notify fit method is called.
|
|
1226
1233
|
self.__is_fit_called = True
|
|
1227
1234
|
|
|
1228
|
-
if self.__is_trainable:
|
|
1229
|
-
# "data" argument is a required argument for model trainer function
|
|
1230
|
-
# when data argument is not passed with hyperparameters. On other side,
|
|
1231
|
-
# "data" argument will be optional argument when data argument
|
|
1232
|
-
# is passed with hyperparameters.
|
|
1233
|
-
_is_optional_arg = self.__model_trainer_input_data is not None
|
|
1234
|
-
# validate the model trainer function 'data' argument.
|
|
1235
|
-
self.__validate_model_trainer_input_data_argument(data, _is_optional_arg)
|
|
1236
|
-
|
|
1237
|
-
if not data is None:
|
|
1238
|
-
# '__model_trainer_input_data' is assigned with "data" argument,
|
|
1239
|
-
# when user passes data argument in fit() method.
|
|
1240
|
-
# Note: if user attempts to pass data argument in both "params"
|
|
1241
|
-
# argument as hyperparameters or "data" argument in fit()
|
|
1242
|
-
# method, then latest "data" argument value is considered
|
|
1243
|
-
# for model training.
|
|
1244
|
-
self.__model_trainer_input_data = data
|
|
1245
|
-
|
|
1246
1235
|
# Validate "early_stop".
|
|
1247
1236
|
arg_info_matrix = []
|
|
1248
1237
|
arg_info_matrix.append(["early_stop", early_stop, True, (int, float)])
|
|
@@ -1251,24 +1240,29 @@ class _BaseSearch:
|
|
|
1251
1240
|
arg_info_matrix.append(["wait", wait, True, (bool)])
|
|
1252
1241
|
arg_info_matrix.append(["evaluation_metric", evaluation_metric, True,
|
|
1253
1242
|
(str), True, list(self.__func_comparator)])
|
|
1243
|
+
arg_info_matrix.append(["verbose", verbose, True, (int), True, [0,1,2]])
|
|
1244
|
+
arg_info_matrix.append(["max_time", max_time, True, (int, float)])
|
|
1254
1245
|
|
|
1255
1246
|
_Validators._validate_function_arguments(arg_info_matrix)
|
|
1256
1247
|
|
|
1248
|
+
# set timeout value.
|
|
1249
|
+
self.__timeout = max_time
|
|
1250
|
+
|
|
1251
|
+
self._setting_model_trainer_data(data)
|
|
1252
|
+
|
|
1257
1253
|
# Set the evaluation metrics.
|
|
1258
1254
|
if evaluation_metric is not None:
|
|
1259
1255
|
self.__evaluation_metric = evaluation_metric.upper()
|
|
1260
1256
|
self.__early_stop = early_stop
|
|
1261
|
-
|
|
1262
1257
|
if self.__is_trainable and self.__is_evaluatable and self.__is_sqle_function:
|
|
1258
|
+
|
|
1263
1259
|
# When "evaluation_metric" is 'MPE' then use the spl comparators.
|
|
1264
1260
|
if self.__evaluation_metric == "MPE":
|
|
1265
1261
|
self._is_best_metrics = self._is_early_stoppable = self._spl_abs_comparator
|
|
1266
1262
|
|
|
1267
1263
|
if not isinstance(self.__model_trainer_input_data, dict):
|
|
1268
|
-
# Label the data with unique IDs.
|
|
1269
|
-
_labeled_data = self._add_data_label()
|
|
1270
1264
|
# Sample all the labeled data for model training and testing.
|
|
1271
|
-
self.__perform_train_test_sampling(_labeled_data, frac, stratify_column,
|
|
1265
|
+
self.__perform_train_test_sampling(self._labeled_data, frac, stratify_column,
|
|
1272
1266
|
sample_id_column, sample_seed)
|
|
1273
1267
|
|
|
1274
1268
|
elif isinstance(self.__model_trainer_input_data, dict):
|
|
@@ -1276,6 +1270,8 @@ class _BaseSearch:
|
|
|
1276
1270
|
self.__perform_train_test_sampling(self.__model_trainer_input_data, frac,
|
|
1277
1271
|
stratify_column, sample_id_column,
|
|
1278
1272
|
sample_seed)
|
|
1273
|
+
# Update model trainer function parameter grid.
|
|
1274
|
+
self.__update_model_parameters()
|
|
1279
1275
|
|
|
1280
1276
|
self.__eval_params = kwargs if self.__is_evaluatable else None
|
|
1281
1277
|
|
|
@@ -1287,11 +1283,13 @@ class _BaseSearch:
|
|
|
1287
1283
|
self.__sampled_df_mapper = self._add_data_label("data")
|
|
1288
1284
|
# Update model trainer function parameter grid.
|
|
1289
1285
|
self.__update_model_parameters()
|
|
1290
|
-
|
|
1286
|
+
|
|
1291
1287
|
# Initialize logging.
|
|
1292
1288
|
if verbose > 0:
|
|
1293
1289
|
self.__progress_bar = _ProgressBar(jobs=len(self._parameter_grid), verbose=verbose)
|
|
1294
1290
|
if not run_parallel:
|
|
1291
|
+
# Setting start time of Sequential execution.
|
|
1292
|
+
self.__start_time = time.time() if self.__timeout is not None else None
|
|
1295
1293
|
# TODO: Factorize the code once parallel execution part is completed in ELE-6154 JIRA.
|
|
1296
1294
|
# Execute all parameters from populated parameter grid for both trainable
|
|
1297
1295
|
# and non trainable function.
|
|
@@ -1302,8 +1300,8 @@ class _BaseSearch:
|
|
|
1302
1300
|
# trainer function.
|
|
1303
1301
|
if self.__early_stop is not None and self.__is_evaluatable:
|
|
1304
1302
|
if self.__is_finite and self._is_early_stoppable():
|
|
1305
|
-
# Terminate HPT execution when the trained model attains the
|
|
1306
|
-
#
|
|
1303
|
+
# Terminate HPT execution when the trained model attains the
|
|
1304
|
+
# given "early_stop" value.
|
|
1307
1305
|
break
|
|
1308
1306
|
elif not self.__is_finite:
|
|
1309
1307
|
# Raise error because non-finite values cannot be compared
|
|
@@ -1316,6 +1314,10 @@ class _BaseSearch:
|
|
|
1316
1314
|
" when '{metric}' metric results inconsistent value.".format(
|
|
1317
1315
|
metric=self.__evaluation_metric))
|
|
1318
1316
|
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
1317
|
+
if self.__timeout is not None and self._is_time_stoppable():
|
|
1318
|
+
# Terminate HPT execution when the execution time exceeds the
|
|
1319
|
+
# given time limit.
|
|
1320
|
+
break
|
|
1319
1321
|
|
|
1320
1322
|
else:
|
|
1321
1323
|
# TODO: Added support for early_stop feature along with concurrency in ELE-6154 JIRA.
|
|
@@ -1328,9 +1330,13 @@ class _BaseSearch:
|
|
|
1328
1330
|
_temp_params["model_param"] = param
|
|
1329
1331
|
_temp_params.update(kwargs)
|
|
1330
1332
|
async_exec_params.append(_temp_params)
|
|
1331
|
-
|
|
1333
|
+
|
|
1334
|
+
# Initialize the stopping event
|
|
1335
|
+
self.__parallel_stop_event = threading.Event()
|
|
1332
1336
|
# let's initialize "_AsyncDBExecutor".
|
|
1333
1337
|
self._async_executor = _AsyncDBExecutor(wait=wait)
|
|
1338
|
+
# Setting start time of Parallel execution.
|
|
1339
|
+
self.__start_time = time.time() if self.__timeout is not None else None
|
|
1334
1340
|
# Trigger parallel thread execution.
|
|
1335
1341
|
self._async_executor.submit(self._execute_fit, *async_exec_params)
|
|
1336
1342
|
|
|
@@ -1377,15 +1383,24 @@ class _BaseSearch:
|
|
|
1377
1383
|
EXAMPLES:
|
|
1378
1384
|
>>> self.__model_trainer_routine(param=param, iter=iter, **kwargs)
|
|
1379
1385
|
"""
|
|
1386
|
+
|
|
1380
1387
|
# Define model name used for model metadata.
|
|
1381
1388
|
model_name = self._generate_model_name(iter)
|
|
1382
1389
|
# Get the unique data identifier present in "model_param".
|
|
1383
1390
|
_data_id = model_param[self.__DATA_ID]
|
|
1384
|
-
# Retrieve the train and test data using data identifier.
|
|
1385
|
-
_train_data, _test_data = self.__sampled_df_mapper[_data_id]
|
|
1386
1391
|
# 'param' variable holds model training parameters and train dataframe.
|
|
1387
1392
|
# Get the model training parameters.
|
|
1388
1393
|
param = model_param["param"]
|
|
1394
|
+
|
|
1395
|
+
# Check the stop_event set or not
|
|
1396
|
+
if self.__parallel_stop_event is not None and self.__parallel_stop_event.is_set():
|
|
1397
|
+
# Update the model metadata for Skip execution.
|
|
1398
|
+
self.__update_model_metadata(model_name, param, "SKIP", 0, _data_id)
|
|
1399
|
+
return
|
|
1400
|
+
|
|
1401
|
+
# Retrieve the train and test data using data identifier.
|
|
1402
|
+
_train_data, _test_data = self.__sampled_df_mapper[_data_id]
|
|
1403
|
+
|
|
1389
1404
|
# Update model training argument with train DataFrame.
|
|
1390
1405
|
param.update(_train_data)
|
|
1391
1406
|
# Update the test DataFrame for model evaluation.
|
|
@@ -1418,6 +1433,7 @@ class _BaseSearch:
|
|
|
1418
1433
|
# Default evaluation metric is set to "MAE" for Regression models.
|
|
1419
1434
|
if self.__evaluation_metric is None:
|
|
1420
1435
|
self.__evaluation_metric = "MAE"
|
|
1436
|
+
|
|
1421
1437
|
else:
|
|
1422
1438
|
# ClassificationEvaluator results are stored under "output_data"
|
|
1423
1439
|
# attribute. "output_data" dataframe 'column 1' contains metrics
|
|
@@ -1431,11 +1447,21 @@ class _BaseSearch:
|
|
|
1431
1447
|
# classification models.
|
|
1432
1448
|
if self.__evaluation_metric is None:
|
|
1433
1449
|
self.__evaluation_metric = "ACCURACY"
|
|
1450
|
+
|
|
1434
1451
|
# Update the model metadata for successful model training.
|
|
1435
|
-
|
|
1436
1452
|
self.__update_model_metadata(model_name, param, "PASS",
|
|
1437
1453
|
training_time, _data_id,
|
|
1438
1454
|
columns, eval_values)
|
|
1455
|
+
|
|
1456
|
+
# Check whether self.__parallel_stop_event is None or not
|
|
1457
|
+
if self.__parallel_stop_event is not None:
|
|
1458
|
+
# SET the self.__parallel_stop_event
|
|
1459
|
+
# When trained model evaluation metric value exceeds self.__early_stop
|
|
1460
|
+
# or When execution time exceeds self.__timeout
|
|
1461
|
+
if (self.__early_stop is not None and self._is_early_stoppable())\
|
|
1462
|
+
or (self.__timeout is not None and self._is_time_stoppable()):
|
|
1463
|
+
self.__parallel_stop_event.set()
|
|
1464
|
+
|
|
1439
1465
|
except Exception as _err_msg:
|
|
1440
1466
|
# Record error message with corresponding "model_name".
|
|
1441
1467
|
self.__model_err_records[model_name] = str(_err_msg)
|
|
@@ -1513,7 +1539,11 @@ class _BaseSearch:
|
|
|
1513
1539
|
else:
|
|
1514
1540
|
# Initialize param for non-model trainer functions.
|
|
1515
1541
|
param = model_param
|
|
1516
|
-
|
|
1542
|
+
# Check the stop_event set or not
|
|
1543
|
+
if self.__parallel_stop_event is not None and self.__parallel_stop_event.is_set():
|
|
1544
|
+
# Update the model metadata for Skip execution.
|
|
1545
|
+
self.__update_model_metadata(model_name, param, "SKIP", 0, _data_id)
|
|
1546
|
+
return
|
|
1517
1547
|
try:
|
|
1518
1548
|
# Record starting time of model training.
|
|
1519
1549
|
start_time = time.perf_counter()
|
|
@@ -1541,6 +1571,13 @@ class _BaseSearch:
|
|
|
1541
1571
|
# Update the model metadata for failed execution.
|
|
1542
1572
|
self.__update_model_metadata(model_name, param, "FAIL", training_time, _data_id)
|
|
1543
1573
|
pass
|
|
1574
|
+
|
|
1575
|
+
if self.__parallel_stop_event is not None:
|
|
1576
|
+
# SET the self.__parallel_stop_event
|
|
1577
|
+
# When execution time exceeds self.__timeout
|
|
1578
|
+
if self.__timeout is not None and self._is_time_stoppable():
|
|
1579
|
+
self.__parallel_stop_event.set()
|
|
1580
|
+
|
|
1544
1581
|
|
|
1545
1582
|
|
|
1546
1583
|
def __update_model_metadata(self, model_name,
|
|
@@ -1573,6 +1610,7 @@ class _BaseSearch:
|
|
|
1573
1610
|
Permitted Values:
|
|
1574
1611
|
* PASS: Function result present in the vantage.
|
|
1575
1612
|
* FAIL: Function execution failed for the chosen parameters.
|
|
1613
|
+
* SKIP: Function execution skipped for the chosen parameters.
|
|
1576
1614
|
Types: str
|
|
1577
1615
|
|
|
1578
1616
|
data_id:
|
|
@@ -1622,7 +1660,6 @@ class _BaseSearch:
|
|
|
1622
1660
|
model_metadata = {"MODEL_ID" : model_name,
|
|
1623
1661
|
"PARAMETERS" : param,
|
|
1624
1662
|
"STATUS" : status}
|
|
1625
|
-
|
|
1626
1663
|
if self.__is_trainable:
|
|
1627
1664
|
# Update "data_id" for model trainer functions.
|
|
1628
1665
|
model_metadata[self.__DATA_ID.upper()] = data_id
|
|
@@ -1664,7 +1701,7 @@ class _BaseSearch:
|
|
|
1664
1701
|
# training best model.
|
|
1665
1702
|
self.__best_data_id = data_id
|
|
1666
1703
|
|
|
1667
|
-
if not self.__progress_bar is None:
|
|
1704
|
+
if not self.__progress_bar is None and status != 'SKIP':
|
|
1668
1705
|
# Update progress bar when logging is required.
|
|
1669
1706
|
self.__progress_bar.update(msg=_msg)
|
|
1670
1707
|
# Update "__model_eval_records" with the formatted metadata.
|
|
@@ -2057,6 +2094,160 @@ class _BaseSearch:
|
|
|
2057
2094
|
|
|
2058
2095
|
# Return list of dictionary containing all possible combinations.
|
|
2059
2096
|
return [dict(param) for param in product(*param_pairs)]
|
|
2097
|
+
|
|
2098
|
+
def _data_mapping(self):
|
|
2099
|
+
"""
|
|
2100
|
+
DESCRIPTION:
|
|
2101
|
+
Internal function to create a Cartesian product of data mapped with input columns
|
|
2102
|
+
and parameter grid.
|
|
2103
|
+
|
|
2104
|
+
PARAMETERS:
|
|
2105
|
+
None
|
|
2106
|
+
|
|
2107
|
+
RETURNS:
|
|
2108
|
+
None
|
|
2109
|
+
"""
|
|
2110
|
+
# Get the input columns from the params.
|
|
2111
|
+
input_columns = self.__params.pop("input_columns")
|
|
2112
|
+
# Create a list of dictionaries with data_id and input_columns
|
|
2113
|
+
data_mapping_list = []
|
|
2114
|
+
# Iterate over the labeled data and create a list of dictionaries
|
|
2115
|
+
for data_ids, data in self._labeled_data.items():
|
|
2116
|
+
# Check if all input columns are present in the data
|
|
2117
|
+
for input_cols in input_columns:
|
|
2118
|
+
if all(col in data.columns for col in input_cols):
|
|
2119
|
+
data_mapping_list.append({'data_id': data_ids,
|
|
2120
|
+
'input_columns': input_cols})
|
|
2121
|
+
|
|
2122
|
+
self._parameter_grid = self.__populate_parameter_grid()
|
|
2123
|
+
|
|
2124
|
+
cartesian_product = product(self._parameter_grid, data_mapping_list)
|
|
2125
|
+
|
|
2126
|
+
result_list = []
|
|
2127
|
+
|
|
2128
|
+
# Iterate over the Cartesian product and construct the desired dictionaries
|
|
2129
|
+
for params, data_mapping in cartesian_product:
|
|
2130
|
+
result_dict = {
|
|
2131
|
+
'param': {**params, 'input_columns': data_mapping['input_columns']},
|
|
2132
|
+
self.__DATA_ID: data_mapping['data_id']
|
|
2133
|
+
}
|
|
2134
|
+
result_list.append(result_dict)
|
|
2135
|
+
|
|
2136
|
+
self._parameter_grid = result_list
|
|
2137
|
+
|
|
2138
|
+
|
|
2139
|
+
def _setting_model_trainer_data(self,
|
|
2140
|
+
data=None):
|
|
2141
|
+
"""
|
|
2142
|
+
DESCRIPTION:
|
|
2143
|
+
Internal function to set the model trainer input data for model
|
|
2144
|
+
training.
|
|
2145
|
+
|
|
2146
|
+
PARAMETERS:
|
|
2147
|
+
data:
|
|
2148
|
+
Optional Argument.
|
|
2149
|
+
Specifies the input data used for model training.
|
|
2150
|
+
Note:
|
|
2151
|
+
* "data" argument is a required argument for model trainer
|
|
2152
|
+
function when data argument is not passed with hyperparameters.
|
|
2153
|
+
* When data argument is passed with hyperparameters then
|
|
2154
|
+
"data" argument is optional.
|
|
2155
|
+
Types: teradataml DataFrame
|
|
2156
|
+
|
|
2157
|
+
RETURNS:
|
|
2158
|
+
None
|
|
2159
|
+
|
|
2160
|
+
Example:
|
|
2161
|
+
>>> print(self.__model_trainer_input_data)
|
|
2162
|
+
( id admitted gpa stats programming masters
|
|
2163
|
+
0 19 0 0.051643 0.0 0.0 1.0
|
|
2164
|
+
1 6 1 0.765258 0.5 0.0 1.0
|
|
2165
|
+
2 15 1 1.000000 0.0 0.0 1.0
|
|
2166
|
+
3 32 0 0.746479 0.0 0.5 1.0
|
|
2167
|
+
4 12 1 0.835681 1.0 1.0 0.0
|
|
2168
|
+
5 40 0 0.976526 1.0 0.5 1.0
|
|
2169
|
+
6 7 1 0.215962 1.0 1.0 1.0
|
|
2170
|
+
7 36 0 0.530516 0.0 1.0 0.0
|
|
2171
|
+
8 28 1 0.967136 0.0 0.0 0.0
|
|
2172
|
+
9 17 1 0.920188 0.0 0.0 0.0,
|
|
2173
|
+
id admitted gpa stats programming masters
|
|
2174
|
+
0 4 1 0.765258 0.5 1.0 1.0
|
|
2175
|
+
1 6 1 0.765258 0.5 0.0 1.0
|
|
2176
|
+
2 7 1 0.215962 1.0 1.0 1.0
|
|
2177
|
+
3 8 1 0.812207 0.5 0.0 0.0
|
|
2178
|
+
4 10 1 0.863850 0.0 0.0 0.0
|
|
2179
|
+
5 11 1 0.591549 0.0 0.0 0.0
|
|
2180
|
+
6 9 1 0.915493 0.0 0.0 0.0
|
|
2181
|
+
7 5 0 0.737089 1.0 1.0 0.0
|
|
2182
|
+
8 3 1 0.859155 1.0 0.5 0.0
|
|
2183
|
+
9 2 0 0.887324 0.5 0.5 1.0,
|
|
2184
|
+
id admitted gpa stats programming masters
|
|
2185
|
+
0 23 1 0.807512 0.0 1.0 1.0
|
|
2186
|
+
1 25 1 0.981221 0.0 0.0 0.0
|
|
2187
|
+
2 26 1 0.798122 0.0 0.0 1.0
|
|
2188
|
+
3 27 0 0.981221 0.0 0.0 1.0
|
|
2189
|
+
4 29 0 1.000000 1.0 0.5 1.0
|
|
2190
|
+
5 30 0 0.901408 0.0 1.0 1.0
|
|
2191
|
+
6 28 1 0.967136 0.0 0.0 0.0
|
|
2192
|
+
7 24 1 0.000000 0.0 1.0 0.0
|
|
2193
|
+
8 22 0 0.746479 1.0 0.5 1.0
|
|
2194
|
+
9 21 1 0.938967 1.0 0.5 0.0)
|
|
2195
|
+
|
|
2196
|
+
>>> print(self._labeled_data)
|
|
2197
|
+
{'DF_0': id admitted gpa stats programming masters
|
|
2198
|
+
0 26 1 0.798122 0.0 0.0 1.0
|
|
2199
|
+
1 40 0 0.976526 1.0 0.5 1.0
|
|
2200
|
+
2 7 1 0.215962 1.0 1.0 1.0
|
|
2201
|
+
3 19 0 0.051643 0.0 0.0 1.0
|
|
2202
|
+
4 15 1 1.000000 0.0 0.0 1.0
|
|
2203
|
+
5 32 0 0.746479 0.0 0.5 1.0
|
|
2204
|
+
6 38 1 0.366197 0.0 0.5 1.0
|
|
2205
|
+
7 12 1 0.835681 1.0 1.0 0.0
|
|
2206
|
+
8 6 1 0.765258 0.5 0.0 1.0
|
|
2207
|
+
9 36 0 0.530516 0.0 1.0 0.0,
|
|
2208
|
+
'DF_1': id admitted gpa stats programming masters
|
|
2209
|
+
0 4 1 0.765258 0.5 1.0 1.0
|
|
2210
|
+
1 6 1 0.765258 0.5 0.0 1.0
|
|
2211
|
+
2 7 1 0.215962 1.0 1.0 1.0
|
|
2212
|
+
3 8 1 0.812207 0.5 0.0 0.0
|
|
2213
|
+
4 10 1 0.863850 0.0 0.0 0.0
|
|
2214
|
+
5 11 1 0.591549 0.0 0.0 0.0
|
|
2215
|
+
6 9 1 0.915493 0.0 0.0 0.0
|
|
2216
|
+
7 5 0 0.737089 1.0 1.0 0.0
|
|
2217
|
+
8 3 1 0.859155 1.0 0.5 0.0
|
|
2218
|
+
9 2 0 0.887324 0.5 0.5 1.0,
|
|
2219
|
+
'DF_2': id admitted gpa stats programming masters
|
|
2220
|
+
0 23 1 0.807512 0.0 1.0 1.0
|
|
2221
|
+
1 25 1 0.981221 0.0 0.0 0.0
|
|
2222
|
+
2 26 1 0.798122 0.0 0.0 1.0
|
|
2223
|
+
3 27 0 0.981221 0.0 0.0 1.0
|
|
2224
|
+
4 29 0 1.000000 1.0 0.5 1.0
|
|
2225
|
+
5 30 0 0.901408 0.0 1.0 1.0
|
|
2226
|
+
6 28 1 0.967136 0.0 0.0 0.0
|
|
2227
|
+
7 24 1 0.000000 0.0 1.0 0.0
|
|
2228
|
+
8 22 0 0.746479 1.0 0.5 1.0
|
|
2229
|
+
9 21 1 0.938967 1.0 0.5 0.0}
|
|
2230
|
+
"""
|
|
2231
|
+
if self.__is_trainable:
|
|
2232
|
+
# "data" argument is a required argument for model trainer function
|
|
2233
|
+
# when data argument is not passed with hyperparameters. On other side,
|
|
2234
|
+
# "data" argument will be optional argument when data argument
|
|
2235
|
+
# is passed with hyperparameters.
|
|
2236
|
+
_is_optional_arg = self.__model_trainer_input_data is not None
|
|
2237
|
+
# validate the model trainer function 'data' argument.
|
|
2238
|
+
self.__validate_model_trainer_input_data_argument(data, _is_optional_arg)
|
|
2239
|
+
|
|
2240
|
+
if not data is None:
|
|
2241
|
+
# '__model_trainer_input_data' is assigned with "data" argument,
|
|
2242
|
+
# when user passes data argument in fit() method.
|
|
2243
|
+
# Note: if user attempts to pass data argument in both "params"
|
|
2244
|
+
# argument as hyperparameters or "data" argument in fit()
|
|
2245
|
+
# method, then latest "data" argument value is considered
|
|
2246
|
+
# for model training.
|
|
2247
|
+
self.__model_trainer_input_data = data
|
|
2248
|
+
|
|
2249
|
+
if self.__is_trainable and self.__is_evaluatable and self.__is_sqle_function:
|
|
2250
|
+
self._labeled_data = self._add_data_label()
|
|
2060
2251
|
|
|
2061
2252
|
|
|
2062
2253
|
class GridSearch(_BaseSearch):
|
|
@@ -2659,9 +2850,8 @@ class GridSearch(_BaseSearch):
|
|
|
2659
2850
|
|
|
2660
2851
|
"""
|
|
2661
2852
|
|
|
2662
|
-
self.__params = params
|
|
2853
|
+
self.__params = params.copy()
|
|
2663
2854
|
super().__init__(func=func, params=self.__params)
|
|
2664
|
-
|
|
2665
2855
|
# Populate parameter grid from provided parameter space.
|
|
2666
2856
|
self.__populate_params_grid()
|
|
2667
2857
|
|
|
@@ -2688,87 +2878,381 @@ class GridSearch(_BaseSearch):
|
|
|
2688
2878
|
# Since GridSearch works on all parameter combinations. Set
|
|
2689
2879
|
# all the parameter combinations to the parameter grid.
|
|
2690
2880
|
self._parameter_grid = self._BaseSearch__populate_parameter_grid()
|
|
2881
|
+
|
|
2691
2882
|
|
|
2692
|
-
|
|
2693
|
-
|
|
2694
|
-
|
|
2883
|
+
def fit(self,
|
|
2884
|
+
data=None,
|
|
2885
|
+
evaluation_metric=None,
|
|
2886
|
+
early_stop=None,
|
|
2887
|
+
frac=0.8,
|
|
2888
|
+
run_parallel=True,
|
|
2889
|
+
wait=True,
|
|
2890
|
+
verbose=0,
|
|
2891
|
+
stratify_column=None,
|
|
2892
|
+
sample_id_column=None,
|
|
2893
|
+
sample_seed=None,
|
|
2894
|
+
max_time=None,
|
|
2895
|
+
**kwargs):
|
|
2695
2896
|
"""
|
|
2696
2897
|
DESCRIPTION:
|
|
2697
|
-
|
|
2698
|
-
|
|
2699
|
-
|
|
2700
|
-
|
|
2701
|
-
|
|
2702
|
-
|
|
2703
|
-
* Based on evaluation metrics search determines best model.
|
|
2704
|
-
* All methods and properties can be used.
|
|
2705
|
-
When used for non-model trainer functions:
|
|
2706
|
-
* Only fit() method is supported.
|
|
2707
|
-
* User can choose the best output as they see fit to use this.
|
|
2708
|
-
|
|
2709
|
-
teradataml RandomSearch also allows user to use input data as the
|
|
2710
|
-
hyperparameter. This option can be suitable when the user wants to
|
|
2711
|
-
identify the best models for a set of input data. When user passes
|
|
2712
|
-
set of data as hyperparameter for model trainer function, the search
|
|
2713
|
-
determines the best data along with the best model based on the
|
|
2714
|
-
evaluation metrics.
|
|
2898
|
+
Function to perform hyperparameter tuning using GridSearch algorithm.
|
|
2899
|
+
Notes:
|
|
2900
|
+
* In the Model trainer function, the best parameters are
|
|
2901
|
+
selected based on training results.
|
|
2902
|
+
* In the Non model trainer function, First execution parameter
|
|
2903
|
+
set is selected as the best parameters.
|
|
2715
2904
|
|
|
2716
2905
|
PARAMETERS:
|
|
2717
|
-
|
|
2718
|
-
|
|
2719
|
-
Specifies
|
|
2720
|
-
|
|
2721
|
-
|
|
2722
|
-
|
|
2723
|
-
|
|
2724
|
-
|
|
2725
|
-
|
|
2906
|
+
data:
|
|
2907
|
+
Optional Argument.
|
|
2908
|
+
Specifies the input teradataml DataFrame for model trainer function.
|
|
2909
|
+
Notes:
|
|
2910
|
+
* DataFrame need not to be passed in fit() methods, when "data" is
|
|
2911
|
+
passed as a model hyperparameters ("params").
|
|
2912
|
+
* "data" is a required argument for model trainer functions.
|
|
2913
|
+
* "data" is ignored for non-model trainer functions.
|
|
2914
|
+
* "data" can be contain single DataFrame or multiple DataFrame.
|
|
2915
|
+
* One can pass multiple dataframes to "data". Hyperparameter
|
|
2916
|
+
tuning is performed on all the dataframes for every model
|
|
2917
|
+
parameter.
|
|
2918
|
+
* "data" can be either a dictionary OR a tuple OR a dataframe.
|
|
2919
|
+
* If it is a dictionary then Key represents the label for
|
|
2920
|
+
dataframe and Value represents the dataframe.
|
|
2921
|
+
* If it is a tuple then teradataml converts it to dictionary
|
|
2922
|
+
by generating the labels internally.
|
|
2923
|
+
* If it is a dataframe then teradataml label it as "DF_0".
|
|
2924
|
+
Types: teradataml DataFrame, dictionary, tuples
|
|
2726
2925
|
|
|
2727
|
-
|
|
2728
|
-
|
|
2729
|
-
Specifies the
|
|
2730
|
-
|
|
2731
|
-
argument names and values refers to argument values for corresponding
|
|
2732
|
-
arguments.
|
|
2926
|
+
evaluation_metric:
|
|
2927
|
+
Optional Argument.
|
|
2928
|
+
Specifies the evaluation metrics to considered for model
|
|
2929
|
+
evaluation.
|
|
2733
2930
|
Notes:
|
|
2734
|
-
*
|
|
2735
|
-
|
|
2736
|
-
|
|
2737
|
-
|
|
2738
|
-
*
|
|
2739
|
-
|
|
2740
|
-
|
|
2741
|
-
|
|
2742
|
-
|
|
2931
|
+
* evaluation_metric applicable for model trainer functions.
|
|
2932
|
+
* Best model is not selected when evaluation returns
|
|
2933
|
+
non-finite values.
|
|
2934
|
+
Permitted Values:
|
|
2935
|
+
* Classification: Accuracy, Micro-Precision, Micro-Recall,
|
|
2936
|
+
Micro-F1, Macro-Precision, Macro-Recall,
|
|
2937
|
+
Macro-F1, Weighted-Precision,
|
|
2938
|
+
Weighted-Recall,
|
|
2939
|
+
Weighted-F1.
|
|
2940
|
+
* Regression: MAE, MSE, MSLE, MAPE, MPE, RMSE, RMSLE, ME,
|
|
2941
|
+
R2, EV, MPD, MGD
|
|
2942
|
+
|
|
2943
|
+
Default Value:
|
|
2944
|
+
* Classification: Accuracy
|
|
2945
|
+
* Regression: MAE
|
|
2946
|
+
Types: str
|
|
2947
|
+
|
|
2948
|
+
early_stop:
|
|
2743
2949
|
Optional Argument.
|
|
2744
|
-
Specifies the
|
|
2950
|
+
Specifies the early stop mechanism value for model trainer
|
|
2951
|
+
functions. Hyperparameter tuning ends model training when
|
|
2952
|
+
the training model evaluation metric attains "early_stop" value.
|
|
2745
2953
|
Note:
|
|
2746
|
-
*
|
|
2747
|
-
|
|
2748
|
-
Types: int
|
|
2749
|
-
|
|
2750
|
-
RETURNS:
|
|
2751
|
-
None
|
|
2954
|
+
* Early stopping supports only when evaluation returns
|
|
2955
|
+
finite value.
|
|
2956
|
+
Types: int or float
|
|
2752
2957
|
|
|
2753
|
-
|
|
2754
|
-
|
|
2755
|
-
|
|
2756
|
-
|
|
2757
|
-
|
|
2758
|
-
|
|
2759
|
-
|
|
2760
|
-
|
|
2761
|
-
|
|
2762
|
-
|
|
2763
|
-
|
|
2764
|
-
>>> data_input = DataFrame.from_table("cal_housing_ex_raw")
|
|
2958
|
+
frac:
|
|
2959
|
+
Optional Argument.
|
|
2960
|
+
Specifies the split percentage of rows to be sampled for training
|
|
2961
|
+
and testing dataset. "frac" argument value must range between (0, 1).
|
|
2962
|
+
Notes:
|
|
2963
|
+
* This "frac" argument is not supported for non-model trainer
|
|
2964
|
+
function.
|
|
2965
|
+
* The "frac" value is considered as train split percentage and
|
|
2966
|
+
The remaining percentage is taken into account for test splitting.
|
|
2967
|
+
Default Value: 0.8
|
|
2968
|
+
Types: float
|
|
2765
2969
|
|
|
2766
|
-
|
|
2767
|
-
|
|
2768
|
-
|
|
2769
|
-
|
|
2770
|
-
|
|
2771
|
-
|
|
2970
|
+
run_parallel:
|
|
2971
|
+
Optional Argument.
|
|
2972
|
+
Specifies the parallel execution functionality of hyperparameter
|
|
2973
|
+
tuning. When "run_parallel" set to true, model functions are
|
|
2974
|
+
executed concurrently. Otherwise, model functions are executed
|
|
2975
|
+
sequentially.
|
|
2976
|
+
Default Value: True
|
|
2977
|
+
Types: bool
|
|
2978
|
+
|
|
2979
|
+
wait:
|
|
2980
|
+
Optional Argument.
|
|
2981
|
+
Specifies whether to wait for the completion of execution
|
|
2982
|
+
of hyperparameter tuning or not. When set to False, hyperparameter
|
|
2983
|
+
tuning is executed in the background and user can use "is_running()"
|
|
2984
|
+
method to check the status. Otherwise it waits until the execution
|
|
2985
|
+
is complete to return the control back to user.
|
|
2986
|
+
Default Value: True
|
|
2987
|
+
Type: bool
|
|
2988
|
+
|
|
2989
|
+
verbose:
|
|
2990
|
+
Optional Argument.
|
|
2991
|
+
Specifies whether to log the model training information and display
|
|
2992
|
+
the logs. When it is set to 1, progress bar alone logged in the
|
|
2993
|
+
console. When it is set to 2, along with progress bar, execution
|
|
2994
|
+
steps and execution time is logged in the console. When it is set
|
|
2995
|
+
to 0, nothing is logged in the console.
|
|
2996
|
+
Note:
|
|
2997
|
+
* verbose is not significant when "wait" is 'False'.
|
|
2998
|
+
Default Value: 0
|
|
2999
|
+
Type: bool
|
|
3000
|
+
|
|
3001
|
+
sample_seed:
|
|
3002
|
+
Optional Argument.
|
|
3003
|
+
Specifies the seed value that controls the shuffling applied
|
|
3004
|
+
to the data before applying the Train-Test split. Pass an int for
|
|
3005
|
+
reproducible output across multiple function calls.
|
|
3006
|
+
Notes:
|
|
3007
|
+
* When the argument is not specified, different
|
|
3008
|
+
runs of the query generate different outputs.
|
|
3009
|
+
* It must be in the range [0, 2147483647]
|
|
3010
|
+
* Seed is supported for stratify column.
|
|
3011
|
+
Types: int
|
|
3012
|
+
|
|
3013
|
+
stratify_column:
|
|
3014
|
+
Optional Argument.
|
|
3015
|
+
Specifies column name that contains the labels indicating
|
|
3016
|
+
which data needs to be stratified for TrainTest split.
|
|
3017
|
+
Notes:
|
|
3018
|
+
* seed is supported for stratify column.
|
|
3019
|
+
Types: str
|
|
3020
|
+
|
|
3021
|
+
sample_id_column:
|
|
3022
|
+
Optional Argument.
|
|
3023
|
+
Specifies the input data column name that has the
|
|
3024
|
+
unique identifier for each row in the input.
|
|
3025
|
+
Note:
|
|
3026
|
+
* Mandatory when "sample_seed" argument is present.
|
|
3027
|
+
Types: str
|
|
3028
|
+
|
|
3029
|
+
max_time:
|
|
3030
|
+
Optional Argument.
|
|
3031
|
+
Specifies the maximum time for the completion of Hyperparameter tuning execution.
|
|
3032
|
+
Default Value: None
|
|
3033
|
+
Types: int or float
|
|
3034
|
+
|
|
3035
|
+
kwargs:
|
|
3036
|
+
Optional Argument.
|
|
3037
|
+
Specifies the keyword arguments. Accepts additional arguments
|
|
3038
|
+
required for the teradataml analytic function.
|
|
3039
|
+
|
|
3040
|
+
RETURNS:
|
|
3041
|
+
None
|
|
3042
|
+
|
|
3043
|
+
RAISES:
|
|
3044
|
+
TeradataMlException, TypeError, ValueError
|
|
3045
|
+
|
|
3046
|
+
EXAMPLES:
|
|
3047
|
+
>>> # Create an instance of the GridSearch algorithm called "optimizer_obj"
|
|
3048
|
+
>>> optimizer_obj = GridSearch(func=SVM, params=params)
|
|
3049
|
+
|
|
3050
|
+
>>> eval_params = {"id_column": "id",
|
|
3051
|
+
"accumulate": "MedHouseVal"}
|
|
3052
|
+
>>> # Example 1: Passing single DataFrame for model trainer function.
|
|
3053
|
+
>>> optimizer_obj.fit(data=train_df,
|
|
3054
|
+
evaluation_metric="MAE",
|
|
3055
|
+
early_stop=70.9,
|
|
3056
|
+
**eval_params)
|
|
3057
|
+
|
|
3058
|
+
>>> # Example 2: Passing multiple datasets as tuple of DataFrames for
|
|
3059
|
+
>>> # model trainer function.
|
|
3060
|
+
>>> optimizer_obj.fit(data=(train_df_1, train_df_2),
|
|
3061
|
+
evaluation_metric="MAE",
|
|
3062
|
+
early_stop=70.9,
|
|
3063
|
+
**eval_params)
|
|
3064
|
+
|
|
3065
|
+
>>> # Example 3: Passing multiple datasets as dictionary of DataFrames
|
|
3066
|
+
>>> # for model trainer function.
|
|
3067
|
+
>>> optimizer_obj.fit(data={"Data-1":train_df_1, "Data-2":train_df_2},
|
|
3068
|
+
evaluation_metric="MAE",
|
|
3069
|
+
early_stop=70.9,
|
|
3070
|
+
**eval_params)
|
|
3071
|
+
|
|
3072
|
+
>>> # Example 4: No data argument passed in fit() method for model trainer function.
|
|
3073
|
+
>>> # Note: data argument must be passed while creating HPT object as
|
|
3074
|
+
>>> # model hyperparameters.
|
|
3075
|
+
|
|
3076
|
+
>>> # Define parameter space for model training with "data" argument.
|
|
3077
|
+
>>> params = {"data":(df1, df2),
|
|
3078
|
+
"input_columns":['MedInc', 'HouseAge', 'AveRooms',
|
|
3079
|
+
'AveBedrms', 'Population', 'AveOccup',
|
|
3080
|
+
'Latitude', 'Longitude'],
|
|
3081
|
+
"response_column":"MedHouseVal",
|
|
3082
|
+
"model_type":"regression",
|
|
3083
|
+
"batch_size":(11, 50, 75),
|
|
3084
|
+
"iter_max":(100, 301),
|
|
3085
|
+
"intercept":False,
|
|
3086
|
+
"learning_rate":"INVTIME",
|
|
3087
|
+
"nesterov_optimization":True,
|
|
3088
|
+
"local_sgd_iterations":1}
|
|
3089
|
+
|
|
3090
|
+
>>> # Create "optimizer_obj" using GridSearch algorithm and perform
|
|
3091
|
+
>>> # fit() method without any "data" argument for model trainer function.
|
|
3092
|
+
>>> optimizer_obj.fit(evaluation_metric="MAE",
|
|
3093
|
+
early_stop=70.9,
|
|
3094
|
+
**eval_params)
|
|
3095
|
+
|
|
3096
|
+
>>> # Example 5: Do not pass data argument in fit() method for
|
|
3097
|
+
>>> # non-model trainer function.
|
|
3098
|
+
>>> # Note: data argument must be passed while creating HPT
|
|
3099
|
+
>>> # object as model hyperparameters.
|
|
3100
|
+
>>> optimizer_obj.fit()
|
|
3101
|
+
|
|
3102
|
+
>>> # Example 6: Passing "verbose" argument value '1' in fit() method to
|
|
3103
|
+
>>> # display model log.
|
|
3104
|
+
>>> optimizer_obj.fit(data=train_df, evaluation_metric="R2",
|
|
3105
|
+
verbose=1, **eval_params)
|
|
3106
|
+
completed: |████████████████████████████████████████████████████████████| 100% - 6/6
|
|
3107
|
+
|
|
3108
|
+
>>> # Example 7: max_time argument is passed in fit() method.
|
|
3109
|
+
>>> # Model training parameters
|
|
3110
|
+
>>> model_params = {"input_columns":['sepal_length', 'sepal_width', 'petal_length', 'petal_width'],
|
|
3111
|
+
... "response_column" :'species',
|
|
3112
|
+
... "max_depth":(5,10,15),
|
|
3113
|
+
... "lambda1" :(1000.0,0.001),
|
|
3114
|
+
... "model_type" :"Classification",
|
|
3115
|
+
... "seed":32,
|
|
3116
|
+
... "shrinkage_factor":0.1,
|
|
3117
|
+
... "iter_num":(5, 50)}
|
|
3118
|
+
>>>
|
|
3119
|
+
>>> eval_params = {"id_column": "id",
|
|
3120
|
+
... "accumulate":"species",
|
|
3121
|
+
... "model_type":'Classification',
|
|
3122
|
+
... "object_order_column":['task_index', 'tree_num', 'iter','class_num', 'tree_order']
|
|
3123
|
+
}
|
|
3124
|
+
>>>
|
|
3125
|
+
>>> # Import model trainer function and optimizer.
|
|
3126
|
+
>>> from teradataml import XGBoost, GridSearch
|
|
3127
|
+
>>>
|
|
3128
|
+
>>> # Initialize the GridSearch optimizer with model trainer
|
|
3129
|
+
>>> # function and parameter space required for model training.
|
|
3130
|
+
>>> gs_obj = GridSearch(func=XGBoost, params=model_params)
|
|
3131
|
+
>>>
|
|
3132
|
+
>>> # fit() method with max_time argument(in seconds) for model trainer function.
|
|
3133
|
+
>>> gs_obj.fit(data=data, max_time=30, verbose=2, **eval_params)
|
|
3134
|
+
Model_id:XGBOOST_2 - Run time:33.277s - Status:PASS - ACCURACY:0.933
|
|
3135
|
+
Model_id:XGBOOST_3 - Run time:33.276s - Status:PASS - ACCURACY:0.933
|
|
3136
|
+
Model_id:XGBOOST_0 - Run time:33.279s - Status:PASS - ACCURACY:0.967
|
|
3137
|
+
Model_id:XGBOOST_1 - Run time:33.278s - Status:PASS - ACCURACY:0.933
|
|
3138
|
+
Computing: |⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾| 33% - 4/12
|
|
3139
|
+
>>>
|
|
3140
|
+
>>> # status 'SKIP' for the models which are not completed within the max_time.
|
|
3141
|
+
>>> gs_obj.models
|
|
3142
|
+
MODEL_ID DATA_ID PARAMETERS STATUS ACCURACY
|
|
3143
|
+
0 XGBOOST_2 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.933333
|
|
3144
|
+
1 XGBOOST_4 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
|
|
3145
|
+
2 XGBOOST_5 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
|
|
3146
|
+
3 XGBOOST_6 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
|
|
3147
|
+
4 XGBOOST_7 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
|
|
3148
|
+
5 XGBOOST_8 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
|
|
3149
|
+
6 XGBOOST_9 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
|
|
3150
|
+
7 XGBOOST_10 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
|
|
3151
|
+
8 XGBOOST_11 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
|
|
3152
|
+
9 XGBOOST_3 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.933333
|
|
3153
|
+
10 XGBOOST_0 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.966667
|
|
3154
|
+
11 XGBOOST_1 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.933333
|
|
3155
|
+
"""
|
|
3156
|
+
|
|
3157
|
+
# Set the flag to discard invalid column parameters.
|
|
3158
|
+
self.discard_invalid_column_params =kwargs.get("discard_invalid_column_params", False)
|
|
3159
|
+
|
|
3160
|
+
if self.discard_invalid_column_params:
|
|
3161
|
+
# Setting model trainer input data.
|
|
3162
|
+
super()._setting_model_trainer_data(data)
|
|
3163
|
+
# Data mapping for model trainer function.
|
|
3164
|
+
super()._data_mapping()
|
|
3165
|
+
# Setting the lambda function to None.
|
|
3166
|
+
self._setting_model_trainer_data = lambda data: None
|
|
3167
|
+
self._BaseSearch__update_model_parameters = lambda: None
|
|
3168
|
+
|
|
3169
|
+
# Calling baseSearch class fit method.
|
|
3170
|
+
super().fit(data, evaluation_metric,
|
|
3171
|
+
early_stop, frac, run_parallel,
|
|
3172
|
+
wait, verbose, stratify_column,
|
|
3173
|
+
sample_id_column, sample_seed,
|
|
3174
|
+
max_time, **kwargs)
|
|
3175
|
+
|
|
3176
|
+
|
|
3177
|
+
class RandomSearch(_BaseSearch):
|
|
3178
|
+
def __init__(self, func, params, n_iter=10, **kwargs):
|
|
3179
|
+
"""
|
|
3180
|
+
DESCRIPTION:
|
|
3181
|
+
RandomSearch algorithm performs random sampling on hyperparameter
|
|
3182
|
+
space to identify optimal hyperparameters. It works for
|
|
3183
|
+
teradataml analytic functions from SQLE, BYOM, VAL and UAF features.
|
|
3184
|
+
teradataml RandomSearch allows user to perform hyperparameter tuning for
|
|
3185
|
+
all model trainer and non-model trainer functions.
|
|
3186
|
+
When used for model trainer functions:
|
|
3187
|
+
* Based on evaluation metrics search determines best model.
|
|
3188
|
+
* All methods and properties can be used.
|
|
3189
|
+
When used for non-model trainer functions:
|
|
3190
|
+
* Only fit() method is supported.
|
|
3191
|
+
* User can choose the best output as they see fit to use this.
|
|
3192
|
+
|
|
3193
|
+
teradataml RandomSearch also allows user to use input data as the
|
|
3194
|
+
hyperparameter. This option can be suitable when the user wants to
|
|
3195
|
+
identify the best models for a set of input data. When user passes
|
|
3196
|
+
set of data as hyperparameter for model trainer function, the search
|
|
3197
|
+
determines the best data along with the best model based on the
|
|
3198
|
+
evaluation metrics.
|
|
3199
|
+
|
|
3200
|
+
PARAMETERS:
|
|
3201
|
+
func:
|
|
3202
|
+
Required Argument.
|
|
3203
|
+
Specifies a teradataml analytic function from SQLE, VAL, and UAF.
|
|
3204
|
+
Types:
|
|
3205
|
+
teradataml Analytic Functions
|
|
3206
|
+
* Advanced analytic functions
|
|
3207
|
+
* UAF
|
|
3208
|
+
* VAL
|
|
3209
|
+
Refer to display_analytic_functions() function for list of functions.
|
|
3210
|
+
|
|
3211
|
+
params:
|
|
3212
|
+
Required Argument.
|
|
3213
|
+
Specifies the parameter(s) of a teradataml analytic function.
|
|
3214
|
+
The parameter(s) must be in dictionary. keys refers to the
|
|
3215
|
+
argument names and values refers to argument values for corresponding
|
|
3216
|
+
arguments.
|
|
3217
|
+
Notes:
|
|
3218
|
+
* One can specify the argument value in a tuple to run HPT
|
|
3219
|
+
with different arguments.
|
|
3220
|
+
* Model trainer function arguments "id_column", "input_columns",
|
|
3221
|
+
and "target_columns" must be passed in fit() method.
|
|
3222
|
+
* All required arguments of non-model trainer function must be
|
|
3223
|
+
passed while RandomSearch object creation.
|
|
3224
|
+
Types: dict
|
|
3225
|
+
|
|
3226
|
+
n_iter:
|
|
3227
|
+
Optional Argument.
|
|
3228
|
+
Specifies the number of iterations random search need to be performed.
|
|
3229
|
+
Note:
|
|
3230
|
+
* n_iter must be less than the size of parameter populations.
|
|
3231
|
+
Default Value: 10
|
|
3232
|
+
Types: int
|
|
3233
|
+
|
|
3234
|
+
RETURNS:
|
|
3235
|
+
None
|
|
3236
|
+
|
|
3237
|
+
RAISES:
|
|
3238
|
+
TeradataMlException, TypeError, ValueError
|
|
3239
|
+
|
|
3240
|
+
EXAMPLES:
|
|
3241
|
+
>>> # Example 1: Model trainer function. Performing hyperparameter-tuning
|
|
3242
|
+
>>> # on SVM model trainer function using random search algorithm.
|
|
3243
|
+
|
|
3244
|
+
>>> # Load the example data.
|
|
3245
|
+
>>> load_example_data("teradataml", ["cal_housing_ex_raw"])
|
|
3246
|
+
|
|
3247
|
+
>>> # Create teradataml DataFrame objects.
|
|
3248
|
+
>>> data_input = DataFrame.from_table("cal_housing_ex_raw")
|
|
3249
|
+
|
|
3250
|
+
>>> # Scale "target_columns" with respect to 'STD' value of the column.
|
|
3251
|
+
>>> fit_obj = ScaleFit(data=data_input,
|
|
3252
|
+
target_columns=['MedInc', 'HouseAge', 'AveRooms',
|
|
3253
|
+
'AveBedrms', 'Population', 'AveOccup',
|
|
3254
|
+
'Latitude', 'Longitude'],
|
|
3255
|
+
scale_method="STD")
|
|
2772
3256
|
|
|
2773
3257
|
>>> # Transform the data.
|
|
2774
3258
|
>>> transform_obj = ScaleTransform(data=data_input,
|
|
@@ -2953,7 +3437,7 @@ class RandomSearch(_BaseSearch):
|
|
|
2953
3437
|
|
|
2954
3438
|
"""
|
|
2955
3439
|
|
|
2956
|
-
self.__params = params
|
|
3440
|
+
self.__params = params.copy()
|
|
2957
3441
|
super().__init__(func=func, params=self.__params)
|
|
2958
3442
|
# Validate argument 'n_iter'
|
|
2959
3443
|
awu_matrix = []
|
|
@@ -2964,10 +3448,9 @@ class RandomSearch(_BaseSearch):
|
|
|
2964
3448
|
# Validates the range of n_iter should be greater than or equal to 1 and
|
|
2965
3449
|
# less than or equal to parameter space.
|
|
2966
3450
|
_Validators._validate_argument_range(n_iter, "n_iter", 1, len(parameter_space), True, True)
|
|
3451
|
+
self._n_iter = n_iter
|
|
2967
3452
|
|
|
2968
|
-
|
|
2969
|
-
|
|
2970
|
-
def __populate_params_grid(self, n_iter, parameter_space):
|
|
3453
|
+
def __populate_params_grid(self):
|
|
2971
3454
|
"""
|
|
2972
3455
|
DESCRIPTION:
|
|
2973
3456
|
Populate parameter grid based on the search algorithm. In random search,
|
|
@@ -2988,6 +3471,313 @@ class RandomSearch(_BaseSearch):
|
|
|
2988
3471
|
EXAMPLES:
|
|
2989
3472
|
>>> self.__populate_params_grid()
|
|
2990
3473
|
"""
|
|
2991
|
-
|
|
2992
3474
|
# Populate the parameter space with random and non-repetitive value
|
|
2993
|
-
self.
|
|
3475
|
+
if self.discard_invalid_column_params:
|
|
3476
|
+
# Defining the empty data_grouped_dict to group the parameters based on data_id.
|
|
3477
|
+
data_grouped_dict = defaultdict(list)
|
|
3478
|
+
for parameter in self._parameter_grid:
|
|
3479
|
+
# Extracting the data_id from the parameter.
|
|
3480
|
+
data_id = parameter['data_id']
|
|
3481
|
+
# Grouping the parameters based on data_id.
|
|
3482
|
+
data_grouped_dict[data_id].append(parameter)
|
|
3483
|
+
# Converting the grouped dictionary to list.
|
|
3484
|
+
data_grouped_dict = list(data_grouped_dict.values())
|
|
3485
|
+
parameter_grid = []
|
|
3486
|
+
for group in data_grouped_dict:
|
|
3487
|
+
# Randomly selecting the n_iter parameters from the grouped data.
|
|
3488
|
+
tmp = random.sample(group, self._n_iter)
|
|
3489
|
+
parameter_grid.extend(tmp)
|
|
3490
|
+
|
|
3491
|
+
# Setting the parameter grid.
|
|
3492
|
+
self._parameter_grid = parameter_grid
|
|
3493
|
+
else:
|
|
3494
|
+
self._parameter_grid = random.sample(self.get_parameter_grid(), self._n_iter)
|
|
3495
|
+
|
|
3496
|
+
def fit(self,
|
|
3497
|
+
data=None,
|
|
3498
|
+
evaluation_metric=None,
|
|
3499
|
+
early_stop=None,
|
|
3500
|
+
frac=0.8,
|
|
3501
|
+
run_parallel=True,
|
|
3502
|
+
wait=True,
|
|
3503
|
+
verbose=0,
|
|
3504
|
+
stratify_column=None,
|
|
3505
|
+
sample_id_column=None,
|
|
3506
|
+
sample_seed=None,
|
|
3507
|
+
max_time=None,
|
|
3508
|
+
**kwargs):
|
|
3509
|
+
"""
|
|
3510
|
+
DESCRIPTION:
|
|
3511
|
+
Function to perform hyperparameter tuning using RandomSearch algorithm.
|
|
3512
|
+
Notes:
|
|
3513
|
+
* In the Model trainer function, the best parameters are
|
|
3514
|
+
selected based on training results.
|
|
3515
|
+
* In the Non model trainer function, First execution parameter
|
|
3516
|
+
set is selected as the best parameters.
|
|
3517
|
+
|
|
3518
|
+
PARAMETERS:
|
|
3519
|
+
data:
|
|
3520
|
+
Optional Argument.
|
|
3521
|
+
Specifies the input teradataml DataFrame for model trainer function.
|
|
3522
|
+
Notes:
|
|
3523
|
+
* DataFrame need not to be passed in fit() methods, when "data" is
|
|
3524
|
+
passed as a model hyperparameters ("params").
|
|
3525
|
+
* "data" is a required argument for model trainer functions.
|
|
3526
|
+
* "data" is ignored for non-model trainer functions.
|
|
3527
|
+
* "data" can be contain single DataFrame or multiple DataFrame.
|
|
3528
|
+
* One can pass multiple dataframes to "data". Hyperparameter
|
|
3529
|
+
tuning is performed on all the dataframes for every model
|
|
3530
|
+
parameter.
|
|
3531
|
+
* "data" can be either a dictionary OR a tuple OR a dataframe.
|
|
3532
|
+
* If it is a dictionary then Key represents the label for
|
|
3533
|
+
dataframe and Value represents the dataframe.
|
|
3534
|
+
* If it is a tuple then teradataml converts it to dictionary
|
|
3535
|
+
by generating the labels internally.
|
|
3536
|
+
* If it is a dataframe then teradataml label it as "DF_0".
|
|
3537
|
+
Types: teradataml DataFrame, dictionary, tuples
|
|
3538
|
+
|
|
3539
|
+
evaluation_metric:
|
|
3540
|
+
Optional Argument.
|
|
3541
|
+
Specifies the evaluation metrics to considered for model
|
|
3542
|
+
evaluation.
|
|
3543
|
+
Notes:
|
|
3544
|
+
* evaluation_metric applicable for model trainer functions.
|
|
3545
|
+
* Best model is not selected when evaluation returns
|
|
3546
|
+
non-finite values.
|
|
3547
|
+
Permitted Values:
|
|
3548
|
+
* Classification: Accuracy, Micro-Precision, Micro-Recall,
|
|
3549
|
+
Micro-F1, Macro-Precision, Macro-Recall,
|
|
3550
|
+
Macro-F1, Weighted-Precision,
|
|
3551
|
+
Weighted-Recall,
|
|
3552
|
+
Weighted-F1.
|
|
3553
|
+
* Regression: MAE, MSE, MSLE, MAPE, MPE, RMSE, RMSLE, ME,
|
|
3554
|
+
R2, EV, MPD, MGD
|
|
3555
|
+
|
|
3556
|
+
Default Value:
|
|
3557
|
+
* Classification: Accuracy
|
|
3558
|
+
* Regression: MAE
|
|
3559
|
+
Types: str
|
|
3560
|
+
|
|
3561
|
+
early_stop:
|
|
3562
|
+
Optional Argument.
|
|
3563
|
+
Specifies the early stop mechanism value for model trainer
|
|
3564
|
+
functions. Hyperparameter tuning ends model training when
|
|
3565
|
+
the training model evaluation metric attains "early_stop" value.
|
|
3566
|
+
Note:
|
|
3567
|
+
* Early stopping supports only when evaluation returns
|
|
3568
|
+
finite value.
|
|
3569
|
+
Types: int or float
|
|
3570
|
+
|
|
3571
|
+
frac:
|
|
3572
|
+
Optional Argument.
|
|
3573
|
+
Specifies the split percentage of rows to be sampled for training
|
|
3574
|
+
and testing dataset. "frac" argument value must range between (0, 1).
|
|
3575
|
+
Notes:
|
|
3576
|
+
* This "frac" argument is not supported for non-model trainer
|
|
3577
|
+
function.
|
|
3578
|
+
* The "frac" value is considered as train split percentage and
|
|
3579
|
+
The remaining percentage is taken into account for test splitting.
|
|
3580
|
+
Default Value: 0.8
|
|
3581
|
+
Types: float
|
|
3582
|
+
|
|
3583
|
+
run_parallel:
|
|
3584
|
+
Optional Argument.
|
|
3585
|
+
Specifies the parallel execution functionality of hyperparameter
|
|
3586
|
+
tuning. When "run_parallel" set to true, model functions are
|
|
3587
|
+
executed concurrently. Otherwise, model functions are executed
|
|
3588
|
+
sequentially.
|
|
3589
|
+
Default Value: True
|
|
3590
|
+
Types: bool
|
|
3591
|
+
|
|
3592
|
+
wait:
|
|
3593
|
+
Optional Argument.
|
|
3594
|
+
Specifies whether to wait for the completion of execution
|
|
3595
|
+
of hyperparameter tuning or not. When set to False, hyperparameter
|
|
3596
|
+
tuning is executed in the background and user can use "is_running()"
|
|
3597
|
+
method to check the status. Otherwise it waits until the execution
|
|
3598
|
+
is complete to return the control back to user.
|
|
3599
|
+
Default Value: True
|
|
3600
|
+
Type: bool
|
|
3601
|
+
|
|
3602
|
+
verbose:
|
|
3603
|
+
Optional Argument.
|
|
3604
|
+
Specifies whether to log the model training information and display
|
|
3605
|
+
the logs. When it is set to 1, progress bar alone logged in the
|
|
3606
|
+
console. When it is set to 2, along with progress bar, execution
|
|
3607
|
+
steps and execution time is logged in the console. When it is set
|
|
3608
|
+
to 0, nothing is logged in the console.
|
|
3609
|
+
Note:
|
|
3610
|
+
* verbose is not significant when "wait" is 'False'.
|
|
3611
|
+
Default Value: 0
|
|
3612
|
+
Type: bool
|
|
3613
|
+
|
|
3614
|
+
sample_seed:
|
|
3615
|
+
Optional Argument.
|
|
3616
|
+
Specifies the seed value that controls the shuffling applied
|
|
3617
|
+
to the data before applying the Train-Test split. Pass an int for
|
|
3618
|
+
reproducible output across multiple function calls.
|
|
3619
|
+
Notes:
|
|
3620
|
+
* When the argument is not specified, different
|
|
3621
|
+
runs of the query generate different outputs.
|
|
3622
|
+
* It must be in the range [0, 2147483647]
|
|
3623
|
+
* Seed is supported for stratify column.
|
|
3624
|
+
Types: int
|
|
3625
|
+
|
|
3626
|
+
stratify_column:
|
|
3627
|
+
Optional Argument.
|
|
3628
|
+
Specifies column name that contains the labels indicating
|
|
3629
|
+
which data needs to be stratified for TrainTest split.
|
|
3630
|
+
Notes:
|
|
3631
|
+
* seed is supported for stratify column.
|
|
3632
|
+
Types: str
|
|
3633
|
+
|
|
3634
|
+
sample_id_column:
|
|
3635
|
+
Optional Argument.
|
|
3636
|
+
Specifies the input data column name that has the
|
|
3637
|
+
unique identifier for each row in the input.
|
|
3638
|
+
Note:
|
|
3639
|
+
* Mandatory when "sample_seed" argument is present.
|
|
3640
|
+
Types: str
|
|
3641
|
+
|
|
3642
|
+
max_time:
|
|
3643
|
+
Optional Argument.
|
|
3644
|
+
Specifies the maximum time for the completion of Hyperparameter tuning execution.
|
|
3645
|
+
Default Value: None
|
|
3646
|
+
Types: int or float
|
|
3647
|
+
|
|
3648
|
+
kwargs:
|
|
3649
|
+
Optional Argument.
|
|
3650
|
+
Specifies the keyword arguments. Accepts additional arguments
|
|
3651
|
+
required for the teradataml analytic function.
|
|
3652
|
+
|
|
3653
|
+
RETURNS:
|
|
3654
|
+
None
|
|
3655
|
+
|
|
3656
|
+
RAISES:
|
|
3657
|
+
TeradataMlException, TypeError, ValueError
|
|
3658
|
+
|
|
3659
|
+
EXAMPLES:
|
|
3660
|
+
>>> # Create an instance of the RandomSearch algorithm called "optimizer_obj"
|
|
3661
|
+
>>> optimizer_obj = RandomSearch(func=SVM, params=params, n_iter=3)
|
|
3662
|
+
|
|
3663
|
+
>>> eval_params = {"id_column": "id",
|
|
3664
|
+
"accumulate": "MedHouseVal"}
|
|
3665
|
+
>>> # Example 1: Passing single DataFrame for model trainer function.
|
|
3666
|
+
>>> optimizer_obj.fit(data=train_df,
|
|
3667
|
+
evaluation_metric="MAE",
|
|
3668
|
+
early_stop=70.9,
|
|
3669
|
+
**eval_params)
|
|
3670
|
+
|
|
3671
|
+
>>> # Example 2: Passing multiple datasets as tuple of DataFrames for
|
|
3672
|
+
>>> # model trainer function.
|
|
3673
|
+
>>> optimizer_obj.fit(data=(train_df_1, train_df_2),
|
|
3674
|
+
evaluation_metric="MAE",
|
|
3675
|
+
early_stop=70.9,
|
|
3676
|
+
**eval_params)
|
|
3677
|
+
|
|
3678
|
+
>>> # Example 3: Passing multiple datasets as dictionary of DataFrames
|
|
3679
|
+
>>> # for model trainer function.
|
|
3680
|
+
>>> optimizer_obj.fit(data={"Data-1":train_df_1, "Data-2":train_df_2},
|
|
3681
|
+
evaluation_metric="MAE",
|
|
3682
|
+
early_stop=70.9,
|
|
3683
|
+
**eval_params)
|
|
3684
|
+
|
|
3685
|
+
>>> # Example 4: No data argument passed in fit() method for model trainer function.
|
|
3686
|
+
>>> # Note: data argument must be passed while creating HPT object as
|
|
3687
|
+
>>> # model hyperparameters.
|
|
3688
|
+
|
|
3689
|
+
>>> # Define parameter space for model training with "data" argument.
|
|
3690
|
+
>>> params = {"data":(df1, df2),
|
|
3691
|
+
"input_columns":['MedInc', 'HouseAge', 'AveRooms',
|
|
3692
|
+
'AveBedrms', 'Population', 'AveOccup',
|
|
3693
|
+
'Latitude', 'Longitude'],
|
|
3694
|
+
"response_column":"MedHouseVal",
|
|
3695
|
+
"model_type":"regression",
|
|
3696
|
+
"batch_size":(11, 50, 75),
|
|
3697
|
+
"iter_max":(100, 301),
|
|
3698
|
+
"intercept":False,
|
|
3699
|
+
"learning_rate":"INVTIME",
|
|
3700
|
+
"nesterov_optimization":True,
|
|
3701
|
+
"local_sgd_iterations":1}
|
|
3702
|
+
|
|
3703
|
+
>>> # Create "optimizer_obj" using RandomSearch algorithm and perform
|
|
3704
|
+
>>> # fit() method without any "data" argument for model trainer function.
|
|
3705
|
+
>>> optimizer_obj.fit(evaluation_metric="MAE",
|
|
3706
|
+
early_stop=70.9,
|
|
3707
|
+
**eval_params)
|
|
3708
|
+
|
|
3709
|
+
>>> # Example 5: Do not pass data argument in fit() method for
|
|
3710
|
+
>>> # non-model trainer function.
|
|
3711
|
+
>>> # Note: data argument must be passed while creating HPT
|
|
3712
|
+
>>> # object as model hyperparameters.
|
|
3713
|
+
>>> optimizer_obj.fit()
|
|
3714
|
+
|
|
3715
|
+
>>> # Example 6: Passing "verbose" argument value '1' in fit() method to
|
|
3716
|
+
>>> # display model log.
|
|
3717
|
+
>>> optimizer_obj.fit(data=train_df, evaluation_metric="R2",
|
|
3718
|
+
verbose=1, **eval_params)
|
|
3719
|
+
completed: |████████████████████████████████████████████████████████████| 100% - 6/6
|
|
3720
|
+
|
|
3721
|
+
>>> # Example 7: max_time argument is passed in fit() method.
|
|
3722
|
+
>>> # Model training parameters
|
|
3723
|
+
>>> model_params = {"input_columns":['sepal_length', 'sepal_width', 'petal_length', 'petal_width'],
|
|
3724
|
+
... "response_column" : 'species',
|
|
3725
|
+
... "max_depth":(5,10,15),
|
|
3726
|
+
... "lambda1" : (1000.0,0.001),
|
|
3727
|
+
... "model_type" :"Classification",
|
|
3728
|
+
... "seed":32,
|
|
3729
|
+
... "shrinkage_factor":0.1,
|
|
3730
|
+
... "iter_num":(5, 50)}
|
|
3731
|
+
>>>
|
|
3732
|
+
>>> eval_params = {"id_column": "id",
|
|
3733
|
+
... "accumulate": "species",
|
|
3734
|
+
... "model_type":'Classification',
|
|
3735
|
+
... "object_order_column":['task_index', 'tree_num', 'iter','class_num', 'tree_order']
|
|
3736
|
+
... }
|
|
3737
|
+
>>>
|
|
3738
|
+
>>> # Import model trainer and optimizer
|
|
3739
|
+
>>> from teradataml import XGBoost, RandomSearch
|
|
3740
|
+
>>>
|
|
3741
|
+
>>> # Initialize the RandomSearch optimizer with model trainer
|
|
3742
|
+
>>> # function and parameter space required for model training.
|
|
3743
|
+
>>> rs_obj = RandomSearch(func=XGBoost, params=model_params, n_iter=5)
|
|
3744
|
+
>>>
|
|
3745
|
+
>>> # fit() method with max_time argument(in seconds) for model trainer function.
|
|
3746
|
+
>>> rs_obj.fit(data=data, max_time=30, verbose=2, **eval_params)
|
|
3747
|
+
Model_id:XGBOOST_3 - Run time:28.292s - Status:PASS - ACCURACY:0.8
|
|
3748
|
+
Model_id:XGBOOST_0 - Run time:28.291s - Status:PASS - ACCURACY:0.867
|
|
3749
|
+
Model_id:XGBOOST_2 - Run time:28.289s - Status:PASS - ACCURACY:0.867
|
|
3750
|
+
Model_id:XGBOOST_1 - Run time:28.291s - Status:PASS - ACCURACY:0.867
|
|
3751
|
+
Computing: |⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾| 80% - 4/5
|
|
3752
|
+
>>>
|
|
3753
|
+
>>> # status 'SKIP' for the models which are not completed within the max_time.
|
|
3754
|
+
>>> rs_obj.models
|
|
3755
|
+
MODEL_ID DATA_ID PARAMETERS STATUS ACCURACY
|
|
3756
|
+
0 XGBOOST_3 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.800000
|
|
3757
|
+
1 XGBOOST_4 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
|
|
3758
|
+
2 XGBOOST_0 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.866667
|
|
3759
|
+
3 XGBOOST_2 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.866667
|
|
3760
|
+
4 XGBOOST_1 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.866667
|
|
3761
|
+
"""
|
|
3762
|
+
|
|
3763
|
+
# Set discard_invalid_column_params flag.
|
|
3764
|
+
self.discard_invalid_column_params =kwargs.get("discard_invalid_column_params", False)
|
|
3765
|
+
|
|
3766
|
+
if self.discard_invalid_column_params:
|
|
3767
|
+
# Setting model trainer input data
|
|
3768
|
+
super()._setting_model_trainer_data(data)
|
|
3769
|
+
# Mapping the data with input columns
|
|
3770
|
+
super()._data_mapping()
|
|
3771
|
+
# Setting the lambda function to None.
|
|
3772
|
+
self._setting_model_trainer_data = lambda data: None
|
|
3773
|
+
self._BaseSearch__update_model_parameters = lambda: None
|
|
3774
|
+
|
|
3775
|
+
# Populate parameter grid.
|
|
3776
|
+
self.__populate_params_grid()
|
|
3777
|
+
|
|
3778
|
+
# Calling baseSearch class fit method.
|
|
3779
|
+
super().fit(data, evaluation_metric, early_stop,
|
|
3780
|
+
frac, run_parallel, wait, verbose,
|
|
3781
|
+
stratify_column, sample_id_column,
|
|
3782
|
+
sample_seed, max_time, **kwargs)
|
|
3783
|
+
|