teradataml 20.0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +2762 -0
- teradataml/__init__.py +78 -0
- teradataml/_version.py +11 -0
- teradataml/analytics/Transformations.py +2996 -0
- teradataml/analytics/__init__.py +82 -0
- teradataml/analytics/analytic_function_executor.py +2416 -0
- teradataml/analytics/analytic_query_generator.py +1050 -0
- teradataml/analytics/byom/H2OPredict.py +514 -0
- teradataml/analytics/byom/PMMLPredict.py +437 -0
- teradataml/analytics/byom/__init__.py +16 -0
- teradataml/analytics/json_parser/__init__.py +133 -0
- teradataml/analytics/json_parser/analytic_functions_argument.py +1805 -0
- teradataml/analytics/json_parser/json_store.py +191 -0
- teradataml/analytics/json_parser/metadata.py +1666 -0
- teradataml/analytics/json_parser/utils.py +805 -0
- teradataml/analytics/meta_class.py +236 -0
- teradataml/analytics/sqle/DecisionTreePredict.py +456 -0
- teradataml/analytics/sqle/NaiveBayesPredict.py +420 -0
- teradataml/analytics/sqle/__init__.py +128 -0
- teradataml/analytics/sqle/json/decisiontreepredict_sqle.json +78 -0
- teradataml/analytics/sqle/json/naivebayespredict_sqle.json +62 -0
- teradataml/analytics/table_operator/__init__.py +11 -0
- teradataml/analytics/uaf/__init__.py +82 -0
- teradataml/analytics/utils.py +828 -0
- teradataml/analytics/valib.py +1617 -0
- teradataml/automl/__init__.py +5835 -0
- teradataml/automl/autodataprep/__init__.py +493 -0
- teradataml/automl/custom_json_utils.py +1625 -0
- teradataml/automl/data_preparation.py +1384 -0
- teradataml/automl/data_transformation.py +1254 -0
- teradataml/automl/feature_engineering.py +2273 -0
- teradataml/automl/feature_exploration.py +1873 -0
- teradataml/automl/model_evaluation.py +488 -0
- teradataml/automl/model_training.py +1407 -0
- teradataml/catalog/__init__.py +2 -0
- teradataml/catalog/byom.py +1759 -0
- teradataml/catalog/function_argument_mapper.py +859 -0
- teradataml/catalog/model_cataloging_utils.py +491 -0
- teradataml/clients/__init__.py +0 -0
- teradataml/clients/auth_client.py +137 -0
- teradataml/clients/keycloak_client.py +165 -0
- teradataml/clients/pkce_client.py +481 -0
- teradataml/common/__init__.py +1 -0
- teradataml/common/aed_utils.py +2078 -0
- teradataml/common/bulk_exposed_utils.py +113 -0
- teradataml/common/constants.py +1669 -0
- teradataml/common/deprecations.py +166 -0
- teradataml/common/exceptions.py +147 -0
- teradataml/common/formula.py +743 -0
- teradataml/common/garbagecollector.py +666 -0
- teradataml/common/logger.py +1261 -0
- teradataml/common/messagecodes.py +518 -0
- teradataml/common/messages.py +262 -0
- teradataml/common/pylogger.py +67 -0
- teradataml/common/sqlbundle.py +764 -0
- teradataml/common/td_coltype_code_to_tdtype.py +48 -0
- teradataml/common/utils.py +3166 -0
- teradataml/common/warnings.py +36 -0
- teradataml/common/wrapper_utils.py +625 -0
- teradataml/config/__init__.py +0 -0
- teradataml/config/dummy_file1.cfg +5 -0
- teradataml/config/dummy_file2.cfg +3 -0
- teradataml/config/sqlengine_alias_definitions_v1.0 +14 -0
- teradataml/config/sqlengine_alias_definitions_v1.1 +20 -0
- teradataml/config/sqlengine_alias_definitions_v1.3 +19 -0
- teradataml/context/__init__.py +0 -0
- teradataml/context/aed_context.py +223 -0
- teradataml/context/context.py +1462 -0
- teradataml/data/A_loan.csv +19 -0
- teradataml/data/BINARY_REALS_LEFT.csv +11 -0
- teradataml/data/BINARY_REALS_RIGHT.csv +11 -0
- teradataml/data/B_loan.csv +49 -0
- teradataml/data/BuoyData2.csv +17 -0
- teradataml/data/CONVOLVE2_COMPLEX_LEFT.csv +5 -0
- teradataml/data/CONVOLVE2_COMPLEX_RIGHT.csv +5 -0
- teradataml/data/Convolve2RealsLeft.csv +5 -0
- teradataml/data/Convolve2RealsRight.csv +5 -0
- teradataml/data/Convolve2ValidLeft.csv +11 -0
- teradataml/data/Convolve2ValidRight.csv +11 -0
- teradataml/data/DFFTConv_Real_8_8.csv +65 -0
- teradataml/data/Employee.csv +5 -0
- teradataml/data/Employee_Address.csv +4 -0
- teradataml/data/Employee_roles.csv +5 -0
- teradataml/data/JulesBelvezeDummyData.csv +100 -0
- teradataml/data/Mall_customer_data.csv +201 -0
- teradataml/data/Orders1_12mf.csv +25 -0
- teradataml/data/Pi_loan.csv +7 -0
- teradataml/data/SMOOTHED_DATA.csv +7 -0
- teradataml/data/TestDFFT8.csv +9 -0
- teradataml/data/TestRiver.csv +109 -0
- teradataml/data/Traindata.csv +28 -0
- teradataml/data/__init__.py +0 -0
- teradataml/data/acf.csv +17 -0
- teradataml/data/adaboost_example.json +34 -0
- teradataml/data/adaboostpredict_example.json +24 -0
- teradataml/data/additional_table.csv +11 -0
- teradataml/data/admissions_test.csv +21 -0
- teradataml/data/admissions_train.csv +41 -0
- teradataml/data/admissions_train_nulls.csv +41 -0
- teradataml/data/advertising.csv +201 -0
- teradataml/data/ageandheight.csv +13 -0
- teradataml/data/ageandpressure.csv +31 -0
- teradataml/data/amazon_reviews_25.csv +26 -0
- teradataml/data/antiselect_example.json +36 -0
- teradataml/data/antiselect_input.csv +8 -0
- teradataml/data/antiselect_input_mixed_case.csv +8 -0
- teradataml/data/applicant_external.csv +7 -0
- teradataml/data/applicant_reference.csv +7 -0
- teradataml/data/apriori_example.json +22 -0
- teradataml/data/arima_example.json +9 -0
- teradataml/data/assortedtext_input.csv +8 -0
- teradataml/data/attribution_example.json +34 -0
- teradataml/data/attribution_sample_table.csv +27 -0
- teradataml/data/attribution_sample_table1.csv +6 -0
- teradataml/data/attribution_sample_table2.csv +11 -0
- teradataml/data/bank_churn.csv +10001 -0
- teradataml/data/bank_marketing.csv +11163 -0
- teradataml/data/bank_web_clicks1.csv +43 -0
- teradataml/data/bank_web_clicks2.csv +91 -0
- teradataml/data/bank_web_url.csv +85 -0
- teradataml/data/barrier.csv +2 -0
- teradataml/data/barrier_new.csv +3 -0
- teradataml/data/betweenness_example.json +14 -0
- teradataml/data/bike_sharing.csv +732 -0
- teradataml/data/bin_breaks.csv +8 -0
- teradataml/data/bin_fit_ip.csv +4 -0
- teradataml/data/binary_complex_left.csv +11 -0
- teradataml/data/binary_complex_right.csv +11 -0
- teradataml/data/binary_matrix_complex_left.csv +21 -0
- teradataml/data/binary_matrix_complex_right.csv +21 -0
- teradataml/data/binary_matrix_real_left.csv +21 -0
- teradataml/data/binary_matrix_real_right.csv +21 -0
- teradataml/data/blood2ageandweight.csv +26 -0
- teradataml/data/bmi.csv +501 -0
- teradataml/data/boston.csv +507 -0
- teradataml/data/boston2cols.csv +721 -0
- teradataml/data/breast_cancer.csv +570 -0
- teradataml/data/buoydata_mix.csv +11 -0
- teradataml/data/burst_data.csv +5 -0
- teradataml/data/burst_example.json +21 -0
- teradataml/data/byom_example.json +34 -0
- teradataml/data/bytes_table.csv +4 -0
- teradataml/data/cal_housing_ex_raw.csv +70 -0
- teradataml/data/callers.csv +7 -0
- teradataml/data/calls.csv +10 -0
- teradataml/data/cars_hist.csv +33 -0
- teradataml/data/cat_table.csv +25 -0
- teradataml/data/ccm_example.json +32 -0
- teradataml/data/ccm_input.csv +91 -0
- teradataml/data/ccm_input2.csv +13 -0
- teradataml/data/ccmexample.csv +101 -0
- teradataml/data/ccmprepare_example.json +9 -0
- teradataml/data/ccmprepare_input.csv +91 -0
- teradataml/data/cfilter_example.json +12 -0
- teradataml/data/changepointdetection_example.json +18 -0
- teradataml/data/changepointdetectionrt_example.json +8 -0
- teradataml/data/chi_sq.csv +3 -0
- teradataml/data/churn_data.csv +14 -0
- teradataml/data/churn_emission.csv +35 -0
- teradataml/data/churn_initial.csv +3 -0
- teradataml/data/churn_state_transition.csv +5 -0
- teradataml/data/citedges_2.csv +745 -0
- teradataml/data/citvertices_2.csv +1210 -0
- teradataml/data/clicks2.csv +16 -0
- teradataml/data/clickstream.csv +13 -0
- teradataml/data/clickstream1.csv +11 -0
- teradataml/data/closeness_example.json +16 -0
- teradataml/data/complaints.csv +21 -0
- teradataml/data/complaints_mini.csv +3 -0
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_testtoken.csv +224 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/complaints_tokens_test.csv +353 -0
- teradataml/data/complaints_traintoken.csv +472 -0
- teradataml/data/computers_category.csv +1001 -0
- teradataml/data/computers_test1.csv +1252 -0
- teradataml/data/computers_train1.csv +5009 -0
- teradataml/data/computers_train1_clustered.csv +5009 -0
- teradataml/data/confusionmatrix_example.json +9 -0
- teradataml/data/conversion_event_table.csv +3 -0
- teradataml/data/corr_input.csv +17 -0
- teradataml/data/correlation_example.json +11 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/coxhazardratio_example.json +39 -0
- teradataml/data/coxph_example.json +15 -0
- teradataml/data/coxsurvival_example.json +28 -0
- teradataml/data/cpt.csv +41 -0
- teradataml/data/credit_ex_merged.csv +45 -0
- teradataml/data/creditcard_data.csv +1001 -0
- teradataml/data/customer_loyalty.csv +301 -0
- teradataml/data/customer_loyalty_newseq.csv +31 -0
- teradataml/data/customer_segmentation_test.csv +2628 -0
- teradataml/data/customer_segmentation_train.csv +8069 -0
- teradataml/data/dataframe_example.json +173 -0
- teradataml/data/decisionforest_example.json +37 -0
- teradataml/data/decisionforestpredict_example.json +38 -0
- teradataml/data/decisiontree_example.json +21 -0
- teradataml/data/decisiontreepredict_example.json +45 -0
- teradataml/data/dfft2_size4_real.csv +17 -0
- teradataml/data/dfft2_test_matrix16.csv +17 -0
- teradataml/data/dfft2conv_real_4_4.csv +65 -0
- teradataml/data/diabetes.csv +443 -0
- teradataml/data/diabetes_test.csv +89 -0
- teradataml/data/dict_table.csv +5 -0
- teradataml/data/docperterm_table.csv +4 -0
- teradataml/data/docs/__init__.py +1 -0
- teradataml/data/docs/byom/__init__.py +0 -0
- teradataml/data/docs/byom/docs/DataRobotPredict.py +180 -0
- teradataml/data/docs/byom/docs/DataikuPredict.py +217 -0
- teradataml/data/docs/byom/docs/H2OPredict.py +325 -0
- teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
- teradataml/data/docs/byom/docs/ONNXPredict.py +283 -0
- teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
- teradataml/data/docs/byom/docs/PMMLPredict.py +278 -0
- teradataml/data/docs/byom/docs/__init__.py +0 -0
- teradataml/data/docs/sqle/__init__.py +0 -0
- teradataml/data/docs/sqle/docs_17_10/Antiselect.py +83 -0
- teradataml/data/docs/sqle/docs_17_10/Attribution.py +200 -0
- teradataml/data/docs/sqle/docs_17_10/BincodeFit.py +172 -0
- teradataml/data/docs/sqle/docs_17_10/BincodeTransform.py +131 -0
- teradataml/data/docs/sqle/docs_17_10/CategoricalSummary.py +86 -0
- teradataml/data/docs/sqle/docs_17_10/ChiSq.py +90 -0
- teradataml/data/docs/sqle/docs_17_10/ColumnSummary.py +86 -0
- teradataml/data/docs/sqle/docs_17_10/ConvertTo.py +96 -0
- teradataml/data/docs/sqle/docs_17_10/DecisionForestPredict.py +139 -0
- teradataml/data/docs/sqle/docs_17_10/DecisionTreePredict.py +152 -0
- teradataml/data/docs/sqle/docs_17_10/FTest.py +161 -0
- teradataml/data/docs/sqle/docs_17_10/FillRowId.py +83 -0
- teradataml/data/docs/sqle/docs_17_10/Fit.py +88 -0
- teradataml/data/docs/sqle/docs_17_10/GLMPredict.py +144 -0
- teradataml/data/docs/sqle/docs_17_10/GetRowsWithMissingValues.py +85 -0
- teradataml/data/docs/sqle/docs_17_10/GetRowsWithoutMissingValues.py +82 -0
- teradataml/data/docs/sqle/docs_17_10/Histogram.py +165 -0
- teradataml/data/docs/sqle/docs_17_10/MovingAverage.py +134 -0
- teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +209 -0
- teradataml/data/docs/sqle/docs_17_10/NPath.py +266 -0
- teradataml/data/docs/sqle/docs_17_10/NaiveBayesPredict.py +116 -0
- teradataml/data/docs/sqle/docs_17_10/NaiveBayesTextClassifierPredict.py +176 -0
- teradataml/data/docs/sqle/docs_17_10/NumApply.py +147 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +135 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +109 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterFit.py +166 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +105 -0
- teradataml/data/docs/sqle/docs_17_10/Pack.py +128 -0
- teradataml/data/docs/sqle/docs_17_10/PolynomialFeaturesFit.py +112 -0
- teradataml/data/docs/sqle/docs_17_10/PolynomialFeaturesTransform.py +102 -0
- teradataml/data/docs/sqle/docs_17_10/QQNorm.py +105 -0
- teradataml/data/docs/sqle/docs_17_10/RoundColumns.py +110 -0
- teradataml/data/docs/sqle/docs_17_10/RowNormalizeFit.py +118 -0
- teradataml/data/docs/sqle/docs_17_10/RowNormalizeTransform.py +99 -0
- teradataml/data/docs/sqle/docs_17_10/SVMSparsePredict.py +153 -0
- teradataml/data/docs/sqle/docs_17_10/ScaleFit.py +197 -0
- teradataml/data/docs/sqle/docs_17_10/ScaleTransform.py +99 -0
- teradataml/data/docs/sqle/docs_17_10/Sessionize.py +114 -0
- teradataml/data/docs/sqle/docs_17_10/SimpleImputeFit.py +116 -0
- teradataml/data/docs/sqle/docs_17_10/SimpleImputeTransform.py +98 -0
- teradataml/data/docs/sqle/docs_17_10/StrApply.py +187 -0
- teradataml/data/docs/sqle/docs_17_10/StringSimilarity.py +146 -0
- teradataml/data/docs/sqle/docs_17_10/Transform.py +105 -0
- teradataml/data/docs/sqle/docs_17_10/UnivariateStatistics.py +142 -0
- teradataml/data/docs/sqle/docs_17_10/Unpack.py +214 -0
- teradataml/data/docs/sqle/docs_17_10/WhichMax.py +83 -0
- teradataml/data/docs/sqle/docs_17_10/WhichMin.py +83 -0
- teradataml/data/docs/sqle/docs_17_10/ZTest.py +155 -0
- teradataml/data/docs/sqle/docs_17_10/__init__.py +0 -0
- teradataml/data/docs/sqle/docs_17_20/ANOVA.py +186 -0
- teradataml/data/docs/sqle/docs_17_20/Antiselect.py +83 -0
- teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
- teradataml/data/docs/sqle/docs_17_20/Attribution.py +201 -0
- teradataml/data/docs/sqle/docs_17_20/BincodeFit.py +172 -0
- teradataml/data/docs/sqle/docs_17_20/BincodeTransform.py +139 -0
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/CategoricalSummary.py +86 -0
- teradataml/data/docs/sqle/docs_17_20/ChiSq.py +90 -0
- teradataml/data/docs/sqle/docs_17_20/ClassificationEvaluator.py +166 -0
- teradataml/data/docs/sqle/docs_17_20/ColumnSummary.py +86 -0
- teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +246 -0
- teradataml/data/docs/sqle/docs_17_20/ConvertTo.py +113 -0
- teradataml/data/docs/sqle/docs_17_20/DecisionForest.py +280 -0
- teradataml/data/docs/sqle/docs_17_20/DecisionForestPredict.py +144 -0
- teradataml/data/docs/sqle/docs_17_20/DecisionTreePredict.py +136 -0
- teradataml/data/docs/sqle/docs_17_20/FTest.py +240 -0
- teradataml/data/docs/sqle/docs_17_20/FillRowId.py +83 -0
- teradataml/data/docs/sqle/docs_17_20/Fit.py +88 -0
- teradataml/data/docs/sqle/docs_17_20/GLM.py +541 -0
- teradataml/data/docs/sqle/docs_17_20/GLMPerSegment.py +415 -0
- teradataml/data/docs/sqle/docs_17_20/GLMPredict.py +144 -0
- teradataml/data/docs/sqle/docs_17_20/GLMPredictPerSegment.py +233 -0
- teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +125 -0
- teradataml/data/docs/sqle/docs_17_20/GetRowsWithMissingValues.py +109 -0
- teradataml/data/docs/sqle/docs_17_20/GetRowsWithoutMissingValues.py +106 -0
- teradataml/data/docs/sqle/docs_17_20/Histogram.py +224 -0
- teradataml/data/docs/sqle/docs_17_20/KMeans.py +251 -0
- teradataml/data/docs/sqle/docs_17_20/KMeansPredict.py +144 -0
- teradataml/data/docs/sqle/docs_17_20/KNN.py +215 -0
- teradataml/data/docs/sqle/docs_17_20/MovingAverage.py +134 -0
- teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +209 -0
- teradataml/data/docs/sqle/docs_17_20/NPath.py +266 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayesPredict.py +116 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierPredict.py +177 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierTrainer.py +127 -0
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +119 -0
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineTransform.py +112 -0
- teradataml/data/docs/sqle/docs_17_20/NumApply.py +147 -0
- teradataml/data/docs/sqle/docs_17_20/OneClassSVM.py +307 -0
- teradataml/data/docs/sqle/docs_17_20/OneClassSVMPredict.py +185 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +231 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +121 -0
- teradataml/data/docs/sqle/docs_17_20/OrdinalEncodingFit.py +220 -0
- teradataml/data/docs/sqle/docs_17_20/OrdinalEncodingTransform.py +127 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +191 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterTransform.py +117 -0
- teradataml/data/docs/sqle/docs_17_20/Pack.py +128 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/PolynomialFeaturesFit.py +112 -0
- teradataml/data/docs/sqle/docs_17_20/PolynomialFeaturesTransform.py +112 -0
- teradataml/data/docs/sqle/docs_17_20/QQNorm.py +105 -0
- teradataml/data/docs/sqle/docs_17_20/ROC.py +164 -0
- teradataml/data/docs/sqle/docs_17_20/RandomProjectionFit.py +155 -0
- teradataml/data/docs/sqle/docs_17_20/RandomProjectionMinComponents.py +106 -0
- teradataml/data/docs/sqle/docs_17_20/RandomProjectionTransform.py +120 -0
- teradataml/data/docs/sqle/docs_17_20/RegressionEvaluator.py +211 -0
- teradataml/data/docs/sqle/docs_17_20/RoundColumns.py +109 -0
- teradataml/data/docs/sqle/docs_17_20/RowNormalizeFit.py +118 -0
- teradataml/data/docs/sqle/docs_17_20/RowNormalizeTransform.py +111 -0
- teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
- teradataml/data/docs/sqle/docs_17_20/SVM.py +414 -0
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +213 -0
- teradataml/data/docs/sqle/docs_17_20/SVMSparsePredict.py +153 -0
- teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +315 -0
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +202 -0
- teradataml/data/docs/sqle/docs_17_20/SentimentExtractor.py +206 -0
- teradataml/data/docs/sqle/docs_17_20/Sessionize.py +114 -0
- teradataml/data/docs/sqle/docs_17_20/Shap.py +225 -0
- teradataml/data/docs/sqle/docs_17_20/Silhouette.py +153 -0
- teradataml/data/docs/sqle/docs_17_20/SimpleImputeFit.py +116 -0
- teradataml/data/docs/sqle/docs_17_20/SimpleImputeTransform.py +109 -0
- teradataml/data/docs/sqle/docs_17_20/StrApply.py +187 -0
- teradataml/data/docs/sqle/docs_17_20/StringSimilarity.py +146 -0
- teradataml/data/docs/sqle/docs_17_20/TDDecisionForestPredict.py +207 -0
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +333 -0
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/TargetEncodingFit.py +267 -0
- teradataml/data/docs/sqle/docs_17_20/TargetEncodingTransform.py +141 -0
- teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +224 -0
- teradataml/data/docs/sqle/docs_17_20/TrainTestSplit.py +160 -0
- teradataml/data/docs/sqle/docs_17_20/Transform.py +123 -0
- teradataml/data/docs/sqle/docs_17_20/UnivariateStatistics.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/Unpack.py +214 -0
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/sqle/docs_17_20/VectorDistance.py +169 -0
- teradataml/data/docs/sqle/docs_17_20/WhichMax.py +83 -0
- teradataml/data/docs/sqle/docs_17_20/WhichMin.py +83 -0
- teradataml/data/docs/sqle/docs_17_20/WordEmbeddings.py +237 -0
- teradataml/data/docs/sqle/docs_17_20/XGBoost.py +362 -0
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +281 -0
- teradataml/data/docs/sqle/docs_17_20/ZTest.py +220 -0
- teradataml/data/docs/sqle/docs_17_20/__init__.py +0 -0
- teradataml/data/docs/tableoperator/__init__.py +0 -0
- teradataml/data/docs/tableoperator/docs_17_00/ReadNOS.py +430 -0
- teradataml/data/docs/tableoperator/docs_17_00/__init__.py +0 -0
- teradataml/data/docs/tableoperator/docs_17_05/ReadNOS.py +430 -0
- teradataml/data/docs/tableoperator/docs_17_05/WriteNOS.py +348 -0
- teradataml/data/docs/tableoperator/docs_17_05/__init__.py +0 -0
- teradataml/data/docs/tableoperator/docs_17_10/ReadNOS.py +429 -0
- teradataml/data/docs/tableoperator/docs_17_10/WriteNOS.py +348 -0
- teradataml/data/docs/tableoperator/docs_17_10/__init__.py +0 -0
- teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
- teradataml/data/docs/tableoperator/docs_17_20/ReadNOS.py +440 -0
- teradataml/data/docs/tableoperator/docs_17_20/WriteNOS.py +387 -0
- teradataml/data/docs/tableoperator/docs_17_20/__init__.py +0 -0
- teradataml/data/docs/uaf/__init__.py +0 -0
- teradataml/data/docs/uaf/docs_17_20/ACF.py +186 -0
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +370 -0
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +172 -0
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +161 -0
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BinaryMatrixOp.py +248 -0
- teradataml/data/docs/uaf/docs_17_20/BinarySeriesOp.py +252 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +178 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +175 -0
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +230 -0
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +218 -0
- teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +185 -0
- teradataml/data/docs/uaf/docs_17_20/DFFT.py +204 -0
- teradataml/data/docs/uaf/docs_17_20/DFFT2.py +216 -0
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +216 -0
- teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +192 -0
- teradataml/data/docs/uaf/docs_17_20/DIFF.py +175 -0
- teradataml/data/docs/uaf/docs_17_20/DTW.py +180 -0
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +217 -0
- teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +142 -0
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +184 -0
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +185 -0
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/FitMetrics.py +172 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesFormula.py +206 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +143 -0
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +198 -0
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +260 -0
- teradataml/data/docs/uaf/docs_17_20/IDFFT.py +165 -0
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +191 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/InputValidator.py +121 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +156 -0
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +215 -0
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +174 -0
- teradataml/data/docs/uaf/docs_17_20/MInfo.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +191 -0
- teradataml/data/docs/uaf/docs_17_20/PACF.py +157 -0
- teradataml/data/docs/uaf/docs_17_20/Portman.py +217 -0
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +203 -0
- teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +155 -0
- teradataml/data/docs/uaf/docs_17_20/Resample.py +237 -0
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SInfo.py +123 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +173 -0
- teradataml/data/docs/uaf/docs_17_20/SelectionCriteria.py +174 -0
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +171 -0
- teradataml/data/docs/uaf/docs_17_20/SignifResidmean.py +164 -0
- teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +180 -0
- teradataml/data/docs/uaf/docs_17_20/Smoothma.py +208 -0
- teradataml/data/docs/uaf/docs_17_20/TrackingOp.py +151 -0
- teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +171 -0
- teradataml/data/docs/uaf/docs_17_20/Unnormalize.py +202 -0
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +171 -0
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/docs/uaf/docs_17_20/__init__.py +0 -0
- teradataml/data/dtw_example.json +18 -0
- teradataml/data/dtw_t1.csv +11 -0
- teradataml/data/dtw_t2.csv +4 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt2d_example.json +16 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_example.json +15 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/dwt_filter_dim.csv +5 -0
- teradataml/data/emission.csv +9 -0
- teradataml/data/emp_table_by_dept.csv +19 -0
- teradataml/data/employee_info.csv +4 -0
- teradataml/data/employee_table.csv +6 -0
- teradataml/data/excluding_event_table.csv +2 -0
- teradataml/data/finance_data.csv +6 -0
- teradataml/data/finance_data2.csv +61 -0
- teradataml/data/finance_data3.csv +93 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/fish.csv +160 -0
- teradataml/data/fm_blood2ageandweight.csv +26 -0
- teradataml/data/fmeasure_example.json +12 -0
- teradataml/data/followers_leaders.csv +10 -0
- teradataml/data/fpgrowth_example.json +12 -0
- teradataml/data/frequentpaths_example.json +29 -0
- teradataml/data/friends.csv +9 -0
- teradataml/data/fs_input.csv +33 -0
- teradataml/data/fs_input1.csv +33 -0
- teradataml/data/genData.csv +513 -0
- teradataml/data/geodataframe_example.json +40 -0
- teradataml/data/glass_types.csv +215 -0
- teradataml/data/glm_admissions_model.csv +12 -0
- teradataml/data/glm_example.json +56 -0
- teradataml/data/glml1l2_example.json +28 -0
- teradataml/data/glml1l2predict_example.json +54 -0
- teradataml/data/glmpredict_example.json +54 -0
- teradataml/data/gq_t1.csv +21 -0
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/hconvolve_complex_right.csv +5 -0
- teradataml/data/hconvolve_complex_rightmulti.csv +5 -0
- teradataml/data/histogram_example.json +12 -0
- teradataml/data/hmmdecoder_example.json +79 -0
- teradataml/data/hmmevaluator_example.json +25 -0
- teradataml/data/hmmsupervised_example.json +10 -0
- teradataml/data/hmmunsupervised_example.json +8 -0
- teradataml/data/hnsw_alter_data.csv +5 -0
- teradataml/data/hnsw_data.csv +10 -0
- teradataml/data/house_values.csv +12 -0
- teradataml/data/house_values2.csv +13 -0
- teradataml/data/housing_cat.csv +7 -0
- teradataml/data/housing_data.csv +9 -0
- teradataml/data/housing_test.csv +47 -0
- teradataml/data/housing_test_binary.csv +47 -0
- teradataml/data/housing_train.csv +493 -0
- teradataml/data/housing_train_attribute.csv +5 -0
- teradataml/data/housing_train_binary.csv +437 -0
- teradataml/data/housing_train_parameter.csv +2 -0
- teradataml/data/housing_train_response.csv +493 -0
- teradataml/data/housing_train_segment.csv +201 -0
- teradataml/data/ibm_stock.csv +370 -0
- teradataml/data/ibm_stock1.csv +370 -0
- teradataml/data/identitymatch_example.json +22 -0
- teradataml/data/idf_table.csv +4 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/impressions.csv +101 -0
- teradataml/data/inflation.csv +21 -0
- teradataml/data/initial.csv +3 -0
- teradataml/data/insect2Cols.csv +61 -0
- teradataml/data/insect_sprays.csv +13 -0
- teradataml/data/insurance.csv +1339 -0
- teradataml/data/interpolator_example.json +13 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/iris_altinput.csv +481 -0
- teradataml/data/iris_attribute_output.csv +8 -0
- teradataml/data/iris_attribute_test.csv +121 -0
- teradataml/data/iris_attribute_train.csv +481 -0
- teradataml/data/iris_category_expect_predict.csv +31 -0
- teradataml/data/iris_data.csv +151 -0
- teradataml/data/iris_input.csv +151 -0
- teradataml/data/iris_response_train.csv +121 -0
- teradataml/data/iris_test.csv +31 -0
- teradataml/data/iris_train.csv +121 -0
- teradataml/data/join_table1.csv +4 -0
- teradataml/data/join_table2.csv +4 -0
- teradataml/data/jsons/anly_function_name.json +7 -0
- teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
- teradataml/data/jsons/byom/dataikupredict.json +148 -0
- teradataml/data/jsons/byom/datarobotpredict.json +147 -0
- teradataml/data/jsons/byom/h2opredict.json +195 -0
- teradataml/data/jsons/byom/onnxembeddings.json +267 -0
- teradataml/data/jsons/byom/onnxpredict.json +187 -0
- teradataml/data/jsons/byom/pmmlpredict.json +147 -0
- teradataml/data/jsons/paired_functions.json +450 -0
- teradataml/data/jsons/sqle/16.20/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/16.20/Attribution.json +249 -0
- teradataml/data/jsons/sqle/16.20/DecisionForestPredict.json +156 -0
- teradataml/data/jsons/sqle/16.20/DecisionTreePredict.json +170 -0
- teradataml/data/jsons/sqle/16.20/GLMPredict.json +122 -0
- teradataml/data/jsons/sqle/16.20/MovingAverage.json +367 -0
- teradataml/data/jsons/sqle/16.20/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/16.20/NaiveBayesPredict.json +136 -0
- teradataml/data/jsons/sqle/16.20/NaiveBayesTextClassifierPredict.json +235 -0
- teradataml/data/jsons/sqle/16.20/Pack.json +98 -0
- teradataml/data/jsons/sqle/16.20/SVMSparsePredict.json +162 -0
- teradataml/data/jsons/sqle/16.20/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/16.20/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/16.20/Unpack.json +166 -0
- teradataml/data/jsons/sqle/16.20/nPath.json +269 -0
- teradataml/data/jsons/sqle/17.00/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/17.00/Attribution.json +249 -0
- teradataml/data/jsons/sqle/17.00/DecisionForestPredict.json +156 -0
- teradataml/data/jsons/sqle/17.00/DecisionTreePredict.json +170 -0
- teradataml/data/jsons/sqle/17.00/GLMPredict.json +122 -0
- teradataml/data/jsons/sqle/17.00/MovingAverage.json +367 -0
- teradataml/data/jsons/sqle/17.00/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/17.00/NaiveBayesPredict.json +136 -0
- teradataml/data/jsons/sqle/17.00/NaiveBayesTextClassifierPredict.json +235 -0
- teradataml/data/jsons/sqle/17.00/Pack.json +98 -0
- teradataml/data/jsons/sqle/17.00/SVMSparsePredict.json +162 -0
- teradataml/data/jsons/sqle/17.00/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/17.00/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/17.00/Unpack.json +166 -0
- teradataml/data/jsons/sqle/17.00/nPath.json +269 -0
- teradataml/data/jsons/sqle/17.05/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/17.05/Attribution.json +249 -0
- teradataml/data/jsons/sqle/17.05/DecisionForestPredict.json +156 -0
- teradataml/data/jsons/sqle/17.05/DecisionTreePredict.json +170 -0
- teradataml/data/jsons/sqle/17.05/GLMPredict.json +122 -0
- teradataml/data/jsons/sqle/17.05/MovingAverage.json +367 -0
- teradataml/data/jsons/sqle/17.05/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/17.05/NaiveBayesPredict.json +136 -0
- teradataml/data/jsons/sqle/17.05/NaiveBayesTextClassifierPredict.json +235 -0
- teradataml/data/jsons/sqle/17.05/Pack.json +98 -0
- teradataml/data/jsons/sqle/17.05/SVMSparsePredict.json +162 -0
- teradataml/data/jsons/sqle/17.05/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/17.05/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/17.05/Unpack.json +166 -0
- teradataml/data/jsons/sqle/17.05/nPath.json +269 -0
- teradataml/data/jsons/sqle/17.10/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/17.10/Attribution.json +249 -0
- teradataml/data/jsons/sqle/17.10/DecisionForestPredict.json +185 -0
- teradataml/data/jsons/sqle/17.10/DecisionTreePredict.json +172 -0
- teradataml/data/jsons/sqle/17.10/GLMPredict.json +151 -0
- teradataml/data/jsons/sqle/17.10/MovingAverage.json +368 -0
- teradataml/data/jsons/sqle/17.10/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/17.10/NaiveBayesPredict.json +149 -0
- teradataml/data/jsons/sqle/17.10/NaiveBayesTextClassifierPredict.json +288 -0
- teradataml/data/jsons/sqle/17.10/Pack.json +133 -0
- teradataml/data/jsons/sqle/17.10/SVMSparsePredict.json +193 -0
- teradataml/data/jsons/sqle/17.10/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/17.10/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/17.10/TD_BinCodeFit.json +239 -0
- teradataml/data/jsons/sqle/17.10/TD_BinCodeTransform.json +70 -0
- teradataml/data/jsons/sqle/17.10/TD_CategoricalSummary.json +54 -0
- teradataml/data/jsons/sqle/17.10/TD_Chisq.json +68 -0
- teradataml/data/jsons/sqle/17.10/TD_ColumnSummary.json +54 -0
- teradataml/data/jsons/sqle/17.10/TD_ConvertTo.json +69 -0
- teradataml/data/jsons/sqle/17.10/TD_FTest.json +187 -0
- teradataml/data/jsons/sqle/17.10/TD_FillRowID.json +52 -0
- teradataml/data/jsons/sqle/17.10/TD_FunctionFit.json +46 -0
- teradataml/data/jsons/sqle/17.10/TD_FunctionTransform.json +72 -0
- teradataml/data/jsons/sqle/17.10/TD_GetRowsWithMissingValues.json +53 -0
- teradataml/data/jsons/sqle/17.10/TD_GetRowsWithoutMissingValues.json +53 -0
- teradataml/data/jsons/sqle/17.10/TD_Histogram.json +133 -0
- teradataml/data/jsons/sqle/17.10/TD_NumApply.json +147 -0
- teradataml/data/jsons/sqle/17.10/TD_OneHotEncodingFit.json +183 -0
- teradataml/data/jsons/sqle/17.10/TD_OneHotEncodingTransform.json +66 -0
- teradataml/data/jsons/sqle/17.10/TD_OutlierFilterFit.json +197 -0
- teradataml/data/jsons/sqle/17.10/TD_OutlierFilterTransform.json +48 -0
- teradataml/data/jsons/sqle/17.10/TD_PolynomialFeaturesFit.json +114 -0
- teradataml/data/jsons/sqle/17.10/TD_PolynomialFeaturesTransform.json +72 -0
- teradataml/data/jsons/sqle/17.10/TD_QQNorm.json +112 -0
- teradataml/data/jsons/sqle/17.10/TD_RoundColumns.json +93 -0
- teradataml/data/jsons/sqle/17.10/TD_RowNormalizeFit.json +128 -0
- teradataml/data/jsons/sqle/17.10/TD_RowNormalizeTransform.json +71 -0
- teradataml/data/jsons/sqle/17.10/TD_ScaleFit.json +157 -0
- teradataml/data/jsons/sqle/17.10/TD_ScaleTransform.json +71 -0
- teradataml/data/jsons/sqle/17.10/TD_SimpleImputeFit.json +148 -0
- teradataml/data/jsons/sqle/17.10/TD_SimpleImputeTransform.json +48 -0
- teradataml/data/jsons/sqle/17.10/TD_StrApply.json +240 -0
- teradataml/data/jsons/sqle/17.10/TD_UnivariateStatistics.json +119 -0
- teradataml/data/jsons/sqle/17.10/TD_WhichMax.json +53 -0
- teradataml/data/jsons/sqle/17.10/TD_WhichMin.json +53 -0
- teradataml/data/jsons/sqle/17.10/TD_ZTest.json +171 -0
- teradataml/data/jsons/sqle/17.10/Unpack.json +188 -0
- teradataml/data/jsons/sqle/17.10/nPath.json +269 -0
- teradataml/data/jsons/sqle/17.20/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/17.20/Attribution.json +249 -0
- teradataml/data/jsons/sqle/17.20/DecisionForestPredict.json +185 -0
- teradataml/data/jsons/sqle/17.20/DecisionTreePredict.json +172 -0
- teradataml/data/jsons/sqle/17.20/GLMPredict.json +151 -0
- teradataml/data/jsons/sqle/17.20/MovingAverage.json +367 -0
- teradataml/data/jsons/sqle/17.20/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/17.20/NaiveBayesPredict.json +149 -0
- teradataml/data/jsons/sqle/17.20/NaiveBayesTextClassifierPredict.json +287 -0
- teradataml/data/jsons/sqle/17.20/Pack.json +133 -0
- teradataml/data/jsons/sqle/17.20/SVMSparsePredict.json +192 -0
- teradataml/data/jsons/sqle/17.20/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/17.20/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +149 -0
- teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
- teradataml/data/jsons/sqle/17.20/TD_BinCodeFit.json +239 -0
- teradataml/data/jsons/sqle/17.20/TD_BinCodeTransform.json +71 -0
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_CategoricalSummary.json +53 -0
- teradataml/data/jsons/sqle/17.20/TD_Chisq.json +68 -0
- teradataml/data/jsons/sqle/17.20/TD_ClassificationEvaluator.json +146 -0
- teradataml/data/jsons/sqle/17.20/TD_ColumnSummary.json +53 -0
- teradataml/data/jsons/sqle/17.20/TD_ColumnTransformer.json +218 -0
- teradataml/data/jsons/sqle/17.20/TD_ConvertTo.json +92 -0
- teradataml/data/jsons/sqle/17.20/TD_DecisionForest.json +260 -0
- teradataml/data/jsons/sqle/17.20/TD_DecisionForestPredict.json +139 -0
- teradataml/data/jsons/sqle/17.20/TD_FTest.json +269 -0
- teradataml/data/jsons/sqle/17.20/TD_FillRowID.json +52 -0
- teradataml/data/jsons/sqle/17.20/TD_FunctionFit.json +46 -0
- teradataml/data/jsons/sqle/17.20/TD_FunctionTransform.json +72 -0
- teradataml/data/jsons/sqle/17.20/TD_GLM.json +507 -0
- teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +168 -0
- teradataml/data/jsons/sqle/17.20/TD_GLMPerSegment.json +411 -0
- teradataml/data/jsons/sqle/17.20/TD_GLMPredictPerSegment.json +146 -0
- teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +93 -0
- teradataml/data/jsons/sqle/17.20/TD_GetRowsWithMissingValues.json +76 -0
- teradataml/data/jsons/sqle/17.20/TD_GetRowsWithoutMissingValues.json +76 -0
- teradataml/data/jsons/sqle/17.20/TD_Histogram.json +152 -0
- teradataml/data/jsons/sqle/17.20/TD_KMeans.json +232 -0
- teradataml/data/jsons/sqle/17.20/TD_KMeansPredict.json +87 -0
- teradataml/data/jsons/sqle/17.20/TD_KNN.json +262 -0
- teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesTextClassifierTrainer.json +137 -0
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +102 -0
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineTransform.json +71 -0
- teradataml/data/jsons/sqle/17.20/TD_NumApply.json +147 -0
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +316 -0
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVMPredict.json +124 -0
- teradataml/data/jsons/sqle/17.20/TD_OneHotEncodingFit.json +271 -0
- teradataml/data/jsons/sqle/17.20/TD_OneHotEncodingTransform.json +65 -0
- teradataml/data/jsons/sqle/17.20/TD_OrdinalEncodingFit.json +229 -0
- teradataml/data/jsons/sqle/17.20/TD_OrdinalEncodingTransform.json +75 -0
- teradataml/data/jsons/sqle/17.20/TD_OutlierFilterFit.json +217 -0
- teradataml/data/jsons/sqle/17.20/TD_OutlierFilterTransform.json +48 -0
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_PolynomialFeaturesFit.json +114 -0
- teradataml/data/jsons/sqle/17.20/TD_PolynomialFeaturesTransform.json +72 -0
- teradataml/data/jsons/sqle/17.20/TD_QQNorm.json +111 -0
- teradataml/data/jsons/sqle/17.20/TD_ROC.json +179 -0
- teradataml/data/jsons/sqle/17.20/TD_RandomProjectionFit.json +179 -0
- teradataml/data/jsons/sqle/17.20/TD_RandomProjectionMinComponents.json +74 -0
- teradataml/data/jsons/sqle/17.20/TD_RandomProjectionTransform.json +74 -0
- teradataml/data/jsons/sqle/17.20/TD_RegressionEvaluator.json +138 -0
- teradataml/data/jsons/sqle/17.20/TD_RoundColumns.json +93 -0
- teradataml/data/jsons/sqle/17.20/TD_RowNormalizeFit.json +128 -0
- teradataml/data/jsons/sqle/17.20/TD_RowNormalizeTransform.json +71 -0
- teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
- teradataml/data/jsons/sqle/17.20/TD_SVM.json +389 -0
- teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +142 -0
- teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +310 -0
- teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +120 -0
- teradataml/data/jsons/sqle/17.20/TD_SentimentExtractor.json +194 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +221 -0
- teradataml/data/jsons/sqle/17.20/TD_Silhouette.json +143 -0
- teradataml/data/jsons/sqle/17.20/TD_SimpleImputeFit.json +147 -0
- teradataml/data/jsons/sqle/17.20/TD_SimpleImputeTransform.json +48 -0
- teradataml/data/jsons/sqle/17.20/TD_StrApply.json +240 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_TargetEncodingFit.json +248 -0
- teradataml/data/jsons/sqle/17.20/TD_TargetEncodingTransform.json +75 -0
- teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +297 -0
- teradataml/data/jsons/sqle/17.20/TD_TrainTestSplit.json +142 -0
- teradataml/data/jsons/sqle/17.20/TD_UnivariateStatistics.json +117 -0
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/sqle/17.20/TD_VectorDistance.json +183 -0
- teradataml/data/jsons/sqle/17.20/TD_WhichMax.json +53 -0
- teradataml/data/jsons/sqle/17.20/TD_WhichMin.json +53 -0
- teradataml/data/jsons/sqle/17.20/TD_WordEmbeddings.json +241 -0
- teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +330 -0
- teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +195 -0
- teradataml/data/jsons/sqle/17.20/TD_ZTest.json +247 -0
- teradataml/data/jsons/sqle/17.20/Unpack.json +188 -0
- teradataml/data/jsons/sqle/17.20/nPath.json +269 -0
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +370 -0
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +460 -0
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +385 -0
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +369 -0
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +369 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +369 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +369 -0
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +400 -0
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +401 -0
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +384 -0
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +384 -0
- teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
- teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
- teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/tableoperator/17.00/read_nos.json +198 -0
- teradataml/data/jsons/tableoperator/17.05/read_nos.json +198 -0
- teradataml/data/jsons/tableoperator/17.05/write_nos.json +195 -0
- teradataml/data/jsons/tableoperator/17.10/read_nos.json +184 -0
- teradataml/data/jsons/tableoperator/17.10/write_nos.json +195 -0
- teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
- teradataml/data/jsons/tableoperator/17.20/read_nos.json +183 -0
- teradataml/data/jsons/tableoperator/17.20/write_nos.json +224 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +132 -0
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +396 -0
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +77 -0
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +153 -0
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +107 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +106 -0
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +89 -0
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +104 -0
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +78 -0
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +66 -0
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +87 -0
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +134 -0
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +144 -0
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +108 -0
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +108 -0
- teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +78 -0
- teradataml/data/jsons/uaf/17.20/TD_DIFF.json +92 -0
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +114 -0
- teradataml/data/jsons/uaf/17.20/TD_DURBIN_WATSON.json +101 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_EXTRACT_RESULTS.json +39 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +101 -0
- teradataml/data/jsons/uaf/17.20/TD_GENSERIES4FORMULA.json +85 -0
- teradataml/data/jsons/uaf/17.20/TD_GENSERIES4SINUSOIDS.json +71 -0
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +139 -0
- teradataml/data/jsons/uaf/17.20/TD_HOLT_WINTERS_FORECASTER.json +313 -0
- teradataml/data/jsons/uaf/17.20/TD_IDFFT.json +58 -0
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +81 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_INPUTVALIDATOR.json +64 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +182 -0
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +103 -0
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +181 -0
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_MATRIXMULTIPLY.json +68 -0
- teradataml/data/jsons/uaf/17.20/TD_MINFO.json +67 -0
- teradataml/data/jsons/uaf/17.20/TD_MULTIVAR_REGR.json +179 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +114 -0
- teradataml/data/jsons/uaf/17.20/TD_PORTMAN.json +119 -0
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +175 -0
- teradataml/data/jsons/uaf/17.20/TD_POWERTRANSFORM.json +98 -0
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +194 -0
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +210 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +143 -0
- teradataml/data/jsons/uaf/17.20/TD_SELECTION_CRITERIA.json +90 -0
- teradataml/data/jsons/uaf/17.20/TD_SIGNIF_PERIODICITIES.json +80 -0
- teradataml/data/jsons/uaf/17.20/TD_SIGNIF_RESIDMEAN.json +68 -0
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +184 -0
- teradataml/data/jsons/uaf/17.20/TD_SINFO.json +58 -0
- teradataml/data/jsons/uaf/17.20/TD_SMOOTHMA.json +163 -0
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +101 -0
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +112 -0
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +95 -0
- teradataml/data/jsons/uaf/17.20/TD_WHITES_GENERAL.json +78 -0
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +410 -0
- teradataml/data/kmeans_example.json +23 -0
- teradataml/data/kmeans_table.csv +10 -0
- teradataml/data/kmeans_us_arrests_data.csv +51 -0
- teradataml/data/knn_example.json +19 -0
- teradataml/data/knnrecommender_example.json +7 -0
- teradataml/data/knnrecommenderpredict_example.json +12 -0
- teradataml/data/lar_example.json +17 -0
- teradataml/data/larpredict_example.json +30 -0
- teradataml/data/lc_new_predictors.csv +5 -0
- teradataml/data/lc_new_reference.csv +9 -0
- teradataml/data/lda_example.json +9 -0
- teradataml/data/ldainference_example.json +15 -0
- teradataml/data/ldatopicsummary_example.json +9 -0
- teradataml/data/levendist_input.csv +13 -0
- teradataml/data/levenshteindistance_example.json +10 -0
- teradataml/data/linreg_example.json +10 -0
- teradataml/data/load_example_data.py +350 -0
- teradataml/data/loan_prediction.csv +295 -0
- teradataml/data/lungcancer.csv +138 -0
- teradataml/data/mappingdata.csv +12 -0
- teradataml/data/medical_readings.csv +101 -0
- teradataml/data/milk_timeseries.csv +157 -0
- teradataml/data/min_max_titanic.csv +4 -0
- teradataml/data/minhash_example.json +6 -0
- teradataml/data/ml_ratings.csv +7547 -0
- teradataml/data/ml_ratings_10.csv +2445 -0
- teradataml/data/mobile_data.csv +13 -0
- teradataml/data/model1_table.csv +5 -0
- teradataml/data/model2_table.csv +5 -0
- teradataml/data/models/License_file.txt +1 -0
- teradataml/data/models/License_file_empty.txt +0 -0
- teradataml/data/models/dataiku_iris_data_ann_thin +0 -0
- teradataml/data/models/dr_iris_rf +0 -0
- teradataml/data/models/iris_db_dt_model_sklearn.onnx +0 -0
- teradataml/data/models/iris_db_dt_model_sklearn_floattensor.onnx +0 -0
- teradataml/data/models/iris_db_glm_model.pmml +57 -0
- teradataml/data/models/iris_db_xgb_model.pmml +4471 -0
- teradataml/data/models/iris_kmeans_model +0 -0
- teradataml/data/models/iris_mojo_glm_h2o_model +0 -0
- teradataml/data/models/iris_mojo_xgb_h2o_model +0 -0
- teradataml/data/modularity_example.json +12 -0
- teradataml/data/movavg_example.json +8 -0
- teradataml/data/mtx1.csv +7 -0
- teradataml/data/mtx2.csv +13 -0
- teradataml/data/multi_model_classification.csv +401 -0
- teradataml/data/multi_model_regression.csv +401 -0
- teradataml/data/mvdfft8.csv +9 -0
- teradataml/data/naivebayes_example.json +10 -0
- teradataml/data/naivebayespredict_example.json +19 -0
- teradataml/data/naivebayestextclassifier2_example.json +7 -0
- teradataml/data/naivebayestextclassifier_example.json +8 -0
- teradataml/data/naivebayestextclassifierpredict_example.json +32 -0
- teradataml/data/name_Find_configure.csv +10 -0
- teradataml/data/namedentityfinder_example.json +14 -0
- teradataml/data/namedentityfinderevaluator_example.json +10 -0
- teradataml/data/namedentityfindertrainer_example.json +6 -0
- teradataml/data/nb_iris_input_test.csv +31 -0
- teradataml/data/nb_iris_input_train.csv +121 -0
- teradataml/data/nbp_iris_model.csv +13 -0
- teradataml/data/ner_dict.csv +8 -0
- teradataml/data/ner_extractor_text.csv +2 -0
- teradataml/data/ner_input_eng.csv +7 -0
- teradataml/data/ner_rule.csv +5 -0
- teradataml/data/ner_sports_test2.csv +29 -0
- teradataml/data/ner_sports_train.csv +501 -0
- teradataml/data/nerevaluator_example.json +6 -0
- teradataml/data/nerextractor_example.json +18 -0
- teradataml/data/nermem_sports_test.csv +18 -0
- teradataml/data/nermem_sports_train.csv +51 -0
- teradataml/data/nertrainer_example.json +7 -0
- teradataml/data/ngrams_example.json +7 -0
- teradataml/data/notebooks/__init__.py +0 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Aggregate Functions using SQLAlchemy.ipynb +1455 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Arithmetic Functions Using SQLAlchemy.ipynb +1993 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Bit-Byte Manipulation Functions using SQLAlchemy.ipynb +1492 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Built-in functions using SQLAlchemy.ipynb +536 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Regular Expressions Using SQLAlchemy.ipynb +570 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage String Functions Using SQLAlchemy.ipynb +2559 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Window Aggregate Functions using SQLAlchemy.ipynb +2911 -0
- teradataml/data/notebooks/sqlalchemy/Using Generic SQLAlchemy ClauseElements teradataml DataFrame assign method.ipynb +698 -0
- teradataml/data/notebooks/sqlalchemy/__init__.py +0 -0
- teradataml/data/notebooks/sqlalchemy/teradataml filtering using SQLAlchemy ClauseElements.ipynb +784 -0
- teradataml/data/npath_example.json +23 -0
- teradataml/data/ntree_example.json +14 -0
- teradataml/data/numeric_strings.csv +5 -0
- teradataml/data/numerics.csv +4 -0
- teradataml/data/ocean_buoy.csv +17 -0
- teradataml/data/ocean_buoy2.csv +17 -0
- teradataml/data/ocean_buoys.csv +28 -0
- teradataml/data/ocean_buoys2.csv +10 -0
- teradataml/data/ocean_buoys_nonpti.csv +28 -0
- teradataml/data/ocean_buoys_seq.csv +29 -0
- teradataml/data/onehot_encoder_train.csv +4 -0
- teradataml/data/openml_example.json +92 -0
- teradataml/data/optional_event_table.csv +4 -0
- teradataml/data/orders1.csv +11 -0
- teradataml/data/orders1_12.csv +13 -0
- teradataml/data/orders_ex.csv +4 -0
- teradataml/data/pack_example.json +9 -0
- teradataml/data/package_tracking.csv +19 -0
- teradataml/data/package_tracking_pti.csv +19 -0
- teradataml/data/pagerank_example.json +13 -0
- teradataml/data/paragraphs_input.csv +6 -0
- teradataml/data/pathanalyzer_example.json +8 -0
- teradataml/data/pathgenerator_example.json +8 -0
- teradataml/data/patient_profile.csv +101 -0
- teradataml/data/pattern_matching_data.csv +11 -0
- teradataml/data/payment_fraud_dataset.csv +10001 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/phrases.csv +7 -0
- teradataml/data/pivot_example.json +9 -0
- teradataml/data/pivot_input.csv +22 -0
- teradataml/data/playerRating.csv +31 -0
- teradataml/data/pos_input.csv +40 -0
- teradataml/data/postagger_example.json +7 -0
- teradataml/data/posttagger_output.csv +44 -0
- teradataml/data/production_data.csv +17 -0
- teradataml/data/production_data2.csv +7 -0
- teradataml/data/randomsample_example.json +32 -0
- teradataml/data/randomwalksample_example.json +9 -0
- teradataml/data/rank_table.csv +6 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/ref_mobile_data.csv +4 -0
- teradataml/data/ref_mobile_data_dense.csv +2 -0
- teradataml/data/ref_url.csv +17 -0
- teradataml/data/restaurant_reviews.csv +7 -0
- teradataml/data/retail_churn_table.csv +27772 -0
- teradataml/data/river_data.csv +145 -0
- teradataml/data/roc_example.json +8 -0
- teradataml/data/roc_input.csv +101 -0
- teradataml/data/rule_inputs.csv +6 -0
- teradataml/data/rule_table.csv +2 -0
- teradataml/data/sales.csv +7 -0
- teradataml/data/sales_transaction.csv +501 -0
- teradataml/data/salesdata.csv +342 -0
- teradataml/data/sample_cities.csv +3 -0
- teradataml/data/sample_shapes.csv +11 -0
- teradataml/data/sample_streets.csv +3 -0
- teradataml/data/sampling_example.json +16 -0
- teradataml/data/sax_example.json +17 -0
- teradataml/data/scale_attributes.csv +3 -0
- teradataml/data/scale_example.json +74 -0
- teradataml/data/scale_housing.csv +11 -0
- teradataml/data/scale_housing_test.csv +6 -0
- teradataml/data/scale_input_part_sparse.csv +31 -0
- teradataml/data/scale_input_partitioned.csv +16 -0
- teradataml/data/scale_input_sparse.csv +11 -0
- teradataml/data/scale_parameters.csv +3 -0
- teradataml/data/scale_stat.csv +11 -0
- teradataml/data/scalebypartition_example.json +13 -0
- teradataml/data/scalemap_example.json +13 -0
- teradataml/data/scalesummary_example.json +12 -0
- teradataml/data/score_category.csv +101 -0
- teradataml/data/score_summary.csv +4 -0
- teradataml/data/script_example.json +10 -0
- teradataml/data/scripts/deploy_script.py +84 -0
- teradataml/data/scripts/lightgbm/dataset.template +175 -0
- teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +264 -0
- teradataml/data/scripts/lightgbm/lightgbm_function.template +234 -0
- teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +177 -0
- teradataml/data/scripts/mapper.R +20 -0
- teradataml/data/scripts/mapper.py +16 -0
- teradataml/data/scripts/mapper_replace.py +16 -0
- teradataml/data/scripts/sklearn/__init__.py +0 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +205 -0
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +148 -0
- teradataml/data/scripts/sklearn/sklearn_function.template +144 -0
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +166 -0
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +161 -0
- teradataml/data/scripts/sklearn/sklearn_score.py +145 -0
- teradataml/data/scripts/sklearn/sklearn_transform.py +327 -0
- teradataml/data/sdk/modelops/modelops_spec.json +101737 -0
- teradataml/data/seeds.csv +10 -0
- teradataml/data/sentenceextractor_example.json +7 -0
- teradataml/data/sentiment_extract_input.csv +11 -0
- teradataml/data/sentiment_train.csv +16 -0
- teradataml/data/sentiment_word.csv +20 -0
- teradataml/data/sentiment_word_input.csv +20 -0
- teradataml/data/sentimentextractor_example.json +24 -0
- teradataml/data/sentimenttrainer_example.json +8 -0
- teradataml/data/sequence_table.csv +10 -0
- teradataml/data/seriessplitter_example.json +8 -0
- teradataml/data/sessionize_example.json +17 -0
- teradataml/data/sessionize_table.csv +116 -0
- teradataml/data/setop_test1.csv +24 -0
- teradataml/data/setop_test2.csv +22 -0
- teradataml/data/soc_nw_edges.csv +11 -0
- teradataml/data/soc_nw_vertices.csv +8 -0
- teradataml/data/souvenir_timeseries.csv +168 -0
- teradataml/data/sparse_iris_attribute.csv +5 -0
- teradataml/data/sparse_iris_test.csv +121 -0
- teradataml/data/sparse_iris_train.csv +601 -0
- teradataml/data/star1.csv +6 -0
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/state_transition.csv +5 -0
- teradataml/data/stock_data.csv +53 -0
- teradataml/data/stock_movement.csv +11 -0
- teradataml/data/stock_vol.csv +76 -0
- teradataml/data/stop_words.csv +8 -0
- teradataml/data/store_sales.csv +37 -0
- teradataml/data/stringsimilarity_example.json +8 -0
- teradataml/data/strsimilarity_input.csv +13 -0
- teradataml/data/students.csv +101 -0
- teradataml/data/svm_iris_input_test.csv +121 -0
- teradataml/data/svm_iris_input_train.csv +481 -0
- teradataml/data/svm_iris_model.csv +7 -0
- teradataml/data/svmdense_example.json +10 -0
- teradataml/data/svmdensepredict_example.json +19 -0
- teradataml/data/svmsparse_example.json +8 -0
- teradataml/data/svmsparsepredict_example.json +14 -0
- teradataml/data/svmsparsesummary_example.json +8 -0
- teradataml/data/target_mobile_data.csv +13 -0
- teradataml/data/target_mobile_data_dense.csv +5 -0
- teradataml/data/target_udt_data.csv +8 -0
- teradataml/data/tdnerextractor_example.json +14 -0
- teradataml/data/templatedata.csv +1201 -0
- teradataml/data/templates/open_source_ml.json +11 -0
- teradataml/data/teradata_icon.ico +0 -0
- teradataml/data/teradataml_example.json +1473 -0
- teradataml/data/test_classification.csv +101 -0
- teradataml/data/test_loan_prediction.csv +53 -0
- teradataml/data/test_pacf_12.csv +37 -0
- teradataml/data/test_prediction.csv +101 -0
- teradataml/data/test_regression.csv +101 -0
- teradataml/data/test_river2.csv +109 -0
- teradataml/data/text_inputs.csv +6 -0
- teradataml/data/textchunker_example.json +8 -0
- teradataml/data/textclassifier_example.json +7 -0
- teradataml/data/textclassifier_input.csv +7 -0
- teradataml/data/textclassifiertrainer_example.json +7 -0
- teradataml/data/textmorph_example.json +11 -0
- teradataml/data/textparser_example.json +15 -0
- teradataml/data/texttagger_example.json +12 -0
- teradataml/data/texttokenizer_example.json +7 -0
- teradataml/data/texttrainer_input.csv +11 -0
- teradataml/data/tf_example.json +7 -0
- teradataml/data/tfidf_example.json +14 -0
- teradataml/data/tfidf_input1.csv +201 -0
- teradataml/data/tfidf_train.csv +6 -0
- teradataml/data/time_table1.csv +535 -0
- teradataml/data/time_table2.csv +14 -0
- teradataml/data/timeseriesdata.csv +1601 -0
- teradataml/data/timeseriesdatasetsd4.csv +105 -0
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic.csv +892 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/to_num_data.csv +4 -0
- teradataml/data/tochar_data.csv +5 -0
- teradataml/data/token_table.csv +696 -0
- teradataml/data/train_multiclass.csv +101 -0
- teradataml/data/train_regression.csv +101 -0
- teradataml/data/train_regression_multiple_labels.csv +101 -0
- teradataml/data/train_tracking.csv +28 -0
- teradataml/data/trans_dense.csv +16 -0
- teradataml/data/trans_sparse.csv +55 -0
- teradataml/data/transformation_table.csv +6 -0
- teradataml/data/transformation_table_new.csv +2 -0
- teradataml/data/tv_spots.csv +16 -0
- teradataml/data/twod_climate_data.csv +117 -0
- teradataml/data/uaf_example.json +529 -0
- teradataml/data/univariatestatistics_example.json +9 -0
- teradataml/data/unpack_example.json +10 -0
- teradataml/data/unpivot_example.json +25 -0
- teradataml/data/unpivot_input.csv +8 -0
- teradataml/data/url_data.csv +10 -0
- teradataml/data/us_air_pass.csv +37 -0
- teradataml/data/us_population.csv +624 -0
- teradataml/data/us_states_shapes.csv +52 -0
- teradataml/data/varmax_example.json +18 -0
- teradataml/data/vectordistance_example.json +30 -0
- teradataml/data/ville_climatedata.csv +121 -0
- teradataml/data/ville_tempdata.csv +12 -0
- teradataml/data/ville_tempdata1.csv +12 -0
- teradataml/data/ville_temperature.csv +11 -0
- teradataml/data/waveletTable.csv +1605 -0
- teradataml/data/waveletTable2.csv +1605 -0
- teradataml/data/weightedmovavg_example.json +9 -0
- teradataml/data/wft_testing.csv +5 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/data/wine_data.csv +1600 -0
- teradataml/data/word_embed_input_table1.csv +6 -0
- teradataml/data/word_embed_input_table2.csv +5 -0
- teradataml/data/word_embed_model.csv +23 -0
- teradataml/data/words_input.csv +13 -0
- teradataml/data/xconvolve_complex_left.csv +6 -0
- teradataml/data/xconvolve_complex_leftmulti.csv +6 -0
- teradataml/data/xgboost_example.json +36 -0
- teradataml/data/xgboostpredict_example.json +32 -0
- teradataml/data/ztest_example.json +16 -0
- teradataml/dataframe/__init__.py +0 -0
- teradataml/dataframe/copy_to.py +2446 -0
- teradataml/dataframe/data_transfer.py +2840 -0
- teradataml/dataframe/dataframe.py +20908 -0
- teradataml/dataframe/dataframe_utils.py +2114 -0
- teradataml/dataframe/fastload.py +794 -0
- teradataml/dataframe/functions.py +2110 -0
- teradataml/dataframe/indexer.py +424 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +1171 -0
- teradataml/dataframe/sql.py +10904 -0
- teradataml/dataframe/sql_function_parameters.py +440 -0
- teradataml/dataframe/sql_functions.py +652 -0
- teradataml/dataframe/sql_interfaces.py +220 -0
- teradataml/dataframe/vantage_function_types.py +675 -0
- teradataml/dataframe/window.py +694 -0
- teradataml/dbutils/__init__.py +3 -0
- teradataml/dbutils/dbutils.py +2871 -0
- teradataml/dbutils/filemgr.py +318 -0
- teradataml/gen_ai/__init__.py +2 -0
- teradataml/gen_ai/convAI.py +473 -0
- teradataml/geospatial/__init__.py +4 -0
- teradataml/geospatial/geodataframe.py +1105 -0
- teradataml/geospatial/geodataframecolumn.py +392 -0
- teradataml/geospatial/geometry_types.py +926 -0
- teradataml/hyperparameter_tuner/__init__.py +1 -0
- teradataml/hyperparameter_tuner/optimizer.py +4115 -0
- teradataml/hyperparameter_tuner/utils.py +303 -0
- teradataml/lib/__init__.py +0 -0
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/lib/libaed_0_1.dylib +0 -0
- teradataml/lib/libaed_0_1.so +0 -0
- teradataml/lib/libaed_0_1_aarch64.so +0 -0
- teradataml/lib/libaed_0_1_ppc64le.so +0 -0
- teradataml/opensource/__init__.py +1 -0
- teradataml/opensource/_base.py +1321 -0
- teradataml/opensource/_class.py +464 -0
- teradataml/opensource/_constants.py +61 -0
- teradataml/opensource/_lightgbm.py +949 -0
- teradataml/opensource/_sklearn.py +1008 -0
- teradataml/opensource/_wrapper_utils.py +267 -0
- teradataml/options/__init__.py +148 -0
- teradataml/options/configure.py +489 -0
- teradataml/options/display.py +187 -0
- teradataml/plot/__init__.py +3 -0
- teradataml/plot/axis.py +1427 -0
- teradataml/plot/constants.py +15 -0
- teradataml/plot/figure.py +431 -0
- teradataml/plot/plot.py +810 -0
- teradataml/plot/query_generator.py +83 -0
- teradataml/plot/subplot.py +216 -0
- teradataml/scriptmgmt/UserEnv.py +4273 -0
- teradataml/scriptmgmt/__init__.py +3 -0
- teradataml/scriptmgmt/lls_utils.py +2157 -0
- teradataml/sdk/README.md +79 -0
- teradataml/sdk/__init__.py +4 -0
- teradataml/sdk/_auth_modes.py +422 -0
- teradataml/sdk/_func_params.py +487 -0
- teradataml/sdk/_json_parser.py +453 -0
- teradataml/sdk/_openapi_spec_constants.py +249 -0
- teradataml/sdk/_utils.py +236 -0
- teradataml/sdk/api_client.py +900 -0
- teradataml/sdk/constants.py +62 -0
- teradataml/sdk/modelops/__init__.py +98 -0
- teradataml/sdk/modelops/_client.py +409 -0
- teradataml/sdk/modelops/_constants.py +304 -0
- teradataml/sdk/modelops/models.py +2308 -0
- teradataml/sdk/spinner.py +107 -0
- teradataml/series/__init__.py +0 -0
- teradataml/series/series.py +537 -0
- teradataml/series/series_utils.py +71 -0
- teradataml/store/__init__.py +12 -0
- teradataml/store/feature_store/__init__.py +0 -0
- teradataml/store/feature_store/constants.py +658 -0
- teradataml/store/feature_store/feature_store.py +4814 -0
- teradataml/store/feature_store/mind_map.py +639 -0
- teradataml/store/feature_store/models.py +7330 -0
- teradataml/store/feature_store/utils.py +390 -0
- teradataml/table_operators/Apply.py +979 -0
- teradataml/table_operators/Script.py +1739 -0
- teradataml/table_operators/TableOperator.py +1343 -0
- teradataml/table_operators/__init__.py +2 -0
- teradataml/table_operators/apply_query_generator.py +262 -0
- teradataml/table_operators/query_generator.py +493 -0
- teradataml/table_operators/table_operator_query_generator.py +462 -0
- teradataml/table_operators/table_operator_util.py +726 -0
- teradataml/table_operators/templates/dataframe_apply.template +184 -0
- teradataml/table_operators/templates/dataframe_map.template +176 -0
- teradataml/table_operators/templates/dataframe_register.template +73 -0
- teradataml/table_operators/templates/dataframe_udf.template +67 -0
- teradataml/table_operators/templates/script_executor.template +170 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +53 -0
- teradataml/utils/__init__.py +0 -0
- teradataml/utils/docstring.py +527 -0
- teradataml/utils/dtypes.py +943 -0
- teradataml/utils/internal_buffer.py +122 -0
- teradataml/utils/print_versions.py +206 -0
- teradataml/utils/utils.py +451 -0
- teradataml/utils/validators.py +3305 -0
- teradataml-20.0.0.8.dist-info/METADATA +2804 -0
- teradataml-20.0.0.8.dist-info/RECORD +1208 -0
- teradataml-20.0.0.8.dist-info/WHEEL +5 -0
- teradataml-20.0.0.8.dist-info/top_level.txt +1 -0
- teradataml-20.0.0.8.dist-info/zip-safe +1 -0
|
@@ -0,0 +1,4115 @@
|
|
|
1
|
+
# ##################################################################
|
|
2
|
+
#
|
|
3
|
+
# Copyright 2025 Teradata. All rights reserved.
|
|
4
|
+
# TERADATA CONFIDENTIAL AND TRADE SECRET
|
|
5
|
+
#
|
|
6
|
+
# Primary Owner: Kesavaragavan B (kesavaragavan.b@Teradata.com)
|
|
7
|
+
# Secondary Owner: Pankaj Purandare (PankajVinod.Purandare@teradata.com),
|
|
8
|
+
# Pradeep Garre (pradeep.garre@teradata.com)
|
|
9
|
+
#
|
|
10
|
+
# This file implements Hyperparameter Tuning feature which is used for
|
|
11
|
+
# model optimization. Optimizer contains following algorithms
|
|
12
|
+
# GridSearch and RandomSearch for hyperaparameter tuning.
|
|
13
|
+
#
|
|
14
|
+
# ##################################################################
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
import pandas as pd
|
|
18
|
+
import random
|
|
19
|
+
import time
|
|
20
|
+
import threading
|
|
21
|
+
from itertools import product
|
|
22
|
+
from collections import defaultdict
|
|
23
|
+
from teradataml import DataFrame, valib, TeradataMlException
|
|
24
|
+
from teradataml.common.messages import Messages, MessageCodes
|
|
25
|
+
from teradataml.hyperparameter_tuner.utils import _ProgressBar
|
|
26
|
+
from teradataml.utils.utils import _AsyncDBExecutor
|
|
27
|
+
from teradataml.utils.validators import _Validators
|
|
28
|
+
from teradataml.options.configure import configure
|
|
29
|
+
from teradataml.common.constants import TeradataConstants
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class _BaseSearch:
|
|
33
|
+
"""Base class for hyperparameter optimization."""
|
|
34
|
+
|
|
35
|
+
def __init__(self, func, params):
|
|
36
|
+
"""
|
|
37
|
+
Constructor for _BaseSearch.
|
|
38
|
+
PARAMETERS:
|
|
39
|
+
func:
|
|
40
|
+
Required Argument.
|
|
41
|
+
Specifies a teradataml analytic function.
|
|
42
|
+
Types:
|
|
43
|
+
teradataml Analytic Functions
|
|
44
|
+
* Advanced analytic functions
|
|
45
|
+
* UAF
|
|
46
|
+
* VAL
|
|
47
|
+
Refer to display_analytic_functions()
|
|
48
|
+
function for list of functions.
|
|
49
|
+
|
|
50
|
+
params:
|
|
51
|
+
Optional Argument.
|
|
52
|
+
Specifies the parameter(s) of a teradataml function.
|
|
53
|
+
Types: dict
|
|
54
|
+
|
|
55
|
+
RAISES:
|
|
56
|
+
TeradataMlException, TypeError, ValueError
|
|
57
|
+
|
|
58
|
+
RETURNS:
|
|
59
|
+
None
|
|
60
|
+
|
|
61
|
+
EXAMPLES:
|
|
62
|
+
|
|
63
|
+
>>> # Let's initialize parameters for BaseSearch.
|
|
64
|
+
>>> func_params = {"data" : antiselect_input,
|
|
65
|
+
"exclude" : (['rowids','orderdate'], ['orderdate'])}
|
|
66
|
+
|
|
67
|
+
>>> # Create instance of _BaseSearch.
|
|
68
|
+
>>> bs_obj = _BaseSearch(func=Antiselect, params=func_params)
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
# Argument validation.
|
|
72
|
+
# Validate argument types.
|
|
73
|
+
awu_matrix = []
|
|
74
|
+
awu_matrix.append(["params", params, True, dict, True])
|
|
75
|
+
_Validators._validate_function_arguments(awu_matrix)
|
|
76
|
+
|
|
77
|
+
# Model trainer function supports evaluation.
|
|
78
|
+
self._SQLE_TRAINABLE_FUNCS = {"DecisionForest", "GLM", "GLMPerSegment",
|
|
79
|
+
"KMeans", "KNN", "OneClassSVM", "SVM", "XGBoost",
|
|
80
|
+
"NaiveBayesTextClassifierTrainer"}
|
|
81
|
+
|
|
82
|
+
# Data passed in fit method is sampled and internally test dataset
|
|
83
|
+
# is passed with following argument name for predictions and evaluation.
|
|
84
|
+
self._TRAINABLE_FUNCS_DATA_MAPPER = {"DecisionForest": "newdata", "GLM": "newdata",
|
|
85
|
+
"GLMPerSegment": "newdata", "KMeans": "data",
|
|
86
|
+
"KNN": "test_data", "OneClassSVM": "newdata",
|
|
87
|
+
"SVM": "newdata", "XGBoost": "newdata",
|
|
88
|
+
"NaiveBayesTextClassifierTrainer": "newdata",
|
|
89
|
+
"DecisionTree": "data", "KMeans": "data",
|
|
90
|
+
"LinReg": "data", "LogReg": "data", "PCA": "data",
|
|
91
|
+
"LinearRegression": "data", "Lasso": "data",
|
|
92
|
+
"Ridge": "data", "ARDRegression": "data",
|
|
93
|
+
"BayesianRidge": "data", "TweedieRegressor": "data",
|
|
94
|
+
"TheilSenRegressor": "data", "SGDRegressor": "data",
|
|
95
|
+
"RidgeCV": "data", "RANSACRegressor": "data",
|
|
96
|
+
"PoissonRegressor": "data", "PassiveAggressiveRegressor": "data",
|
|
97
|
+
"OrthogonalMatchingPursuitCV": "data", "OrthogonalMatchingPursuit": "data",
|
|
98
|
+
"MultiTaskLassoCV": "data", "MultiTaskLasso": "data",
|
|
99
|
+
"MultiTaskElasticNetCV": "data", "MultiTaskElasticNet": "data",
|
|
100
|
+
"LassoLarsIC": "data", "LassoLarsCV": "data", "LassoLars": "data",
|
|
101
|
+
"LassoCV": "data", "LarsCV": "data", "Lars": "data",
|
|
102
|
+
"HuberRegressor": "data", "GammaRegressor": "data",
|
|
103
|
+
"ElasticNetCV": "data", "ElasticNet": "data",
|
|
104
|
+
"LogisticRegression": "data", "RidgeClassifier": "data",
|
|
105
|
+
"RidgeClassifierCV": "data", "SGDClassifier": "data",
|
|
106
|
+
"PassiveAggressiveClassifier": "data", "Perceptron": "data",
|
|
107
|
+
"LogisticRegressionCV": "data"}
|
|
108
|
+
|
|
109
|
+
self._UAF_TRAINABLE_FUNCS = {"ArimaEstimate", "LinearRegr", "MAMean",
|
|
110
|
+
"MultivarRegr", "SimpleExp"}
|
|
111
|
+
self._VAL_TRAINABLE_FUNCS = {"DecisionTree", "KMeans", "LinReg", "LogReg", "PCA"}
|
|
112
|
+
|
|
113
|
+
# Unsupervised model trainer functions. These models are suitable
|
|
114
|
+
# for prediction rather than evaluation.
|
|
115
|
+
self.__US_TRAINABLE_FUNCS = {"KMeans", "OneClassSVM", "PCA"}
|
|
116
|
+
|
|
117
|
+
# Evaluation approach for model evaluable functions were "True" means
|
|
118
|
+
# higher the score is better, and vice versa.
|
|
119
|
+
self.__func_comparator = {'MAE': False,
|
|
120
|
+
'MSE': False,
|
|
121
|
+
'MSLE': False,
|
|
122
|
+
'MAPE': False,
|
|
123
|
+
'RMSE': False,
|
|
124
|
+
'RMSLE': False,
|
|
125
|
+
'ME': False,
|
|
126
|
+
'R2': True,
|
|
127
|
+
'EV': True,
|
|
128
|
+
'MPE': False,
|
|
129
|
+
'MPD': False,
|
|
130
|
+
'MGD': False,
|
|
131
|
+
'ACCURACY': True,
|
|
132
|
+
'MICRO-PRECISION': True,
|
|
133
|
+
'MICRO-RECALL': True,
|
|
134
|
+
'MICRO-F1': True,
|
|
135
|
+
'MACRO-PRECISION': True,
|
|
136
|
+
'MACRO-RECALL': True,
|
|
137
|
+
'MACRO-F1': True,
|
|
138
|
+
'WEIGHTED-PRECISION': True,
|
|
139
|
+
'WEIGHTED-RECALL': True,
|
|
140
|
+
'WEIGHTED-F1': True,
|
|
141
|
+
'SILHOUETTE': True,
|
|
142
|
+
'CALINSKI': True,
|
|
143
|
+
'DAVIES': True}
|
|
144
|
+
|
|
145
|
+
# OpenSource ML function comparator (excluding MPD, MGD, MTD, RMSE, RMSLE)
|
|
146
|
+
self.__osml_func_comparator = {k: v for k, v in self.__func_comparator.items()
|
|
147
|
+
if k not in ['MPD', 'MGD', 'MTD', 'RMSE', 'RMSLE']}
|
|
148
|
+
|
|
149
|
+
# Linear model categorization lists for sklearn models
|
|
150
|
+
self._LINEAR_REGRESSION_MODELS = {
|
|
151
|
+
"ARDRegression", "BayesianRidge", "TweedieRegressor", "TheilSenRegressor",
|
|
152
|
+
"SGDRegressor", "RidgeCV", "Ridge", "RANSACRegressor", "PoissonRegressor",
|
|
153
|
+
"PassiveAggressiveRegressor", "OrthogonalMatchingPursuitCV", "OrthogonalMatchingPursuit",
|
|
154
|
+
"MultiTaskLassoCV", "MultiTaskLasso", "MultiTaskElasticNetCV", "MultiTaskElasticNet",
|
|
155
|
+
"LinearRegression", "LassoLarsIC", "LassoLarsCV", "LassoLars", "LassoCV",
|
|
156
|
+
"Lasso", "LarsCV", "Lars", "HuberRegressor", "GammaRegressor",
|
|
157
|
+
"ElasticNetCV", "ElasticNet"
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
self._LINEAR_CLASSIFICATION_MODELS = {
|
|
161
|
+
"SGDClassifier", "RidgeClassifierCV", "RidgeClassifier", "Perceptron",
|
|
162
|
+
"PassiveAggressiveClassifier", "LogisticRegressionCV", "LogisticRegression"
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
self._CLUSTERING_MODELS = {
|
|
166
|
+
"KMeans", "GaussianMixture"
|
|
167
|
+
}
|
|
168
|
+
self.__func = func
|
|
169
|
+
self.__params = params
|
|
170
|
+
# "self.__best_model" contains best model.
|
|
171
|
+
self.__best_model = None
|
|
172
|
+
# "self.__evaluation_metric" contains evaluation metric considered for
|
|
173
|
+
# evaluation.
|
|
174
|
+
self.__evaluation_metric = None
|
|
175
|
+
# "self.__eval_params" contains evaluation parameter will be used for
|
|
176
|
+
# trained model evaluation.
|
|
177
|
+
self.__eval_params = None
|
|
178
|
+
# "self.__early_stop" contains expected evaluation value considered for
|
|
179
|
+
# evaluation.
|
|
180
|
+
self.__early_stop = None
|
|
181
|
+
# "self._parameter_grid" contains parameter combinations.
|
|
182
|
+
self._parameter_grid = None
|
|
183
|
+
# "self.__best_score_" contains best model score.
|
|
184
|
+
self.__best_score_ = None
|
|
185
|
+
# "self.__best_model_id" contains best model ID.
|
|
186
|
+
self.__best_model_id = None
|
|
187
|
+
# "self.__best_params_" contains best model parameters.
|
|
188
|
+
self.__best_params_ = None
|
|
189
|
+
# "__model_stats" contains "model_id" and corresponding evaluation
|
|
190
|
+
# metrics as a DataFrame.
|
|
191
|
+
self.__model_stats = None
|
|
192
|
+
# "self.__models" contains "model_id", "params", "accuracy", and "status"
|
|
193
|
+
# will be stored as a DataFrame.
|
|
194
|
+
self.__models = None
|
|
195
|
+
# HPT complete execution results including "model_stats" informations recorded.
|
|
196
|
+
self.__model_eval_records = list()
|
|
197
|
+
# "self.__trained_models" is an internal attribute to keep track of
|
|
198
|
+
# "model_id" and the associated function objects.
|
|
199
|
+
self.__trained_models = dict()
|
|
200
|
+
# "__train_data" contains training data for model trainer and unsupervised
|
|
201
|
+
# model trainer functions.
|
|
202
|
+
self.__train_data = None
|
|
203
|
+
# "__test_data" contains testing data for model trainer function.
|
|
204
|
+
self.__test_data = None
|
|
205
|
+
# Default model will be used for predict and evaluate after HPT execution.
|
|
206
|
+
self.__default_model = None
|
|
207
|
+
# 'self.__is_finite' will indicate whether the chosen '__evaluation_metric'
|
|
208
|
+
# contains 'NaN', '-inf' or 'inf' values.
|
|
209
|
+
self.__is_finite = True
|
|
210
|
+
# '__is_fit_called' specifies whether a fit method is called by user.
|
|
211
|
+
# This helps 'is_running' method to identify the model training state.
|
|
212
|
+
self.__is_fit_called = False
|
|
213
|
+
# "__model_trainer_input_data" contains the model trainer data when input data is passed along with params.
|
|
214
|
+
self.__model_trainer_input_data = None
|
|
215
|
+
# Constant name for data identifier.
|
|
216
|
+
self.__DATA_ID = "data_id"
|
|
217
|
+
# '__progress_bar' holds progress bar obj when verbose is set.
|
|
218
|
+
self.__progress_bar = None
|
|
219
|
+
# '__model_err_records' holds error messages of failed model.
|
|
220
|
+
self.__model_err_records = dict()
|
|
221
|
+
# '__parallel_stop_event' is used to stop threads in parallel execution.
|
|
222
|
+
self.__parallel_stop_event = None
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
# Set the function feature type and supported functionality.
|
|
226
|
+
self.__is_sqle_function = False
|
|
227
|
+
self.__is_uaf_function = False
|
|
228
|
+
self.__is_val_function = True if "valib" in str(self.__func.__module__)\
|
|
229
|
+
else False
|
|
230
|
+
self.__is_opensource_model = False
|
|
231
|
+
self.__is_clustering_model = False
|
|
232
|
+
self.__is_regression_model = False
|
|
233
|
+
self.__is_classification_model = False
|
|
234
|
+
self.model_id_counter = {}
|
|
235
|
+
|
|
236
|
+
# Import sklearn wrapper class for proper type checking
|
|
237
|
+
from teradataml.opensource._sklearn import _SkLearnObjectWrapper
|
|
238
|
+
|
|
239
|
+
if hasattr(func, "modelObj") and isinstance(func, _SkLearnObjectWrapper):
|
|
240
|
+
self.__is_opensource_model = True
|
|
241
|
+
self.__is_trainable = True
|
|
242
|
+
self.__is_evaluatable = True
|
|
243
|
+
self.__is_predictable = True
|
|
244
|
+
|
|
245
|
+
# Set the function name and class
|
|
246
|
+
self.__func_name = func.modelObj.__class__.__name__ # e.g., 'KMeans'
|
|
247
|
+
self.__func = func.__class__
|
|
248
|
+
if self.__func_name in self._CLUSTERING_MODELS:
|
|
249
|
+
self.__is_clustering_model = True
|
|
250
|
+
self.__is_evaluatable = False
|
|
251
|
+
elif self.__func_name in self._LINEAR_REGRESSION_MODELS:
|
|
252
|
+
self.__is_regression_model = True
|
|
253
|
+
elif self.__func_name in self._LINEAR_CLASSIFICATION_MODELS:
|
|
254
|
+
self.__is_classification_model = True
|
|
255
|
+
else:
|
|
256
|
+
self.__func_name = func._tdml_valib_name if "_VALIB" in str(func.__class__) \
|
|
257
|
+
else func.__name__
|
|
258
|
+
if self.__func_name in self._VAL_TRAINABLE_FUNCS and self.__is_val_function:
|
|
259
|
+
# TODO: Enable these feature once merge model supports VAL functions.
|
|
260
|
+
# This case is for VAL model trainer functions.
|
|
261
|
+
self.__is_trainable = self.__is_evaluatable = \
|
|
262
|
+
self.__is_predictable = False
|
|
263
|
+
elif self.__func_name in self._UAF_TRAINABLE_FUNCS:
|
|
264
|
+
# TODO: Enable these feature once merge model supports UAF functions.
|
|
265
|
+
# This case is for UAF model trainer functions.
|
|
266
|
+
self.__is_uaf_function = self.__is_trainable = \
|
|
267
|
+
self.__is_evaluatable = False
|
|
268
|
+
self.__is_predictable = False
|
|
269
|
+
elif self.__func_name in self._SQLE_TRAINABLE_FUNCS:
|
|
270
|
+
# This case is for SQLE model trainer functions.
|
|
271
|
+
self.__is_sqle_function = self.__is_trainable = \
|
|
272
|
+
self.__is_evaluatable = self.__is_predictable = True
|
|
273
|
+
else:
|
|
274
|
+
# This case is for non-model trainer functions.
|
|
275
|
+
self.__is_trainable = self.__is_evaluatable = \
|
|
276
|
+
self.__is_predictable = False
|
|
277
|
+
|
|
278
|
+
self.__is_evaluatable = False if not self.__is_evaluatable or \
|
|
279
|
+
self.__func_name in self.__US_TRAINABLE_FUNCS else \
|
|
280
|
+
True
|
|
281
|
+
# Set train routine based on model type.
|
|
282
|
+
# Non-model trainer routine is used for unsupervised model function training.
|
|
283
|
+
self._execute_fit = self.__model_trainer_routine if self.__is_trainable \
|
|
284
|
+
and (self.__is_evaluatable or self.__is_clustering_model) else \
|
|
285
|
+
self.__non_model_trainer_routine
|
|
286
|
+
|
|
287
|
+
# Utility lambda functions.
|
|
288
|
+
# '_is_best_metrics' function is to check whether current trained model
|
|
289
|
+
# evaluation value is better than existing "self.__best_model" score.
|
|
290
|
+
self._is_best_metrics = lambda curr_score: curr_score > self.__best_score_ \
|
|
291
|
+
if self.__func_comparator[self.__evaluation_metric] \
|
|
292
|
+
else curr_score < self.__best_score_
|
|
293
|
+
# '_is_early_stoppable' function is to check whether HPT execution reached
|
|
294
|
+
# "self.__early_stop" value.
|
|
295
|
+
self._is_early_stoppable = lambda : self.__best_score_ >= self.__early_stop \
|
|
296
|
+
if self.__func_comparator[self.__evaluation_metric] \
|
|
297
|
+
else self.__best_score_ <= self.__early_stop
|
|
298
|
+
|
|
299
|
+
# '_is_time_stoppable' function is to check whether HPT execution reached self.__timeout value.
|
|
300
|
+
self._is_time_stoppable = lambda : True if time.time() - self.__start_time >= self.__timeout else False
|
|
301
|
+
|
|
302
|
+
# Special case comparator for "MPE" metrics.
|
|
303
|
+
# When "curr_score" argument is 'None' then lambda function checks
|
|
304
|
+
# for '_is_early_stoppable'. Otherwise, it checks for '_is_best_metrics'.
|
|
305
|
+
self._spl_abs_comparator = lambda curr_score=None: \
|
|
306
|
+
abs(curr_score) < abs(self.__best_score_) \
|
|
307
|
+
if curr_score is not None else \
|
|
308
|
+
abs(self.__best_score_) <= abs(self.__early_stop)
|
|
309
|
+
|
|
310
|
+
# '_generate_model_name' function is used to create new model name
|
|
311
|
+
# for every iteration.
|
|
312
|
+
self._generate_model_name = lambda iter: "{}_{}".format(\
|
|
313
|
+
self.__func_name.upper(), str(iter))
|
|
314
|
+
|
|
315
|
+
# '__is_model_training_completed' function to check whether all models are
|
|
316
|
+
# executed based on model evaluation records. Function returns true, when all
|
|
317
|
+
# models are executed and evaluation reports are updated. Otherwise,
|
|
318
|
+
# returns false.
|
|
319
|
+
self.__is_model_training_completed = lambda : self.__is_fit_called and \
|
|
320
|
+
len(self.__model_eval_records) < \
|
|
321
|
+
len(self._parameter_grid)
|
|
322
|
+
|
|
323
|
+
# '_generate_dataframe_name' function is used to create new dataframe ID
|
|
324
|
+
# for given iteration.
|
|
325
|
+
self._generate_dataframe_name = lambda df_name, iter: "{}_{}".format(df_name, str(iter))
|
|
326
|
+
|
|
327
|
+
# '_get_train_data_arg' function is used to return model trainer function
|
|
328
|
+
# train argument name.
|
|
329
|
+
self._get_model_trainer_train_data_arg = lambda : "train_data" if \
|
|
330
|
+
self.__func_name == "KNN" else "data"
|
|
331
|
+
|
|
332
|
+
# '_get_predict_column' function is used to generate prediction column name.
|
|
333
|
+
self._get_predict_column = lambda: f"{self.__func_name.lower()}_predict_1"
|
|
334
|
+
|
|
335
|
+
if self.__is_trainable and "data" in self.__params:
|
|
336
|
+
data = self.__params.pop("data")
|
|
337
|
+
self.__validate_model_trainer_input_data_argument(data, False)
|
|
338
|
+
self.__model_trainer_input_data = data
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def set_parameter_grid(self):
|
|
342
|
+
"""
|
|
343
|
+
DESCRIPTION:
|
|
344
|
+
Set the value of the attribute _parameter_grid.
|
|
345
|
+
|
|
346
|
+
RETURNS:
|
|
347
|
+
None
|
|
348
|
+
|
|
349
|
+
EXAMPLES:
|
|
350
|
+
>>> self.set_parameter_grid()
|
|
351
|
+
"""
|
|
352
|
+
self._parameter_grid = self.__populate_parameter_grid()
|
|
353
|
+
def get_parameter_grid(self):
|
|
354
|
+
"""
|
|
355
|
+
DESCRIPTION:
|
|
356
|
+
Returns the value of the attribute _parameter_grid.
|
|
357
|
+
|
|
358
|
+
RETURNS:
|
|
359
|
+
dict
|
|
360
|
+
|
|
361
|
+
EXAMPLES:
|
|
362
|
+
>>> # Create an instance of the search algorithm called "optimizer_obj"
|
|
363
|
+
>>> # by referring "__init__()" method.
|
|
364
|
+
>>> # Perform "fit()" method on the optimizer_obj to populate model records.
|
|
365
|
+
>>> # Retrieve parameter grid.
|
|
366
|
+
>>> optimizer_obj.get_parameter_grid()
|
|
367
|
+
[{'param': {'input_columns': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms',
|
|
368
|
+
'Population', 'AveOccup', 'Latitude', 'Longitude'],
|
|
369
|
+
'response_column': 'MedHouseVal', 'model_type': 'regression',
|
|
370
|
+
'batch_size': 75, 'iter_max': 100, 'lambda1': 0.1, 'alpha': 0.5,
|
|
371
|
+
'iter_num_no_change': 60, 'tolerance': 0.01, 'intercept': False,
|
|
372
|
+
'learning_rate': 'INVTIME', 'initial_data': 0.5, 'decay_rate': 0.5,
|
|
373
|
+
'momentum': 0.6, 'nesterov': True, 'local_sgd_iterations': 1,
|
|
374
|
+
'data': '"ALICE"."ml__select__1696593660430612"'},
|
|
375
|
+
'data_id': 'DF_0'},
|
|
376
|
+
{'param': {'input_columns': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms',
|
|
377
|
+
'Population', 'AveOccup', 'Latitude', 'Longitude'],
|
|
378
|
+
'response_column': 'MedHouseVal', 'model_type': 'regression',
|
|
379
|
+
'batch_size': 75, 'iter_max': 100, 'lambda1': 0.1, 'alpha': 0.5,
|
|
380
|
+
'iter_num_no_change': 60, 'tolerance': 0.01, 'intercept': False,
|
|
381
|
+
'learning_rate': 'INVTIME', 'initial_data': 0.5, 'decay_rate': 0.5,
|
|
382
|
+
'momentum': 0.6, 'nesterov': True, 'local_sgd_iterations': 1,
|
|
383
|
+
'data': '"ALICE"."ml__select__1696593660430612"'},
|
|
384
|
+
'data_id': 'DF_1'}]
|
|
385
|
+
"""
|
|
386
|
+
return self._parameter_grid
|
|
387
|
+
|
|
388
|
+
@property
|
|
389
|
+
def models(self):
|
|
390
|
+
"""
|
|
391
|
+
DESCRIPTION:
|
|
392
|
+
Returns the generated models metadata.
|
|
393
|
+
|
|
394
|
+
RETURNS:
|
|
395
|
+
pandas DataFrame
|
|
396
|
+
|
|
397
|
+
EXAMPLES:
|
|
398
|
+
>>> # Create an instance of the search algorithm called "optimizer_obj"
|
|
399
|
+
>>> # by referring "__init__()" method.
|
|
400
|
+
>>> # Perform "fit()" method on the optimizer_obj to populate model records.
|
|
401
|
+
>>> # Retrieve models metadata.
|
|
402
|
+
>>> optimizer_obj.models
|
|
403
|
+
MODEL_ID DATA_ID PARAMETERS STATUS MAE
|
|
404
|
+
0 SVM_3 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.616772
|
|
405
|
+
1 SVM_0 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.660815
|
|
406
|
+
2 SVM_1 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.660815
|
|
407
|
+
3 SVM_2 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.616772
|
|
408
|
+
4 SVM_4 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.616772
|
|
409
|
+
5 SVM_5 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.616772
|
|
410
|
+
|
|
411
|
+
"""
|
|
412
|
+
# All the models are stored in a dictionary '__model_eval_records'. Since
|
|
413
|
+
# "models" return a pandas DataFrame, one has to construct pandas DataFrame
|
|
414
|
+
# from "__models". This construction should be done only if it is
|
|
415
|
+
# appropriate, i.e., when a new model is pushed to "__model_eval_records",
|
|
416
|
+
# only then construct the pandas Dataframe for models. Otherwise, store
|
|
417
|
+
# it and use it. Check a new model record is generated or not by
|
|
418
|
+
# comparing the number of model records present in '__model_eval_records'
|
|
419
|
+
# with existing number of records in '__models'.
|
|
420
|
+
_is_models_updated = self.__models is None or \
|
|
421
|
+
len(self.__model_eval_records) != self.__models.shape[0]
|
|
422
|
+
|
|
423
|
+
# Update the '__models' when model records are updated.
|
|
424
|
+
if _is_models_updated :
|
|
425
|
+
# Set the '__models' variable with models metadata.
|
|
426
|
+
|
|
427
|
+
# Set the columns based on teradataml analytics function type.
|
|
428
|
+
_df_cols = ["MODEL_ID", "PARAMETERS", "STATUS"]
|
|
429
|
+
|
|
430
|
+
if self.__is_trainable:
|
|
431
|
+
_df_cols.insert(1, self.__DATA_ID.upper())
|
|
432
|
+
|
|
433
|
+
# Include evaluation metrics for model trainer functions.
|
|
434
|
+
if self.__evaluation_metric:
|
|
435
|
+
_df_cols.append(self.__evaluation_metric)
|
|
436
|
+
|
|
437
|
+
# Replace the teradataml DataFrame with 'table_name'.
|
|
438
|
+
# Convert "PARAMETERS" from dictionary to string datatype.
|
|
439
|
+
for index, records in enumerate(self.__model_eval_records):
|
|
440
|
+
# Check whether "PARAMETERS" record contains a dictionary parameter.
|
|
441
|
+
if isinstance(records["PARAMETERS"], dict):
|
|
442
|
+
# Replace the dataframe with table name and typecast the type
|
|
443
|
+
# of model training parameters to string.
|
|
444
|
+
for key, value in records["PARAMETERS"].items():
|
|
445
|
+
if isinstance(value, DataFrame):
|
|
446
|
+
records["PARAMETERS"][key] = \
|
|
447
|
+
value._table_name
|
|
448
|
+
records["PARAMETERS"] = str(records["PARAMETERS"])
|
|
449
|
+
|
|
450
|
+
# Create pandas dataframe for recorded evaluation report.
|
|
451
|
+
self.__models = pd.DataFrame(self.__model_eval_records,
|
|
452
|
+
columns=_df_cols)
|
|
453
|
+
|
|
454
|
+
return self.__models
|
|
455
|
+
|
|
456
|
+
@property
|
|
457
|
+
def best_score_(self):
|
|
458
|
+
"""
|
|
459
|
+
DESCRIPTION:
|
|
460
|
+
Returns the best score of the model out of all generated models.
|
|
461
|
+
Note:
|
|
462
|
+
"best_score_" is not supported for non-model trainer functions.
|
|
463
|
+
|
|
464
|
+
RETURNS:
|
|
465
|
+
String representing the best score.
|
|
466
|
+
|
|
467
|
+
EXAMPLES:
|
|
468
|
+
>>> # Create an instance of the search algorithm called "optimizer_obj"
|
|
469
|
+
>>> # by referring "__init__()" method.
|
|
470
|
+
>>> # Perform "fit()" method on the optimizer_obj to populate model records.
|
|
471
|
+
>>> # Retrieve the best score.
|
|
472
|
+
>>> optimizer_obj.best_score_
|
|
473
|
+
2.060386
|
|
474
|
+
"""
|
|
475
|
+
return self.__best_score_
|
|
476
|
+
|
|
477
|
+
@property
|
|
478
|
+
def best_model_id(self):
|
|
479
|
+
"""
|
|
480
|
+
DESCRIPTION:
|
|
481
|
+
Returns the model id of the model with best score.
|
|
482
|
+
Note:
|
|
483
|
+
"best_model_id" is not supported for non-model trainer functions.
|
|
484
|
+
|
|
485
|
+
RETURNS:
|
|
486
|
+
String representing the best model id.
|
|
487
|
+
|
|
488
|
+
EXAMPLES:
|
|
489
|
+
>>> # Create an instance of the search algorithm called "optimizer_obj"
|
|
490
|
+
>>> # by referring "__init__()" method.
|
|
491
|
+
>>> # Perform "fit()" method on the optimizer_obj to populate model records.
|
|
492
|
+
>>> # Retrieve the best model id.
|
|
493
|
+
>>> optimizer_obj.best_model_id
|
|
494
|
+
'SVM_2'
|
|
495
|
+
"""
|
|
496
|
+
return self.__best_model_id
|
|
497
|
+
|
|
498
|
+
@property
|
|
499
|
+
def best_params_(self):
|
|
500
|
+
"""
|
|
501
|
+
DESCRIPTION:
|
|
502
|
+
Returns the parameters used for the model with best score.
|
|
503
|
+
Note:
|
|
504
|
+
"best_params_" is not supported for non-model trainer functions.
|
|
505
|
+
|
|
506
|
+
RETURNS:
|
|
507
|
+
dict
|
|
508
|
+
|
|
509
|
+
EXAMPLES:
|
|
510
|
+
>>> # Create an instance of the search algorithm called "optimizer_obj"
|
|
511
|
+
>>> # by referring "__init__()" method.
|
|
512
|
+
>>> # Perform "fit()" method on the optimizer_obj to populate model records.
|
|
513
|
+
>>> # Retrieve the best parameters.
|
|
514
|
+
>>> optimizer_obj.best_params_
|
|
515
|
+
{'input_columns': ['MedInc', 'HouseAge', 'AveRooms',
|
|
516
|
+
'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude'],
|
|
517
|
+
'response_column': 'MedHouseVal', 'model_type': 'regression',
|
|
518
|
+
'batch_size': 50, 'iter_max': 301, 'lambda1': 0.1, 'alpha': 0.5,
|
|
519
|
+
'iter_num_no_change': 60, 'tolerance': 0.01, 'intercept': False,
|
|
520
|
+
'learning_rate': 'INVTIME', 'initial_data': 0.5, 'decay_rate': 0.5,
|
|
521
|
+
'momentum': 0.6, 'nesterov': True, 'local_sgd_iterations': 1,
|
|
522
|
+
'data': '"ALICE"."ml__select__1696595493985650"'}
|
|
523
|
+
"""
|
|
524
|
+
return self.__best_params_
|
|
525
|
+
|
|
526
|
+
@property
|
|
527
|
+
def best_model(self):
|
|
528
|
+
"""
|
|
529
|
+
DESCRIPTION:
|
|
530
|
+
Returns the best trained model obtained from hyperparameter tuning.
|
|
531
|
+
Note:
|
|
532
|
+
"best_model" is not supported for non-model trainer functions.
|
|
533
|
+
|
|
534
|
+
RETURNS:
|
|
535
|
+
object of trained model.
|
|
536
|
+
|
|
537
|
+
EXAMPLES:
|
|
538
|
+
>>> # Create an instance of the search algorithm called "optimizer_obj"
|
|
539
|
+
>>> # by referring "__init__()" method.
|
|
540
|
+
>>> # Perform "fit()" method on the optimizer_obj to populate model records.
|
|
541
|
+
>>> # Retrieve the best model.
|
|
542
|
+
>>> optimizer_obj.best_model
|
|
543
|
+
############ output_data Output ############
|
|
544
|
+
|
|
545
|
+
iterNum loss eta bias
|
|
546
|
+
0 3 2.060386 0.028868 0.0
|
|
547
|
+
1 5 2.055509 0.022361 0.0
|
|
548
|
+
2 6 2.051982 0.020412 0.0
|
|
549
|
+
3 7 2.048387 0.018898 0.0
|
|
550
|
+
4 9 2.041521 0.016667 0.0
|
|
551
|
+
5 10 2.038314 0.015811 0.0
|
|
552
|
+
6 8 2.044882 0.017678 0.0
|
|
553
|
+
7 4 2.058757 0.025000 0.0
|
|
554
|
+
8 2 2.065932 0.035355 0.0
|
|
555
|
+
9 1 1.780877 0.050000 0.0
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
############ result Output ############
|
|
559
|
+
|
|
560
|
+
predictor estimate value
|
|
561
|
+
attribute
|
|
562
|
+
7 Latitude 0.155095 None
|
|
563
|
+
-9 Learning Rate (Initial) 0.050000 None
|
|
564
|
+
-17 OneClass SVM NaN FALSE
|
|
565
|
+
-14 Epsilon 0.100000 None
|
|
566
|
+
5 Population 0.000000 None
|
|
567
|
+
-12 Nesterov NaN TRUE
|
|
568
|
+
-5 BIC 73.297397 None
|
|
569
|
+
-7 Alpha 0.500000 Elasticnet
|
|
570
|
+
-3 Number of Observations 55.000000 None
|
|
571
|
+
0 (Intercept) 0.000000 None
|
|
572
|
+
|
|
573
|
+
"""
|
|
574
|
+
return self.__best_model
|
|
575
|
+
|
|
576
|
+
@property
|
|
577
|
+
def best_sampled_data_(self):
|
|
578
|
+
"""
|
|
579
|
+
DESCRIPTION:
|
|
580
|
+
Returns the best sampled data used for training the best model.
|
|
581
|
+
Note:
|
|
582
|
+
"best_sampled_data_" is not supported for non-model trainer functions.
|
|
583
|
+
|
|
584
|
+
RETURNS:
|
|
585
|
+
list of DataFrames.
|
|
586
|
+
|
|
587
|
+
EXAMPLES:
|
|
588
|
+
>>> # Create an instance of the search algorithm called "optimizer_obj"
|
|
589
|
+
>>> # by referring "__init__()" method.
|
|
590
|
+
>>> # Perform "fit()" method on the optimizer_obj to populate model records.
|
|
591
|
+
>>> # Retrieve the best sampled data.
|
|
592
|
+
>>> optimizer_obj.best_sampled_data_
|
|
593
|
+
[{'data': id MedHouseVal MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude
|
|
594
|
+
0 5233 0.955 -0.895906 0.680467 -0.387272 -0.202806 -0.125930 2.130214 -0.754303 0.653775
|
|
595
|
+
1 10661 3.839 2.724825 -1.258313 0.876263 -1.142947 -0.751004 -0.187396 -0.878298 0.852744
|
|
596
|
+
2 10966 1.896 0.057849 0.343287 -0.141762 -0.664624 -0.095545 0.588981 -0.829586 0.815727
|
|
597
|
+
3 3687 1.741 -0.383816 -1.679787 -0.849458 0.108000 0.718354 1.083500 -0.630308 0.593621
|
|
598
|
+
4 7114 2.187 -0.245392 0.258993 0.225092 -0.205781 -0.171508 -0.035650 -0.763160 0.755573
|
|
599
|
+
5 5300 3.500 -0.955800 -1.005429 -1.548811 -0.130818 2.630473 -0.601956 -0.696734 0.556604
|
|
600
|
+
6 686 1.578 -0.152084 -0.078186 -0.625426 -0.513581 -0.685892 -0.533101 0.906345 -1.141575
|
|
601
|
+
7 9454 0.603 -1.109609 -0.499660 0.355748 0.379188 -0.364674 -0.356799 1.827451 -1.655193
|
|
602
|
+
8 5202 1.000 -0.307539 1.101940 -0.379623 -0.570271 -0.141123 0.595366 -0.754303 0.635266
|
|
603
|
+
9 5769 2.568 -0.413546 0.343287 -0.922324 -0.028824 1.165456 0.031374 -0.656879 0.626012},
|
|
604
|
+
{'newdata': id MedHouseVal MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude
|
|
605
|
+
0 1754 1.651 -0.026315 0.596172 0.454207 -0.027273 0.068320 -0.082765 1.017055 -1.234118
|
|
606
|
+
1 3593 2.676 1.241775 0.090403 1.024283 -0.367626 -0.045626 0.252048 -0.621452 0.542722
|
|
607
|
+
2 7581 1.334 -0.714880 -1.258313 -0.604140 -0.259612 3.058041 0.857406 -0.776445 0.658402
|
|
608
|
+
3 8783 2.500 -0.170156 0.596172 0.163717 0.398242 -0.668529 -0.728130 -0.820729 0.621385
|
|
609
|
+
4 5611 1.587 -0.712366 -0.415366 -1.275716 0.012960 0.860515 0.764870 -0.820729 0.639893
|
|
610
|
+
5 244 1.117 -0.605796 1.101940 -0.160367 0.426668 1.022209 1.041018 0.946201 -1.187846}]
|
|
611
|
+
"""
|
|
612
|
+
return self.__sampled_df_mapper[self.__best_data_id]
|
|
613
|
+
|
|
614
|
+
@property
|
|
615
|
+
def best_data_id(self):
|
|
616
|
+
"""
|
|
617
|
+
DESCRIPTION:
|
|
618
|
+
Returns the "data_id" of a sampled data used for training the best model.
|
|
619
|
+
Note:
|
|
620
|
+
"best_data_id" is not supported for non-model trainer functions.
|
|
621
|
+
|
|
622
|
+
RETURNS:
|
|
623
|
+
String representing the best "data_id"
|
|
624
|
+
|
|
625
|
+
EXAMPLES:
|
|
626
|
+
>>> # Create an instance of the search algorithm called "optimizer_obj"
|
|
627
|
+
>>> # by referring "__init__()" method.
|
|
628
|
+
>>> # Perform "fit()" method on the optimizer_obj to populate model records.
|
|
629
|
+
>>> # Retrieve the best data id.
|
|
630
|
+
>>> optimizer_obj.best_data_id
|
|
631
|
+
DF_0
|
|
632
|
+
"""
|
|
633
|
+
return self.__best_data_id
|
|
634
|
+
|
|
635
|
+
@property
|
|
636
|
+
def model_stats(self):
|
|
637
|
+
"""
|
|
638
|
+
DESCRIPTION:
|
|
639
|
+
Returns the model statistics of the model with best score.
|
|
640
|
+
|
|
641
|
+
RETURNS:
|
|
642
|
+
pandas DataFrame.
|
|
643
|
+
|
|
644
|
+
EXAMPLES:
|
|
645
|
+
>>> # Create an instance of the search algorithm called "optimizer_obj"
|
|
646
|
+
>>> # by referring "__init__()" method.
|
|
647
|
+
>>> # Perform "fit()" method on the optimizer_obj to populate model records.
|
|
648
|
+
>>> # Retrieve the model stats.
|
|
649
|
+
>>> optimizer_obj.model_stats
|
|
650
|
+
MODEL_ID DATA_ID PARAMETERS STATUS MAE
|
|
651
|
+
0 SVM_3 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.616772`
|
|
652
|
+
1 SVM_0 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.660815
|
|
653
|
+
2 SVM_1 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.660815
|
|
654
|
+
3 SVM_2 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.616772
|
|
655
|
+
4 SVM_4 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.616772
|
|
656
|
+
5 SVM_5 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.616772`
|
|
657
|
+
|
|
658
|
+
"""
|
|
659
|
+
|
|
660
|
+
if not (self.__is_evaluatable or self.__is_clustering_model):
|
|
661
|
+
# Raise error when "model_stats" attribute accessed for non-executable
|
|
662
|
+
# functions.
|
|
663
|
+
err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
|
|
664
|
+
"retrieve 'model_stats' attribute",
|
|
665
|
+
"'model_stats' attribute not applicable "\
|
|
666
|
+
"for non-evaluatable function.")
|
|
667
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
668
|
+
elif len(self.__model_eval_records) == 0:
|
|
669
|
+
# Raise error when no records are found.
|
|
670
|
+
err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
|
|
671
|
+
"retrieve 'model_stats' attribute", \
|
|
672
|
+
"No records found in 'model_stats' " \
|
|
673
|
+
"attribute.")
|
|
674
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
675
|
+
|
|
676
|
+
|
|
677
|
+
# All the models records are stored in a dictionary '__model_eval_records'.
|
|
678
|
+
# Since "model_stats" return a pandas DataFrame, one has to construct
|
|
679
|
+
# pandas DataFrame from "__model_stats". This construction should be done
|
|
680
|
+
# only if it is appropriate, i.e., when a new model record is pushed to
|
|
681
|
+
# "__model_eval_records", only then construct the pandas Dataframe for
|
|
682
|
+
# model_stats. Otherwise, store it and use it. Check a new model record is
|
|
683
|
+
# generated or not by comparing the number of model records present in
|
|
684
|
+
# '__model_eval_records' with existing number of records in '__model_stats'.
|
|
685
|
+
_is_model_stats_updated = self.__model_stats is None or \
|
|
686
|
+
len(self.__model_eval_records) != \
|
|
687
|
+
self.__model_stats.shape[0]
|
|
688
|
+
|
|
689
|
+
# Update the '__models' when model stats records are updated.
|
|
690
|
+
if _is_model_stats_updated:
|
|
691
|
+
# Set the '__model_stats' with model evaluation report.
|
|
692
|
+
|
|
693
|
+
# Exclude "models" attribute specific columns.
|
|
694
|
+
_df_cols = ["PARAMETERS", "STATUS", self.__DATA_ID.upper()]
|
|
695
|
+
|
|
696
|
+
# Create pandas dataframe for recorded evaluation report by excluding
|
|
697
|
+
# 'PARAMETERS' and 'STATUS' columns.
|
|
698
|
+
self.__model_stats = pd.DataFrame(self.__model_eval_records).drop(\
|
|
699
|
+
columns=_df_cols, axis=1)
|
|
700
|
+
|
|
701
|
+
return self.__model_stats
|
|
702
|
+
|
|
703
|
+
def is_running(self):
|
|
704
|
+
"""
|
|
705
|
+
DESCRIPTION:
|
|
706
|
+
Check whether hyperparameter tuning is completed or not. Function
|
|
707
|
+
returns True when execution is in progress. Otherwise it returns False.
|
|
708
|
+
|
|
709
|
+
PARAMETERS:
|
|
710
|
+
None
|
|
711
|
+
|
|
712
|
+
RAISES:
|
|
713
|
+
None
|
|
714
|
+
|
|
715
|
+
RETURNS:
|
|
716
|
+
bool
|
|
717
|
+
|
|
718
|
+
EXAMPLES:
|
|
719
|
+
>>> # Create an instance of the search algorithm called "optimizer_obj"
|
|
720
|
+
>>> # by referring "__init__()" method.
|
|
721
|
+
>>> # Perform "fit()" method on the optimizer_obj to populate model records.
|
|
722
|
+
>>> # Retrieve the model execution status.
|
|
723
|
+
>>> optimizer_obj.is_running()
|
|
724
|
+
False
|
|
725
|
+
"""
|
|
726
|
+
# Check all models are executed based on model training records count.
|
|
727
|
+
# Note: Model training records is updated at the end of execution and
|
|
728
|
+
# list append operation is thread-safe. Hence, following method works for
|
|
729
|
+
# both parallel and sequential execution.
|
|
730
|
+
return self.__is_model_training_completed()
|
|
731
|
+
|
|
732
|
+
def _add_data_label(self, arg_name=None):
|
|
733
|
+
"""
|
|
734
|
+
DESCRIPTION:
|
|
735
|
+
Internal function to label the teradataml DataFrame for model trainer
|
|
736
|
+
functions. Labels will be added for input data except dictionary
|
|
737
|
+
formatted DataFrame. Since, Dictionary formatted DataFrame contains
|
|
738
|
+
custom data labels.
|
|
739
|
+
|
|
740
|
+
PARAMETERS:
|
|
741
|
+
arg_name:
|
|
742
|
+
Optional Argument.
|
|
743
|
+
Specifies the model trainer argument name for unsupervised
|
|
744
|
+
model trainer functions.
|
|
745
|
+
Notes:
|
|
746
|
+
* "arg_name" argument is not supported for model-trainer functions
|
|
747
|
+
(evaluatable functions). Since, argument names are
|
|
748
|
+
added in data sampling method.
|
|
749
|
+
* "arg_name" is added to training data of unsupervised
|
|
750
|
+
model-trainer functions.
|
|
751
|
+
Types: str
|
|
752
|
+
|
|
753
|
+
RETURNS:
|
|
754
|
+
dictionary
|
|
755
|
+
|
|
756
|
+
RAISES:
|
|
757
|
+
None
|
|
758
|
+
|
|
759
|
+
EXAMPLES:
|
|
760
|
+
>>> # Example 1: tuple of DataFrame is passed.
|
|
761
|
+
>>> # Assign DataFrames to be labeled.
|
|
762
|
+
>>> self.__model_trainer_input_data = (DF1, DF2)
|
|
763
|
+
>>> # Call '_add_data_label' method for labelling.
|
|
764
|
+
>>> self._add_data_label()
|
|
765
|
+
{'DF_0': DF1, 'DF_1': DF2}
|
|
766
|
+
|
|
767
|
+
>>> # Example 2: Dictionary of DataFrame is passed.
|
|
768
|
+
>>> # This test case is specific to unsupervised
|
|
769
|
+
>>> # model trainer functions.
|
|
770
|
+
>>> # Assign labelled dataframes.
|
|
771
|
+
>>> self.__model_trainer_input_data = {"data-1":DF1, "data-2":DF2}
|
|
772
|
+
>>> # Call '_add_data_label' method to add argument name and reframe
|
|
773
|
+
>>> # the structure into generic labelled format.
|
|
774
|
+
>>> self._add_data_label(arg_name="data")
|
|
775
|
+
{"data-1": {'data': DF1}, "data-2": {'data': DF2} }
|
|
776
|
+
|
|
777
|
+
>>> # Example 3: Tuple of DataFrame is passed.
|
|
778
|
+
>>> # This test case is specific to unsupervised
|
|
779
|
+
>>> # model trainer functions.
|
|
780
|
+
>>> # Assign labelled dataframes.
|
|
781
|
+
>>> self.__model_trainer_input_data = (DF1, DF2)
|
|
782
|
+
>>> # Call '_add_data_label' method to add argument name and data
|
|
783
|
+
>>> # labels. Resulting structure contains unique data labels
|
|
784
|
+
>>> # and dictionary formatted.
|
|
785
|
+
>>> # Assign labels for dataframes with data argument name.
|
|
786
|
+
>>> self._add_data_label(arg_name="data")
|
|
787
|
+
{"DF_0": {'data': DF1}, "DF_1": {'data': DF2} }
|
|
788
|
+
|
|
789
|
+
>>> # Example 4: Single DataFrame is passed.
|
|
790
|
+
>>> # Assign DataFrames to be labeled.
|
|
791
|
+
>>> self.__model_trainer_input_data = DF1
|
|
792
|
+
>>> # Call '_add_data_label' method for labelling.
|
|
793
|
+
>>> self._add_data_label()
|
|
794
|
+
{'DF_0': DF1}
|
|
795
|
+
"""
|
|
796
|
+
|
|
797
|
+
_labeled_data = {}
|
|
798
|
+
|
|
799
|
+
if isinstance(self.__model_trainer_input_data, DataFrame):
|
|
800
|
+
# Provide default data identifier "DF_0", when
|
|
801
|
+
# '__model_trainer_input_data' contains single DataFrame.
|
|
802
|
+
_df_id = self._generate_dataframe_name("DF",0)
|
|
803
|
+
# Record labeled data using unique data identifier.
|
|
804
|
+
# Note: "arg_name" is added to data of unsupervised model-trainer
|
|
805
|
+
# functions while adding data identifier.
|
|
806
|
+
_labeled_data[_df_id] = self.__model_trainer_input_data if arg_name \
|
|
807
|
+
is None else {arg_name: \
|
|
808
|
+
self.__model_trainer_input_data}
|
|
809
|
+
elif isinstance(self.__model_trainer_input_data, tuple):
|
|
810
|
+
# Assign default data identifier sequence, when
|
|
811
|
+
# '__model_trainer_input_data' contains tuples of DataFrame.
|
|
812
|
+
for _index, _data in enumerate(self.__model_trainer_input_data):
|
|
813
|
+
_df_id = self._generate_dataframe_name("DF",_index)
|
|
814
|
+
# Record labeled data using unique data identifier.
|
|
815
|
+
# Note: "arg_name" is added to data of unsupervised model-trainer
|
|
816
|
+
# functions while adding data identifier.
|
|
817
|
+
_labeled_data[_df_id] = _data if arg_name is None else \
|
|
818
|
+
{arg_name: _data}
|
|
819
|
+
elif isinstance(self.__model_trainer_input_data, dict) and arg_name:
|
|
820
|
+
# This condition updates unsupervised model trainer functions data.
|
|
821
|
+
# Assign "arg_name" to all the data items when
|
|
822
|
+
# '__model_trainer_input_data' contains dictionary format DataFrame.
|
|
823
|
+
# Note: Dictionary keys specifies data identifier (labels) and
|
|
824
|
+
# values specifies DataFrame (training data).
|
|
825
|
+
for _data_id in self.__model_trainer_input_data:
|
|
826
|
+
_arg_name_added = {arg_name: self.__model_trainer_input_data[_data_id]}
|
|
827
|
+
_labeled_data[_data_id] = _arg_name_added
|
|
828
|
+
|
|
829
|
+
return _labeled_data
|
|
830
|
+
|
|
831
|
+
def __perform_train_test_sampling(self, data, frac, stratify_column=None,
|
|
832
|
+
sample_id_column=None, sample_seed=None):
|
|
833
|
+
"""
|
|
834
|
+
DESCRIPTION:
|
|
835
|
+
Internal function to perform train test split for multiple DataFrame.
|
|
836
|
+
Train Test split is use 80/20 method for sampling train and test
|
|
837
|
+
DataFrame. After sampling, parameter grid is updated with the train
|
|
838
|
+
and test DataFrame.
|
|
839
|
+
|
|
840
|
+
Notes:
|
|
841
|
+
* Sampled DataFrames are stored in following format.
|
|
842
|
+
[<Train_DF>, <Test_DF>]
|
|
843
|
+
* Each sampled DataFrame mapped with unique data identifier.
|
|
844
|
+
|
|
845
|
+
PARAMETERS:
|
|
846
|
+
data:
|
|
847
|
+
Required Argument.
|
|
848
|
+
Specifies the teradataml DataFrame needs to be sampled.
|
|
849
|
+
Types: dictionary of DataFrame.
|
|
850
|
+
|
|
851
|
+
frac:
|
|
852
|
+
Required Argument.
|
|
853
|
+
Specifies the split percentage of rows to be sampled for training
|
|
854
|
+
and testing dataset. "frac" argument value must range between (0, 1).
|
|
855
|
+
Notes:
|
|
856
|
+
* This "frac" argument is not supported for non-model trainer
|
|
857
|
+
function.
|
|
858
|
+
* The "frac" value is considered as train split percentage and
|
|
859
|
+
The remaining percentage is taken into account for test splitting.
|
|
860
|
+
Types: float
|
|
861
|
+
|
|
862
|
+
sample_seed:
|
|
863
|
+
Optional Argument.
|
|
864
|
+
Specifies the seed value that controls the shuffling applied
|
|
865
|
+
to the data before applying the Train-Test split. Pass an int for
|
|
866
|
+
reproducible output across multiple function calls.
|
|
867
|
+
Notes:
|
|
868
|
+
* When the argument is not specified, different
|
|
869
|
+
runs of the query generate different outputs.
|
|
870
|
+
* It must be in the range [0, 2147483647]
|
|
871
|
+
* Seed is supported for stratify column.
|
|
872
|
+
Types: int
|
|
873
|
+
|
|
874
|
+
stratify_column:
|
|
875
|
+
Optional Argument.
|
|
876
|
+
Specifies column name that contains the labels indicating
|
|
877
|
+
which data needs to be stratified for TrainTest split.
|
|
878
|
+
Notes:
|
|
879
|
+
* seed is supported for stratify column.
|
|
880
|
+
Types: str
|
|
881
|
+
|
|
882
|
+
sample_id_column:
|
|
883
|
+
Optional Argument.
|
|
884
|
+
Specifies the input data column name that has the
|
|
885
|
+
unique identifier for each row in the input.
|
|
886
|
+
Note:
|
|
887
|
+
* Mandatory when "sample_seed" argument is present.
|
|
888
|
+
Types: str
|
|
889
|
+
|
|
890
|
+
RETURNS:
|
|
891
|
+
None
|
|
892
|
+
|
|
893
|
+
RAISES:
|
|
894
|
+
None
|
|
895
|
+
|
|
896
|
+
EXAMPLES:
|
|
897
|
+
>>> _labeled_df = {'DF_0': DF1, 'DF_1': DF2}
|
|
898
|
+
>>> # Sample the labeled DataFrame.
|
|
899
|
+
>>> self.__perform_train_test_sampling(_labeled_df)
|
|
900
|
+
{'DF_0': [{'data':DF1_Train}, {'newdata':DF1_Test}],
|
|
901
|
+
'DF_1': [{'data':DF2_Train}, {'newdata':DF2_Test}]}
|
|
902
|
+
"""
|
|
903
|
+
# Validate the range of "frac" argument value.
|
|
904
|
+
_Validators._validate_argument_range(arg=frac, arg_name='frac',
|
|
905
|
+
lbound=0.0, ubound=1.0)
|
|
906
|
+
|
|
907
|
+
self.__sampled_df_mapper = {}
|
|
908
|
+
for _data_id in data:
|
|
909
|
+
# Setup train, test input data argument name according to function.
|
|
910
|
+
# Apart from "KNN" function all other SQLE, and VAL function takes "data"
|
|
911
|
+
# as training input data argument.
|
|
912
|
+
train_data_arg = self._get_model_trainer_train_data_arg()
|
|
913
|
+
# Test input data argument name varies for all function. So retrieve
|
|
914
|
+
# the stored information.
|
|
915
|
+
test_data_arg = self._TRAINABLE_FUNCS_DATA_MAPPER[self.__func_name]
|
|
916
|
+
|
|
917
|
+
# Perform sampling based on given "frac" value.
|
|
918
|
+
# Consider the "frac" value as train percentage and the remaining
|
|
919
|
+
# as test percentage for train-test-split.
|
|
920
|
+
train_test_sample = data[_data_id].sample(frac=[frac, round(1 - frac, 2)],
|
|
921
|
+
stratify_column=stratify_column,
|
|
922
|
+
id_column=sample_id_column,
|
|
923
|
+
seed=sample_seed)
|
|
924
|
+
# Represent the sample. Otherwise, split consistency is lost.
|
|
925
|
+
train_test_sample.materialize()
|
|
926
|
+
|
|
927
|
+
_sample_id = "sampleid"
|
|
928
|
+
_split_value = [1, 2]
|
|
929
|
+
|
|
930
|
+
# Create train DataFrame.
|
|
931
|
+
_train_data = train_test_sample[\
|
|
932
|
+
train_test_sample[_sample_id] == _split_value[0]].drop(\
|
|
933
|
+
_sample_id, axis = 1)
|
|
934
|
+
|
|
935
|
+
# Create test DataFrame.
|
|
936
|
+
_test_data = train_test_sample[\
|
|
937
|
+
train_test_sample[_sample_id] == _split_value[1]].drop(\
|
|
938
|
+
_sample_id, axis = 1)
|
|
939
|
+
|
|
940
|
+
# Represent train and test dataset.
|
|
941
|
+
_train_data.materialize()
|
|
942
|
+
_test_data.materialize()
|
|
943
|
+
|
|
944
|
+
# Update train and test dataset using data id with train and test
|
|
945
|
+
# arguments. Unique Data-structure to store train and test sampled
|
|
946
|
+
# data for model trainer functions.
|
|
947
|
+
self.__sampled_df_mapper[_data_id] = [{train_data_arg:_train_data},
|
|
948
|
+
{test_data_arg:_test_data}]
|
|
949
|
+
|
|
950
|
+
def __update_model_parameters(self):
|
|
951
|
+
"""
|
|
952
|
+
DESCRIPTION:
|
|
953
|
+
Internal function to update the parameter grid with multiple
|
|
954
|
+
dataframe using unique data identifiers. This function perform
|
|
955
|
+
cartesian products on parameter grid and data identifiers.
|
|
956
|
+
Hence, Hyperparameter tuning is performed on all DataFrame.
|
|
957
|
+
|
|
958
|
+
Notes:
|
|
959
|
+
* This function is only applicable for model trainer functions
|
|
960
|
+
(supervised, and unsupervised models).
|
|
961
|
+
* '_sampled_df_mapper' variable must contain labeled data before
|
|
962
|
+
updating parameter grid. Since, unique data identifier is added
|
|
963
|
+
to all parameters present in parameter grid.
|
|
964
|
+
|
|
965
|
+
PARAMETERS:
|
|
966
|
+
None
|
|
967
|
+
|
|
968
|
+
RETURNS:
|
|
969
|
+
None
|
|
970
|
+
|
|
971
|
+
RAISES:
|
|
972
|
+
None
|
|
973
|
+
|
|
974
|
+
EXAMPLES:
|
|
975
|
+
>>> _labeled_df = {'DF_0': DF1, 'DF_1': DF2}
|
|
976
|
+
>>> # Sample the labeled DataFrame.
|
|
977
|
+
>>> self.__perform_train_test_sampling(_labeled_df)
|
|
978
|
+
{'DF_0': [{'data':DF1_Train}, {'newdata':DF1_Test}],
|
|
979
|
+
'DF_1': [{'data':DF2_Train}, {'newdata':DF2_Test}]}
|
|
980
|
+
>>> self.__update_model_parameters()
|
|
981
|
+
[
|
|
982
|
+
{'param': {'input_columns': ['age', 'survived', 'pclass'],
|
|
983
|
+
'response_column': 'fare', 'max_depth': 10, 'lambda1': 1000.0,
|
|
984
|
+
'model_type': 'regression', 'seed': -1, 'shrinkage_factor': 0.1,
|
|
985
|
+
'iter_num': 2},
|
|
986
|
+
'data_id': 'DF_0'},
|
|
987
|
+
{'param': {'input_columns': ['age', 'survived', 'pclass'],
|
|
988
|
+
'response_column': 'fare', 'max_depth': 10, 'lambda1': 1000.0,
|
|
989
|
+
'model_type': 'regression', 'seed': -1, 'shrinkage_factor': 0.1,
|
|
990
|
+
'iter_num': 50},
|
|
991
|
+
'data_id': 'DF_1'}
|
|
992
|
+
]
|
|
993
|
+
"""
|
|
994
|
+
# Get data identifiers.
|
|
995
|
+
_model_ids = self.__sampled_df_mapper.keys()
|
|
996
|
+
# Update '_parameter_grid' with data identifiers by performing
|
|
997
|
+
# cartesian product.
|
|
998
|
+
self._parameter_grid = [{"param":param[0] , self.__DATA_ID:param[1]} for \
|
|
999
|
+
param in product(self._parameter_grid, _model_ids)]
|
|
1000
|
+
|
|
1001
|
+
def __validate_model_trainer_input_data_argument(self, data, is_optional_arg=True):
|
|
1002
|
+
"""
|
|
1003
|
+
DESCRIPTION:
|
|
1004
|
+
Internal function to validate input data of model trainer function.
|
|
1005
|
+
This function validates single DataFrame, multiple DataFrame, and
|
|
1006
|
+
multiple DataFrame with user-defined data labels.
|
|
1007
|
+
Notes:
|
|
1008
|
+
* This function is only applicable for model trainer functions
|
|
1009
|
+
(supervised, and unsupervised models).
|
|
1010
|
+
|
|
1011
|
+
PARAMETERS:
|
|
1012
|
+
data:
|
|
1013
|
+
Required Argument.
|
|
1014
|
+
Specifies the input teradataml DataFrame for model trainer function.
|
|
1015
|
+
Notes:
|
|
1016
|
+
* "data" is a required argument for model trainer functions.
|
|
1017
|
+
* "data" is ignored for non-model trainer functions.
|
|
1018
|
+
* "data" can be contain single DataFrame or multiple DataFrame.
|
|
1019
|
+
* Multiple DataFrame must be specified using tuple or Dictionary
|
|
1020
|
+
as follow.
|
|
1021
|
+
* Tuples:
|
|
1022
|
+
gs.fit(data=(df1, df2), **eval_params)
|
|
1023
|
+
|
|
1024
|
+
* Dictionary:
|
|
1025
|
+
gs.fit(data={"data-1":df1, "data-2":df2}, **eval_params)
|
|
1026
|
+
Types: teradataml DataFrame, dictionary, tuples
|
|
1027
|
+
|
|
1028
|
+
is_optional_arg:
|
|
1029
|
+
Optional Argument.
|
|
1030
|
+
Specifies whether passed data argument value is a optional
|
|
1031
|
+
argument or not.
|
|
1032
|
+
Default Value: True
|
|
1033
|
+
Types: bool
|
|
1034
|
+
|
|
1035
|
+
RETURNS:
|
|
1036
|
+
None
|
|
1037
|
+
|
|
1038
|
+
RAISES:
|
|
1039
|
+
TeradataMlException, TypeError, ValueError
|
|
1040
|
+
|
|
1041
|
+
EXAMPLES:
|
|
1042
|
+
>>> self.__validate_model_trainer_input_data_argument(data,
|
|
1043
|
+
_is_optional_arg)
|
|
1044
|
+
|
|
1045
|
+
"""
|
|
1046
|
+
# Validate "data" for model trainer functions.
|
|
1047
|
+
arg_info_matrix = []
|
|
1048
|
+
if isinstance(data, tuple):
|
|
1049
|
+
# Validate all DataFrames present in tuples.
|
|
1050
|
+
for _data in data:
|
|
1051
|
+
arg_info_matrix.append(["data", _data, is_optional_arg, (DataFrame)])
|
|
1052
|
+
elif isinstance(data, dict):
|
|
1053
|
+
# Validate all DataFrames present in dictionary format.
|
|
1054
|
+
for _data_id in data:
|
|
1055
|
+
arg_info_matrix.append(["data", data[_data_id], is_optional_arg, (DataFrame)])
|
|
1056
|
+
else:
|
|
1057
|
+
# Validate DataFrames.
|
|
1058
|
+
arg_info_matrix.append(["data", data, is_optional_arg, (DataFrame)])
|
|
1059
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
1060
|
+
|
|
1061
|
+
def _regression_metrics(self, y_true, y_pred):
|
|
1062
|
+
from teradataml import td_sklearn as skl
|
|
1063
|
+
|
|
1064
|
+
ME = skl.max_error(y_true=y_true, y_pred=y_pred)
|
|
1065
|
+
|
|
1066
|
+
MAE = skl.mean_absolute_error(y_true=y_true, y_pred=y_pred)
|
|
1067
|
+
|
|
1068
|
+
MSE = skl.mean_squared_error(y_true=y_true, y_pred=y_pred, squared=False)
|
|
1069
|
+
|
|
1070
|
+
try:
|
|
1071
|
+
MSLE = skl.mean_squared_log_error(y_true=y_true, y_pred=y_pred)
|
|
1072
|
+
except:
|
|
1073
|
+
MSLE = "NA"
|
|
1074
|
+
|
|
1075
|
+
MAPE = skl.mean_absolute_percentage_error(y_true=y_true, y_pred=y_pred)
|
|
1076
|
+
|
|
1077
|
+
R2 = skl.r2_score(y_true=y_true, y_pred=y_pred)
|
|
1078
|
+
|
|
1079
|
+
EV = skl.explained_variance_score(y_true=y_true, y_pred=y_pred)
|
|
1080
|
+
|
|
1081
|
+
MAD = skl.median_absolute_error(y_true=y_true, y_pred=y_pred)
|
|
1082
|
+
|
|
1083
|
+
#TODO: Support for MPD, MGD, MTD will be added in next phase.
|
|
1084
|
+
# Support for RMSE, RMSLE will be added after OpenSourceML scikit-learn version
|
|
1085
|
+
# update as it requires higher version(>1.1.3)
|
|
1086
|
+
"""MPD = skl.mean_poisson_deviance(y_true, y_pred)
|
|
1087
|
+
MGD = skl.mean_gamma_deviance(y_true, y_pred)
|
|
1088
|
+
MTD = skl.mean_tweedie_deviance(y_true, y_pred)"""
|
|
1089
|
+
|
|
1090
|
+
keys = ["MAE", "MSE", "MSLE", "MAPE", "R2", "EV", "ME", "MAD"]
|
|
1091
|
+
values = [MAE, MSE, MSLE, MAPE, R2, EV, ME, MAD]
|
|
1092
|
+
return dict(zip(keys, values))
|
|
1093
|
+
|
|
1094
|
+
def _classification_metrics(self, y_true, y_pred):
|
|
1095
|
+
from teradataml import td_sklearn as skl
|
|
1096
|
+
|
|
1097
|
+
# Basic classification metrics
|
|
1098
|
+
accuracy = skl.accuracy_score(y_true=y_true, y_pred=y_pred)
|
|
1099
|
+
|
|
1100
|
+
# Precision, Recall, F1 (micro, macro, weighted averages)
|
|
1101
|
+
micro_precision = skl.precision_score(y_true=y_true, y_pred=y_pred, average='micro')
|
|
1102
|
+
micro_recall = skl.recall_score(y_true=y_true, y_pred=y_pred, average='micro')
|
|
1103
|
+
micro_f1 = skl.f1_score(y_true=y_true, y_pred=y_pred, average='micro')
|
|
1104
|
+
|
|
1105
|
+
macro_precision = skl.precision_score(y_true=y_true, y_pred=y_pred, average='macro')
|
|
1106
|
+
macro_recall = skl.recall_score(y_true=y_true, y_pred=y_pred, average='macro')
|
|
1107
|
+
macro_f1 = skl.f1_score(y_true=y_true, y_pred=y_pred, average='macro')
|
|
1108
|
+
|
|
1109
|
+
weighted_precision = skl.precision_score(y_true=y_true, y_pred=y_pred, average='weighted')
|
|
1110
|
+
weighted_recall = skl.recall_score(y_true=y_true, y_pred=y_pred, average='weighted')
|
|
1111
|
+
weighted_f1 = skl.f1_score(y_true=y_true, y_pred=y_pred, average='weighted')
|
|
1112
|
+
|
|
1113
|
+
keys = [
|
|
1114
|
+
"ACCURACY", "MICRO-PRECISION", "MICRO-RECALL", "MICRO-F1",
|
|
1115
|
+
"MACRO-PRECISION", "MACRO-RECALL", "MACRO-F1",
|
|
1116
|
+
"WEIGHTED-PRECISION", "WEIGHTED-RECALL", "WEIGHTED-F1"
|
|
1117
|
+
]
|
|
1118
|
+
values = [
|
|
1119
|
+
accuracy, micro_precision, micro_recall, micro_f1,
|
|
1120
|
+
macro_precision, macro_recall, macro_f1,
|
|
1121
|
+
weighted_precision, weighted_recall, weighted_f1
|
|
1122
|
+
]
|
|
1123
|
+
return dict(zip(keys, values))
|
|
1124
|
+
|
|
1125
|
+
def fit(self,
|
|
1126
|
+
data=None,
|
|
1127
|
+
evaluation_metric=None,
|
|
1128
|
+
early_stop=None,
|
|
1129
|
+
frac=0.8,
|
|
1130
|
+
run_parallel=True,
|
|
1131
|
+
wait=True,
|
|
1132
|
+
verbose=0,
|
|
1133
|
+
stratify_column=None,
|
|
1134
|
+
sample_id_column=None,
|
|
1135
|
+
sample_seed=None,
|
|
1136
|
+
max_time=None,
|
|
1137
|
+
**kwargs):
|
|
1138
|
+
"""
|
|
1139
|
+
DESCRIPTION:
|
|
1140
|
+
Function to run the teradataml analytic function for all sets of
|
|
1141
|
+
hyperparameters. Sets of hyperparameters chosen for execution
|
|
1142
|
+
from the parameter grid were the parameter grid is populated
|
|
1143
|
+
based on search algorithm.
|
|
1144
|
+
Notes:
|
|
1145
|
+
* In the Model trainer function, the best parameters are
|
|
1146
|
+
selected based on training results.
|
|
1147
|
+
* In the Non model trainer function, First execution parameter
|
|
1148
|
+
set is selected as the best parameters.
|
|
1149
|
+
|
|
1150
|
+
PARAMETERS:
|
|
1151
|
+
data:
|
|
1152
|
+
Optional Argument.
|
|
1153
|
+
Specifies the input teradataml DataFrame for model trainer function.
|
|
1154
|
+
Notes:
|
|
1155
|
+
* DataFrame need not to be passed in fit() methods, when "data" is
|
|
1156
|
+
passed as a model hyperparameters ("params").
|
|
1157
|
+
* "data" is a required argument for model trainer functions.
|
|
1158
|
+
* "data" is ignored for non-model trainer functions.
|
|
1159
|
+
* "data" can be contain single DataFrame or multiple DataFrame.
|
|
1160
|
+
* One can pass multiple dataframes to "data". Hyperparameter
|
|
1161
|
+
tuning is performed on all the dataframes for every model
|
|
1162
|
+
parameter.
|
|
1163
|
+
* "data" can be either a dictionary OR a tuple OR a dataframe.
|
|
1164
|
+
* If it is a dictionary then Key represents the label for
|
|
1165
|
+
dataframe and Value represents the dataframe.
|
|
1166
|
+
* If it is a tuple then teradataml converts it to dictionary
|
|
1167
|
+
by generating the labels internally.
|
|
1168
|
+
* If it is a dataframe then teradataml label it as "DF_0".
|
|
1169
|
+
Types: teradataml DataFrame, dictionary, tuples
|
|
1170
|
+
|
|
1171
|
+
evaluation_metric:
|
|
1172
|
+
Optional Argument.
|
|
1173
|
+
Specifies the evaluation metrics to considered for model
|
|
1174
|
+
evaluation.
|
|
1175
|
+
Notes:
|
|
1176
|
+
* evaluation_metric applicable for model trainer functions.
|
|
1177
|
+
* Best model is not selected when evaluation returns
|
|
1178
|
+
non-finite values.
|
|
1179
|
+
* MPD, MGD, RMSE, RMSLE are not supported for OpenSourceML models.
|
|
1180
|
+
Permitted Values:
|
|
1181
|
+
* Classification: Accuracy, Micro-Precision, Micro-Recall,
|
|
1182
|
+
Micro-F1, Macro-Precision, Macro-Recall,
|
|
1183
|
+
Macro-F1, Weighted-Precision,
|
|
1184
|
+
Weighted-Recall,
|
|
1185
|
+
Weighted-F1.
|
|
1186
|
+
* Regression: MAE, MSE, MSLE, MAPE, MPE, RMSE, RMSLE, ME,
|
|
1187
|
+
R2, EV, MPD, MGD
|
|
1188
|
+
* Clustering: SILHOUETTE
|
|
1189
|
+
Default Value:
|
|
1190
|
+
* Classification: Accuracy
|
|
1191
|
+
* Regression: MAE
|
|
1192
|
+
* Clustering: SILHOUETTE
|
|
1193
|
+
Types: str
|
|
1194
|
+
|
|
1195
|
+
early_stop:
|
|
1196
|
+
Optional Argument.
|
|
1197
|
+
Specifies the early stop mechanism value for model trainer
|
|
1198
|
+
functions. Hyperparameter tuning ends model training when
|
|
1199
|
+
the training model evaluation metric attains "early_stop" value.
|
|
1200
|
+
Note:
|
|
1201
|
+
* Early stopping supports only when evaluation returns
|
|
1202
|
+
finite value.
|
|
1203
|
+
Types: int or float
|
|
1204
|
+
|
|
1205
|
+
frac:
|
|
1206
|
+
Optional Argument.
|
|
1207
|
+
Specifies the split percentage of rows to be sampled for training
|
|
1208
|
+
and testing dataset. "frac" argument value must range between (0, 1).
|
|
1209
|
+
Notes:
|
|
1210
|
+
* This "frac" argument is not supported for non-model trainer
|
|
1211
|
+
function.
|
|
1212
|
+
* The "frac" value is considered as train split percentage and
|
|
1213
|
+
The remaining percentage is taken into account for test splitting.
|
|
1214
|
+
Default Value: 0.8
|
|
1215
|
+
Types: float
|
|
1216
|
+
|
|
1217
|
+
run_parallel:
|
|
1218
|
+
Optional Argument.
|
|
1219
|
+
Specifies the parallel execution functionality of hyperparameter
|
|
1220
|
+
tuning. When "run_parallel" set to true, model functions are
|
|
1221
|
+
executed concurrently. Otherwise, model functions are executed
|
|
1222
|
+
sequentially.
|
|
1223
|
+
Note:
|
|
1224
|
+
* Early stopping is not supported when parallel run is
|
|
1225
|
+
enabled.
|
|
1226
|
+
Default Value: True
|
|
1227
|
+
Types: bool
|
|
1228
|
+
|
|
1229
|
+
wait:
|
|
1230
|
+
Optional Argument.
|
|
1231
|
+
Specifies whether to wait for the completion of execution
|
|
1232
|
+
of hyperparameter tuning or not. When set to False, hyperparameter
|
|
1233
|
+
tuning is executed in the background and user can use "is_running()"
|
|
1234
|
+
method to check the status. Otherwise it waits until the execution
|
|
1235
|
+
is complete to return the control back to user.
|
|
1236
|
+
Default Value: True
|
|
1237
|
+
Type: bool
|
|
1238
|
+
|
|
1239
|
+
verbose:
|
|
1240
|
+
Optional Argument.
|
|
1241
|
+
Specifies whether to log the model training information and display
|
|
1242
|
+
the logs. When it is set to 1, progress bar alone logged in the
|
|
1243
|
+
console. When it is set to 2, along with progress bar, execution
|
|
1244
|
+
steps and execution time is logged in the console. When it is set
|
|
1245
|
+
to 0, nothing is logged in the console.
|
|
1246
|
+
Note:
|
|
1247
|
+
* verbose is not significant when "wait" is 'False'.
|
|
1248
|
+
Default Value: 0
|
|
1249
|
+
Type: bool
|
|
1250
|
+
|
|
1251
|
+
sample_seed:
|
|
1252
|
+
Optional Argument.
|
|
1253
|
+
Specifies the seed value that controls the shuffling applied
|
|
1254
|
+
to the data before applying the Train-Test split. Pass an int for
|
|
1255
|
+
reproducible output across multiple function calls.
|
|
1256
|
+
Notes:
|
|
1257
|
+
* When the argument is not specified, different
|
|
1258
|
+
runs of the query generate different outputs.
|
|
1259
|
+
* It must be in the range [0, 2147483647]
|
|
1260
|
+
* Seed is supported for stratify column.
|
|
1261
|
+
Types: int
|
|
1262
|
+
|
|
1263
|
+
stratify_column:
|
|
1264
|
+
Optional Argument.
|
|
1265
|
+
Specifies column name that contains the labels indicating
|
|
1266
|
+
which data needs to be stratified for TrainTest split.
|
|
1267
|
+
Notes:
|
|
1268
|
+
* seed is supported for stratify column.
|
|
1269
|
+
Types: str
|
|
1270
|
+
|
|
1271
|
+
sample_id_column:
|
|
1272
|
+
Optional Argument.
|
|
1273
|
+
Specifies the input data column name that has the
|
|
1274
|
+
unique identifier for each row in the input.
|
|
1275
|
+
Note:
|
|
1276
|
+
* Mandatory when "sample_seed" argument is present.
|
|
1277
|
+
Types: str
|
|
1278
|
+
|
|
1279
|
+
max_time:
|
|
1280
|
+
Optional Argument.
|
|
1281
|
+
Specifies the maximum time for the completion of Hyperparameter tuning execution.
|
|
1282
|
+
Default Value: None
|
|
1283
|
+
Types: int or float
|
|
1284
|
+
|
|
1285
|
+
kwargs:
|
|
1286
|
+
Optional Argument.
|
|
1287
|
+
Specifies the keyword arguments. Accepts additional arguments
|
|
1288
|
+
required for the teradataml analytic function.
|
|
1289
|
+
|
|
1290
|
+
RETURNS:
|
|
1291
|
+
None
|
|
1292
|
+
|
|
1293
|
+
RAISES:
|
|
1294
|
+
TeradataMlException, TypeError, ValueError
|
|
1295
|
+
|
|
1296
|
+
EXAMPLES:
|
|
1297
|
+
>>> # Create an instance of the search algorithm called "optimizer_obj"
|
|
1298
|
+
>>> # by referring "__init__()" method.
|
|
1299
|
+
>>> # Perform fit() operation on the "optimizer_obj".
|
|
1300
|
+
|
|
1301
|
+
>>> eval_params = {"id_column": "id",
|
|
1302
|
+
"accumulate": "MedHouseVal"}
|
|
1303
|
+
>>> # Example 1: Passing single DataFrame for model trainer function.
|
|
1304
|
+
>>> optimizer_obj.fit(data=train_df,
|
|
1305
|
+
evaluation_metric="MAE",
|
|
1306
|
+
early_stop=70.9,
|
|
1307
|
+
**eval_params)
|
|
1308
|
+
|
|
1309
|
+
>>> # Example 2: Passing multiple datasets as tuple of DataFrames for
|
|
1310
|
+
>>> # model trainer function.
|
|
1311
|
+
>>> optimizer_obj.fit(data=(train_df_1, train_df_2),
|
|
1312
|
+
evaluation_metric="MAE",
|
|
1313
|
+
early_stop=70.9,
|
|
1314
|
+
**eval_params)
|
|
1315
|
+
|
|
1316
|
+
>>> # Example 3: Passing multiple datasets as dictionary of DataFrames
|
|
1317
|
+
>>> # for model trainer function.
|
|
1318
|
+
>>> optimizer_obj.fit(data={"Data-1":train_df_1, "Data-2":train_df_2},
|
|
1319
|
+
evaluation_metric="MAE",
|
|
1320
|
+
early_stop=70.9,
|
|
1321
|
+
**eval_params)
|
|
1322
|
+
|
|
1323
|
+
>>> # Example 4: No data argument passed in fit() method for model trainer function.
|
|
1324
|
+
>>> # Note: data argument must be passed while creating HPT object as
|
|
1325
|
+
>>> # model hyperparameters.
|
|
1326
|
+
|
|
1327
|
+
>>> # Define parameter space for model training with "data" argument.
|
|
1328
|
+
>>> params = {"data":(df1, df2),
|
|
1329
|
+
"input_columns":['MedInc', 'HouseAge', 'AveRooms',
|
|
1330
|
+
'AveBedrms', 'Population', 'AveOccup',
|
|
1331
|
+
'Latitude', 'Longitude'],
|
|
1332
|
+
"response_column":"MedHouseVal",
|
|
1333
|
+
"model_type":"regression",
|
|
1334
|
+
"batch_size":(11, 50, 75),
|
|
1335
|
+
"iter_max":(100, 301),
|
|
1336
|
+
"intercept":False,
|
|
1337
|
+
"learning_rate":"INVTIME",
|
|
1338
|
+
"nesterov":True,
|
|
1339
|
+
"local_sgd_iterations":1}
|
|
1340
|
+
|
|
1341
|
+
>>> # Create "optimizer_obj" using any search algorithm and perform
|
|
1342
|
+
>>> # fit() method without any "data" argument for model trainer function.
|
|
1343
|
+
>>> optimizer_obj.fit(evaluation_metric="MAE",
|
|
1344
|
+
early_stop=70.9,
|
|
1345
|
+
**eval_params)
|
|
1346
|
+
|
|
1347
|
+
>>> # Example 5: Do not pass data argument in fit() method for
|
|
1348
|
+
>>> # non-model trainer function.
|
|
1349
|
+
>>> # Note: data argument must be passed while creating HPT
|
|
1350
|
+
>>> # object as model hyperparameters.
|
|
1351
|
+
>>> optimizer_obj.fit()
|
|
1352
|
+
|
|
1353
|
+
>>> # Example 6: Passing "verbose" argument value '1' in fit() method to
|
|
1354
|
+
>>> # display model log.
|
|
1355
|
+
>>> optimizer_obj.fit(data=train_df, evaluation_metric="R2",
|
|
1356
|
+
verbose=1, **eval_params)
|
|
1357
|
+
completed: |████████████████████████████████████████████████████████████| 100% - 6/6
|
|
1358
|
+
|
|
1359
|
+
"""
|
|
1360
|
+
|
|
1361
|
+
# Set the flag to notify fit method is called.
|
|
1362
|
+
self.__is_fit_called = True
|
|
1363
|
+
|
|
1364
|
+
# Validate "early_stop".
|
|
1365
|
+
arg_info_matrix = []
|
|
1366
|
+
arg_info_matrix.append(["early_stop", early_stop, True, (int, float)])
|
|
1367
|
+
arg_info_matrix.append(["frac", frac, True, (float)])
|
|
1368
|
+
arg_info_matrix.append(["run_parallel", run_parallel, True, (bool)])
|
|
1369
|
+
arg_info_matrix.append(["wait", wait, True, (bool)])
|
|
1370
|
+
arg_info_matrix.append(["evaluation_metric", evaluation_metric, True,
|
|
1371
|
+
(str), True, list(self.__osml_func_comparator)
|
|
1372
|
+
if self.__is_opensource_model
|
|
1373
|
+
else list(self.__func_comparator)])
|
|
1374
|
+
arg_info_matrix.append(["verbose", verbose, True, (int), True, [0,1,2]])
|
|
1375
|
+
arg_info_matrix.append(["max_time", max_time, True, (int, float)])
|
|
1376
|
+
|
|
1377
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
1378
|
+
|
|
1379
|
+
# set timeout value.
|
|
1380
|
+
self.__timeout = max_time
|
|
1381
|
+
|
|
1382
|
+
self._setting_model_trainer_data(data)
|
|
1383
|
+
|
|
1384
|
+
# Set the evaluation metrics.
|
|
1385
|
+
if evaluation_metric is not None:
|
|
1386
|
+
self.__evaluation_metric = evaluation_metric.upper()
|
|
1387
|
+
self.__early_stop = early_stop
|
|
1388
|
+
if self.__is_trainable and self.__is_evaluatable and self.__is_sqle_function:
|
|
1389
|
+
|
|
1390
|
+
# When "evaluation_metric" is 'MPE' then use the spl comparators.
|
|
1391
|
+
if self.__evaluation_metric == "MPE":
|
|
1392
|
+
self._is_best_metrics = self._is_early_stoppable = self._spl_abs_comparator
|
|
1393
|
+
|
|
1394
|
+
if not isinstance(self.__model_trainer_input_data, dict):
|
|
1395
|
+
# Sample all the labeled data for model training and testing.
|
|
1396
|
+
self.__perform_train_test_sampling(self._labeled_data, frac, stratify_column,
|
|
1397
|
+
sample_id_column, sample_seed)
|
|
1398
|
+
|
|
1399
|
+
elif isinstance(self.__model_trainer_input_data, dict):
|
|
1400
|
+
# Sample all the custom labeled data for model training and testing.
|
|
1401
|
+
self.__perform_train_test_sampling(self.__model_trainer_input_data, frac,
|
|
1402
|
+
stratify_column, sample_id_column,
|
|
1403
|
+
sample_seed)
|
|
1404
|
+
# Update model trainer function parameter grid.
|
|
1405
|
+
self.__update_model_parameters()
|
|
1406
|
+
|
|
1407
|
+
self.__eval_params = kwargs if self.__is_evaluatable else None
|
|
1408
|
+
|
|
1409
|
+
elif self.__is_trainable and self.__is_opensource_model:
|
|
1410
|
+
|
|
1411
|
+
if self.__is_clustering_model:
|
|
1412
|
+
self.__sampled_df_mapper = self._add_data_label("data")
|
|
1413
|
+
# Update model trainer function parameter grid.
|
|
1414
|
+
self.__update_model_parameters()
|
|
1415
|
+
elif self.__is_regression_model or self.__is_classification_model:
|
|
1416
|
+
# Open-source regression model: perform train-test split
|
|
1417
|
+
|
|
1418
|
+
if not isinstance(self.__model_trainer_input_data, dict):
|
|
1419
|
+
self.__perform_train_test_sampling(self._labeled_data, frac, stratify_column,
|
|
1420
|
+
sample_id_column, sample_seed)
|
|
1421
|
+
elif isinstance(self.__model_trainer_input_data, dict):
|
|
1422
|
+
self.__perform_train_test_sampling(self.__model_trainer_input_data, frac,
|
|
1423
|
+
stratify_column, sample_id_column,
|
|
1424
|
+
sample_seed)
|
|
1425
|
+
# Set evaluation parameters for supervised models
|
|
1426
|
+
self.__eval_params = kwargs if self.__is_evaluatable else None
|
|
1427
|
+
|
|
1428
|
+
self.__update_model_parameters()
|
|
1429
|
+
|
|
1430
|
+
elif self.__is_trainable and not self.__is_evaluatable:
|
|
1431
|
+
# This condition identifies unsupervised model trainer function.
|
|
1432
|
+
# Let's process training data.
|
|
1433
|
+
# Note: All unsupervised model training data argument named as 'data'.
|
|
1434
|
+
# Label the data with model training argument name.
|
|
1435
|
+
self.__sampled_df_mapper = self._add_data_label("data")
|
|
1436
|
+
# Update model trainer function parameter grid.
|
|
1437
|
+
self.__update_model_parameters()
|
|
1438
|
+
# Initialize logging.
|
|
1439
|
+
if verbose > 0:
|
|
1440
|
+
self.__progress_bar = _ProgressBar(jobs=len(self._parameter_grid), verbose=verbose)
|
|
1441
|
+
|
|
1442
|
+
# With VT option Parallel execution won't be possible, as it opens multiple connections.
|
|
1443
|
+
if not run_parallel or configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
|
|
1444
|
+
# Setting start time of Sequential execution.
|
|
1445
|
+
|
|
1446
|
+
self.__start_time = time.time() if self.__timeout is not None else None
|
|
1447
|
+
# TODO: Factorize the code once parallel execution part is completed in ELE-6154 JIRA.
|
|
1448
|
+
# Execute all parameters from populated parameter grid for both trainable
|
|
1449
|
+
# and non trainable function.
|
|
1450
|
+
for iter, param in enumerate(self._parameter_grid):
|
|
1451
|
+
self._execute_fit(model_param=param, iter=iter, **kwargs)
|
|
1452
|
+
|
|
1453
|
+
# Condition to check early stop feature applicable for model
|
|
1454
|
+
# trainer function.
|
|
1455
|
+
if self.__early_stop is not None and (self.__is_evaluatable or self.__is_clustering_model):
|
|
1456
|
+
if self.__is_finite and self._is_early_stoppable():
|
|
1457
|
+
# Terminate HPT execution when the trained model attains the
|
|
1458
|
+
# given "early_stop" value.
|
|
1459
|
+
break
|
|
1460
|
+
elif not self.__is_finite:
|
|
1461
|
+
# Raise error because non-finite values cannot be compared
|
|
1462
|
+
# with "__early_stop" value effectively.
|
|
1463
|
+
# Reset the best models and other properties before raising error.
|
|
1464
|
+
self.__default_model = self.__best_model = self.__best_score_ = \
|
|
1465
|
+
self.__best_model_id = self.__best_params_ = None
|
|
1466
|
+
err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
|
|
1467
|
+
"execute 'fit()'","Early stop feature is not applicable"\
|
|
1468
|
+
" when '{metric}' metric results inconsistent value.".format(
|
|
1469
|
+
metric=self.__evaluation_metric))
|
|
1470
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
1471
|
+
if self.__timeout is not None and self._is_time_stoppable():
|
|
1472
|
+
# Terminate HPT execution when the execution time exceeds the
|
|
1473
|
+
# given time limit.
|
|
1474
|
+
break
|
|
1475
|
+
|
|
1476
|
+
else:
|
|
1477
|
+
# TODO: Added support for early_stop feature along with concurrency in ELE-6154 JIRA.
|
|
1478
|
+
# Functions are executed concurrent.
|
|
1479
|
+
# Prepare the parameter grid for concurrent execution.
|
|
1480
|
+
async_exec_params = []
|
|
1481
|
+
for iter, param in enumerate(self._parameter_grid):
|
|
1482
|
+
_temp_params = {}
|
|
1483
|
+
_temp_params["iter"] = iter
|
|
1484
|
+
_temp_params["model_param"] = param
|
|
1485
|
+
_temp_params.update(kwargs)
|
|
1486
|
+
async_exec_params.append(_temp_params)
|
|
1487
|
+
|
|
1488
|
+
# Initialize the stopping event
|
|
1489
|
+
self.__parallel_stop_event = threading.Event()
|
|
1490
|
+
# let's initialize "_AsyncDBExecutor".
|
|
1491
|
+
self._async_executor = _AsyncDBExecutor(wait=wait)
|
|
1492
|
+
# Setting start time of Parallel execution.
|
|
1493
|
+
self.__start_time = time.time() if self.__timeout is not None else None
|
|
1494
|
+
# Trigger parallel thread execution.
|
|
1495
|
+
self._async_executor.submit(self._execute_fit, *async_exec_params)
|
|
1496
|
+
|
|
1497
|
+
if len(self.__model_err_records) > 0 and not kwargs.get('suppress_refer_msg', False):
|
|
1498
|
+
print('\nAn error occurred during Model Training.'\
|
|
1499
|
+
' Refer to get_error_log() for more details.')
|
|
1500
|
+
|
|
1501
|
+
|
|
1502
|
+
def __model_trainer_routine(self, model_param, iter, **kwargs):
|
|
1503
|
+
"""
|
|
1504
|
+
DESCRIPTION:
|
|
1505
|
+
Internal function to perform fit, predict and evaluate operations
|
|
1506
|
+
for model trainer functions. This model trainer routine supports
|
|
1507
|
+
for teradata analytic functions supported by merge model
|
|
1508
|
+
feature.
|
|
1509
|
+
|
|
1510
|
+
PARAMETERS:
|
|
1511
|
+
model_param:
|
|
1512
|
+
Required Argument.
|
|
1513
|
+
Specifies the model trainer arguments used for model training.
|
|
1514
|
+
Notes:
|
|
1515
|
+
* "model_param" contains both model training parameters
|
|
1516
|
+
and sampled data id.
|
|
1517
|
+
* Using 'param' key model training parameters are retrieved
|
|
1518
|
+
from "model_param".
|
|
1519
|
+
* Using 'data_id' key sampled data identifier is retrieved from
|
|
1520
|
+
"model_param".
|
|
1521
|
+
Types: dict
|
|
1522
|
+
|
|
1523
|
+
iter:
|
|
1524
|
+
Required Argument.
|
|
1525
|
+
Specifies the iteration count of HPT execution for teradataml
|
|
1526
|
+
analytic function.
|
|
1527
|
+
Types: int
|
|
1528
|
+
|
|
1529
|
+
kwargs:
|
|
1530
|
+
Required Argument.
|
|
1531
|
+
Specifies the keyword arguments used for model evaluation.
|
|
1532
|
+
Accepts additional required arguments for the model trainer
|
|
1533
|
+
function evaluation.
|
|
1534
|
+
|
|
1535
|
+
RETURNS:
|
|
1536
|
+
None
|
|
1537
|
+
|
|
1538
|
+
RAISES:
|
|
1539
|
+
None
|
|
1540
|
+
|
|
1541
|
+
EXAMPLES:
|
|
1542
|
+
>>> self.__model_trainer_routine(param=param, iter=iter, **kwargs)
|
|
1543
|
+
"""
|
|
1544
|
+
# Define model name used for model metadata.
|
|
1545
|
+
|
|
1546
|
+
model_name = self._generate_model_name(iter)
|
|
1547
|
+
# Get the unique data identifier present in "model_param".
|
|
1548
|
+
_data_id = model_param[self.__DATA_ID]
|
|
1549
|
+
# 'param' variable holds model training parameters and train dataframe.
|
|
1550
|
+
# Get the model training parameters.
|
|
1551
|
+
|
|
1552
|
+
if self.__is_opensource_model:
|
|
1553
|
+
param_outer = model_param.get("param", {})
|
|
1554
|
+
param = param_outer.get("param", param_outer)
|
|
1555
|
+
data_input = param.pop("data", None)
|
|
1556
|
+
param = {k: v for k, v in param.items() if k != "data"}
|
|
1557
|
+
else:
|
|
1558
|
+
param = model_param["param"]
|
|
1559
|
+
data_input = None
|
|
1560
|
+
|
|
1561
|
+
# Check the stop_event set or not
|
|
1562
|
+
if self.__parallel_stop_event is not None and self.__parallel_stop_event.is_set():
|
|
1563
|
+
# Update the model metadata for Skip execution.
|
|
1564
|
+
self.__update_model_metadata(model_name, param, "SKIP", 0, 0, 0, _data_id)
|
|
1565
|
+
return
|
|
1566
|
+
|
|
1567
|
+
# Retrieve the train and test data using data identifier.
|
|
1568
|
+
if self.__is_opensource_model:
|
|
1569
|
+
|
|
1570
|
+
if self.__is_clustering_model:
|
|
1571
|
+
_train_data = self.__sampled_df_mapper[_data_id]
|
|
1572
|
+
_test_data = {} # No label needed
|
|
1573
|
+
elif self.__is_regression_model or self.__is_classification_model:
|
|
1574
|
+
_train_data, _test_data = self.__sampled_df_mapper[_data_id]
|
|
1575
|
+
kwargs.update(_test_data)
|
|
1576
|
+
else:
|
|
1577
|
+
_train_data, _test_data = self.__sampled_df_mapper[_data_id]
|
|
1578
|
+
# Update model training argument with train DataFrame.
|
|
1579
|
+
param.update(_train_data)
|
|
1580
|
+
# Update the test DataFrame for model evaluation.
|
|
1581
|
+
kwargs.update(_test_data)
|
|
1582
|
+
|
|
1583
|
+
try:
|
|
1584
|
+
# Record starting time of model training.
|
|
1585
|
+
start_time = time.perf_counter()
|
|
1586
|
+
if self.__is_val_function:
|
|
1587
|
+
# VAL uses special framework. So, Lets create new instance
|
|
1588
|
+
# using getattr method.
|
|
1589
|
+
self.__func = valib.__getattr__(self.__func_name)
|
|
1590
|
+
# Train the model.
|
|
1591
|
+
if self.__is_opensource_model:
|
|
1592
|
+
from teradataml import td_sklearn as skl
|
|
1593
|
+
func_class = getattr(skl, self.__func_name) # e.g., skl.KMeans
|
|
1594
|
+
if self.__is_regression_model or self.__is_classification_model:
|
|
1595
|
+
# Extract and remove only for regression models
|
|
1596
|
+
self.__input_columns = param.pop("input_columns", None)
|
|
1597
|
+
self.__response_column = param.pop("response_column", None)
|
|
1598
|
+
|
|
1599
|
+
func_obj = func_class(**param) # Safely create model instance
|
|
1600
|
+
else:
|
|
1601
|
+
func_obj = self.__func(**param)
|
|
1602
|
+
end_time = time.perf_counter()
|
|
1603
|
+
training_time = round((end_time - start_time), 3)
|
|
1604
|
+
# Store the trained object.
|
|
1605
|
+
self.__trained_models[model_name] = func_obj
|
|
1606
|
+
|
|
1607
|
+
if self.__is_opensource_model and self.__is_clustering_model:
|
|
1608
|
+
start_time_cluster = time.perf_counter()
|
|
1609
|
+
from teradataml import td_sklearn as skl
|
|
1610
|
+
feature_cols = [col for col in _train_data["data"].columns]
|
|
1611
|
+
func_obj.fit(data=_train_data["data"], feature_columns=feature_cols)
|
|
1612
|
+
pred_col = self._get_predict_column()
|
|
1613
|
+
result = func_obj.predict(data=_train_data["data"], feature_columns=feature_cols)
|
|
1614
|
+
result.materialize()
|
|
1615
|
+
|
|
1616
|
+
silhouette = skl.silhouette_score(
|
|
1617
|
+
X=result.select(feature_cols),
|
|
1618
|
+
labels=result.select([pred_col])
|
|
1619
|
+
)
|
|
1620
|
+
|
|
1621
|
+
calinski = skl.calinski_harabasz_score(
|
|
1622
|
+
X=result.select(feature_cols),
|
|
1623
|
+
labels=result.select([pred_col])
|
|
1624
|
+
)
|
|
1625
|
+
|
|
1626
|
+
davies = skl.davies_bouldin_score(
|
|
1627
|
+
X=result.select(feature_cols),
|
|
1628
|
+
labels=result.select([pred_col])
|
|
1629
|
+
)
|
|
1630
|
+
|
|
1631
|
+
columns = ["SILHOUETTE", "CALINSKI", "DAVIES"]
|
|
1632
|
+
eval_values = [silhouette, calinski, davies]
|
|
1633
|
+
eval_key_values = dict(zip(columns, eval_values))
|
|
1634
|
+
|
|
1635
|
+
end_time_cluster = time.perf_counter()
|
|
1636
|
+
training_time_cluster = round((end_time_cluster - start_time_cluster), 3)
|
|
1637
|
+
|
|
1638
|
+
if self.__evaluation_metric is None:
|
|
1639
|
+
self.__evaluation_metric = "SILHOUETTE"
|
|
1640
|
+
|
|
1641
|
+
self.__update_model_metadata(model_name, param, "PASS", training_time_cluster,
|
|
1642
|
+
end_time_cluster, start_time_cluster, _data_id, eval_key_values)
|
|
1643
|
+
elif self.__is_opensource_model and (self.__is_regression_model or self.__is_classification_model):
|
|
1644
|
+
start_time_lin = time.perf_counter()
|
|
1645
|
+
train_df = _train_data["data"]
|
|
1646
|
+
y = train_df.select([self.__response_column])
|
|
1647
|
+
X = train_df.drop(columns=[self.__response_column], axis=1)
|
|
1648
|
+
|
|
1649
|
+
func_obj.fit(X,y)
|
|
1650
|
+
pred_col = self._get_predict_column()
|
|
1651
|
+
|
|
1652
|
+
output = func_obj.predict(X,y)
|
|
1653
|
+
|
|
1654
|
+
y_true = output.select([self.__response_column])
|
|
1655
|
+
y_pred = output.select([pred_col])
|
|
1656
|
+
|
|
1657
|
+
if self.__is_regression_model:
|
|
1658
|
+
eval_key_values = self._regression_metrics(y_true, y_pred)
|
|
1659
|
+
if self.__evaluation_metric is None:
|
|
1660
|
+
self.__evaluation_metric = "MAE"
|
|
1661
|
+
elif self.__is_classification_model:
|
|
1662
|
+
eval_key_values = self._classification_metrics(y_true, y_pred)
|
|
1663
|
+
if self.__evaluation_metric is None:
|
|
1664
|
+
self.__evaluation_metric = "ACCURACY"
|
|
1665
|
+
|
|
1666
|
+
end_time_lin = time.perf_counter()
|
|
1667
|
+
training_time_lin = round((end_time_lin - start_time_lin), 3)
|
|
1668
|
+
|
|
1669
|
+
self.__update_model_metadata(model_name, param, "PASS", training_time_lin,
|
|
1670
|
+
end_time_lin, start_time_lin, _data_id, eval_key_values)
|
|
1671
|
+
else:
|
|
1672
|
+
# Evaluate the trained model.
|
|
1673
|
+
evaluations = func_obj.evaluate(**kwargs)
|
|
1674
|
+
# Extract evaluations report in dictionary format.
|
|
1675
|
+
if "RegressionEvaluator" in type(evaluations).__name__:
|
|
1676
|
+
# RegressionEvaluator results are stored under "result" attribute.
|
|
1677
|
+
# "result" dataframe column names are metrics and corresponding
|
|
1678
|
+
# rows are evaluation values.
|
|
1679
|
+
columns = evaluations.result.keys()
|
|
1680
|
+
eval_values = evaluations.result.get_values()[0]
|
|
1681
|
+
|
|
1682
|
+
# Default evaluation metric is set to "MAE" for Regression models.
|
|
1683
|
+
if self.__evaluation_metric is None:
|
|
1684
|
+
self.__evaluation_metric = "MAE"
|
|
1685
|
+
|
|
1686
|
+
else:
|
|
1687
|
+
# ClassificationEvaluator results are stored under "output_data"
|
|
1688
|
+
# attribute. "output_data" dataframe 'column 1' contains metrics
|
|
1689
|
+
# and 'column 2' holds corresponding evaluation values.
|
|
1690
|
+
eval_report = evaluations.output_data.get_values().transpose()
|
|
1691
|
+
columns = eval_report[1].astype('str')
|
|
1692
|
+
columns = [column_name.upper() for column_name in columns]
|
|
1693
|
+
eval_values = eval_report[2]
|
|
1694
|
+
|
|
1695
|
+
# Default evaluation metric is set to "ACCURACY" for
|
|
1696
|
+
# classification models.
|
|
1697
|
+
if self.__evaluation_metric is None:
|
|
1698
|
+
self.__evaluation_metric = "ACCURACY"
|
|
1699
|
+
|
|
1700
|
+
# Combine columns and eval_values into a dictionary
|
|
1701
|
+
eval_key_values = dict(zip(columns, eval_values))
|
|
1702
|
+
# Update the model metadata for successful model training.
|
|
1703
|
+
self.__update_model_metadata(model_name, param, "PASS",
|
|
1704
|
+
training_time, end_time, start_time,
|
|
1705
|
+
_data_id, eval_key_values)
|
|
1706
|
+
|
|
1707
|
+
|
|
1708
|
+
# Check whether self.__parallel_stop_event is None or not
|
|
1709
|
+
if self.__parallel_stop_event is not None:
|
|
1710
|
+
# SET the self.__parallel_stop_event
|
|
1711
|
+
# When trained model evaluation metric value exceeds self.__early_stop
|
|
1712
|
+
# or When execution time exceeds self.__timeout
|
|
1713
|
+
if (self.__early_stop is not None and self._is_early_stoppable())\
|
|
1714
|
+
or (self.__timeout is not None and self._is_time_stoppable()):
|
|
1715
|
+
self.__parallel_stop_event.set()
|
|
1716
|
+
|
|
1717
|
+
except Exception as _err_msg:
|
|
1718
|
+
# Record error message with corresponding "model_name".
|
|
1719
|
+
self.__model_err_records[model_name] = str(_err_msg)
|
|
1720
|
+
# Compute the failed execution time for failed training.
|
|
1721
|
+
end_time = time.perf_counter()
|
|
1722
|
+
training_time = round((end_time - start_time), 3)
|
|
1723
|
+
# Update the model metadata for failed execution.
|
|
1724
|
+
self.__update_model_metadata(model_name, param, "FAIL", training_time,
|
|
1725
|
+
end_time, start_time, _data_id)
|
|
1726
|
+
pass
|
|
1727
|
+
|
|
1728
|
+
def __non_model_trainer_routine(self, model_param, iter, **kwargs):
|
|
1729
|
+
"""
|
|
1730
|
+
DESCRIPTION:
|
|
1731
|
+
Internal function to perform fit operations for non-model
|
|
1732
|
+
trainer functions. This is non-model trainer routine supports
|
|
1733
|
+
for teradata analytic functions.
|
|
1734
|
+
Note:
|
|
1735
|
+
* non-evaluatable model trainer function trained in this routine.
|
|
1736
|
+
|
|
1737
|
+
PARAMETERS:
|
|
1738
|
+
model_param:
|
|
1739
|
+
Required Argument.
|
|
1740
|
+
Specifies the model trainer arguments used for model execution.
|
|
1741
|
+
Notes:
|
|
1742
|
+
* "model_param" contains both model training parameters
|
|
1743
|
+
and data id for non-evaluatable model trainer
|
|
1744
|
+
functions.
|
|
1745
|
+
* Using 'param' key model training parameters are retrieved
|
|
1746
|
+
from "model_param" for non-evaluatable functions.
|
|
1747
|
+
* Using 'data_id' key data identifier is retrieved from
|
|
1748
|
+
"model_param" for non-evaluatable functions.
|
|
1749
|
+
* No pre-processing required in "model_param" for non-model
|
|
1750
|
+
trainer functions.
|
|
1751
|
+
* Instead of data identifier DataFrame is present for
|
|
1752
|
+
non-model trainer functions.
|
|
1753
|
+
Types: dict
|
|
1754
|
+
|
|
1755
|
+
iter:
|
|
1756
|
+
Required Argument.
|
|
1757
|
+
Specifies the iteration count of HPT execution for teradataml
|
|
1758
|
+
analytic function.
|
|
1759
|
+
Types: int
|
|
1760
|
+
|
|
1761
|
+
kwargs:
|
|
1762
|
+
Optional Argument.
|
|
1763
|
+
Specifies the keyword arguments. Accepts additional arguments
|
|
1764
|
+
required for the teradataml analytic function.
|
|
1765
|
+
|
|
1766
|
+
RETURNS:
|
|
1767
|
+
None
|
|
1768
|
+
|
|
1769
|
+
RAISES:
|
|
1770
|
+
None
|
|
1771
|
+
|
|
1772
|
+
EXAMPLES:
|
|
1773
|
+
>>> self.__non_model_trainer_routine(param=param, iter=iter, **kwargs)
|
|
1774
|
+
"""
|
|
1775
|
+
# Define model name used for model metadata.
|
|
1776
|
+
model_name = self._generate_model_name(iter)
|
|
1777
|
+
|
|
1778
|
+
# 'param' variable holds model training parameters and train dataframe.
|
|
1779
|
+
param = None
|
|
1780
|
+
_data_id = None
|
|
1781
|
+
# Update model training argument with train dataframe for unsupervised models.
|
|
1782
|
+
if self.__is_trainable and not self.__is_evaluatable:
|
|
1783
|
+
# Get the model training data id.
|
|
1784
|
+
_data_id = model_param[self.__DATA_ID]
|
|
1785
|
+
# Retrieve train data using data id.
|
|
1786
|
+
_train_data = self.__sampled_df_mapper[_data_id]
|
|
1787
|
+
# Get the model training params.
|
|
1788
|
+
param = model_param["param"]
|
|
1789
|
+
# Update the params with training data.
|
|
1790
|
+
param.update(_train_data)
|
|
1791
|
+
else:
|
|
1792
|
+
# Initialize param for non-model trainer functions.
|
|
1793
|
+
param = model_param
|
|
1794
|
+
# Check the stop_event set or not
|
|
1795
|
+
if self.__parallel_stop_event is not None and self.__parallel_stop_event.is_set():
|
|
1796
|
+
# Update the model metadata for Skip execution.
|
|
1797
|
+
self.__update_model_metadata(model_name, param, "SKIP", 0, 0, 0, _data_id)
|
|
1798
|
+
return
|
|
1799
|
+
try:
|
|
1800
|
+
# Record starting time of model training.
|
|
1801
|
+
start_time = time.perf_counter()
|
|
1802
|
+
if self.__is_val_function:
|
|
1803
|
+
# VAL uses special framework. So, Lets create new instance
|
|
1804
|
+
# using getattr method.
|
|
1805
|
+
self.__func = valib.__getattr__(self.__func_name)
|
|
1806
|
+
|
|
1807
|
+
# Train the model.
|
|
1808
|
+
func_obj = self.__func(**param)
|
|
1809
|
+
|
|
1810
|
+
# Store the trained object.
|
|
1811
|
+
self.__trained_models[model_name] = func_obj
|
|
1812
|
+
|
|
1813
|
+
# Process training time.
|
|
1814
|
+
end_time = time.perf_counter()
|
|
1815
|
+
training_time = round((end_time - start_time), 3)
|
|
1816
|
+
# Update the model metadata for successful model training.
|
|
1817
|
+
|
|
1818
|
+
self.__update_model_metadata(model_name, param, "PASS", training_time, end_time, start_time, _data_id)
|
|
1819
|
+
except Exception as _err_msg:
|
|
1820
|
+
# Record error message with corresponding "model_name".
|
|
1821
|
+
self.__model_err_records[model_name] = str(_err_msg)
|
|
1822
|
+
# Compute the failed execution time for failed training.
|
|
1823
|
+
end_time = time.perf_counter()
|
|
1824
|
+
training_time = round((end_time - start_time), 3)
|
|
1825
|
+
# Update the model metadata for failed execution.
|
|
1826
|
+
self.__update_model_metadata(model_name, param, "FAIL", training_time, end_time, start_time, _data_id)
|
|
1827
|
+
pass
|
|
1828
|
+
|
|
1829
|
+
if self.__parallel_stop_event is not None:
|
|
1830
|
+
# SET the self.__parallel_stop_event
|
|
1831
|
+
# When execution time exceeds self.__timeout
|
|
1832
|
+
if self.__timeout is not None and self._is_time_stoppable():
|
|
1833
|
+
self.__parallel_stop_event.set()
|
|
1834
|
+
|
|
1835
|
+
|
|
1836
|
+
def __update_model_metadata(self, model_name,
|
|
1837
|
+
param,
|
|
1838
|
+
status,
|
|
1839
|
+
training_time,
|
|
1840
|
+
end_time,
|
|
1841
|
+
start_time,
|
|
1842
|
+
data_id=None,
|
|
1843
|
+
eval_key_values=None):
|
|
1844
|
+
"""
|
|
1845
|
+
DESCRIPTION:
|
|
1846
|
+
Internal function to update the model evaluation details, that are
|
|
1847
|
+
used for "models" and "model_stats" properties.
|
|
1848
|
+
|
|
1849
|
+
PARAMETERS:
|
|
1850
|
+
model_name:
|
|
1851
|
+
Required Argument.
|
|
1852
|
+
Specifies the unique model name for the training model.
|
|
1853
|
+
Types: str
|
|
1854
|
+
|
|
1855
|
+
param:
|
|
1856
|
+
Required Argument.
|
|
1857
|
+
Specifies the model trainer function parameters used for
|
|
1858
|
+
model training.
|
|
1859
|
+
Types: dict
|
|
1860
|
+
|
|
1861
|
+
status:
|
|
1862
|
+
Required Argument.
|
|
1863
|
+
Specifies the status of executed teradataml analytic function.
|
|
1864
|
+
Permitted Values:
|
|
1865
|
+
* PASS: Function result present in the vantage.
|
|
1866
|
+
* FAIL: Function execution failed for the chosen parameters.
|
|
1867
|
+
* SKIP: Function execution skipped for the chosen parameters.
|
|
1868
|
+
Types: str
|
|
1869
|
+
|
|
1870
|
+
training_time:
|
|
1871
|
+
Required Argument.
|
|
1872
|
+
Specifies the model training time in seconds for both model trainer
|
|
1873
|
+
function and non-model trainer function.
|
|
1874
|
+
Types: float
|
|
1875
|
+
|
|
1876
|
+
end_time:
|
|
1877
|
+
Optional Argument.
|
|
1878
|
+
Specifies the end time of the model training.
|
|
1879
|
+
Types: float
|
|
1880
|
+
|
|
1881
|
+
start_time:
|
|
1882
|
+
Optional Argument.
|
|
1883
|
+
Specifies the start time of the model training.
|
|
1884
|
+
Types: float
|
|
1885
|
+
|
|
1886
|
+
data_id:
|
|
1887
|
+
Optional Argument.
|
|
1888
|
+
Specifies the unique data identifier used for model training.
|
|
1889
|
+
Note:
|
|
1890
|
+
* "data_id" is supported for model trainer functions.
|
|
1891
|
+
Types: str
|
|
1892
|
+
|
|
1893
|
+
eval_key_values:
|
|
1894
|
+
Optional Argument.
|
|
1895
|
+
Specifies the evaluation key values retrieved from model evaluation
|
|
1896
|
+
phase. This argument is a required argument for model trainer
|
|
1897
|
+
function.
|
|
1898
|
+
Types: dict.
|
|
1899
|
+
|
|
1900
|
+
RETURNS:
|
|
1901
|
+
None
|
|
1902
|
+
|
|
1903
|
+
RAISES:
|
|
1904
|
+
None
|
|
1905
|
+
|
|
1906
|
+
EXAMPLES:
|
|
1907
|
+
>>> optimizer_obj.__update_model_metadata(self,
|
|
1908
|
+
evaluations=evaluation_obj.result,
|
|
1909
|
+
iter=1, params={"columns" :
|
|
1910
|
+
["age", "nbr_children", "income"],
|
|
1911
|
+
"response_column" : "years_with_bank"},
|
|
1912
|
+
status="Present")
|
|
1913
|
+
|
|
1914
|
+
"""
|
|
1915
|
+
# Prepare model metadata.
|
|
1916
|
+
model_metadata = {"MODEL_ID" : model_name,
|
|
1917
|
+
"PARAMETERS" : param,
|
|
1918
|
+
"STATUS" : status}
|
|
1919
|
+
if self.__is_trainable:
|
|
1920
|
+
# Update "data_id" for model trainer functions.
|
|
1921
|
+
model_metadata[self.__DATA_ID.upper()] = data_id
|
|
1922
|
+
|
|
1923
|
+
# Format log message needs to displayed.
|
|
1924
|
+
_msg = "Model_id:{}, Run time:{}s, Start time:{}, End time:{}, Status:{}".format(model_name,
|
|
1925
|
+
training_time,
|
|
1926
|
+
start_time,
|
|
1927
|
+
end_time,
|
|
1928
|
+
status)
|
|
1929
|
+
|
|
1930
|
+
if status == "PASS" and (self.__is_evaluatable or self.__is_clustering_model):
|
|
1931
|
+
# While execution status is 'Fail' then update the evaluation result
|
|
1932
|
+
# with 'None' values.
|
|
1933
|
+
model_scores = eval_key_values
|
|
1934
|
+
model_metadata.update(model_scores)
|
|
1935
|
+
# Add additional model score to the log message.
|
|
1936
|
+
if self.__is_opensource_model and (self.__evaluation_metric is None or self.__evaluation_metric not in model_scores):
|
|
1937
|
+
if "SILHOUETTE" in model_scores:
|
|
1938
|
+
self.__evaluation_metric = "SILHOUETTE"
|
|
1939
|
+
_msg += ",{}:{}".format(self.__evaluation_metric,round(
|
|
1940
|
+
model_scores[self.__evaluation_metric], 3))
|
|
1941
|
+
# Best model updation.
|
|
1942
|
+
# 'self.__is_finite' holds 'True' until any infinite value is seen.
|
|
1943
|
+
self.__is_finite = self.__is_finite and np.isfinite(model_metadata[
|
|
1944
|
+
self.__evaluation_metric])
|
|
1945
|
+
|
|
1946
|
+
# Let's check if evaluation result is finite and model is the
|
|
1947
|
+
# new best model.
|
|
1948
|
+
if np.isfinite(model_metadata[self.__evaluation_metric]) and \
|
|
1949
|
+
(self.__best_score_ is None or \
|
|
1950
|
+
self._is_best_metrics(model_metadata[self.__evaluation_metric])):
|
|
1951
|
+
# Update existing best model.
|
|
1952
|
+
self.__default_model = self.__best_model = \
|
|
1953
|
+
self.__trained_models[model_name]
|
|
1954
|
+
# Update existing best score.
|
|
1955
|
+
self.__best_score_ = model_metadata[self.__evaluation_metric]
|
|
1956
|
+
# Update existing best model ID.
|
|
1957
|
+
self.__best_model_id = model_name
|
|
1958
|
+
# "self.__best_params_" contains best model parameters.
|
|
1959
|
+
self.__best_params_ = param
|
|
1960
|
+
# "__best_data_id" contains bet data identifier used for
|
|
1961
|
+
# training best model.
|
|
1962
|
+
self.__best_data_id = data_id
|
|
1963
|
+
|
|
1964
|
+
if not self.__progress_bar is None and status != 'SKIP':
|
|
1965
|
+
# Update progress bar when logging is required.
|
|
1966
|
+
self.__progress_bar.update(msg=_msg)
|
|
1967
|
+
# Update "__model_eval_records" with the formatted metadata.
|
|
1968
|
+
self.__model_eval_records.append(model_metadata)
|
|
1969
|
+
|
|
1970
|
+
|
|
1971
|
+
def predict(self, **kwargs):
|
|
1972
|
+
"""
|
|
1973
|
+
DESCRIPTION:
|
|
1974
|
+
Function uses model training function generated models from SQLE,
|
|
1975
|
+
VAL and UAF features for predictions. Predictions are made using
|
|
1976
|
+
the best trained model. Predict function is not supported for
|
|
1977
|
+
non-model trainer function.
|
|
1978
|
+
|
|
1979
|
+
PARAMETERS:
|
|
1980
|
+
kwargs:
|
|
1981
|
+
Optional Argument.
|
|
1982
|
+
Specifies the keyword arguments. Accepts all merge model
|
|
1983
|
+
predict feature arguments required for the teradataml
|
|
1984
|
+
analytic function predictions.
|
|
1985
|
+
|
|
1986
|
+
RETURNS:
|
|
1987
|
+
Output teradataml DataFrames can be accessed using attribute
|
|
1988
|
+
references, such as HPTObj.<attribute_name>.
|
|
1989
|
+
Output teradataml DataFrame attribute name is:
|
|
1990
|
+
result
|
|
1991
|
+
|
|
1992
|
+
RAISES:
|
|
1993
|
+
TeradataMlException, TypeError, ValueError
|
|
1994
|
+
|
|
1995
|
+
EXAMPLES:
|
|
1996
|
+
>>> # Create an instance of the search algorithm called "optimizer_obj"
|
|
1997
|
+
>>> # by referring "__init__()" method.
|
|
1998
|
+
>>> # Perform "fit()" method on the optimizer_obj to populate model records.
|
|
1999
|
+
>>> # Perform prediction using "optimizer_obj".
|
|
2000
|
+
>>> optimizer_obj.predict(newdata=test_data, **eval_params)
|
|
2001
|
+
id prediction MedHouseVal
|
|
2002
|
+
0 686 0.202843 1.578
|
|
2003
|
+
1 2018 0.149868 0.578
|
|
2004
|
+
2 1754 0.211870 1.651
|
|
2005
|
+
3 670 0.192414 1.922
|
|
2006
|
+
4 244 0.247545 1.117
|
|
2007
|
+
"""
|
|
2008
|
+
|
|
2009
|
+
# Raise TeradataMLException error when non-model trainer function
|
|
2010
|
+
# identifier is passed.
|
|
2011
|
+
if not self.__is_trainable or not self.__is_predictable:
|
|
2012
|
+
err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
|
|
2013
|
+
"execute 'predict()'","Not applicable for" \
|
|
2014
|
+
" non-model trainer analytic functions.")
|
|
2015
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
2016
|
+
|
|
2017
|
+
if self.__default_model is None:
|
|
2018
|
+
err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
|
|
2019
|
+
"execute 'predict()'",
|
|
2020
|
+
"No model is set as default to set a "\
|
|
2021
|
+
"prediction model use the 'set_model()' function.")
|
|
2022
|
+
|
|
2023
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
2024
|
+
|
|
2025
|
+
test_data = kwargs.get("newdata", None)
|
|
2026
|
+
|
|
2027
|
+
if self.__is_opensource_model and self.__is_clustering_model:
|
|
2028
|
+
if test_data is None:
|
|
2029
|
+
test_data = self.__sampled_df_mapper[self.__best_data_id]["data"]
|
|
2030
|
+
feature_columns = kwargs.get("feature_columns", None)
|
|
2031
|
+
|
|
2032
|
+
# If feature columns not passed, fetch from training data
|
|
2033
|
+
if feature_columns is None:
|
|
2034
|
+
if self.__best_data_id is None:
|
|
2035
|
+
err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
|
|
2036
|
+
"fetch 'feature_columns'",
|
|
2037
|
+
"No training metadata found")
|
|
2038
|
+
|
|
2039
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
2040
|
+
training_df = self.__sampled_df_mapper[self.__best_data_id]["data"]
|
|
2041
|
+
training_columns = training_df.columns
|
|
2042
|
+
|
|
2043
|
+
feature_columns = [col for col in training_columns]
|
|
2044
|
+
|
|
2045
|
+
return self.__default_model.predict(data=test_data, feature_columns=feature_columns)
|
|
2046
|
+
elif self.__is_opensource_model and (self.__is_regression_model or self.__is_classification_model):
|
|
2047
|
+
if test_data is None:
|
|
2048
|
+
test_data = self.__sampled_df_mapper[self.__best_data_id][1]["data"]
|
|
2049
|
+
y_test = test_data.select([self.__response_column])
|
|
2050
|
+
X_test = test_data.drop(columns=[self.__response_column], axis=1)
|
|
2051
|
+
|
|
2052
|
+
return self.__default_model.predict(X_test, y_test)
|
|
2053
|
+
# TODO Enable this method, once Merge model supports VAL, and UAF.
|
|
2054
|
+
return self.__default_model.predict(**kwargs)
|
|
2055
|
+
|
|
2056
|
+
|
|
2057
|
+
def get_input_data(self, data_id):
|
|
2058
|
+
"""
|
|
2059
|
+
DESCRIPTION:
|
|
2060
|
+
Function to get the input data used by model trainer functions.
|
|
2061
|
+
Unique identifiers (data_id) is used to get the training data.
|
|
2062
|
+
In case of unlabeled data such as single dataframe or tuple of
|
|
2063
|
+
dataframe, default unique identifiers are assigned. Hence, unlabeled
|
|
2064
|
+
training data is retrieved using default unique identifiers.
|
|
2065
|
+
Notes:
|
|
2066
|
+
* Function only returns input data for model trainer functions.
|
|
2067
|
+
* Train and Test sampled data are returned for supervised
|
|
2068
|
+
model trainer function (evaluatable functions).
|
|
2069
|
+
* Train data is returned for unsupervised-model trainer function
|
|
2070
|
+
(non-evaluatable functions).
|
|
2071
|
+
|
|
2072
|
+
PARAMETERS:
|
|
2073
|
+
data_id:
|
|
2074
|
+
Required Argument.
|
|
2075
|
+
Specifies the unique data identifier used for model training.
|
|
2076
|
+
Types: str
|
|
2077
|
+
|
|
2078
|
+
RETURNS:
|
|
2079
|
+
teradataml DataFrame
|
|
2080
|
+
|
|
2081
|
+
RAISES:
|
|
2082
|
+
ValueError
|
|
2083
|
+
|
|
2084
|
+
EXAMPLES:
|
|
2085
|
+
>>> # Create an instance of the search algorithm called "optimizer_obj"
|
|
2086
|
+
>>> # by referring "__init__()" method.
|
|
2087
|
+
>>> # Perform "fit()" method on the optimizer_obj to populate model records.
|
|
2088
|
+
>>> # Retrieve the training data.
|
|
2089
|
+
>>> optimizer_obj.get_input_data(data_id="DF_1")
|
|
2090
|
+
[{'data': id MedHouseVal MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude
|
|
2091
|
+
0 19789 0.660 -1.154291 -0.668250 0.862203 7.021803 -1.389101 -1.106515 2.367716 -1.710719
|
|
2092
|
+
1 17768 1.601 -0.447350 -0.162481 -0.431952 -0.156872 2.436223 2.172854 0.755780 -1.016640
|
|
2093
|
+
2 19722 0.675 -0.076848 1.439120 1.805547 1.944759 -1.186169 0.326739 1.459894 -0.974996
|
|
2094
|
+
3 18022 3.719 1.029892 0.343287 0.635952 -0.480133 -0.914869 -0.160824 0.711496 -1.067540
|
|
2095
|
+
4 15749 3.500 -0.182247 1.776299 -0.364226 0.035715 -0.257239 -0.970166 0.941772 -1.294272
|
|
2096
|
+
5 11246 2.028 -0.294581 -0.583955 -0.265916 -0.270654 0.182266 -0.703494 -0.807444 0.764827
|
|
2097
|
+
6 16736 3.152 0.943735 1.439120 -0.747066 -1.036053 -1.071138 -0.678411 0.906345 -1.234118
|
|
2098
|
+
7 12242 0.775 -1.076758 -0.752545 -0.424517 0.460470 0.742228 -0.597809 -0.838443 1.241428
|
|
2099
|
+
8 14365 2.442 -0.704218 1.017646 -0.428965 -0.367301 -1.014707 -1.333045 -1.294568 1.121121
|
|
2100
|
+
9 18760 1.283 0.019018 -1.258313 0.754993 0.013994 0.094365 0.222254 2.195008 -1.201728},
|
|
2101
|
+
{'newdata': id MedHouseVal MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude
|
|
2102
|
+
0 16102 2.841 0.206284 1.270530 -0.248620 -0.224210 -0.059733 -0.242386 0.937344 -1.317408
|
|
2103
|
+
1 15994 3.586 0.306050 1.439120 0.255448 -0.334613 -0.160657 -0.426510 0.937344 -1.303526
|
|
2104
|
+
2 15391 2.541 0.423107 -1.595492 0.951807 -0.061005 1.955480 0.517572 -1.055434 1.236801
|
|
2105
|
+
3 18799 0.520 -0.677565 -0.415366 0.548756 1.254406 -0.883398 -0.534060 2.358859 -1.035149
|
|
2106
|
+
4 19172 1.964 0.247152 -0.162481 0.428766 -0.427459 -0.175849 -0.451380 1.238475 -1.396070
|
|
2107
|
+
5 18164 3.674 0.295345 -1.258313 -1.078181 0.175885 0.045531 -1.298667 0.760208 -1.099930
|
|
2108
|
+
6 13312 1.598 0.484475 -1.342608 0.767557 -0.229585 0.113899 0.361520 -0.692306 0.949915
|
|
2109
|
+
7 12342 1.590 -0.520029 -0.246776 0.973345 1.407755 2.325532 -0.406887 -0.798587 1.445024}]
|
|
2110
|
+
|
|
2111
|
+
"""
|
|
2112
|
+
# Validation.
|
|
2113
|
+
arg_info_matrix = []
|
|
2114
|
+
arg_info_matrix.append(["data_id", data_id, False, str,
|
|
2115
|
+
True, list(self.__sampled_df_mapper.keys())])
|
|
2116
|
+
|
|
2117
|
+
# "data_id" argument validation.
|
|
2118
|
+
# "data_id" validates for argument type, and permitted values.
|
|
2119
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
2120
|
+
|
|
2121
|
+
return self.__sampled_df_mapper.get(data_id)
|
|
2122
|
+
|
|
2123
|
+
|
|
2124
|
+
def get_model(self, model_id):
|
|
2125
|
+
"""
|
|
2126
|
+
DESCRIPTION:
|
|
2127
|
+
Function to get the model.
|
|
2128
|
+
|
|
2129
|
+
PARAMETERS:
|
|
2130
|
+
model_id:
|
|
2131
|
+
Required Argument.
|
|
2132
|
+
Specifies the unique identifier for model.
|
|
2133
|
+
Notes:
|
|
2134
|
+
* Trained model results returned for model trainer functions.
|
|
2135
|
+
* Executed function results returned for non-model trainer
|
|
2136
|
+
functions.
|
|
2137
|
+
Types: str
|
|
2138
|
+
|
|
2139
|
+
RETURNS:
|
|
2140
|
+
Object of teradataml analytic functions.
|
|
2141
|
+
Note:
|
|
2142
|
+
* Attribute references remains same as that of the function
|
|
2143
|
+
attributes.
|
|
2144
|
+
|
|
2145
|
+
RAISES:
|
|
2146
|
+
TeradataMlException, ValueError
|
|
2147
|
+
|
|
2148
|
+
EXAMPLES:
|
|
2149
|
+
>>> # Create an instance of the search algorithm called "optimizer_obj"
|
|
2150
|
+
>>> # by referring "__init__()" method.
|
|
2151
|
+
>>> # Perform "fit()" method on the optimizer_obj to populate model records.
|
|
2152
|
+
>>> # Retrieve the trained model.
|
|
2153
|
+
>>> optimizer_obj.get_model(model_id="SVM_1")
|
|
2154
|
+
############ output_data Output ############
|
|
2155
|
+
|
|
2156
|
+
iterNum loss eta bias
|
|
2157
|
+
0 3 2.265289 0.028868 0.0
|
|
2158
|
+
1 5 2.254413 0.022361 0.0
|
|
2159
|
+
2 6 2.249260 0.020412 0.0
|
|
2160
|
+
3 7 2.244463 0.018898 0.0
|
|
2161
|
+
4 9 2.235800 0.016667 0.0
|
|
2162
|
+
5 10 2.231866 0.015811 0.0
|
|
2163
|
+
6 8 2.239989 0.017678 0.0
|
|
2164
|
+
7 4 2.259956 0.025000 0.0
|
|
2165
|
+
8 2 2.271862 0.035355 0.0
|
|
2166
|
+
9 1 2.280970 0.050000 0.0
|
|
2167
|
+
|
|
2168
|
+
############ result Output ############
|
|
2169
|
+
|
|
2170
|
+
predictor estimate value
|
|
2171
|
+
attribute
|
|
2172
|
+
-7 Alpha 0.50000 Elasticnet
|
|
2173
|
+
-3 Number of Observations 31.00000 None
|
|
2174
|
+
5 Population -0.32384 None
|
|
2175
|
+
0 (Intercept) 0.00000 None
|
|
2176
|
+
-17 OneClass SVM NaN FALSE
|
|
2177
|
+
-16 Kernel NaN LINEAR
|
|
2178
|
+
-1 Loss Function NaN EPSILON_INSENSITIVE
|
|
2179
|
+
7 Latitude 0.00000 None
|
|
2180
|
+
-9 Learning Rate (Initial) 0.05000 None
|
|
2181
|
+
-14 Epsilon 0.10000 None
|
|
2182
|
+
|
|
2183
|
+
"""
|
|
2184
|
+
# Validations
|
|
2185
|
+
arg_info_matrix = []
|
|
2186
|
+
arg_info_matrix.append(["model_id", model_id, False, str,
|
|
2187
|
+
True, list(self.__trained_models.keys())])
|
|
2188
|
+
|
|
2189
|
+
# "model_id" argument validations.
|
|
2190
|
+
# "model_id" validates for argument type, and permitted values.
|
|
2191
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
2192
|
+
|
|
2193
|
+
# Get the trained model object of trained model.
|
|
2194
|
+
model_obj = self.__trained_models.get(model_id)
|
|
2195
|
+
# Raise teradataml exception when HPT "fit" method is not executed.
|
|
2196
|
+
# since "self.__trained_models" does not contain a record for retrieval.
|
|
2197
|
+
if model_obj is None:
|
|
2198
|
+
err = Messages.get_message(MessageCodes.MODEL_NOT_FOUND,
|
|
2199
|
+
model_id, ' or not created')
|
|
2200
|
+
raise TeradataMlException(err, MessageCodes.MODEL_NOT_FOUND)
|
|
2201
|
+
|
|
2202
|
+
return model_obj
|
|
2203
|
+
|
|
2204
|
+
|
|
2205
|
+
def get_error_log(self, model_id):
|
|
2206
|
+
"""
|
|
2207
|
+
DESCRIPTION:
|
|
2208
|
+
Function to get the error logs of a failed model training in the fit method.
|
|
2209
|
+
|
|
2210
|
+
PARAMETERS:
|
|
2211
|
+
model_id:
|
|
2212
|
+
Required Argument.
|
|
2213
|
+
Specifies the unique identifier for model.
|
|
2214
|
+
Note:
|
|
2215
|
+
* Only failed model training error log is returned.
|
|
2216
|
+
Types: str
|
|
2217
|
+
|
|
2218
|
+
RETURNS:
|
|
2219
|
+
string
|
|
2220
|
+
|
|
2221
|
+
RAISES:
|
|
2222
|
+
TypeError, ValueError
|
|
2223
|
+
|
|
2224
|
+
EXAMPLES:
|
|
2225
|
+
>>> # Create an instance of the search algorithm called "optimizer_obj"
|
|
2226
|
+
>>> # by referring "__init__()" method.
|
|
2227
|
+
>>> # Perform "fit()" method on the optimizer_obj to populate model records.
|
|
2228
|
+
>>> # Retrieve the error log.
|
|
2229
|
+
>>> optimizer_obj.get_error_log("SVM_2")
|
|
2230
|
+
"[Teradata][teradataml](TDML_2082) Value of 'iter_max' must be greater
|
|
2231
|
+
than or equal to 1 and less than or equal to 10000000."
|
|
2232
|
+
|
|
2233
|
+
"""
|
|
2234
|
+
# Validations
|
|
2235
|
+
arg_info_matrix = []
|
|
2236
|
+
arg_info_matrix.append(["model_id", model_id, False, str,
|
|
2237
|
+
True, list(self.__model_err_records.keys())])
|
|
2238
|
+
|
|
2239
|
+
# "model_id" argument validations.
|
|
2240
|
+
# "model_id" validates for argument type, and permitted values.
|
|
2241
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
2242
|
+
|
|
2243
|
+
# Retrieve the raw error message
|
|
2244
|
+
msg = self.__model_err_records.get(model_id)
|
|
2245
|
+
|
|
2246
|
+
# For opensource models, return trimmed message
|
|
2247
|
+
if self.__is_opensource_model:
|
|
2248
|
+
return msg.split("\n", 1)[0].strip()
|
|
2249
|
+
|
|
2250
|
+
# For generic models, return original message
|
|
2251
|
+
return msg
|
|
2252
|
+
|
|
2253
|
+
|
|
2254
|
+
def set_model(self, model_id):
|
|
2255
|
+
"""
|
|
2256
|
+
DESCRIPTION:
|
|
2257
|
+
Function to set the model to use for Prediction.
|
|
2258
|
+
|
|
2259
|
+
PARAMETERS:
|
|
2260
|
+
model_id:
|
|
2261
|
+
Required Argument.
|
|
2262
|
+
Specifies the unique identifier for model.
|
|
2263
|
+
Note:
|
|
2264
|
+
* Not significant for non-model trainer functions.
|
|
2265
|
+
Types: str
|
|
2266
|
+
|
|
2267
|
+
RETURNS:
|
|
2268
|
+
None
|
|
2269
|
+
|
|
2270
|
+
RAISES:
|
|
2271
|
+
TeradataMlException, ValueError
|
|
2272
|
+
|
|
2273
|
+
EXAMPLES:
|
|
2274
|
+
>>> # Create an instance of the search algorithm called "optimizer_obj"
|
|
2275
|
+
>>> # by referring "__init__()" method.
|
|
2276
|
+
>>> # Perform "fit()" method on the optimizer_obj to populate model records.
|
|
2277
|
+
>>> # Set the default trained model.
|
|
2278
|
+
>>> optimizer_obj.set_model(model_id="SVM_1")
|
|
2279
|
+
"""
|
|
2280
|
+
# Raise TeradataMLException error when non-model trainer function
|
|
2281
|
+
# identifier is passed.
|
|
2282
|
+
if not self.__is_trainable:
|
|
2283
|
+
err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
|
|
2284
|
+
"execute 'set_model()'","Not applicable for" \
|
|
2285
|
+
" non-model trainer analytic functions.")
|
|
2286
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
2287
|
+
|
|
2288
|
+
# Replace the default model with the trained model.
|
|
2289
|
+
self.__default_model = self.get_model(model_id)
|
|
2290
|
+
|
|
2291
|
+
|
|
2292
|
+
def evaluate(self, **kwargs):
|
|
2293
|
+
"""
|
|
2294
|
+
DESCRIPTION:
|
|
2295
|
+
Function uses trained models from SQLE, VAL and UAF features for
|
|
2296
|
+
evaluations. evaluations are made using the default trained model.
|
|
2297
|
+
Notes:
|
|
2298
|
+
* Evaluation supported for evaluatable model-trainer functions.
|
|
2299
|
+
* Best model is set as default model by default.
|
|
2300
|
+
* Default model can be changed using "set_model()" method.
|
|
2301
|
+
|
|
2302
|
+
PARAMETERS:
|
|
2303
|
+
kwargs:
|
|
2304
|
+
Optional Argument.
|
|
2305
|
+
Specifies the keyword arguments. Accepts additional arguments
|
|
2306
|
+
required for the teradataml analytic function evaluations.
|
|
2307
|
+
While "kwargs" is empty then internal sampled test dataset
|
|
2308
|
+
and arguments used for evaluation. Otherwise,
|
|
2309
|
+
All arguments required with validation data need to be passed
|
|
2310
|
+
for evaluation.
|
|
2311
|
+
|
|
2312
|
+
RETURNS:
|
|
2313
|
+
Output teradataml DataFrames can be accessed using attribute
|
|
2314
|
+
references, such as HPTEvaluateObj.<attribute_name>.
|
|
2315
|
+
Output teradataml DataFrame attribute name is:
|
|
2316
|
+
result
|
|
2317
|
+
|
|
2318
|
+
RAISES:
|
|
2319
|
+
TeradataMlException
|
|
2320
|
+
|
|
2321
|
+
EXAMPLES:
|
|
2322
|
+
>>> # Create an instance of the search algorithm called "optimizer_obj"
|
|
2323
|
+
>>> # by referring "__init__()" method.
|
|
2324
|
+
>>> # Perform "fit()" method on the optimizer_obj to populate model records.
|
|
2325
|
+
>>> # Perform evaluation using best model.
|
|
2326
|
+
>>> optimizer_obj.evaluate(newdata=test_data, **eval_params)
|
|
2327
|
+
############ result Output ############
|
|
2328
|
+
MAE MSE MSLE MAPE MPE RMSE RMSLE ME R2 EV MPD MGD
|
|
2329
|
+
0 2.616772 8.814968 0.0 101.876866 101.876866 2.969001 0.0 5.342344 -4.14622 -0.14862 NaN NaN
|
|
2330
|
+
|
|
2331
|
+
"""
|
|
2332
|
+
|
|
2333
|
+
# Raise TeradataMLException error when non-model trainer function
|
|
2334
|
+
# identifier is passed.
|
|
2335
|
+
if not self.__is_trainable or not self.__is_evaluatable:
|
|
2336
|
+
if not self.__is_clustering_model:
|
|
2337
|
+
err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
|
|
2338
|
+
"execute 'evaluate()'","Not applicable for" \
|
|
2339
|
+
" non-model trainer analytic functions.")
|
|
2340
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
2341
|
+
else:
|
|
2342
|
+
err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
|
|
2343
|
+
"execute 'evaluate()'","Not applicable for" \
|
|
2344
|
+
" clustering model functions.")
|
|
2345
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
2346
|
+
|
|
2347
|
+
if self.__default_model is None:
|
|
2348
|
+
err = Messages.get_message(MessageCodes.EXECUTION_FAILED,
|
|
2349
|
+
"execute 'evaluate()'",
|
|
2350
|
+
"No model is set as default to set a "\
|
|
2351
|
+
"trained model for evaluation use "\
|
|
2352
|
+
"the 'set_model()' function.")
|
|
2353
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
2354
|
+
if self.__is_opensource_model and (self.__is_regression_model or self.__is_classification_model):
|
|
2355
|
+
test_data = kwargs.get("newdata", None)
|
|
2356
|
+
|
|
2357
|
+
if test_data is None:
|
|
2358
|
+
test_data = self.__sampled_df_mapper[self.__best_data_id][1]["data"]
|
|
2359
|
+
|
|
2360
|
+
y_test = test_data.select([self.__response_column])
|
|
2361
|
+
X_test = test_data.drop(columns=[self.__response_column], axis=1)
|
|
2362
|
+
|
|
2363
|
+
pred_col = self._get_predict_column()
|
|
2364
|
+
|
|
2365
|
+
output = self.__default_model.predict(X_test,y_test)
|
|
2366
|
+
|
|
2367
|
+
y_true = output.select([self.__response_column])
|
|
2368
|
+
y_pred = output.select([pred_col])
|
|
2369
|
+
|
|
2370
|
+
if self.__is_regression_model:
|
|
2371
|
+
eval_key_values = self._regression_metrics(y_true, y_pred)
|
|
2372
|
+
elif self.__is_classification_model:
|
|
2373
|
+
eval_key_values = self._classification_metrics(y_true, y_pred)
|
|
2374
|
+
|
|
2375
|
+
import pandas as pd
|
|
2376
|
+
result_df = pd.DataFrame([eval_key_values])
|
|
2377
|
+
return result_df
|
|
2378
|
+
else:
|
|
2379
|
+
_params = self.__eval_params if len(kwargs) == 0 else kwargs
|
|
2380
|
+
if self._TRAINABLE_FUNCS_DATA_MAPPER[self.__func_name] not in _params:
|
|
2381
|
+
_params.update(self.__sampled_df_mapper[self.__best_data_id][1])
|
|
2382
|
+
return self.__default_model.evaluate(**_params)
|
|
2383
|
+
|
|
2384
|
+
|
|
2385
|
+
def __populate_parameter_grid(self):
|
|
2386
|
+
"""
|
|
2387
|
+
DESCRIPTION:
|
|
2388
|
+
Internal function to populate parameter grid with all combinations.
|
|
2389
|
+
|
|
2390
|
+
PARAMETERS:
|
|
2391
|
+
None
|
|
2392
|
+
|
|
2393
|
+
RETURNS:
|
|
2394
|
+
List of dictionary
|
|
2395
|
+
|
|
2396
|
+
RAISES:
|
|
2397
|
+
None
|
|
2398
|
+
|
|
2399
|
+
EXAMPLES:
|
|
2400
|
+
>>> self.__populate_parameter_grid()
|
|
2401
|
+
|
|
2402
|
+
"""
|
|
2403
|
+
param_pairs = []
|
|
2404
|
+
# Iterate all the parameters to create argument name and value pairs.
|
|
2405
|
+
for arg, arg_value in self.__params.items():
|
|
2406
|
+
temp_params = []
|
|
2407
|
+
if isinstance(arg_value, tuple):
|
|
2408
|
+
# When dictionary value type is tuple then add argument name to
|
|
2409
|
+
# all the values in tuples.
|
|
2410
|
+
for value in arg_value:
|
|
2411
|
+
temp_params.append((arg, value))
|
|
2412
|
+
else:
|
|
2413
|
+
# Add argument name to the value.
|
|
2414
|
+
temp_params.append((arg, arg_value))
|
|
2415
|
+
|
|
2416
|
+
# Append name and value pairs to the "param_pairs".
|
|
2417
|
+
param_pairs.append(temp_params)
|
|
2418
|
+
|
|
2419
|
+
# Return list of dictionary containing all possible combinations.
|
|
2420
|
+
return [dict(param) for param in product(*param_pairs)]
|
|
2421
|
+
|
|
2422
|
+
def _data_mapping(self):
|
|
2423
|
+
"""
|
|
2424
|
+
DESCRIPTION:
|
|
2425
|
+
Internal function to create a Cartesian product of data mapped with input columns
|
|
2426
|
+
and parameter grid.
|
|
2427
|
+
|
|
2428
|
+
PARAMETERS:
|
|
2429
|
+
None
|
|
2430
|
+
|
|
2431
|
+
RETURNS:
|
|
2432
|
+
None
|
|
2433
|
+
"""
|
|
2434
|
+
# Get the input columns from the params.
|
|
2435
|
+
input_columns = self.__params.pop("input_columns")
|
|
2436
|
+
# Create a list of dictionaries with data_id and input_columns
|
|
2437
|
+
data_mapping_list = []
|
|
2438
|
+
# Iterate over the labeled data and create a list of dictionaries
|
|
2439
|
+
for data_ids, data in self._labeled_data.items():
|
|
2440
|
+
# Check if all input columns are present in the data
|
|
2441
|
+
for input_cols in input_columns:
|
|
2442
|
+
if all(col in data.columns for col in input_cols):
|
|
2443
|
+
data_mapping_list.append({'data_id': data_ids,
|
|
2444
|
+
'input_columns': input_cols})
|
|
2445
|
+
|
|
2446
|
+
self._parameter_grid = self.__populate_parameter_grid()
|
|
2447
|
+
|
|
2448
|
+
cartesian_product = product(self._parameter_grid, data_mapping_list)
|
|
2449
|
+
|
|
2450
|
+
result_list = []
|
|
2451
|
+
|
|
2452
|
+
# Iterate over the Cartesian product and construct the desired dictionaries
|
|
2453
|
+
for params, data_mapping in cartesian_product:
|
|
2454
|
+
result_dict = {
|
|
2455
|
+
'param': {**params, 'input_columns': data_mapping['input_columns']},
|
|
2456
|
+
self.__DATA_ID: data_mapping['data_id']
|
|
2457
|
+
}
|
|
2458
|
+
result_list.append(result_dict)
|
|
2459
|
+
|
|
2460
|
+
self._parameter_grid = result_list
|
|
2461
|
+
|
|
2462
|
+
|
|
2463
|
+
def _setting_model_trainer_data(self,
|
|
2464
|
+
data=None):
|
|
2465
|
+
"""
|
|
2466
|
+
DESCRIPTION:
|
|
2467
|
+
Internal function to set the model trainer input data for model
|
|
2468
|
+
training.
|
|
2469
|
+
|
|
2470
|
+
PARAMETERS:
|
|
2471
|
+
data:
|
|
2472
|
+
Optional Argument.
|
|
2473
|
+
Specifies the input data used for model training.
|
|
2474
|
+
Note:
|
|
2475
|
+
* "data" argument is a required argument for model trainer
|
|
2476
|
+
function when data argument is not passed with hyperparameters.
|
|
2477
|
+
* When data argument is passed with hyperparameters then
|
|
2478
|
+
"data" argument is optional.
|
|
2479
|
+
Types: teradataml DataFrame
|
|
2480
|
+
|
|
2481
|
+
RETURNS:
|
|
2482
|
+
None
|
|
2483
|
+
|
|
2484
|
+
Example:
|
|
2485
|
+
>>> print(self.__model_trainer_input_data)
|
|
2486
|
+
( id admitted gpa stats programming masters
|
|
2487
|
+
0 19 0 0.051643 0.0 0.0 1.0
|
|
2488
|
+
1 6 1 0.765258 0.5 0.0 1.0
|
|
2489
|
+
2 15 1 1.000000 0.0 0.0 1.0
|
|
2490
|
+
3 32 0 0.746479 0.0 0.5 1.0
|
|
2491
|
+
4 12 1 0.835681 1.0 1.0 0.0
|
|
2492
|
+
5 40 0 0.976526 1.0 0.5 1.0
|
|
2493
|
+
6 7 1 0.215962 1.0 1.0 1.0
|
|
2494
|
+
7 36 0 0.530516 0.0 1.0 0.0
|
|
2495
|
+
8 28 1 0.967136 0.0 0.0 0.0
|
|
2496
|
+
9 17 1 0.920188 0.0 0.0 0.0,
|
|
2497
|
+
id admitted gpa stats programming masters
|
|
2498
|
+
0 4 1 0.765258 0.5 1.0 1.0
|
|
2499
|
+
1 6 1 0.765258 0.5 0.0 1.0
|
|
2500
|
+
2 7 1 0.215962 1.0 1.0 1.0
|
|
2501
|
+
3 8 1 0.812207 0.5 0.0 0.0
|
|
2502
|
+
4 10 1 0.863850 0.0 0.0 0.0
|
|
2503
|
+
5 11 1 0.591549 0.0 0.0 0.0
|
|
2504
|
+
6 9 1 0.915493 0.0 0.0 0.0
|
|
2505
|
+
7 5 0 0.737089 1.0 1.0 0.0
|
|
2506
|
+
8 3 1 0.859155 1.0 0.5 0.0
|
|
2507
|
+
9 2 0 0.887324 0.5 0.5 1.0,
|
|
2508
|
+
id admitted gpa stats programming masters
|
|
2509
|
+
0 23 1 0.807512 0.0 1.0 1.0
|
|
2510
|
+
1 25 1 0.981221 0.0 0.0 0.0
|
|
2511
|
+
2 26 1 0.798122 0.0 0.0 1.0
|
|
2512
|
+
3 27 0 0.981221 0.0 0.0 1.0
|
|
2513
|
+
4 29 0 1.000000 1.0 0.5 1.0
|
|
2514
|
+
5 30 0 0.901408 0.0 1.0 1.0
|
|
2515
|
+
6 28 1 0.967136 0.0 0.0 0.0
|
|
2516
|
+
7 24 1 0.000000 0.0 1.0 0.0
|
|
2517
|
+
8 22 0 0.746479 1.0 0.5 1.0
|
|
2518
|
+
9 21 1 0.938967 1.0 0.5 0.0)
|
|
2519
|
+
|
|
2520
|
+
>>> print(self._labeled_data)
|
|
2521
|
+
{'DF_0': id admitted gpa stats programming masters
|
|
2522
|
+
0 26 1 0.798122 0.0 0.0 1.0
|
|
2523
|
+
1 40 0 0.976526 1.0 0.5 1.0
|
|
2524
|
+
2 7 1 0.215962 1.0 1.0 1.0
|
|
2525
|
+
3 19 0 0.051643 0.0 0.0 1.0
|
|
2526
|
+
4 15 1 1.000000 0.0 0.0 1.0
|
|
2527
|
+
5 32 0 0.746479 0.0 0.5 1.0
|
|
2528
|
+
6 38 1 0.366197 0.0 0.5 1.0
|
|
2529
|
+
7 12 1 0.835681 1.0 1.0 0.0
|
|
2530
|
+
8 6 1 0.765258 0.5 0.0 1.0
|
|
2531
|
+
9 36 0 0.530516 0.0 1.0 0.0,
|
|
2532
|
+
'DF_1': id admitted gpa stats programming masters
|
|
2533
|
+
0 4 1 0.765258 0.5 1.0 1.0
|
|
2534
|
+
1 6 1 0.765258 0.5 0.0 1.0
|
|
2535
|
+
2 7 1 0.215962 1.0 1.0 1.0
|
|
2536
|
+
3 8 1 0.812207 0.5 0.0 0.0
|
|
2537
|
+
4 10 1 0.863850 0.0 0.0 0.0
|
|
2538
|
+
5 11 1 0.591549 0.0 0.0 0.0
|
|
2539
|
+
6 9 1 0.915493 0.0 0.0 0.0
|
|
2540
|
+
7 5 0 0.737089 1.0 1.0 0.0
|
|
2541
|
+
8 3 1 0.859155 1.0 0.5 0.0
|
|
2542
|
+
9 2 0 0.887324 0.5 0.5 1.0,
|
|
2543
|
+
'DF_2': id admitted gpa stats programming masters
|
|
2544
|
+
0 23 1 0.807512 0.0 1.0 1.0
|
|
2545
|
+
1 25 1 0.981221 0.0 0.0 0.0
|
|
2546
|
+
2 26 1 0.798122 0.0 0.0 1.0
|
|
2547
|
+
3 27 0 0.981221 0.0 0.0 1.0
|
|
2548
|
+
4 29 0 1.000000 1.0 0.5 1.0
|
|
2549
|
+
5 30 0 0.901408 0.0 1.0 1.0
|
|
2550
|
+
6 28 1 0.967136 0.0 0.0 0.0
|
|
2551
|
+
7 24 1 0.000000 0.0 1.0 0.0
|
|
2552
|
+
8 22 0 0.746479 1.0 0.5 1.0
|
|
2553
|
+
9 21 1 0.938967 1.0 0.5 0.0}
|
|
2554
|
+
"""
|
|
2555
|
+
if self.__is_trainable:
|
|
2556
|
+
# "data" argument is a required argument for model trainer function
|
|
2557
|
+
# when data argument is not passed with hyperparameters. On other side,
|
|
2558
|
+
# "data" argument will be optional argument when data argument
|
|
2559
|
+
# is passed with hyperparameters.
|
|
2560
|
+
_is_optional_arg = self.__model_trainer_input_data is not None
|
|
2561
|
+
# validate the model trainer function 'data' argument.
|
|
2562
|
+
self.__validate_model_trainer_input_data_argument(data, _is_optional_arg)
|
|
2563
|
+
|
|
2564
|
+
if not data is None:
|
|
2565
|
+
# '__model_trainer_input_data' is assigned with "data" argument,
|
|
2566
|
+
# when user passes data argument in fit() method.
|
|
2567
|
+
# Note: if user attempts to pass data argument in both "params"
|
|
2568
|
+
# argument as hyperparameters or "data" argument in fit()
|
|
2569
|
+
# method, then latest "data" argument value is considered
|
|
2570
|
+
# for model training.
|
|
2571
|
+
self.__model_trainer_input_data = data
|
|
2572
|
+
|
|
2573
|
+
if self.__is_trainable and self.__is_evaluatable and self.__is_sqle_function:
|
|
2574
|
+
self._labeled_data = self._add_data_label()
|
|
2575
|
+
elif self.__is_trainable and self.__is_evaluatable and not self.__is_clustering_model:
|
|
2576
|
+
self._labeled_data = self._add_data_label()
|
|
2577
|
+
|
|
2578
|
+
|
|
2579
|
+
class GridSearch(_BaseSearch):
|
|
2580
|
+
def __init__(self, func, params):
|
|
2581
|
+
"""
|
|
2582
|
+
DESCRIPTION:
|
|
2583
|
+
GridSearch is an exhaustive search algorithm that covers all possible
|
|
2584
|
+
parameter values to identify optimal hyperparameters. It works for
|
|
2585
|
+
teradataml analytic functions from SQLE, BYOM, VAL and UAF features.
|
|
2586
|
+
teradataml GridSearch allows user to perform hyperparameter tuning for
|
|
2587
|
+
all model trainer and non-model trainer functions.
|
|
2588
|
+
When used for model trainer functions:
|
|
2589
|
+
* Based on evaluation metrics search determines best model.
|
|
2590
|
+
* All methods and properties can be used.
|
|
2591
|
+
When used for non-model trainer functions:
|
|
2592
|
+
* Only fit() method is supported.
|
|
2593
|
+
* User can choose the best output as they see fit to use this.
|
|
2594
|
+
|
|
2595
|
+
teradataml GridSearch also allows user to use input data as the
|
|
2596
|
+
hyperparameter. This option can be suitable when the user wants to
|
|
2597
|
+
identify the best models for a set of input data. When user passes
|
|
2598
|
+
set of data as hyperparameter for model trainer function, the search
|
|
2599
|
+
determines the best data along with the best model based on the
|
|
2600
|
+
evaluation metrics.
|
|
2601
|
+
Note:
|
|
2602
|
+
* configure.temp_object_type="VT" follows sequential execution.
|
|
2603
|
+
|
|
2604
|
+
PARAMETERS:
|
|
2605
|
+
func:
|
|
2606
|
+
Required Argument.
|
|
2607
|
+
Specifies a teradataml analytic function from SQLE, VAL, and UAF.
|
|
2608
|
+
Types:
|
|
2609
|
+
teradataml Analytic Functions
|
|
2610
|
+
* Advanced analytic functions
|
|
2611
|
+
* UAF
|
|
2612
|
+
* VAL
|
|
2613
|
+
Refer to display_analytic_functions() function for list of functions.
|
|
2614
|
+
|
|
2615
|
+
params:
|
|
2616
|
+
Required Argument.
|
|
2617
|
+
Specifies the parameter(s) of a teradataml analytic function.
|
|
2618
|
+
The parameter(s) must be in dictionary. keys refers to the
|
|
2619
|
+
argument names and values refers to argument values for corresponding
|
|
2620
|
+
arguments.
|
|
2621
|
+
Notes:
|
|
2622
|
+
* One can specify the argument value in a tuple to run HPT
|
|
2623
|
+
with different arguments.
|
|
2624
|
+
* Model trainer function arguments "id_column", "input_columns",
|
|
2625
|
+
and "target_columns" must be passed in fit() method.
|
|
2626
|
+
* All required arguments of non-model trainer function must
|
|
2627
|
+
be passed while GridSearch object creation.
|
|
2628
|
+
Types: dict
|
|
2629
|
+
|
|
2630
|
+
RETURNS:
|
|
2631
|
+
None
|
|
2632
|
+
|
|
2633
|
+
RAISES:
|
|
2634
|
+
TeradataMlException, TypeError, ValueError
|
|
2635
|
+
|
|
2636
|
+
EXAMPLES:
|
|
2637
|
+
>>> # Example 1: Model trainer function. Performing hyperparameter-tuning
|
|
2638
|
+
>>> # on SVM model trainer function.
|
|
2639
|
+
|
|
2640
|
+
>>> # Load the example data.
|
|
2641
|
+
>>> load_example_data("teradataml", ["cal_housing_ex_raw"])
|
|
2642
|
+
|
|
2643
|
+
>>> # Create teradataml DataFrame objects.
|
|
2644
|
+
>>> data_input = DataFrame.from_table("cal_housing_ex_raw")
|
|
2645
|
+
|
|
2646
|
+
>>> # Scale "target_columns" with respect to 'STD' value of the column.
|
|
2647
|
+
>>> fit_obj = ScaleFit(data=data_input,
|
|
2648
|
+
target_columns=['MedInc', 'HouseAge', 'AveRooms',
|
|
2649
|
+
'AveBedrms', 'Population', 'AveOccup',
|
|
2650
|
+
'Latitude', 'Longitude'],
|
|
2651
|
+
scale_method="STD")
|
|
2652
|
+
|
|
2653
|
+
>>> # Transform the data.
|
|
2654
|
+
>>> transform_obj = ScaleTransform(data=data_input,
|
|
2655
|
+
object=fit_obj.output,
|
|
2656
|
+
accumulate=["id", "MedHouseVal"])
|
|
2657
|
+
|
|
2658
|
+
>>> # Define parameter space for model training.
|
|
2659
|
+
>>> params = {"input_columns":['MedInc', 'HouseAge', 'AveRooms',
|
|
2660
|
+
'AveBedrms', 'Population', 'AveOccup',
|
|
2661
|
+
'Latitude', 'Longitude'],
|
|
2662
|
+
"response_column":"MedHouseVal",
|
|
2663
|
+
"model_type":"regression",
|
|
2664
|
+
"batch_size":(11, 50, 75),
|
|
2665
|
+
"iter_max":(100, 301),
|
|
2666
|
+
"lambda1":0.1,
|
|
2667
|
+
"alpha":0.5,
|
|
2668
|
+
"iter_num_no_change":60,
|
|
2669
|
+
"tolerance":0.01,
|
|
2670
|
+
"intercept":False,
|
|
2671
|
+
"learning_rate":"INVTIME",
|
|
2672
|
+
"initial_data":0.5,
|
|
2673
|
+
"decay_rate":0.5,
|
|
2674
|
+
"momentum":0.6,
|
|
2675
|
+
"nesterov":True,
|
|
2676
|
+
"local_sgd_iterations":1}
|
|
2677
|
+
|
|
2678
|
+
>>> # Required argument for model prediction and evaluation.
|
|
2679
|
+
>>> eval_params = {"id_column": "id",
|
|
2680
|
+
"accumulate": "MedHouseVal"}
|
|
2681
|
+
|
|
2682
|
+
>>> # Import trainer function and optimizer.
|
|
2683
|
+
>>> from teradataml import SVM, GridSearch
|
|
2684
|
+
|
|
2685
|
+
>>> # Initialize the GridSearch optimizer with model trainer
|
|
2686
|
+
>>> # function and parameter space required for model training.
|
|
2687
|
+
>>> gs_obj = GridSearch(func=SVM, params=params)
|
|
2688
|
+
|
|
2689
|
+
>>> # Perform model optimization for SVM function.
|
|
2690
|
+
>>> # Evaluation and prediction arguments are passed along with
|
|
2691
|
+
>>> # training dataframe.
|
|
2692
|
+
>>> gs_obj.fit(data=transform_obj.result, **eval_params)
|
|
2693
|
+
|
|
2694
|
+
>>> # View trained models.
|
|
2695
|
+
>>> gs_obj.models
|
|
2696
|
+
MODEL_ID DATA_ID PARAMETERS STATUS MAE
|
|
2697
|
+
0 SVM_3 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.616772
|
|
2698
|
+
1 SVM_0 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.660815
|
|
2699
|
+
2 SVM_1 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.660815
|
|
2700
|
+
3 SVM_2 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.616772
|
|
2701
|
+
4 SVM_4 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.616772
|
|
2702
|
+
5 SVM_5 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.616772
|
|
2703
|
+
|
|
2704
|
+
>>> # View model evaluation stats.
|
|
2705
|
+
>>> gs_obj.model_stats
|
|
2706
|
+
MODEL_ID DATA_ID PARAMETERS STATUS MAE
|
|
2707
|
+
0 SVM_3 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.616772`
|
|
2708
|
+
1 SVM_0 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.660815
|
|
2709
|
+
2 SVM_1 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.660815
|
|
2710
|
+
3 SVM_2 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.616772
|
|
2711
|
+
4 SVM_4 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.616772
|
|
2712
|
+
5 SVM_5 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.616772`
|
|
2713
|
+
|
|
2714
|
+
>>> # View best data, model ID and score.
|
|
2715
|
+
>>> print("Best data ID: ", gs_obj.best_data_id)
|
|
2716
|
+
Best data ID: DF_0
|
|
2717
|
+
>>> print("Best model ID: ", gs_obj.best_model_id)
|
|
2718
|
+
Best model ID: SVM_3
|
|
2719
|
+
>>> print("Best model score: ",gs_obj.best_score_)
|
|
2720
|
+
Best model score: 2.616772068334627
|
|
2721
|
+
|
|
2722
|
+
>>> # Performing prediction on sampled data using best trained model.
|
|
2723
|
+
>>> test_data = transform_obj.result.iloc[:5]
|
|
2724
|
+
>>> gs_pred = gs_obj.predict(newdata=test_data, **eval_params)
|
|
2725
|
+
>>> print("Prediction result: \n", gs_pred.result)
|
|
2726
|
+
Prediction result:
|
|
2727
|
+
id prediction MedHouseVal
|
|
2728
|
+
0 686 0.202843 1.578
|
|
2729
|
+
1 2018 0.149868 0.578
|
|
2730
|
+
2 1754 0.211870 1.651
|
|
2731
|
+
3 670 0.192414 1.922
|
|
2732
|
+
4 244 0.247545 1.117
|
|
2733
|
+
|
|
2734
|
+
>>> # Perform evaluation using best model.
|
|
2735
|
+
>>> gs_obj.evaluate()
|
|
2736
|
+
############ result Output ############
|
|
2737
|
+
MAE MSE MSLE MAPE MPE RMSE RMSLE ME R2 EV MPD MGD
|
|
2738
|
+
0 2.616772 8.814968 0.0 101.876866 101.876866 2.969001 0.0 5.342344 -4.14622 -0.14862 NaN NaN
|
|
2739
|
+
|
|
2740
|
+
>>> # Retrieve any trained model.
|
|
2741
|
+
>>> gs_obj.get_model("SVM_1")
|
|
2742
|
+
############ output_data Output ############
|
|
2743
|
+
|
|
2744
|
+
iterNum loss eta bias
|
|
2745
|
+
0 3 2.060386 0.028868 0.0
|
|
2746
|
+
1 5 2.055509 0.022361 0.0
|
|
2747
|
+
2 6 2.051982 0.020412 0.0
|
|
2748
|
+
3 7 2.048387 0.018898 0.0
|
|
2749
|
+
4 9 2.041521 0.016667 0.0
|
|
2750
|
+
5 10 2.038314 0.015811 0.0
|
|
2751
|
+
6 8 2.044882 0.017678 0.0
|
|
2752
|
+
7 4 2.058757 0.025000 0.0
|
|
2753
|
+
8 2 2.065932 0.035355 0.0
|
|
2754
|
+
9 1 1.780877 0.050000 0.0
|
|
2755
|
+
|
|
2756
|
+
|
|
2757
|
+
############ result Output ############
|
|
2758
|
+
|
|
2759
|
+
predictor estimate value
|
|
2760
|
+
attribute
|
|
2761
|
+
7 Latitude 0.155095 None
|
|
2762
|
+
-9 Learning Rate (Initial) 0.050000 None
|
|
2763
|
+
-17 OneClass SVM NaN FALSE
|
|
2764
|
+
-14 Epsilon 0.100000 None
|
|
2765
|
+
5 Population 0.000000 None
|
|
2766
|
+
-12 Nesterov NaN TRUE
|
|
2767
|
+
-5 BIC 73.297397 None
|
|
2768
|
+
-7 Alpha 0.500000 Elasticnet
|
|
2769
|
+
-3 Number of Observations 55.000000 None
|
|
2770
|
+
0 (Intercept) 0.000000 None
|
|
2771
|
+
|
|
2772
|
+
>>> # Update the default model.
|
|
2773
|
+
>>> gs_obj.set_model("SVM_1")
|
|
2774
|
+
|
|
2775
|
+
|
|
2776
|
+
|
|
2777
|
+
>>> # Example 2: Model trainer function. Performing hyperparameter-tuning
|
|
2778
|
+
>>> # on SVM model trainer function using unlabeled multiple-dataframe.
|
|
2779
|
+
|
|
2780
|
+
>>> # Slicing transformed dataframe into two part to present
|
|
2781
|
+
>>> # multiple-dataframe support.
|
|
2782
|
+
|
|
2783
|
+
>>> train_df_1 = transform_obj.result.iloc[:30]
|
|
2784
|
+
>>> train_df_2 = transform_obj.result.iloc[30:]
|
|
2785
|
+
|
|
2786
|
+
>>> # Initialize the GridSearch optimizer with model trainer
|
|
2787
|
+
>>> # function and parameter space required for model training.
|
|
2788
|
+
>>> gs_obj = GridSearch(func=SVM, params=params)
|
|
2789
|
+
|
|
2790
|
+
>>> # Perform model optimization for SVM function for
|
|
2791
|
+
>>> # unlabeled multiple-dataframe support.
|
|
2792
|
+
>>> # Evaluation and prediction arguments are passed along with
|
|
2793
|
+
>>> # training dataframe.
|
|
2794
|
+
>>> gs_obj.fit(data=(train_df_1, train_df_2), **eval_params)
|
|
2795
|
+
|
|
2796
|
+
>>> # View trained models.
|
|
2797
|
+
>>> gs_obj.models
|
|
2798
|
+
MODEL_ID DATA_ID PARAMETERS STATUS MAE
|
|
2799
|
+
0 SVM_3 DF_1 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.650505
|
|
2800
|
+
1 SVM_1 DF_1 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.650505
|
|
2801
|
+
2 SVM_2 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.326521
|
|
2802
|
+
3 SVM_0 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.326521
|
|
2803
|
+
4 SVM_7 DF_1 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.650505
|
|
2804
|
+
5 SVM_4 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.326521
|
|
2805
|
+
6 SVM_6 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.326521
|
|
2806
|
+
7 SVM_5 DF_1 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.650505
|
|
2807
|
+
8 SVM_9 DF_1 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.650505
|
|
2808
|
+
9 SVM_10 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.326521
|
|
2809
|
+
10 SVM_11 DF_1 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.650505
|
|
2810
|
+
11 SVM_8 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.326521
|
|
2811
|
+
>>> # View model evaluation stats.
|
|
2812
|
+
>>> gs_obj.model_stats
|
|
2813
|
+
MODEL_ID MAE MSE MSLE MAPE ... ME R2 EV MPD MGD
|
|
2814
|
+
0 SVM_3 2.650505 8.459088 0.0 159.159527 ... 5.282729 -2.930531 0.333730 NaN NaN
|
|
2815
|
+
1 SVM_1 2.650505 8.459088 0.0 159.159527 ... 5.282729 -2.930531 0.333730 NaN NaN
|
|
2816
|
+
2 SVM_2 2.326521 6.218464 0.0 90.629648 ... 3.776410 -6.987358 -0.034968 NaN NaN
|
|
2817
|
+
3 SVM_0 2.326521 6.218464 0.0 90.629648 ... 3.776410 -6.987358 -0.034968 NaN NaN
|
|
2818
|
+
4 SVM_7 2.650505 8.459088 0.0 159.159527 ... 5.282729 -2.930531 0.333730 NaN NaN
|
|
2819
|
+
5 SVM_4 2.326521 6.218464 0.0 90.629648 ... 3.776410 -6.987358 -0.034968 NaN NaN
|
|
2820
|
+
6 SVM_6 2.326521 6.218464 0.0 90.629648 ... 3.776410 -6.987358 -0.034968 NaN NaN
|
|
2821
|
+
7 SVM_5 2.650505 8.459088 0.0 159.159527 ... 5.282729 -2.930531 0.333730 NaN NaN
|
|
2822
|
+
8 SVM_9 2.650505 8.459088 0.0 159.159527 ... 5.282729 -2.930531 0.333730 NaN NaN
|
|
2823
|
+
9 SVM_10 2.326521 6.218464 0.0 90.629648 ... 3.776410 -6.987358 -0.034968 NaN NaN
|
|
2824
|
+
10 SVM_11 2.650505 8.459088 0.0 159.159527 ... 5.282729 -2.930531 0.333730 NaN NaN
|
|
2825
|
+
11 SVM_8 2.326521 6.218464 0.0 90.629648 ... 3.776410 -6.987358 -0.034968 NaN NaN
|
|
2826
|
+
|
|
2827
|
+
|
|
2828
|
+
>>> # View best data, model ID and score.
|
|
2829
|
+
>>> print("Best data ID: ", gs_obj.best_data_id)
|
|
2830
|
+
Best data ID: DF_0
|
|
2831
|
+
>>> print("Best model ID: ", gs_obj.best_model_id)
|
|
2832
|
+
Best model ID: SVM_2
|
|
2833
|
+
>>> print("Best model score: ",gs_obj.best_score_)
|
|
2834
|
+
Best model score: 2.3265213466885375
|
|
2835
|
+
|
|
2836
|
+
>>> # Performing prediction on sampled data using best trained model.
|
|
2837
|
+
>>> test_data = transform_obj.result.iloc[:5]
|
|
2838
|
+
>>> gs_pred = gs_obj.predict(newdata=test_data, **eval_params)
|
|
2839
|
+
>>> print("Prediction result: \n", gs_pred.result)
|
|
2840
|
+
Prediction result:
|
|
2841
|
+
id prediction MedHouseVal
|
|
2842
|
+
0 686 -0.214558 1.578
|
|
2843
|
+
1 2018 0.224954 0.578
|
|
2844
|
+
2 1754 -0.484374 1.651
|
|
2845
|
+
3 670 -0.288802 1.922
|
|
2846
|
+
4 244 -0.097476 1.117
|
|
2847
|
+
|
|
2848
|
+
>>> # Perform evaluation using best model.
|
|
2849
|
+
>>> gs_obj.evaluate()
|
|
2850
|
+
############ result Output ############
|
|
2851
|
+
|
|
2852
|
+
MAE MSE MSLE MAPE MPE RMSE RMSLE ME R2 EV MPD MGD
|
|
2853
|
+
0 2.326521 6.218464 0.0 90.629648 90.629648 2.493685 0.0 3.77641 -6.987358 -0.034968 NaN NaN
|
|
2854
|
+
|
|
2855
|
+
|
|
2856
|
+
>>> # Retrieve any trained model.
|
|
2857
|
+
>>> gs_obj.get_model("SVM_1")
|
|
2858
|
+
############ output_data Output ############
|
|
2859
|
+
|
|
2860
|
+
iterNum loss eta bias
|
|
2861
|
+
0 3 2.078232 0.028868 0.0
|
|
2862
|
+
1 5 2.049456 0.022361 0.0
|
|
2863
|
+
2 6 2.037157 0.020412 0.0
|
|
2864
|
+
3 7 2.028186 0.018898 0.0
|
|
2865
|
+
4 9 2.012801 0.016667 0.0
|
|
2866
|
+
5 10 2.007469 0.015811 0.0
|
|
2867
|
+
6 8 2.020026 0.017678 0.0
|
|
2868
|
+
7 4 2.063343 0.025000 0.0
|
|
2869
|
+
8 2 2.092763 0.035355 0.0
|
|
2870
|
+
9 1 2.112669 0.050000 0.0
|
|
2871
|
+
|
|
2872
|
+
|
|
2873
|
+
############ result Output ############
|
|
2874
|
+
|
|
2875
|
+
predictor estimate value
|
|
2876
|
+
attribute
|
|
2877
|
+
7 Latitude 0.077697 None
|
|
2878
|
+
-9 Learning Rate (Initial) 0.050000 None
|
|
2879
|
+
-17 OneClass SVM NaN FALSE
|
|
2880
|
+
-14 Epsilon 0.100000 None
|
|
2881
|
+
5 Population -0.120322 None
|
|
2882
|
+
-12 Nesterov NaN TRUE
|
|
2883
|
+
-5 BIC 50.583018 None
|
|
2884
|
+
-7 Alpha 0.500000 Elasticnet
|
|
2885
|
+
-3 Number of Observations 31.000000 None
|
|
2886
|
+
0 (Intercept) 0.000000 None
|
|
2887
|
+
|
|
2888
|
+
|
|
2889
|
+
>>> # Update the default model.
|
|
2890
|
+
>>> gs_obj.set_model("SVM_1")
|
|
2891
|
+
|
|
2892
|
+
>>> # Example 3: Model trainer function. Performing hyperparameter-tuning
|
|
2893
|
+
>>> # on SVM model trainer function using labeled multiple-dataframe.
|
|
2894
|
+
|
|
2895
|
+
>>> # Initialize the GridSearch optimizer with model trainer
|
|
2896
|
+
>>> # function and parameter space required for model training.
|
|
2897
|
+
>>> gs_obj = GridSearch(func=SVM, params=params)
|
|
2898
|
+
|
|
2899
|
+
>>> # Perform model optimization for SVM function for
|
|
2900
|
+
>>> # labeled multiple-dataframe support.
|
|
2901
|
+
>>> # Evaluation and prediction arguments are passed along with
|
|
2902
|
+
>>> # training dataframe.
|
|
2903
|
+
>>> gs_obj.fit(data={"Data-1":train_df_1, "Data-2":train_df_2}, **eval_params)
|
|
2904
|
+
|
|
2905
|
+
>>> # View trained models.
|
|
2906
|
+
>>> gs_obj.models
|
|
2907
|
+
MODEL_ID DATA_ID PARAMETERS STATUS MAE
|
|
2908
|
+
0 SVM_1 Data-2 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.286463
|
|
2909
|
+
1 SVM_3 Data-2 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.286463
|
|
2910
|
+
2 SVM_2 Data-1 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.156109
|
|
2911
|
+
3 SVM_0 Data-1 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.156109
|
|
2912
|
+
4 SVM_7 Data-2 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.286463
|
|
2913
|
+
5 SVM_4 Data-1 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.156109
|
|
2914
|
+
6 SVM_5 Data-2 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.286463
|
|
2915
|
+
7 SVM_6 Data-1 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.156109
|
|
2916
|
+
8 SVM_10 Data-1 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.156109
|
|
2917
|
+
9 SVM_8 Data-1 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.156109
|
|
2918
|
+
10 SVM_9 Data-2 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.286463
|
|
2919
|
+
11 SVM_11 Data-2 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.286463
|
|
2920
|
+
|
|
2921
|
+
>>> # View model evaluation stats.
|
|
2922
|
+
>>> gs_obj.model_stats
|
|
2923
|
+
MODEL_ID MAE MSE MSLE MAPE ... ME R2 EV MPD MGD
|
|
2924
|
+
0 SVM_1 2.286463 5.721906 0.115319 120.188468 ... 3.280316 -3.436736 0.616960 NaN NaN
|
|
2925
|
+
1 SVM_3 2.286463 5.721906 0.115319 120.188468 ... 3.280316 -3.436736 0.616960 NaN NaN
|
|
2926
|
+
2 SVM_2 2.156109 6.986356 0.000000 97.766138 ... 4.737632 -2.195437 -0.235152 NaN NaN
|
|
2927
|
+
3 SVM_0 2.156109 6.986356 0.000000 97.766138 ... 4.737632 -2.195437 -0.235152 NaN NaN
|
|
2928
|
+
4 SVM_7 2.286463 5.721906 0.115319 120.188468 ... 3.280316 -3.436736 0.616960 NaN NaN
|
|
2929
|
+
5 SVM_4 2.156109 6.986356 0.000000 97.766138 ... 4.737632 -2.195437 -0.235152 NaN NaN
|
|
2930
|
+
6 SVM_5 2.286463 5.721906 0.115319 120.188468 ... 3.280316 -3.436736 0.616960 NaN NaN
|
|
2931
|
+
7 SVM_6 2.156109 6.986356 0.000000 97.766138 ... 4.737632 -2.195437 -0.235152 NaN NaN
|
|
2932
|
+
8 SVM_10 2.156109 6.986356 0.000000 97.766138 ... 4.737632 -2.195437 -0.235152 NaN NaN
|
|
2933
|
+
9 SVM_8 2.156109 6.986356 0.000000 97.766138 ... 4.737632 -2.195437 -0.235152 NaN NaN
|
|
2934
|
+
10 SVM_9 2.286463 5.721906 0.115319 120.188468 ... 3.280316 -3.436736 0.616960 NaN NaN
|
|
2935
|
+
11 SVM_11 2.286463 5.721906 0.115319 120.188468 ... 3.280316 -3.436736 0.616960 NaN NaN
|
|
2936
|
+
|
|
2937
|
+
[12 rows x 13 columns]
|
|
2938
|
+
|
|
2939
|
+
>>> # View best data, model ID and score.
|
|
2940
|
+
>>> print("Best data ID: ", gs_obj.best_data_id)
|
|
2941
|
+
Best data ID: Data-1
|
|
2942
|
+
>>> print("Best model ID: ", gs_obj.best_model_id)
|
|
2943
|
+
Best model ID: SVM_2
|
|
2944
|
+
>>> print("Best model score: ",gs_obj.best_score_)
|
|
2945
|
+
Best model score: 2.156108718480682
|
|
2946
|
+
|
|
2947
|
+
>>> # Performing prediction on sampled data using best trained model.
|
|
2948
|
+
>>> test_data = transform_obj.result.iloc[:5]
|
|
2949
|
+
>>> gs_pred = gs_obj.predict(newdata=test_data, **eval_params)
|
|
2950
|
+
>>> print("Prediction result: \n", gs_pred.result)
|
|
2951
|
+
Prediction result:
|
|
2952
|
+
id prediction MedHouseVal
|
|
2953
|
+
0 686 -0.512750 1.578
|
|
2954
|
+
1 2018 0.065364 0.578
|
|
2955
|
+
2 1754 -0.849449 1.651
|
|
2956
|
+
3 670 -0.657097 1.922
|
|
2957
|
+
4 244 -0.285946 1.117
|
|
2958
|
+
|
|
2959
|
+
>>> # Perform evaluation using best model.
|
|
2960
|
+
>>> gs_obj.evaluate()
|
|
2961
|
+
############ result Output ############
|
|
2962
|
+
|
|
2963
|
+
MAE MSE MSLE MAPE MPE RMSE RMSLE ME R2 EV MPD MGD
|
|
2964
|
+
0 2.156109 6.986356 0.0 97.766138 83.453982 2.643172 0.0 4.737632 -2.195437 -0.235152 NaN NaN
|
|
2965
|
+
|
|
2966
|
+
>>> # Retrieve any trained model.
|
|
2967
|
+
>>> gs_obj.get_model("SVM_1")
|
|
2968
|
+
############ output_data Output ############
|
|
2969
|
+
|
|
2970
|
+
iterNum loss eta bias
|
|
2971
|
+
0 3 2.238049 0.028868 0.0
|
|
2972
|
+
1 5 2.198618 0.022361 0.0
|
|
2973
|
+
2 6 2.183347 0.020412 0.0
|
|
2974
|
+
3 7 2.171550 0.018898 0.0
|
|
2975
|
+
4 9 2.154619 0.016667 0.0
|
|
2976
|
+
5 10 2.147124 0.015811 0.0
|
|
2977
|
+
6 8 2.162718 0.017678 0.0
|
|
2978
|
+
7 4 2.217790 0.025000 0.0
|
|
2979
|
+
8 2 2.257826 0.035355 0.0
|
|
2980
|
+
9 1 2.286324 0.050000 0.0
|
|
2981
|
+
|
|
2982
|
+
|
|
2983
|
+
############ result Output ############
|
|
2984
|
+
|
|
2985
|
+
predictor estimate value
|
|
2986
|
+
attribute
|
|
2987
|
+
-7 Alpha 0.500000 Elasticnet
|
|
2988
|
+
-3 Number of Observations 31.000000 None
|
|
2989
|
+
5 Population -0.094141 None
|
|
2990
|
+
0 (Intercept) 0.000000 None
|
|
2991
|
+
-17 OneClass SVM NaN FALSE
|
|
2992
|
+
-16 Kernel NaN LINEAR
|
|
2993
|
+
-1 Loss Function NaN EPSILON_INSENSITIVE
|
|
2994
|
+
7 Latitude 0.169825 None
|
|
2995
|
+
-9 Learning Rate (Initial) 0.050000 None
|
|
2996
|
+
-14 Epsilon 0.100000 None
|
|
2997
|
+
|
|
2998
|
+
>>> # Update the default model.
|
|
2999
|
+
>>> gs_obj.set_model("SVM_1")
|
|
3000
|
+
|
|
3001
|
+
|
|
3002
|
+
>>> # Example 4: Model trainer function. Performing hyperparameter-tuning
|
|
3003
|
+
>>> # on SVM model trainer function by passing unlabeled
|
|
3004
|
+
>>> # multiple-dataframe as model hyperparameter.
|
|
3005
|
+
|
|
3006
|
+
>>> # Define parameter space for model training.
|
|
3007
|
+
>>> params = {"data":(train_df_1, train_df_2),
|
|
3008
|
+
"input_columns":['MedInc', 'HouseAge', 'AveRooms',
|
|
3009
|
+
'AveBedrms', 'Population', 'AveOccup',
|
|
3010
|
+
'Latitude', 'Longitude'],
|
|
3011
|
+
"response_column":"MedHouseVal",
|
|
3012
|
+
"model_type":"regression",
|
|
3013
|
+
"batch_size":(11, 50, 75),
|
|
3014
|
+
"iter_max":(100, 301),
|
|
3015
|
+
"lambda1":0.1,
|
|
3016
|
+
"alpha":0.5,
|
|
3017
|
+
"iter_num_no_change":60,
|
|
3018
|
+
"tolerance":0.01,
|
|
3019
|
+
"intercept":False,
|
|
3020
|
+
"learning_rate":"INVTIME",
|
|
3021
|
+
"initial_data":0.5,
|
|
3022
|
+
"decay_rate":0.5,
|
|
3023
|
+
"momentum":0.6,
|
|
3024
|
+
"nesterov":True,
|
|
3025
|
+
"local_sgd_iterations":1}
|
|
3026
|
+
|
|
3027
|
+
>>> # Initialize the GridSearch optimizer with model trainer
|
|
3028
|
+
>>> # function and parameter space required for model training.
|
|
3029
|
+
>>> gs_obj = GridSearch(func=SVM, params=params)
|
|
3030
|
+
|
|
3031
|
+
>>> # Perform model optimization for SVM function for
|
|
3032
|
+
>>> # labeled multiple-dataframe support.
|
|
3033
|
+
>>> # Evaluation and prediction arguments are passed along with
|
|
3034
|
+
>>> # training dataframe.
|
|
3035
|
+
>>> gs_obj.fit(**eval_params)
|
|
3036
|
+
|
|
3037
|
+
>>> # View trained models.
|
|
3038
|
+
>>> gs_obj.models
|
|
3039
|
+
MODEL_ID DATA_ID PARAMETERS STATUS MAE
|
|
3040
|
+
0 SVM_0 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.007936
|
|
3041
|
+
1 SVM_1 DF_1 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.517338
|
|
3042
|
+
2 SVM_3 DF_1 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.517338
|
|
3043
|
+
3 SVM_2 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.007936
|
|
3044
|
+
4 SVM_5 DF_1 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.517338
|
|
3045
|
+
5 SVM_7 DF_1 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.517338
|
|
3046
|
+
6 SVM_6 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.007936
|
|
3047
|
+
7 SVM_4 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.007936
|
|
3048
|
+
8 SVM_9 DF_1 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.517338
|
|
3049
|
+
9 SVM_8 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.007936
|
|
3050
|
+
10 SVM_11 DF_1 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.517338
|
|
3051
|
+
11 SVM_10 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS 2.007936
|
|
3052
|
+
|
|
3053
|
+
>>> # View model evaluation stats.
|
|
3054
|
+
>>> gs_obj.model_stats
|
|
3055
|
+
MODEL_ID MAE MSE MSLE MAPE ... ME R2 EV MPD MGD
|
|
3056
|
+
0 SVM_0 2.007936 5.402427 0.007669 88.199346 ... 3.981598 -6.898063 -1.003772 NaN NaN
|
|
3057
|
+
1 SVM_1 2.517338 7.470182 0.000000 118.722467 ... 4.035658 -7.827958 -0.716572 NaN NaN
|
|
3058
|
+
2 SVM_3 2.517338 7.470182 0.000000 118.722467 ... 4.035658 -7.827958 -0.716572 NaN NaN
|
|
3059
|
+
3 SVM_2 2.007936 5.402427 0.007669 88.199346 ... 3.981598 -6.898063 -1.003772 NaN NaN
|
|
3060
|
+
4 SVM_5 2.517338 7.470182 0.000000 118.722467 ... 4.035658 -7.827958 -0.716572 NaN NaN
|
|
3061
|
+
5 SVM_7 2.517338 7.470182 0.000000 118.722467 ... 4.035658 -7.827958 -0.716572 NaN NaN
|
|
3062
|
+
6 SVM_6 2.007936 5.402427 0.007669 88.199346 ... 3.981598 -6.898063 -1.003772 NaN NaN
|
|
3063
|
+
7 SVM_4 2.007936 5.402427 0.007669 88.199346 ... 3.981598 -6.898063 -1.003772 NaN NaN
|
|
3064
|
+
8 SVM_9 2.517338 7.470182 0.000000 118.722467 ... 4.035658 -7.827958 -0.716572 NaN NaN
|
|
3065
|
+
9 SVM_8 2.007936 5.402427 0.007669 88.199346 ... 3.981598 -6.898063 -1.003772 NaN NaN
|
|
3066
|
+
10 SVM_11 2.517338 7.470182 0.000000 118.722467 ... 4.035658 -7.827958 -0.716572 NaN NaN
|
|
3067
|
+
11 SVM_10 2.007936 5.402427 0.007669 88.199346 ... 3.981598 -6.898063 -1.003772 NaN NaN
|
|
3068
|
+
|
|
3069
|
+
[12 rows x 13 columns]
|
|
3070
|
+
|
|
3071
|
+
>>> # View best data, model ID and score.
|
|
3072
|
+
>>> print("Best data ID: ", gs_obj.best_data_id)
|
|
3073
|
+
Best data ID: DF_0
|
|
3074
|
+
>>> print("Best model ID: ", gs_obj.best_model_id)
|
|
3075
|
+
Best model ID: SVM_0
|
|
3076
|
+
>>> print("Best model score: ",gs_obj.best_score_)
|
|
3077
|
+
Best model score: 2.0079362549355104
|
|
3078
|
+
|
|
3079
|
+
>>> # Performing prediction on sampled data using best trained model.
|
|
3080
|
+
>>> test_data = transform_obj.result.iloc[:5]
|
|
3081
|
+
>>> gs_pred = gs_obj.predict(newdata=test_data, **eval_params)
|
|
3082
|
+
>>> print("Prediction result: \n", gs_pred.result)
|
|
3083
|
+
Prediction result:
|
|
3084
|
+
id prediction MedHouseVal
|
|
3085
|
+
0 686 -0.365955 1.578
|
|
3086
|
+
1 2018 0.411846 0.578
|
|
3087
|
+
2 1754 -0.634807 1.651
|
|
3088
|
+
3 670 -0.562927 1.922
|
|
3089
|
+
4 244 -0.169730 1.117
|
|
3090
|
+
>>> # Perform evaluation using best model.
|
|
3091
|
+
>>> gs_obj.evaluate()
|
|
3092
|
+
############ result Output ############
|
|
3093
|
+
|
|
3094
|
+
MAE MSE MSLE MAPE MPE RMSE RMSLE ME R2 EV MPD MGD
|
|
3095
|
+
0 2.007936 5.402427 0.007669 88.199346 88.199346 2.324312 0.087574 3.981598 -6.898063 -1.003772 NaN NaN
|
|
3096
|
+
|
|
3097
|
+
|
|
3098
|
+
>>> # Retrieve any trained model.
|
|
3099
|
+
>>> gs_obj.get_model("SVM_1")
|
|
3100
|
+
############ output_data Output ############
|
|
3101
|
+
|
|
3102
|
+
iterNum loss eta bias
|
|
3103
|
+
0 3 2.154842 0.028868 0.0
|
|
3104
|
+
1 5 2.129916 0.022361 0.0
|
|
3105
|
+
2 6 2.118539 0.020412 0.0
|
|
3106
|
+
3 7 2.107991 0.018898 0.0
|
|
3107
|
+
4 9 2.089022 0.016667 0.0
|
|
3108
|
+
5 10 2.080426 0.015811 0.0
|
|
3109
|
+
6 8 2.098182 0.017678 0.0
|
|
3110
|
+
7 4 2.142030 0.025000 0.0
|
|
3111
|
+
8 2 2.168233 0.035355 0.0
|
|
3112
|
+
9 1 2.186740 0.050000 0.0
|
|
3113
|
+
|
|
3114
|
+
############ result Output ############
|
|
3115
|
+
|
|
3116
|
+
predictor estimate value
|
|
3117
|
+
attribute
|
|
3118
|
+
7 Latitude 0.010463 None
|
|
3119
|
+
-9 Learning Rate (Initial) 0.050000 None
|
|
3120
|
+
-17 OneClass SVM NaN FALSE
|
|
3121
|
+
-14 Epsilon 0.100000 None
|
|
3122
|
+
5 Population -0.348591 None
|
|
3123
|
+
-12 Nesterov NaN TRUE
|
|
3124
|
+
-5 BIC 50.585888 None
|
|
3125
|
+
-7 Alpha 0.500000 Elasticnet
|
|
3126
|
+
-3 Number of Observations 31.000000 None
|
|
3127
|
+
0 (Intercept) 0.000000 None
|
|
3128
|
+
|
|
3129
|
+
|
|
3130
|
+
>>> # Update the default model.
|
|
3131
|
+
>>> gs_obj.set_model("SVM_1")
|
|
3132
|
+
|
|
3133
|
+
>>> # Example 5: Non-Model trainer function. Performing GridSearch
|
|
3134
|
+
>>> # on AntiSelect model trainer function.
|
|
3135
|
+
>>> # Load the example dataset.
|
|
3136
|
+
>>> load_example_data("teradataml", "titanic")
|
|
3137
|
+
|
|
3138
|
+
>>> # Create teradaraml dataframe.
|
|
3139
|
+
>>> titanic = DataFrame.from_table("titanic")
|
|
3140
|
+
|
|
3141
|
+
>>> # Define the non-model trainer function parameter space.
|
|
3142
|
+
>>> # Include input data in parameter space for non-model trainer function.
|
|
3143
|
+
>>> params = {"data":titanic, "exclude":(
|
|
3144
|
+
['survived', 'name', 'age'],
|
|
3145
|
+
["ticket", "parch", "sex", "age"])}
|
|
3146
|
+
|
|
3147
|
+
>>> # Import non-model trainer function and optimizer.
|
|
3148
|
+
>>> from teradataml import Antiselect, GridSearch
|
|
3149
|
+
|
|
3150
|
+
>>> # Initialize the GridSearch optimizer with non-model trainer
|
|
3151
|
+
>>> # function and parameter space required for non-model training.
|
|
3152
|
+
>>> gs_obj = GridSearch(func=Antiselect, params=params)
|
|
3153
|
+
|
|
3154
|
+
>>> # Perform execution of Antiselect function.
|
|
3155
|
+
>>> gs_obj.fit()
|
|
3156
|
+
|
|
3157
|
+
>>> # View trained models.
|
|
3158
|
+
>>> gs_obj.models
|
|
3159
|
+
MODEL_ID PARAMETERS STATUS
|
|
3160
|
+
0 ANTISELECT_1 {'data': '"titanic"', 'exclude': ['ticket', 'p... PASS
|
|
3161
|
+
1 ANTISELECT_0 {'data': '"titanic"', 'exclude': ['survived', ... PASS
|
|
3162
|
+
|
|
3163
|
+
>>> # Retrieve any trained model using "MODEL_ID".
|
|
3164
|
+
>>> gs_obj.get_model("ANTISELECT_1")
|
|
3165
|
+
############ result Output ############
|
|
3166
|
+
|
|
3167
|
+
passenger survived pclass name sibsp fare cabin embarked
|
|
3168
|
+
0 162 1 2 Watt, Mrs. James (Elizabeth "Bessie" Inglis Milne) 0 15.7500 None S
|
|
3169
|
+
1 591 0 3 Rintamaki, Mr. Matti 0 7.1250 None S
|
|
3170
|
+
2 387 0 3 Goodwin, Master. Sidney Leonard 5 46.9000 None S
|
|
3171
|
+
3 469 0 3 Scanlan, Mr. James 0 7.7250 None Q
|
|
3172
|
+
4 326 1 1 Young, Miss. Marie Grice 0 135.6333 C32 C
|
|
3173
|
+
5 265 0 3 Henry, Miss. Delia 0 7.7500 None Q
|
|
3174
|
+
6 530 0 2 Hocking, Mr. Richard George 2 11.5000 None S
|
|
3175
|
+
7 244 0 3 Maenpaa, Mr. Matti Alexanteri 0 7.1250 None S
|
|
3176
|
+
8 61 0 3 Sirayanian, Mr. Orsen 0 7.2292 None C
|
|
3177
|
+
9 122 0 3 Moore, Mr. Leonard Charles 0 8.0500 None S
|
|
3178
|
+
|
|
3179
|
+
"""
|
|
3180
|
+
|
|
3181
|
+
self.__params = params.copy()
|
|
3182
|
+
super().__init__(func=func, params=self.__params)
|
|
3183
|
+
# Populate parameter grid from provided parameter space.
|
|
3184
|
+
self.__populate_params_grid()
|
|
3185
|
+
|
|
3186
|
+
|
|
3187
|
+
def __populate_params_grid(self):
|
|
3188
|
+
"""
|
|
3189
|
+
DESCRIPTION:
|
|
3190
|
+
Populate parameter grid based on the search algorithm. In GridSearch,
|
|
3191
|
+
populate all combinations of parameters.
|
|
3192
|
+
|
|
3193
|
+
PARAMETERS:
|
|
3194
|
+
None
|
|
3195
|
+
|
|
3196
|
+
RETURNS:
|
|
3197
|
+
None
|
|
3198
|
+
|
|
3199
|
+
RAISES:
|
|
3200
|
+
None
|
|
3201
|
+
|
|
3202
|
+
EXAMPLES:
|
|
3203
|
+
>>> self.__populate_params_grid()
|
|
3204
|
+
"""
|
|
3205
|
+
# Populate all parameter combinations for given "params".
|
|
3206
|
+
# Since GridSearch works on all parameter combinations. Set
|
|
3207
|
+
# all the parameter combinations to the parameter grid.
|
|
3208
|
+
self._parameter_grid = self._BaseSearch__populate_parameter_grid()
|
|
3209
|
+
|
|
3210
|
+
|
|
3211
|
+
def fit(self,
|
|
3212
|
+
data=None,
|
|
3213
|
+
evaluation_metric=None,
|
|
3214
|
+
early_stop=None,
|
|
3215
|
+
frac=0.8,
|
|
3216
|
+
run_parallel=True,
|
|
3217
|
+
wait=True,
|
|
3218
|
+
verbose=0,
|
|
3219
|
+
stratify_column=None,
|
|
3220
|
+
sample_id_column=None,
|
|
3221
|
+
sample_seed=None,
|
|
3222
|
+
max_time=None,
|
|
3223
|
+
**kwargs):
|
|
3224
|
+
"""
|
|
3225
|
+
DESCRIPTION:
|
|
3226
|
+
Function to perform hyperparameter tuning using GridSearch algorithm.
|
|
3227
|
+
Notes:
|
|
3228
|
+
* In the Model trainer function, the best parameters are
|
|
3229
|
+
selected based on training results.
|
|
3230
|
+
* In the Non model trainer function, First execution parameter
|
|
3231
|
+
set is selected as the best parameters.
|
|
3232
|
+
|
|
3233
|
+
PARAMETERS:
|
|
3234
|
+
data:
|
|
3235
|
+
Optional Argument.
|
|
3236
|
+
Specifies the input teradataml DataFrame for model trainer function.
|
|
3237
|
+
Notes:
|
|
3238
|
+
* DataFrame need not to be passed in fit() methods, when "data" is
|
|
3239
|
+
passed as a model hyperparameters ("params").
|
|
3240
|
+
* "data" is a required argument for model trainer functions.
|
|
3241
|
+
* "data" is ignored for non-model trainer functions.
|
|
3242
|
+
* "data" can be contain single DataFrame or multiple DataFrame.
|
|
3243
|
+
* One can pass multiple dataframes to "data". Hyperparameter
|
|
3244
|
+
tuning is performed on all the dataframes for every model
|
|
3245
|
+
parameter.
|
|
3246
|
+
* "data" can be either a dictionary OR a tuple OR a dataframe.
|
|
3247
|
+
* If it is a dictionary then Key represents the label for
|
|
3248
|
+
dataframe and Value represents the dataframe.
|
|
3249
|
+
* If it is a tuple then teradataml converts it to dictionary
|
|
3250
|
+
by generating the labels internally.
|
|
3251
|
+
* If it is a dataframe then teradataml label it as "DF_0".
|
|
3252
|
+
Types: teradataml DataFrame, dictionary, tuples
|
|
3253
|
+
|
|
3254
|
+
evaluation_metric:
|
|
3255
|
+
Optional Argument.
|
|
3256
|
+
Specifies the evaluation metrics to considered for model
|
|
3257
|
+
evaluation.
|
|
3258
|
+
Notes:
|
|
3259
|
+
* evaluation_metric applicable for model trainer functions.
|
|
3260
|
+
* Best model is not selected when evaluation returns
|
|
3261
|
+
non-finite values.
|
|
3262
|
+
* MPD, MGD, RMSE, RMSLE are not supported for OpenSourceML models.
|
|
3263
|
+
Permitted Values:
|
|
3264
|
+
* Classification: Accuracy, Micro-Precision, Micro-Recall,
|
|
3265
|
+
Micro-F1, Macro-Precision, Macro-Recall,
|
|
3266
|
+
Macro-F1, Weighted-Precision,
|
|
3267
|
+
Weighted-Recall,
|
|
3268
|
+
Weighted-F1.
|
|
3269
|
+
* Regression: MAE, MSE, MSLE, MAPE, MPE, RMSE, RMSLE, ME,
|
|
3270
|
+
R2, EV, MPD, MGD
|
|
3271
|
+
|
|
3272
|
+
Default Value:
|
|
3273
|
+
* Classification: Accuracy
|
|
3274
|
+
* Regression: MAE
|
|
3275
|
+
Types: str
|
|
3276
|
+
|
|
3277
|
+
early_stop:
|
|
3278
|
+
Optional Argument.
|
|
3279
|
+
Specifies the early stop mechanism value for model trainer
|
|
3280
|
+
functions. Hyperparameter tuning ends model training when
|
|
3281
|
+
the training model evaluation metric attains "early_stop" value.
|
|
3282
|
+
Note:
|
|
3283
|
+
* Early stopping supports only when evaluation returns
|
|
3284
|
+
finite value.
|
|
3285
|
+
Types: int or float
|
|
3286
|
+
|
|
3287
|
+
frac:
|
|
3288
|
+
Optional Argument.
|
|
3289
|
+
Specifies the split percentage of rows to be sampled for training
|
|
3290
|
+
and testing dataset. "frac" argument value must range between (0, 1).
|
|
3291
|
+
Notes:
|
|
3292
|
+
* This "frac" argument is not supported for non-model trainer
|
|
3293
|
+
function.
|
|
3294
|
+
* The "frac" value is considered as train split percentage and
|
|
3295
|
+
The remaining percentage is taken into account for test splitting.
|
|
3296
|
+
Default Value: 0.8
|
|
3297
|
+
Types: float
|
|
3298
|
+
|
|
3299
|
+
run_parallel:
|
|
3300
|
+
Optional Argument.
|
|
3301
|
+
Specifies the parallel execution functionality of hyperparameter
|
|
3302
|
+
tuning. When "run_parallel" set to true, model functions are
|
|
3303
|
+
executed concurrently. Otherwise, model functions are executed
|
|
3304
|
+
sequentially.
|
|
3305
|
+
Default Value: True
|
|
3306
|
+
Types: bool
|
|
3307
|
+
|
|
3308
|
+
wait:
|
|
3309
|
+
Optional Argument.
|
|
3310
|
+
Specifies whether to wait for the completion of execution
|
|
3311
|
+
of hyperparameter tuning or not. When set to False, hyperparameter
|
|
3312
|
+
tuning is executed in the background and user can use "is_running()"
|
|
3313
|
+
method to check the status. Otherwise it waits until the execution
|
|
3314
|
+
is complete to return the control back to user.
|
|
3315
|
+
Default Value: True
|
|
3316
|
+
Type: bool
|
|
3317
|
+
|
|
3318
|
+
verbose:
|
|
3319
|
+
Optional Argument.
|
|
3320
|
+
Specifies whether to log the model training information and display
|
|
3321
|
+
the logs. When it is set to 1, progress bar alone logged in the
|
|
3322
|
+
console. When it is set to 2, along with progress bar, execution
|
|
3323
|
+
steps and execution time is logged in the console. When it is set
|
|
3324
|
+
to 0, nothing is logged in the console.
|
|
3325
|
+
Note:
|
|
3326
|
+
* verbose is not significant when "wait" is 'False'.
|
|
3327
|
+
Default Value: 0
|
|
3328
|
+
Type: bool
|
|
3329
|
+
|
|
3330
|
+
sample_seed:
|
|
3331
|
+
Optional Argument.
|
|
3332
|
+
Specifies the seed value that controls the shuffling applied
|
|
3333
|
+
to the data before applying the Train-Test split. Pass an int for
|
|
3334
|
+
reproducible output across multiple function calls.
|
|
3335
|
+
Notes:
|
|
3336
|
+
* When the argument is not specified, different
|
|
3337
|
+
runs of the query generate different outputs.
|
|
3338
|
+
* It must be in the range [0, 2147483647]
|
|
3339
|
+
* Seed is supported for stratify column.
|
|
3340
|
+
Types: int
|
|
3341
|
+
|
|
3342
|
+
stratify_column:
|
|
3343
|
+
Optional Argument.
|
|
3344
|
+
Specifies column name that contains the labels indicating
|
|
3345
|
+
which data needs to be stratified for TrainTest split.
|
|
3346
|
+
Notes:
|
|
3347
|
+
* seed is supported for stratify column.
|
|
3348
|
+
Types: str
|
|
3349
|
+
|
|
3350
|
+
sample_id_column:
|
|
3351
|
+
Optional Argument.
|
|
3352
|
+
Specifies the input data column name that has the
|
|
3353
|
+
unique identifier for each row in the input.
|
|
3354
|
+
Note:
|
|
3355
|
+
* Mandatory when "sample_seed" argument is present.
|
|
3356
|
+
Types: str
|
|
3357
|
+
|
|
3358
|
+
max_time:
|
|
3359
|
+
Optional Argument.
|
|
3360
|
+
Specifies the maximum time for the completion of Hyperparameter tuning execution.
|
|
3361
|
+
Default Value: None
|
|
3362
|
+
Types: int or float
|
|
3363
|
+
|
|
3364
|
+
kwargs:
|
|
3365
|
+
Optional Argument.
|
|
3366
|
+
Specifies the keyword arguments. Accepts additional arguments
|
|
3367
|
+
required for the teradataml analytic function.
|
|
3368
|
+
|
|
3369
|
+
RETURNS:
|
|
3370
|
+
None
|
|
3371
|
+
|
|
3372
|
+
RAISES:
|
|
3373
|
+
TeradataMlException, TypeError, ValueError
|
|
3374
|
+
|
|
3375
|
+
EXAMPLES:
|
|
3376
|
+
>>> # Create an instance of the GridSearch algorithm called "optimizer_obj"
|
|
3377
|
+
>>> optimizer_obj = GridSearch(func=SVM, params=params)
|
|
3378
|
+
|
|
3379
|
+
>>> eval_params = {"id_column": "id",
|
|
3380
|
+
"accumulate": "MedHouseVal"}
|
|
3381
|
+
>>> # Example 1: Passing single DataFrame for model trainer function.
|
|
3382
|
+
>>> optimizer_obj.fit(data=train_df,
|
|
3383
|
+
evaluation_metric="MAE",
|
|
3384
|
+
early_stop=70.9,
|
|
3385
|
+
**eval_params)
|
|
3386
|
+
|
|
3387
|
+
>>> # Example 2: Passing multiple datasets as tuple of DataFrames for
|
|
3388
|
+
>>> # model trainer function.
|
|
3389
|
+
>>> optimizer_obj.fit(data=(train_df_1, train_df_2),
|
|
3390
|
+
evaluation_metric="MAE",
|
|
3391
|
+
early_stop=70.9,
|
|
3392
|
+
**eval_params)
|
|
3393
|
+
|
|
3394
|
+
>>> # Example 3: Passing multiple datasets as dictionary of DataFrames
|
|
3395
|
+
>>> # for model trainer function.
|
|
3396
|
+
>>> optimizer_obj.fit(data={"Data-1":train_df_1, "Data-2":train_df_2},
|
|
3397
|
+
evaluation_metric="MAE",
|
|
3398
|
+
early_stop=70.9,
|
|
3399
|
+
**eval_params)
|
|
3400
|
+
|
|
3401
|
+
>>> # Example 4: No data argument passed in fit() method for model trainer function.
|
|
3402
|
+
>>> # Note: data argument must be passed while creating HPT object as
|
|
3403
|
+
>>> # model hyperparameters.
|
|
3404
|
+
|
|
3405
|
+
>>> # Define parameter space for model training with "data" argument.
|
|
3406
|
+
>>> params = {"data":(df1, df2),
|
|
3407
|
+
"input_columns":['MedInc', 'HouseAge', 'AveRooms',
|
|
3408
|
+
'AveBedrms', 'Population', 'AveOccup',
|
|
3409
|
+
'Latitude', 'Longitude'],
|
|
3410
|
+
"response_column":"MedHouseVal",
|
|
3411
|
+
"model_type":"regression",
|
|
3412
|
+
"batch_size":(11, 50, 75),
|
|
3413
|
+
"iter_max":(100, 301),
|
|
3414
|
+
"intercept":False,
|
|
3415
|
+
"learning_rate":"INVTIME",
|
|
3416
|
+
"nesterov":True,
|
|
3417
|
+
"local_sgd_iterations":1}
|
|
3418
|
+
|
|
3419
|
+
>>> # Create "optimizer_obj" using GridSearch algorithm and perform
|
|
3420
|
+
>>> # fit() method without any "data" argument for model trainer function.
|
|
3421
|
+
>>> optimizer_obj.fit(evaluation_metric="MAE",
|
|
3422
|
+
early_stop=70.9,
|
|
3423
|
+
**eval_params)
|
|
3424
|
+
|
|
3425
|
+
>>> # Example 5: Do not pass data argument in fit() method for
|
|
3426
|
+
>>> # non-model trainer function.
|
|
3427
|
+
>>> # Note: data argument must be passed while creating HPT
|
|
3428
|
+
>>> # object as model hyperparameters.
|
|
3429
|
+
>>> optimizer_obj.fit()
|
|
3430
|
+
|
|
3431
|
+
>>> # Example 6: Passing "verbose" argument value '1' in fit() method to
|
|
3432
|
+
>>> # display model log.
|
|
3433
|
+
>>> optimizer_obj.fit(data=train_df, evaluation_metric="R2",
|
|
3434
|
+
verbose=1, **eval_params)
|
|
3435
|
+
completed: |████████████████████████████████████████████████████████████| 100% - 6/6
|
|
3436
|
+
|
|
3437
|
+
>>> # Example 7: max_time argument is passed in fit() method.
|
|
3438
|
+
>>> # Model training parameters
|
|
3439
|
+
>>> model_params = {"input_columns":['sepal_length', 'sepal_width', 'petal_length', 'petal_width'],
|
|
3440
|
+
... "response_column" :'species',
|
|
3441
|
+
... "max_depth":(5,10,15),
|
|
3442
|
+
... "lambda1" :(1000.0,0.001),
|
|
3443
|
+
... "model_type" :"Classification",
|
|
3444
|
+
... "seed":32,
|
|
3445
|
+
... "shrinkage_factor":0.1,
|
|
3446
|
+
... "iter_num":(5, 50)}
|
|
3447
|
+
>>>
|
|
3448
|
+
>>> eval_params = {"id_column": "id",
|
|
3449
|
+
... "accumulate":"species",
|
|
3450
|
+
... "model_type":'Classification',
|
|
3451
|
+
... "object_order_column":['task_index', 'tree_num', 'iter','class_num', 'tree_order']
|
|
3452
|
+
}
|
|
3453
|
+
>>>
|
|
3454
|
+
>>> # Import model trainer function and optimizer.
|
|
3455
|
+
>>> from teradataml import XGBoost, GridSearch
|
|
3456
|
+
>>>
|
|
3457
|
+
>>> # Initialize the GridSearch optimizer with model trainer
|
|
3458
|
+
>>> # function and parameter space required for model training.
|
|
3459
|
+
>>> gs_obj = GridSearch(func=XGBoost, params=model_params)
|
|
3460
|
+
>>>
|
|
3461
|
+
>>> # fit() method with max_time argument(in seconds) for model trainer function.
|
|
3462
|
+
>>> gs_obj.fit(data=data, max_time=30, verbose=2, **eval_params)
|
|
3463
|
+
Model_id:XGBOOST_2 - Run time:33.277s - Status:PASS - ACCURACY:0.933
|
|
3464
|
+
Model_id:XGBOOST_3 - Run time:33.276s - Status:PASS - ACCURACY:0.933
|
|
3465
|
+
Model_id:XGBOOST_0 - Run time:33.279s - Status:PASS - ACCURACY:0.967
|
|
3466
|
+
Model_id:XGBOOST_1 - Run time:33.278s - Status:PASS - ACCURACY:0.933
|
|
3467
|
+
Computing: |⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾| 33% - 4/12
|
|
3468
|
+
>>>
|
|
3469
|
+
>>> # status 'SKIP' for the models which are not completed within the max_time.
|
|
3470
|
+
>>> gs_obj.models
|
|
3471
|
+
MODEL_ID DATA_ID PARAMETERS STATUS ACCURACY
|
|
3472
|
+
0 XGBOOST_2 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.933333
|
|
3473
|
+
1 XGBOOST_4 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
|
|
3474
|
+
2 XGBOOST_5 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
|
|
3475
|
+
3 XGBOOST_6 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
|
|
3476
|
+
4 XGBOOST_7 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
|
|
3477
|
+
5 XGBOOST_8 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
|
|
3478
|
+
6 XGBOOST_9 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
|
|
3479
|
+
7 XGBOOST_10 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
|
|
3480
|
+
8 XGBOOST_11 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
|
|
3481
|
+
9 XGBOOST_3 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.933333
|
|
3482
|
+
10 XGBOOST_0 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.966667
|
|
3483
|
+
11 XGBOOST_1 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.933333
|
|
3484
|
+
"""
|
|
3485
|
+
|
|
3486
|
+
# Set the flag to discard invalid column parameters.
|
|
3487
|
+
self.discard_invalid_column_params =kwargs.get("discard_invalid_column_params", False)
|
|
3488
|
+
|
|
3489
|
+
if self.discard_invalid_column_params:
|
|
3490
|
+
# Setting model trainer input data.
|
|
3491
|
+
super()._setting_model_trainer_data(data)
|
|
3492
|
+
# Data mapping for model trainer function.
|
|
3493
|
+
super()._data_mapping()
|
|
3494
|
+
# Setting the lambda function to None.
|
|
3495
|
+
self._setting_model_trainer_data = lambda data: None
|
|
3496
|
+
self._BaseSearch__update_model_parameters = lambda: None
|
|
3497
|
+
|
|
3498
|
+
# Calling baseSearch class fit method.
|
|
3499
|
+
super().fit(data, evaluation_metric,
|
|
3500
|
+
early_stop, frac, run_parallel,
|
|
3501
|
+
wait, verbose, stratify_column,
|
|
3502
|
+
sample_id_column, sample_seed,
|
|
3503
|
+
max_time, **kwargs)
|
|
3504
|
+
|
|
3505
|
+
|
|
3506
|
+
class RandomSearch(_BaseSearch):
|
|
3507
|
+
def __init__(self, func, params, n_iter=10, **kwargs):
|
|
3508
|
+
"""
|
|
3509
|
+
DESCRIPTION:
|
|
3510
|
+
RandomSearch algorithm performs random sampling on hyperparameter
|
|
3511
|
+
space to identify optimal hyperparameters. It works for
|
|
3512
|
+
teradataml analytic functions from SQLE, BYOM, VAL and UAF features.
|
|
3513
|
+
teradataml RandomSearch allows user to perform hyperparameter tuning for
|
|
3514
|
+
all model trainer and non-model trainer functions.
|
|
3515
|
+
When used for model trainer functions:
|
|
3516
|
+
* Based on evaluation metrics search determines best model.
|
|
3517
|
+
* All methods and properties can be used.
|
|
3518
|
+
When used for non-model trainer functions:
|
|
3519
|
+
* Only fit() method is supported.
|
|
3520
|
+
* User can choose the best output as they see fit to use this.
|
|
3521
|
+
|
|
3522
|
+
teradataml RandomSearch also allows user to use input data as the
|
|
3523
|
+
hyperparameter. This option can be suitable when the user wants to
|
|
3524
|
+
identify the best models for a set of input data. When user passes
|
|
3525
|
+
set of data as hyperparameter for model trainer function, the search
|
|
3526
|
+
determines the best data along with the best model based on the
|
|
3527
|
+
evaluation metrics.
|
|
3528
|
+
Note:
|
|
3529
|
+
* configure.temp_object_type="VT" follows sequential execution.
|
|
3530
|
+
|
|
3531
|
+
PARAMETERS:
|
|
3532
|
+
func:
|
|
3533
|
+
Required Argument.
|
|
3534
|
+
Specifies a teradataml analytic function from SQLE, VAL, and UAF.
|
|
3535
|
+
Types:
|
|
3536
|
+
teradataml Analytic Functions
|
|
3537
|
+
* Advanced analytic functions
|
|
3538
|
+
* UAF
|
|
3539
|
+
* VAL
|
|
3540
|
+
Refer to display_analytic_functions() function for list of functions.
|
|
3541
|
+
|
|
3542
|
+
params:
|
|
3543
|
+
Required Argument.
|
|
3544
|
+
Specifies the parameter(s) of a teradataml analytic function.
|
|
3545
|
+
The parameter(s) must be in dictionary. keys refers to the
|
|
3546
|
+
argument names and values refers to argument values for corresponding
|
|
3547
|
+
arguments.
|
|
3548
|
+
Notes:
|
|
3549
|
+
* One can specify the argument value in a tuple to run HPT
|
|
3550
|
+
with different arguments.
|
|
3551
|
+
* Model trainer function arguments "id_column", "input_columns",
|
|
3552
|
+
and "target_columns" must be passed in fit() method.
|
|
3553
|
+
* All required arguments of non-model trainer function must be
|
|
3554
|
+
passed while RandomSearch object creation.
|
|
3555
|
+
Types: dict
|
|
3556
|
+
|
|
3557
|
+
n_iter:
|
|
3558
|
+
Optional Argument.
|
|
3559
|
+
Specifies the number of iterations random search need to be performed.
|
|
3560
|
+
Note:
|
|
3561
|
+
* n_iter must be less than the size of parameter populations.
|
|
3562
|
+
Default Value: 10
|
|
3563
|
+
Types: int
|
|
3564
|
+
|
|
3565
|
+
RETURNS:
|
|
3566
|
+
None
|
|
3567
|
+
|
|
3568
|
+
RAISES:
|
|
3569
|
+
TeradataMlException, TypeError, ValueError
|
|
3570
|
+
|
|
3571
|
+
EXAMPLES:
|
|
3572
|
+
>>> # Example 1: Model trainer function. Performing hyperparameter-tuning
|
|
3573
|
+
>>> # on SVM model trainer function using random search algorithm.
|
|
3574
|
+
|
|
3575
|
+
>>> # Load the example data.
|
|
3576
|
+
>>> load_example_data("teradataml", ["cal_housing_ex_raw"])
|
|
3577
|
+
|
|
3578
|
+
>>> # Create teradataml DataFrame objects.
|
|
3579
|
+
>>> data_input = DataFrame.from_table("cal_housing_ex_raw")
|
|
3580
|
+
|
|
3581
|
+
>>> # Scale "target_columns" with respect to 'STD' value of the column.
|
|
3582
|
+
>>> fit_obj = ScaleFit(data=data_input,
|
|
3583
|
+
target_columns=['MedInc', 'HouseAge', 'AveRooms',
|
|
3584
|
+
'AveBedrms', 'Population', 'AveOccup',
|
|
3585
|
+
'Latitude', 'Longitude'],
|
|
3586
|
+
scale_method="STD")
|
|
3587
|
+
|
|
3588
|
+
>>> # Transform the data.
|
|
3589
|
+
>>> transform_obj = ScaleTransform(data=data_input,
|
|
3590
|
+
object=fit_obj.output,
|
|
3591
|
+
accumulate=["id", "MedHouseVal"])
|
|
3592
|
+
|
|
3593
|
+
>>> # Define parameter space for model training.
|
|
3594
|
+
>>> # Note: These parameters create 6 models based on batch_size and iter_max.
|
|
3595
|
+
>>> params = {"input_columns":['MedInc', 'HouseAge', 'AveRooms',
|
|
3596
|
+
'AveBedrms', 'Population', 'AveOccup',
|
|
3597
|
+
'Latitude', 'Longitude'],
|
|
3598
|
+
"response_column":"MedHouseVal",
|
|
3599
|
+
"model_type":"regression",
|
|
3600
|
+
"batch_size":(11, 50, 75),
|
|
3601
|
+
"iter_max":(100, 301),
|
|
3602
|
+
"lambda1":0.1,
|
|
3603
|
+
"alpha":0.5,
|
|
3604
|
+
"iter_num_no_change":60,
|
|
3605
|
+
"tolerance":0.01,
|
|
3606
|
+
"intercept":False,
|
|
3607
|
+
"learning_rate":"INVTIME",
|
|
3608
|
+
"initial_data":0.5,
|
|
3609
|
+
"decay_rate":0.5,
|
|
3610
|
+
"momentum":0.6,
|
|
3611
|
+
"nesterov":True,
|
|
3612
|
+
"local_sgd_iterations":1}
|
|
3613
|
+
|
|
3614
|
+
>>> # Import trainer function and optimizer.
|
|
3615
|
+
>>> from teradataml import SVM, RandomSearch
|
|
3616
|
+
|
|
3617
|
+
>>> # Initialize the random search optimizer with model trainer
|
|
3618
|
+
>>> # function and parameter space required for model training.
|
|
3619
|
+
>>> rs_obj = RandomSearch(func=SVM, params=params, n_iter=3)
|
|
3620
|
+
|
|
3621
|
+
>>> # Perform model optimization for SVM function.
|
|
3622
|
+
>>> # Evaluation and prediction arguments are passed along with
|
|
3623
|
+
>>> # training dataframe.
|
|
3624
|
+
>>> rs_obj.fit(data=transform_obj.result, evaluation_metric="R2",
|
|
3625
|
+
id_column="id", verbose=1)
|
|
3626
|
+
completed: |████████████████████████████████████████████████████████████| 100% - 3/3
|
|
3627
|
+
>>> # View trained models.
|
|
3628
|
+
>>> rs_obj.models
|
|
3629
|
+
MODEL_ID DATA_ID PARAMETERS STATUS R2
|
|
3630
|
+
0 SVM_2 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS -3.668091
|
|
3631
|
+
1 SVM_1 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS -3.668091
|
|
3632
|
+
2 SVM_0 DF_0 {'input_columns': ['MedInc', 'HouseAge', 'AveR... PASS -3.668091
|
|
3633
|
+
|
|
3634
|
+
>>> # View model evaluation stats.
|
|
3635
|
+
>>> rs_obj.model_stats
|
|
3636
|
+
MODEL_ID MAE MSE MSLE MAPE ... ME R2 EV MPD MGD
|
|
3637
|
+
0 SVM_2 2.354167 6.715689 0.0 120.054758 ... 3.801619 -3.668091 0.184238 NaN NaN
|
|
3638
|
+
1 SVM_1 2.354167 6.715689 0.0 120.054758 ... 3.801619 -3.668091 0.184238 NaN NaN
|
|
3639
|
+
2 SVM_0 2.354167 6.715689 0.0 120.054758 ... 3.801619 -3.668091 0.184238 NaN NaN
|
|
3640
|
+
|
|
3641
|
+
[3 rows x 13 columns]
|
|
3642
|
+
|
|
3643
|
+
>>> # Performing prediction on sampled data using best trained model.
|
|
3644
|
+
>>> test_data = transform_obj.result.iloc[:5]
|
|
3645
|
+
>>> rs_pred = rs_obj.predict(newdata=test_data, id_column="id")
|
|
3646
|
+
>>> print("Prediction result: \n", rs_pred.result)
|
|
3647
|
+
Prediction result:
|
|
3648
|
+
id prediction
|
|
3649
|
+
0 686 -0.024033
|
|
3650
|
+
1 2018 -0.069738
|
|
3651
|
+
2 1754 -0.117881
|
|
3652
|
+
3 670 -0.021818
|
|
3653
|
+
4 244 -0.187346
|
|
3654
|
+
|
|
3655
|
+
>>> # Perform evaluation using best model.
|
|
3656
|
+
>>> rs_obj.evaluate()
|
|
3657
|
+
############ result Output ############
|
|
3658
|
+
|
|
3659
|
+
MAE MSE MSLE MAPE MPE RMSE RMSLE ME R2 EV MPD MGD
|
|
3660
|
+
0 2.354167 6.715689 0.0 120.054758 120.054758 2.591465 0.0 3.801619 -3.668091 0.184238 NaN NaN
|
|
3661
|
+
|
|
3662
|
+
>>> # Retrieve any trained model.
|
|
3663
|
+
>>> rs_obj.get_model("SVM_1")
|
|
3664
|
+
############ output_data Output ############
|
|
3665
|
+
|
|
3666
|
+
iterNum loss eta bias
|
|
3667
|
+
0 3 2.012817 0.028868 0.0
|
|
3668
|
+
1 5 2.010455 0.022361 0.0
|
|
3669
|
+
2 6 2.009331 0.020412 0.0
|
|
3670
|
+
3 7 2.008276 0.018898 0.0
|
|
3671
|
+
4 9 2.006384 0.016667 0.0
|
|
3672
|
+
5 10 2.005518 0.015811 0.0
|
|
3673
|
+
6 8 2.007302 0.017678 0.0
|
|
3674
|
+
7 4 2.011636 0.025000 0.0
|
|
3675
|
+
8 2 2.014326 0.035355 0.0
|
|
3676
|
+
9 1 2.016398 0.050000 0.0
|
|
3677
|
+
|
|
3678
|
+
############ result Output ############
|
|
3679
|
+
|
|
3680
|
+
predictor estimate value
|
|
3681
|
+
attribute
|
|
3682
|
+
-7 Alpha 0.500000 Elasticnet
|
|
3683
|
+
-3 Number of Observations 55.000000 None
|
|
3684
|
+
5 Population 0.000000 None
|
|
3685
|
+
0 (Intercept) 0.000000 None
|
|
3686
|
+
-17 OneClass SVM NaN FALSE
|
|
3687
|
+
-16 Kernel NaN LINEAR
|
|
3688
|
+
-1 Loss Function NaN EPSILON_INSENSITIVE
|
|
3689
|
+
7 Latitude -0.076648 None
|
|
3690
|
+
-9 Learning Rate (Initial) 0.050000 None
|
|
3691
|
+
-14 Epsilon 0.100000 None
|
|
3692
|
+
|
|
3693
|
+
|
|
3694
|
+
>>> # View best data, model ID, score and parameters.
|
|
3695
|
+
>>> print("Best data ID: ", rs_obj.best_data_id)
|
|
3696
|
+
Best data ID: DF_0
|
|
3697
|
+
>>> print("Best model ID: ", rs_obj.best_model_id)
|
|
3698
|
+
Best model ID: SVM_2
|
|
3699
|
+
>>> print("Best model score: ", rs_obj.best_score_)
|
|
3700
|
+
Best model score: -3.6680912444156455
|
|
3701
|
+
>>> print("Best model parameters: ", rs_obj.best_params_)
|
|
3702
|
+
Best model parameters: {'input_columns': ['MedInc', 'HouseAge', 'AveRooms',
|
|
3703
|
+
'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude'],
|
|
3704
|
+
'response_column': 'MedHouseVal', 'model_type': 'regression',
|
|
3705
|
+
'batch_size': 50, 'iter_max': 301, 'lambda1': 0.1, 'alpha': 0.5,
|
|
3706
|
+
'iter_num_no_change': 60, 'tolerance': 0.01, 'intercept': False,
|
|
3707
|
+
'learning_rate': 'INVTIME', 'initial_data': 0.5, 'decay_rate': 0.5,
|
|
3708
|
+
'momentum': 0.6, 'nesterov': True, 'local_sgd_iterations': 1,
|
|
3709
|
+
'data': '"ALICE"."ml__select__1696595493985650"'}
|
|
3710
|
+
|
|
3711
|
+
>>> # Update the default model.
|
|
3712
|
+
>>> rs_obj.set_model("SVM_1")
|
|
3713
|
+
|
|
3714
|
+
>>> # Example 2: Non-Model trainer function. Performing random search
|
|
3715
|
+
>>> # on AntiSelect model trainer function using random
|
|
3716
|
+
>>> # search algorithm.
|
|
3717
|
+
|
|
3718
|
+
>>> # Load the example dataset.
|
|
3719
|
+
>>> load_example_data("teradataml", "titanic")
|
|
3720
|
+
|
|
3721
|
+
>>> # Create teradaraml dataframe.
|
|
3722
|
+
>>> titanic = DataFrame.from_table("titanic")
|
|
3723
|
+
|
|
3724
|
+
>>> # Define the non-model trainer function parameter space.
|
|
3725
|
+
>>> # Include input data in parameter space for non-model trainer function.
|
|
3726
|
+
>>> # Note: These parameters creates two model hyperparameters.
|
|
3727
|
+
>>> params = {"data":titanic, "exclude":(['survived', 'age'],['age'],
|
|
3728
|
+
['survived', 'name', 'age'],
|
|
3729
|
+
['ticket'],['parch'],['sex','age'],
|
|
3730
|
+
['survived'], ['ticket','parch'],
|
|
3731
|
+
["ticket", "parch", "sex", "age"])}
|
|
3732
|
+
|
|
3733
|
+
>>> # Import non-model trainer function and optimizer.
|
|
3734
|
+
>>> from teradataml import Antiselect, RandomSearch
|
|
3735
|
+
|
|
3736
|
+
>>> # Initialize the random search optimizer with non-model trainer
|
|
3737
|
+
>>> # function and parameter space required for non-model training.
|
|
3738
|
+
>>> rs_obj = RandomSearch(func=Antiselect, params=params, n_iter=4)
|
|
3739
|
+
|
|
3740
|
+
>>> # Perform execution of Antiselect function.
|
|
3741
|
+
>>> rs_obj.fit()
|
|
3742
|
+
|
|
3743
|
+
>>> # Note: Since it is a non-model trainer function model ID, score
|
|
3744
|
+
>>> # and parameters are not applicable here.
|
|
3745
|
+
>>> # View trained models.
|
|
3746
|
+
>>> rs_obj.models
|
|
3747
|
+
MODEL_ID PARAMETERS STATUS
|
|
3748
|
+
0 ANTISELECT_1 {'data': '"titanic"', 'exclude': ['survived', ... PASS
|
|
3749
|
+
1 ANTISELECT_3 {'data': '"titanic"', 'exclude': ['ticket', 'p... PASS
|
|
3750
|
+
2 ANTISELECT_2 {'data': '"titanic"', 'exclude': ['survived']} PASS
|
|
3751
|
+
3 ANTISELECT_0 {'data': '"titanic"', 'exclude': ['sex', 'age']} PASS
|
|
3752
|
+
|
|
3753
|
+
>>> # Retrieve any trained model using "MODEL_ID".
|
|
3754
|
+
>>> rs_obj.get_model("ANTISELECT_0")
|
|
3755
|
+
############ result Output ############
|
|
3756
|
+
|
|
3757
|
+
passenger survived pclass name sibsp parch ticket fare cabin embarked
|
|
3758
|
+
0 162 1 2 Watt, Mrs. James (Elizabeth "Bessie" Inglis Milne) 0 0 C.A. 33595 15.7500 None S
|
|
3759
|
+
1 591 0 3 Rintamaki, Mr. Matti 0 0 STON/O 2. 3101273 7.1250 None S
|
|
3760
|
+
2 387 0 3 Goodwin, Master. Sidney Leonard 5 2 CA 2144 46.9000 None S
|
|
3761
|
+
3 469 0 3 Scanlan, Mr. James 0 0 36209 7.7250 None Q
|
|
3762
|
+
4 326 1 1 Young, Miss. Marie Grice 0 0 PC 17760 135.6333 C32 C
|
|
3763
|
+
5 265 0 3 Henry, Miss. Delia 0 0 382649 7.7500 None Q
|
|
3764
|
+
6 530 0 2 Hocking, Mr. Richard George 2 1 29104 11.5000 None S
|
|
3765
|
+
7 244 0 3 Maenpaa, Mr. Matti Alexanteri 0 0 STON/O 2. 3101275 7.1250 None S
|
|
3766
|
+
8 61 0 3 Sirayanian, Mr. Orsen 0 0 2669 7.2292 None C
|
|
3767
|
+
9 122 0 3 Moore, Mr. Leonard Charles 0 0 A4. 54510 8.0500 None S
|
|
3768
|
+
|
|
3769
|
+
"""
|
|
3770
|
+
|
|
3771
|
+
self.__params = params.copy()
|
|
3772
|
+
super().__init__(func=func, params=self.__params)
|
|
3773
|
+
# Validate argument 'n_iter'
|
|
3774
|
+
awu_matrix = []
|
|
3775
|
+
awu_matrix.append(["n_iter", n_iter, True, int])
|
|
3776
|
+
_Validators._validate_positive_int(n_iter, "n_iter")
|
|
3777
|
+
self.set_parameter_grid()
|
|
3778
|
+
parameter_space = self.get_parameter_grid()
|
|
3779
|
+
# Validates the range of n_iter should be greater than or equal to 1 and
|
|
3780
|
+
# less than or equal to parameter space.
|
|
3781
|
+
_Validators._validate_argument_range(n_iter, "n_iter", 1, len(parameter_space), True, True)
|
|
3782
|
+
self._n_iter = n_iter
|
|
3783
|
+
|
|
3784
|
+
def __populate_params_grid(self):
|
|
3785
|
+
"""
|
|
3786
|
+
DESCRIPTION:
|
|
3787
|
+
Populate parameter grid based on the search algorithm. In random search,
|
|
3788
|
+
Random selection performed on given hyperparameters.
|
|
3789
|
+
|
|
3790
|
+
PARAMETERS:
|
|
3791
|
+
n_iter:
|
|
3792
|
+
Required Argument.
|
|
3793
|
+
Specifies number of parameters need to be sampled.
|
|
3794
|
+
Types: int
|
|
3795
|
+
|
|
3796
|
+
RETURNS:
|
|
3797
|
+
None
|
|
3798
|
+
|
|
3799
|
+
RAISES:
|
|
3800
|
+
TeradataMlException
|
|
3801
|
+
|
|
3802
|
+
EXAMPLES:
|
|
3803
|
+
>>> self.__populate_params_grid()
|
|
3804
|
+
"""
|
|
3805
|
+
# Populate the parameter space with random and non-repetitive value
|
|
3806
|
+
if self.discard_invalid_column_params:
|
|
3807
|
+
# Defining the empty data_grouped_dict to group the parameters based on data_id.
|
|
3808
|
+
data_grouped_dict = defaultdict(list)
|
|
3809
|
+
for parameter in self._parameter_grid:
|
|
3810
|
+
# Extracting the data_id from the parameter.
|
|
3811
|
+
data_id = parameter['data_id']
|
|
3812
|
+
# Grouping the parameters based on data_id.
|
|
3813
|
+
data_grouped_dict[data_id].append(parameter)
|
|
3814
|
+
# Converting the grouped dictionary to list.
|
|
3815
|
+
data_grouped_dict = list(data_grouped_dict.values())
|
|
3816
|
+
parameter_grid = []
|
|
3817
|
+
for group in data_grouped_dict:
|
|
3818
|
+
# Randomly selecting the n_iter parameters from the grouped data.
|
|
3819
|
+
tmp = random.sample(group, self._n_iter)
|
|
3820
|
+
parameter_grid.extend(tmp)
|
|
3821
|
+
|
|
3822
|
+
# Setting the parameter grid.
|
|
3823
|
+
self._parameter_grid = parameter_grid
|
|
3824
|
+
else:
|
|
3825
|
+
self._parameter_grid = random.sample(self.get_parameter_grid(), self._n_iter)
|
|
3826
|
+
|
|
3827
|
+
def fit(self,
|
|
3828
|
+
data=None,
|
|
3829
|
+
evaluation_metric=None,
|
|
3830
|
+
early_stop=None,
|
|
3831
|
+
frac=0.8,
|
|
3832
|
+
run_parallel=True,
|
|
3833
|
+
wait=True,
|
|
3834
|
+
verbose=0,
|
|
3835
|
+
stratify_column=None,
|
|
3836
|
+
sample_id_column=None,
|
|
3837
|
+
sample_seed=None,
|
|
3838
|
+
max_time=None,
|
|
3839
|
+
**kwargs):
|
|
3840
|
+
"""
|
|
3841
|
+
DESCRIPTION:
|
|
3842
|
+
Function to perform hyperparameter tuning using RandomSearch algorithm.
|
|
3843
|
+
Notes:
|
|
3844
|
+
* In the Model trainer function, the best parameters are
|
|
3845
|
+
selected based on training results.
|
|
3846
|
+
* In the Non model trainer function, First execution parameter
|
|
3847
|
+
set is selected as the best parameters.
|
|
3848
|
+
|
|
3849
|
+
PARAMETERS:
|
|
3850
|
+
data:
|
|
3851
|
+
Optional Argument.
|
|
3852
|
+
Specifies the input teradataml DataFrame for model trainer function.
|
|
3853
|
+
Notes:
|
|
3854
|
+
* DataFrame need not to be passed in fit() methods, when "data" is
|
|
3855
|
+
passed as a model hyperparameters ("params").
|
|
3856
|
+
* "data" is a required argument for model trainer functions.
|
|
3857
|
+
* "data" is ignored for non-model trainer functions.
|
|
3858
|
+
* "data" can be contain single DataFrame or multiple DataFrame.
|
|
3859
|
+
* One can pass multiple dataframes to "data". Hyperparameter
|
|
3860
|
+
tuning is performed on all the dataframes for every model
|
|
3861
|
+
parameter.
|
|
3862
|
+
* "data" can be either a dictionary OR a tuple OR a dataframe.
|
|
3863
|
+
* If it is a dictionary then Key represents the label for
|
|
3864
|
+
dataframe and Value represents the dataframe.
|
|
3865
|
+
* If it is a tuple then teradataml converts it to dictionary
|
|
3866
|
+
by generating the labels internally.
|
|
3867
|
+
* If it is a dataframe then teradataml label it as "DF_0".
|
|
3868
|
+
Types: teradataml DataFrame, dictionary, tuples
|
|
3869
|
+
|
|
3870
|
+
evaluation_metric:
|
|
3871
|
+
Optional Argument.
|
|
3872
|
+
Specifies the evaluation metrics to considered for model
|
|
3873
|
+
evaluation.
|
|
3874
|
+
Notes:
|
|
3875
|
+
* evaluation_metric applicable for model trainer functions.
|
|
3876
|
+
* Best model is not selected when evaluation returns
|
|
3877
|
+
non-finite values.
|
|
3878
|
+
* MPD, MGD, RMSE, RMSLE are not supported for OpenSourceML models.
|
|
3879
|
+
Permitted Values:
|
|
3880
|
+
* Classification: Accuracy, Micro-Precision, Micro-Recall,
|
|
3881
|
+
Micro-F1, Macro-Precision, Macro-Recall,
|
|
3882
|
+
Macro-F1, Weighted-Precision,
|
|
3883
|
+
Weighted-Recall,
|
|
3884
|
+
Weighted-F1.
|
|
3885
|
+
* Regression: MAE, MSE, MSLE, MAPE, MPE, RMSE, RMSLE, ME,
|
|
3886
|
+
R2, EV, MPD, MGD
|
|
3887
|
+
|
|
3888
|
+
Default Value:
|
|
3889
|
+
* Classification: Accuracy
|
|
3890
|
+
* Regression: MAE
|
|
3891
|
+
Types: str
|
|
3892
|
+
|
|
3893
|
+
early_stop:
|
|
3894
|
+
Optional Argument.
|
|
3895
|
+
Specifies the early stop mechanism value for model trainer
|
|
3896
|
+
functions. Hyperparameter tuning ends model training when
|
|
3897
|
+
the training model evaluation metric attains "early_stop" value.
|
|
3898
|
+
Note:
|
|
3899
|
+
* Early stopping supports only when evaluation returns
|
|
3900
|
+
finite value.
|
|
3901
|
+
Types: int or float
|
|
3902
|
+
|
|
3903
|
+
frac:
|
|
3904
|
+
Optional Argument.
|
|
3905
|
+
Specifies the split percentage of rows to be sampled for training
|
|
3906
|
+
and testing dataset. "frac" argument value must range between (0, 1).
|
|
3907
|
+
Notes:
|
|
3908
|
+
* This "frac" argument is not supported for non-model trainer
|
|
3909
|
+
function.
|
|
3910
|
+
* The "frac" value is considered as train split percentage and
|
|
3911
|
+
The remaining percentage is taken into account for test splitting.
|
|
3912
|
+
Default Value: 0.8
|
|
3913
|
+
Types: float
|
|
3914
|
+
|
|
3915
|
+
run_parallel:
|
|
3916
|
+
Optional Argument.
|
|
3917
|
+
Specifies the parallel execution functionality of hyperparameter
|
|
3918
|
+
tuning. When "run_parallel" set to true, model functions are
|
|
3919
|
+
executed concurrently. Otherwise, model functions are executed
|
|
3920
|
+
sequentially.
|
|
3921
|
+
Default Value: True
|
|
3922
|
+
Types: bool
|
|
3923
|
+
|
|
3924
|
+
wait:
|
|
3925
|
+
Optional Argument.
|
|
3926
|
+
Specifies whether to wait for the completion of execution
|
|
3927
|
+
of hyperparameter tuning or not. When set to False, hyperparameter
|
|
3928
|
+
tuning is executed in the background and user can use "is_running()"
|
|
3929
|
+
method to check the status. Otherwise it waits until the execution
|
|
3930
|
+
is complete to return the control back to user.
|
|
3931
|
+
Default Value: True
|
|
3932
|
+
Type: bool
|
|
3933
|
+
|
|
3934
|
+
verbose:
|
|
3935
|
+
Optional Argument.
|
|
3936
|
+
Specifies whether to log the model training information and display
|
|
3937
|
+
the logs. When it is set to 1, progress bar alone logged in the
|
|
3938
|
+
console. When it is set to 2, along with progress bar, execution
|
|
3939
|
+
steps and execution time is logged in the console. When it is set
|
|
3940
|
+
to 0, nothing is logged in the console.
|
|
3941
|
+
Note:
|
|
3942
|
+
* verbose is not significant when "wait" is 'False'.
|
|
3943
|
+
Default Value: 0
|
|
3944
|
+
Type: bool
|
|
3945
|
+
|
|
3946
|
+
sample_seed:
|
|
3947
|
+
Optional Argument.
|
|
3948
|
+
Specifies the seed value that controls the shuffling applied
|
|
3949
|
+
to the data before applying the Train-Test split. Pass an int for
|
|
3950
|
+
reproducible output across multiple function calls.
|
|
3951
|
+
Notes:
|
|
3952
|
+
* When the argument is not specified, different
|
|
3953
|
+
runs of the query generate different outputs.
|
|
3954
|
+
* It must be in the range [0, 2147483647]
|
|
3955
|
+
* Seed is supported for stratify column.
|
|
3956
|
+
Types: int
|
|
3957
|
+
|
|
3958
|
+
stratify_column:
|
|
3959
|
+
Optional Argument.
|
|
3960
|
+
Specifies column name that contains the labels indicating
|
|
3961
|
+
which data needs to be stratified for TrainTest split.
|
|
3962
|
+
Notes:
|
|
3963
|
+
* seed is supported for stratify column.
|
|
3964
|
+
Types: str
|
|
3965
|
+
|
|
3966
|
+
sample_id_column:
|
|
3967
|
+
Optional Argument.
|
|
3968
|
+
Specifies the input data column name that has the
|
|
3969
|
+
unique identifier for each row in the input.
|
|
3970
|
+
Note:
|
|
3971
|
+
* Mandatory when "sample_seed" argument is present.
|
|
3972
|
+
Types: str
|
|
3973
|
+
|
|
3974
|
+
max_time:
|
|
3975
|
+
Optional Argument.
|
|
3976
|
+
Specifies the maximum time for the completion of Hyperparameter tuning execution.
|
|
3977
|
+
Default Value: None
|
|
3978
|
+
Types: int or float
|
|
3979
|
+
|
|
3980
|
+
kwargs:
|
|
3981
|
+
Optional Argument.
|
|
3982
|
+
Specifies the keyword arguments. Accepts additional arguments
|
|
3983
|
+
required for the teradataml analytic function.
|
|
3984
|
+
|
|
3985
|
+
RETURNS:
|
|
3986
|
+
None
|
|
3987
|
+
|
|
3988
|
+
RAISES:
|
|
3989
|
+
TeradataMlException, TypeError, ValueError
|
|
3990
|
+
|
|
3991
|
+
EXAMPLES:
|
|
3992
|
+
>>> # Create an instance of the RandomSearch algorithm called "optimizer_obj"
|
|
3993
|
+
>>> optimizer_obj = RandomSearch(func=SVM, params=params, n_iter=3)
|
|
3994
|
+
|
|
3995
|
+
>>> eval_params = {"id_column": "id",
|
|
3996
|
+
"accumulate": "MedHouseVal"}
|
|
3997
|
+
>>> # Example 1: Passing single DataFrame for model trainer function.
|
|
3998
|
+
>>> optimizer_obj.fit(data=train_df,
|
|
3999
|
+
evaluation_metric="MAE",
|
|
4000
|
+
early_stop=70.9,
|
|
4001
|
+
**eval_params)
|
|
4002
|
+
|
|
4003
|
+
>>> # Example 2: Passing multiple datasets as tuple of DataFrames for
|
|
4004
|
+
>>> # model trainer function.
|
|
4005
|
+
>>> optimizer_obj.fit(data=(train_df_1, train_df_2),
|
|
4006
|
+
evaluation_metric="MAE",
|
|
4007
|
+
early_stop=70.9,
|
|
4008
|
+
**eval_params)
|
|
4009
|
+
|
|
4010
|
+
>>> # Example 3: Passing multiple datasets as dictionary of DataFrames
|
|
4011
|
+
>>> # for model trainer function.
|
|
4012
|
+
>>> optimizer_obj.fit(data={"Data-1":train_df_1, "Data-2":train_df_2},
|
|
4013
|
+
evaluation_metric="MAE",
|
|
4014
|
+
early_stop=70.9,
|
|
4015
|
+
**eval_params)
|
|
4016
|
+
|
|
4017
|
+
>>> # Example 4: No data argument passed in fit() method for model trainer function.
|
|
4018
|
+
>>> # Note: data argument must be passed while creating HPT object as
|
|
4019
|
+
>>> # model hyperparameters.
|
|
4020
|
+
|
|
4021
|
+
>>> # Define parameter space for model training with "data" argument.
|
|
4022
|
+
>>> params = {"data":(df1, df2),
|
|
4023
|
+
"input_columns":['MedInc', 'HouseAge', 'AveRooms',
|
|
4024
|
+
'AveBedrms', 'Population', 'AveOccup',
|
|
4025
|
+
'Latitude', 'Longitude'],
|
|
4026
|
+
"response_column":"MedHouseVal",
|
|
4027
|
+
"model_type":"regression",
|
|
4028
|
+
"batch_size":(11, 50, 75),
|
|
4029
|
+
"iter_max":(100, 301),
|
|
4030
|
+
"intercept":False,
|
|
4031
|
+
"learning_rate":"INVTIME",
|
|
4032
|
+
"nesterov":True,
|
|
4033
|
+
"local_sgd_iterations":1}
|
|
4034
|
+
|
|
4035
|
+
>>> # Create "optimizer_obj" using RandomSearch algorithm and perform
|
|
4036
|
+
>>> # fit() method without any "data" argument for model trainer function.
|
|
4037
|
+
>>> optimizer_obj.fit(evaluation_metric="MAE",
|
|
4038
|
+
early_stop=70.9,
|
|
4039
|
+
**eval_params)
|
|
4040
|
+
|
|
4041
|
+
>>> # Example 5: Do not pass data argument in fit() method for
|
|
4042
|
+
>>> # non-model trainer function.
|
|
4043
|
+
>>> # Note: data argument must be passed while creating HPT
|
|
4044
|
+
>>> # object as model hyperparameters.
|
|
4045
|
+
>>> optimizer_obj.fit()
|
|
4046
|
+
|
|
4047
|
+
>>> # Example 6: Passing "verbose" argument value '1' in fit() method to
|
|
4048
|
+
>>> # display model log.
|
|
4049
|
+
>>> optimizer_obj.fit(data=train_df, evaluation_metric="R2",
|
|
4050
|
+
verbose=1, **eval_params)
|
|
4051
|
+
completed: |████████████████████████████████████████████████████████████| 100% - 6/6
|
|
4052
|
+
|
|
4053
|
+
>>> # Example 7: max_time argument is passed in fit() method.
|
|
4054
|
+
>>> # Model training parameters
|
|
4055
|
+
>>> model_params = {"input_columns":['sepal_length', 'sepal_width', 'petal_length', 'petal_width'],
|
|
4056
|
+
... "response_column" : 'species',
|
|
4057
|
+
... "max_depth":(5,10,15),
|
|
4058
|
+
... "lambda1" : (1000.0,0.001),
|
|
4059
|
+
... "model_type" :"Classification",
|
|
4060
|
+
... "seed":32,
|
|
4061
|
+
... "shrinkage_factor":0.1,
|
|
4062
|
+
... "iter_num":(5, 50)}
|
|
4063
|
+
>>>
|
|
4064
|
+
>>> eval_params = {"id_column": "id",
|
|
4065
|
+
... "accumulate": "species",
|
|
4066
|
+
... "model_type":'Classification',
|
|
4067
|
+
... "object_order_column":['task_index', 'tree_num', 'iter','class_num', 'tree_order']
|
|
4068
|
+
... }
|
|
4069
|
+
>>>
|
|
4070
|
+
>>> # Import model trainer and optimizer
|
|
4071
|
+
>>> from teradataml import XGBoost, RandomSearch
|
|
4072
|
+
>>>
|
|
4073
|
+
>>> # Initialize the RandomSearch optimizer with model trainer
|
|
4074
|
+
>>> # function and parameter space required for model training.
|
|
4075
|
+
>>> rs_obj = RandomSearch(func=XGBoost, params=model_params, n_iter=5)
|
|
4076
|
+
>>>
|
|
4077
|
+
>>> # fit() method with max_time argument(in seconds) for model trainer function.
|
|
4078
|
+
>>> rs_obj.fit(data=data, max_time=30, verbose=2, **eval_params)
|
|
4079
|
+
Model_id:XGBOOST_3 - Run time:28.292s - Status:PASS - ACCURACY:0.8
|
|
4080
|
+
Model_id:XGBOOST_0 - Run time:28.291s - Status:PASS - ACCURACY:0.867
|
|
4081
|
+
Model_id:XGBOOST_2 - Run time:28.289s - Status:PASS - ACCURACY:0.867
|
|
4082
|
+
Model_id:XGBOOST_1 - Run time:28.291s - Status:PASS - ACCURACY:0.867
|
|
4083
|
+
Computing: |⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾| 80% - 4/5
|
|
4084
|
+
>>>
|
|
4085
|
+
>>> # status 'SKIP' for the models which are not completed within the max_time.
|
|
4086
|
+
>>> rs_obj.models
|
|
4087
|
+
MODEL_ID DATA_ID PARAMETERS STATUS ACCURACY
|
|
4088
|
+
0 XGBOOST_3 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.800000
|
|
4089
|
+
1 XGBOOST_4 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
|
|
4090
|
+
2 XGBOOST_0 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.866667
|
|
4091
|
+
3 XGBOOST_2 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.866667
|
|
4092
|
+
4 XGBOOST_1 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.866667
|
|
4093
|
+
"""
|
|
4094
|
+
|
|
4095
|
+
# Set discard_invalid_column_params flag.
|
|
4096
|
+
self.discard_invalid_column_params =kwargs.get("discard_invalid_column_params", False)
|
|
4097
|
+
|
|
4098
|
+
if self.discard_invalid_column_params:
|
|
4099
|
+
# Setting model trainer input data
|
|
4100
|
+
super()._setting_model_trainer_data(data)
|
|
4101
|
+
# Mapping the data with input columns
|
|
4102
|
+
super()._data_mapping()
|
|
4103
|
+
# Setting the lambda function to None.
|
|
4104
|
+
self._setting_model_trainer_data = lambda data: None
|
|
4105
|
+
self._BaseSearch__update_model_parameters = lambda: None
|
|
4106
|
+
|
|
4107
|
+
# Populate parameter grid.
|
|
4108
|
+
self.__populate_params_grid()
|
|
4109
|
+
|
|
4110
|
+
# Calling baseSearch class fit method.
|
|
4111
|
+
super().fit(data, evaluation_metric, early_stop,
|
|
4112
|
+
frac, run_parallel, wait, verbose,
|
|
4113
|
+
stratify_column, sample_id_column,
|
|
4114
|
+
sample_seed, max_time, **kwargs)
|
|
4115
|
+
|