teradataml 20.0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +2762 -0
- teradataml/__init__.py +78 -0
- teradataml/_version.py +11 -0
- teradataml/analytics/Transformations.py +2996 -0
- teradataml/analytics/__init__.py +82 -0
- teradataml/analytics/analytic_function_executor.py +2416 -0
- teradataml/analytics/analytic_query_generator.py +1050 -0
- teradataml/analytics/byom/H2OPredict.py +514 -0
- teradataml/analytics/byom/PMMLPredict.py +437 -0
- teradataml/analytics/byom/__init__.py +16 -0
- teradataml/analytics/json_parser/__init__.py +133 -0
- teradataml/analytics/json_parser/analytic_functions_argument.py +1805 -0
- teradataml/analytics/json_parser/json_store.py +191 -0
- teradataml/analytics/json_parser/metadata.py +1666 -0
- teradataml/analytics/json_parser/utils.py +805 -0
- teradataml/analytics/meta_class.py +236 -0
- teradataml/analytics/sqle/DecisionTreePredict.py +456 -0
- teradataml/analytics/sqle/NaiveBayesPredict.py +420 -0
- teradataml/analytics/sqle/__init__.py +128 -0
- teradataml/analytics/sqle/json/decisiontreepredict_sqle.json +78 -0
- teradataml/analytics/sqle/json/naivebayespredict_sqle.json +62 -0
- teradataml/analytics/table_operator/__init__.py +11 -0
- teradataml/analytics/uaf/__init__.py +82 -0
- teradataml/analytics/utils.py +828 -0
- teradataml/analytics/valib.py +1617 -0
- teradataml/automl/__init__.py +5835 -0
- teradataml/automl/autodataprep/__init__.py +493 -0
- teradataml/automl/custom_json_utils.py +1625 -0
- teradataml/automl/data_preparation.py +1384 -0
- teradataml/automl/data_transformation.py +1254 -0
- teradataml/automl/feature_engineering.py +2273 -0
- teradataml/automl/feature_exploration.py +1873 -0
- teradataml/automl/model_evaluation.py +488 -0
- teradataml/automl/model_training.py +1407 -0
- teradataml/catalog/__init__.py +2 -0
- teradataml/catalog/byom.py +1759 -0
- teradataml/catalog/function_argument_mapper.py +859 -0
- teradataml/catalog/model_cataloging_utils.py +491 -0
- teradataml/clients/__init__.py +0 -0
- teradataml/clients/auth_client.py +137 -0
- teradataml/clients/keycloak_client.py +165 -0
- teradataml/clients/pkce_client.py +481 -0
- teradataml/common/__init__.py +1 -0
- teradataml/common/aed_utils.py +2078 -0
- teradataml/common/bulk_exposed_utils.py +113 -0
- teradataml/common/constants.py +1669 -0
- teradataml/common/deprecations.py +166 -0
- teradataml/common/exceptions.py +147 -0
- teradataml/common/formula.py +743 -0
- teradataml/common/garbagecollector.py +666 -0
- teradataml/common/logger.py +1261 -0
- teradataml/common/messagecodes.py +518 -0
- teradataml/common/messages.py +262 -0
- teradataml/common/pylogger.py +67 -0
- teradataml/common/sqlbundle.py +764 -0
- teradataml/common/td_coltype_code_to_tdtype.py +48 -0
- teradataml/common/utils.py +3166 -0
- teradataml/common/warnings.py +36 -0
- teradataml/common/wrapper_utils.py +625 -0
- teradataml/config/__init__.py +0 -0
- teradataml/config/dummy_file1.cfg +5 -0
- teradataml/config/dummy_file2.cfg +3 -0
- teradataml/config/sqlengine_alias_definitions_v1.0 +14 -0
- teradataml/config/sqlengine_alias_definitions_v1.1 +20 -0
- teradataml/config/sqlengine_alias_definitions_v1.3 +19 -0
- teradataml/context/__init__.py +0 -0
- teradataml/context/aed_context.py +223 -0
- teradataml/context/context.py +1462 -0
- teradataml/data/A_loan.csv +19 -0
- teradataml/data/BINARY_REALS_LEFT.csv +11 -0
- teradataml/data/BINARY_REALS_RIGHT.csv +11 -0
- teradataml/data/B_loan.csv +49 -0
- teradataml/data/BuoyData2.csv +17 -0
- teradataml/data/CONVOLVE2_COMPLEX_LEFT.csv +5 -0
- teradataml/data/CONVOLVE2_COMPLEX_RIGHT.csv +5 -0
- teradataml/data/Convolve2RealsLeft.csv +5 -0
- teradataml/data/Convolve2RealsRight.csv +5 -0
- teradataml/data/Convolve2ValidLeft.csv +11 -0
- teradataml/data/Convolve2ValidRight.csv +11 -0
- teradataml/data/DFFTConv_Real_8_8.csv +65 -0
- teradataml/data/Employee.csv +5 -0
- teradataml/data/Employee_Address.csv +4 -0
- teradataml/data/Employee_roles.csv +5 -0
- teradataml/data/JulesBelvezeDummyData.csv +100 -0
- teradataml/data/Mall_customer_data.csv +201 -0
- teradataml/data/Orders1_12mf.csv +25 -0
- teradataml/data/Pi_loan.csv +7 -0
- teradataml/data/SMOOTHED_DATA.csv +7 -0
- teradataml/data/TestDFFT8.csv +9 -0
- teradataml/data/TestRiver.csv +109 -0
- teradataml/data/Traindata.csv +28 -0
- teradataml/data/__init__.py +0 -0
- teradataml/data/acf.csv +17 -0
- teradataml/data/adaboost_example.json +34 -0
- teradataml/data/adaboostpredict_example.json +24 -0
- teradataml/data/additional_table.csv +11 -0
- teradataml/data/admissions_test.csv +21 -0
- teradataml/data/admissions_train.csv +41 -0
- teradataml/data/admissions_train_nulls.csv +41 -0
- teradataml/data/advertising.csv +201 -0
- teradataml/data/ageandheight.csv +13 -0
- teradataml/data/ageandpressure.csv +31 -0
- teradataml/data/amazon_reviews_25.csv +26 -0
- teradataml/data/antiselect_example.json +36 -0
- teradataml/data/antiselect_input.csv +8 -0
- teradataml/data/antiselect_input_mixed_case.csv +8 -0
- teradataml/data/applicant_external.csv +7 -0
- teradataml/data/applicant_reference.csv +7 -0
- teradataml/data/apriori_example.json +22 -0
- teradataml/data/arima_example.json +9 -0
- teradataml/data/assortedtext_input.csv +8 -0
- teradataml/data/attribution_example.json +34 -0
- teradataml/data/attribution_sample_table.csv +27 -0
- teradataml/data/attribution_sample_table1.csv +6 -0
- teradataml/data/attribution_sample_table2.csv +11 -0
- teradataml/data/bank_churn.csv +10001 -0
- teradataml/data/bank_marketing.csv +11163 -0
- teradataml/data/bank_web_clicks1.csv +43 -0
- teradataml/data/bank_web_clicks2.csv +91 -0
- teradataml/data/bank_web_url.csv +85 -0
- teradataml/data/barrier.csv +2 -0
- teradataml/data/barrier_new.csv +3 -0
- teradataml/data/betweenness_example.json +14 -0
- teradataml/data/bike_sharing.csv +732 -0
- teradataml/data/bin_breaks.csv +8 -0
- teradataml/data/bin_fit_ip.csv +4 -0
- teradataml/data/binary_complex_left.csv +11 -0
- teradataml/data/binary_complex_right.csv +11 -0
- teradataml/data/binary_matrix_complex_left.csv +21 -0
- teradataml/data/binary_matrix_complex_right.csv +21 -0
- teradataml/data/binary_matrix_real_left.csv +21 -0
- teradataml/data/binary_matrix_real_right.csv +21 -0
- teradataml/data/blood2ageandweight.csv +26 -0
- teradataml/data/bmi.csv +501 -0
- teradataml/data/boston.csv +507 -0
- teradataml/data/boston2cols.csv +721 -0
- teradataml/data/breast_cancer.csv +570 -0
- teradataml/data/buoydata_mix.csv +11 -0
- teradataml/data/burst_data.csv +5 -0
- teradataml/data/burst_example.json +21 -0
- teradataml/data/byom_example.json +34 -0
- teradataml/data/bytes_table.csv +4 -0
- teradataml/data/cal_housing_ex_raw.csv +70 -0
- teradataml/data/callers.csv +7 -0
- teradataml/data/calls.csv +10 -0
- teradataml/data/cars_hist.csv +33 -0
- teradataml/data/cat_table.csv +25 -0
- teradataml/data/ccm_example.json +32 -0
- teradataml/data/ccm_input.csv +91 -0
- teradataml/data/ccm_input2.csv +13 -0
- teradataml/data/ccmexample.csv +101 -0
- teradataml/data/ccmprepare_example.json +9 -0
- teradataml/data/ccmprepare_input.csv +91 -0
- teradataml/data/cfilter_example.json +12 -0
- teradataml/data/changepointdetection_example.json +18 -0
- teradataml/data/changepointdetectionrt_example.json +8 -0
- teradataml/data/chi_sq.csv +3 -0
- teradataml/data/churn_data.csv +14 -0
- teradataml/data/churn_emission.csv +35 -0
- teradataml/data/churn_initial.csv +3 -0
- teradataml/data/churn_state_transition.csv +5 -0
- teradataml/data/citedges_2.csv +745 -0
- teradataml/data/citvertices_2.csv +1210 -0
- teradataml/data/clicks2.csv +16 -0
- teradataml/data/clickstream.csv +13 -0
- teradataml/data/clickstream1.csv +11 -0
- teradataml/data/closeness_example.json +16 -0
- teradataml/data/complaints.csv +21 -0
- teradataml/data/complaints_mini.csv +3 -0
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_testtoken.csv +224 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/complaints_tokens_test.csv +353 -0
- teradataml/data/complaints_traintoken.csv +472 -0
- teradataml/data/computers_category.csv +1001 -0
- teradataml/data/computers_test1.csv +1252 -0
- teradataml/data/computers_train1.csv +5009 -0
- teradataml/data/computers_train1_clustered.csv +5009 -0
- teradataml/data/confusionmatrix_example.json +9 -0
- teradataml/data/conversion_event_table.csv +3 -0
- teradataml/data/corr_input.csv +17 -0
- teradataml/data/correlation_example.json +11 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/coxhazardratio_example.json +39 -0
- teradataml/data/coxph_example.json +15 -0
- teradataml/data/coxsurvival_example.json +28 -0
- teradataml/data/cpt.csv +41 -0
- teradataml/data/credit_ex_merged.csv +45 -0
- teradataml/data/creditcard_data.csv +1001 -0
- teradataml/data/customer_loyalty.csv +301 -0
- teradataml/data/customer_loyalty_newseq.csv +31 -0
- teradataml/data/customer_segmentation_test.csv +2628 -0
- teradataml/data/customer_segmentation_train.csv +8069 -0
- teradataml/data/dataframe_example.json +173 -0
- teradataml/data/decisionforest_example.json +37 -0
- teradataml/data/decisionforestpredict_example.json +38 -0
- teradataml/data/decisiontree_example.json +21 -0
- teradataml/data/decisiontreepredict_example.json +45 -0
- teradataml/data/dfft2_size4_real.csv +17 -0
- teradataml/data/dfft2_test_matrix16.csv +17 -0
- teradataml/data/dfft2conv_real_4_4.csv +65 -0
- teradataml/data/diabetes.csv +443 -0
- teradataml/data/diabetes_test.csv +89 -0
- teradataml/data/dict_table.csv +5 -0
- teradataml/data/docperterm_table.csv +4 -0
- teradataml/data/docs/__init__.py +1 -0
- teradataml/data/docs/byom/__init__.py +0 -0
- teradataml/data/docs/byom/docs/DataRobotPredict.py +180 -0
- teradataml/data/docs/byom/docs/DataikuPredict.py +217 -0
- teradataml/data/docs/byom/docs/H2OPredict.py +325 -0
- teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
- teradataml/data/docs/byom/docs/ONNXPredict.py +283 -0
- teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
- teradataml/data/docs/byom/docs/PMMLPredict.py +278 -0
- teradataml/data/docs/byom/docs/__init__.py +0 -0
- teradataml/data/docs/sqle/__init__.py +0 -0
- teradataml/data/docs/sqle/docs_17_10/Antiselect.py +83 -0
- teradataml/data/docs/sqle/docs_17_10/Attribution.py +200 -0
- teradataml/data/docs/sqle/docs_17_10/BincodeFit.py +172 -0
- teradataml/data/docs/sqle/docs_17_10/BincodeTransform.py +131 -0
- teradataml/data/docs/sqle/docs_17_10/CategoricalSummary.py +86 -0
- teradataml/data/docs/sqle/docs_17_10/ChiSq.py +90 -0
- teradataml/data/docs/sqle/docs_17_10/ColumnSummary.py +86 -0
- teradataml/data/docs/sqle/docs_17_10/ConvertTo.py +96 -0
- teradataml/data/docs/sqle/docs_17_10/DecisionForestPredict.py +139 -0
- teradataml/data/docs/sqle/docs_17_10/DecisionTreePredict.py +152 -0
- teradataml/data/docs/sqle/docs_17_10/FTest.py +161 -0
- teradataml/data/docs/sqle/docs_17_10/FillRowId.py +83 -0
- teradataml/data/docs/sqle/docs_17_10/Fit.py +88 -0
- teradataml/data/docs/sqle/docs_17_10/GLMPredict.py +144 -0
- teradataml/data/docs/sqle/docs_17_10/GetRowsWithMissingValues.py +85 -0
- teradataml/data/docs/sqle/docs_17_10/GetRowsWithoutMissingValues.py +82 -0
- teradataml/data/docs/sqle/docs_17_10/Histogram.py +165 -0
- teradataml/data/docs/sqle/docs_17_10/MovingAverage.py +134 -0
- teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +209 -0
- teradataml/data/docs/sqle/docs_17_10/NPath.py +266 -0
- teradataml/data/docs/sqle/docs_17_10/NaiveBayesPredict.py +116 -0
- teradataml/data/docs/sqle/docs_17_10/NaiveBayesTextClassifierPredict.py +176 -0
- teradataml/data/docs/sqle/docs_17_10/NumApply.py +147 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +135 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +109 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterFit.py +166 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +105 -0
- teradataml/data/docs/sqle/docs_17_10/Pack.py +128 -0
- teradataml/data/docs/sqle/docs_17_10/PolynomialFeaturesFit.py +112 -0
- teradataml/data/docs/sqle/docs_17_10/PolynomialFeaturesTransform.py +102 -0
- teradataml/data/docs/sqle/docs_17_10/QQNorm.py +105 -0
- teradataml/data/docs/sqle/docs_17_10/RoundColumns.py +110 -0
- teradataml/data/docs/sqle/docs_17_10/RowNormalizeFit.py +118 -0
- teradataml/data/docs/sqle/docs_17_10/RowNormalizeTransform.py +99 -0
- teradataml/data/docs/sqle/docs_17_10/SVMSparsePredict.py +153 -0
- teradataml/data/docs/sqle/docs_17_10/ScaleFit.py +197 -0
- teradataml/data/docs/sqle/docs_17_10/ScaleTransform.py +99 -0
- teradataml/data/docs/sqle/docs_17_10/Sessionize.py +114 -0
- teradataml/data/docs/sqle/docs_17_10/SimpleImputeFit.py +116 -0
- teradataml/data/docs/sqle/docs_17_10/SimpleImputeTransform.py +98 -0
- teradataml/data/docs/sqle/docs_17_10/StrApply.py +187 -0
- teradataml/data/docs/sqle/docs_17_10/StringSimilarity.py +146 -0
- teradataml/data/docs/sqle/docs_17_10/Transform.py +105 -0
- teradataml/data/docs/sqle/docs_17_10/UnivariateStatistics.py +142 -0
- teradataml/data/docs/sqle/docs_17_10/Unpack.py +214 -0
- teradataml/data/docs/sqle/docs_17_10/WhichMax.py +83 -0
- teradataml/data/docs/sqle/docs_17_10/WhichMin.py +83 -0
- teradataml/data/docs/sqle/docs_17_10/ZTest.py +155 -0
- teradataml/data/docs/sqle/docs_17_10/__init__.py +0 -0
- teradataml/data/docs/sqle/docs_17_20/ANOVA.py +186 -0
- teradataml/data/docs/sqle/docs_17_20/Antiselect.py +83 -0
- teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
- teradataml/data/docs/sqle/docs_17_20/Attribution.py +201 -0
- teradataml/data/docs/sqle/docs_17_20/BincodeFit.py +172 -0
- teradataml/data/docs/sqle/docs_17_20/BincodeTransform.py +139 -0
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/CategoricalSummary.py +86 -0
- teradataml/data/docs/sqle/docs_17_20/ChiSq.py +90 -0
- teradataml/data/docs/sqle/docs_17_20/ClassificationEvaluator.py +166 -0
- teradataml/data/docs/sqle/docs_17_20/ColumnSummary.py +86 -0
- teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +246 -0
- teradataml/data/docs/sqle/docs_17_20/ConvertTo.py +113 -0
- teradataml/data/docs/sqle/docs_17_20/DecisionForest.py +280 -0
- teradataml/data/docs/sqle/docs_17_20/DecisionForestPredict.py +144 -0
- teradataml/data/docs/sqle/docs_17_20/DecisionTreePredict.py +136 -0
- teradataml/data/docs/sqle/docs_17_20/FTest.py +240 -0
- teradataml/data/docs/sqle/docs_17_20/FillRowId.py +83 -0
- teradataml/data/docs/sqle/docs_17_20/Fit.py +88 -0
- teradataml/data/docs/sqle/docs_17_20/GLM.py +541 -0
- teradataml/data/docs/sqle/docs_17_20/GLMPerSegment.py +415 -0
- teradataml/data/docs/sqle/docs_17_20/GLMPredict.py +144 -0
- teradataml/data/docs/sqle/docs_17_20/GLMPredictPerSegment.py +233 -0
- teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +125 -0
- teradataml/data/docs/sqle/docs_17_20/GetRowsWithMissingValues.py +109 -0
- teradataml/data/docs/sqle/docs_17_20/GetRowsWithoutMissingValues.py +106 -0
- teradataml/data/docs/sqle/docs_17_20/Histogram.py +224 -0
- teradataml/data/docs/sqle/docs_17_20/KMeans.py +251 -0
- teradataml/data/docs/sqle/docs_17_20/KMeansPredict.py +144 -0
- teradataml/data/docs/sqle/docs_17_20/KNN.py +215 -0
- teradataml/data/docs/sqle/docs_17_20/MovingAverage.py +134 -0
- teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +209 -0
- teradataml/data/docs/sqle/docs_17_20/NPath.py +266 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayesPredict.py +116 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierPredict.py +177 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierTrainer.py +127 -0
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +119 -0
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineTransform.py +112 -0
- teradataml/data/docs/sqle/docs_17_20/NumApply.py +147 -0
- teradataml/data/docs/sqle/docs_17_20/OneClassSVM.py +307 -0
- teradataml/data/docs/sqle/docs_17_20/OneClassSVMPredict.py +185 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +231 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +121 -0
- teradataml/data/docs/sqle/docs_17_20/OrdinalEncodingFit.py +220 -0
- teradataml/data/docs/sqle/docs_17_20/OrdinalEncodingTransform.py +127 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +191 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterTransform.py +117 -0
- teradataml/data/docs/sqle/docs_17_20/Pack.py +128 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/PolynomialFeaturesFit.py +112 -0
- teradataml/data/docs/sqle/docs_17_20/PolynomialFeaturesTransform.py +112 -0
- teradataml/data/docs/sqle/docs_17_20/QQNorm.py +105 -0
- teradataml/data/docs/sqle/docs_17_20/ROC.py +164 -0
- teradataml/data/docs/sqle/docs_17_20/RandomProjectionFit.py +155 -0
- teradataml/data/docs/sqle/docs_17_20/RandomProjectionMinComponents.py +106 -0
- teradataml/data/docs/sqle/docs_17_20/RandomProjectionTransform.py +120 -0
- teradataml/data/docs/sqle/docs_17_20/RegressionEvaluator.py +211 -0
- teradataml/data/docs/sqle/docs_17_20/RoundColumns.py +109 -0
- teradataml/data/docs/sqle/docs_17_20/RowNormalizeFit.py +118 -0
- teradataml/data/docs/sqle/docs_17_20/RowNormalizeTransform.py +111 -0
- teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
- teradataml/data/docs/sqle/docs_17_20/SVM.py +414 -0
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +213 -0
- teradataml/data/docs/sqle/docs_17_20/SVMSparsePredict.py +153 -0
- teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +315 -0
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +202 -0
- teradataml/data/docs/sqle/docs_17_20/SentimentExtractor.py +206 -0
- teradataml/data/docs/sqle/docs_17_20/Sessionize.py +114 -0
- teradataml/data/docs/sqle/docs_17_20/Shap.py +225 -0
- teradataml/data/docs/sqle/docs_17_20/Silhouette.py +153 -0
- teradataml/data/docs/sqle/docs_17_20/SimpleImputeFit.py +116 -0
- teradataml/data/docs/sqle/docs_17_20/SimpleImputeTransform.py +109 -0
- teradataml/data/docs/sqle/docs_17_20/StrApply.py +187 -0
- teradataml/data/docs/sqle/docs_17_20/StringSimilarity.py +146 -0
- teradataml/data/docs/sqle/docs_17_20/TDDecisionForestPredict.py +207 -0
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +333 -0
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/TargetEncodingFit.py +267 -0
- teradataml/data/docs/sqle/docs_17_20/TargetEncodingTransform.py +141 -0
- teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +224 -0
- teradataml/data/docs/sqle/docs_17_20/TrainTestSplit.py +160 -0
- teradataml/data/docs/sqle/docs_17_20/Transform.py +123 -0
- teradataml/data/docs/sqle/docs_17_20/UnivariateStatistics.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/Unpack.py +214 -0
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/sqle/docs_17_20/VectorDistance.py +169 -0
- teradataml/data/docs/sqle/docs_17_20/WhichMax.py +83 -0
- teradataml/data/docs/sqle/docs_17_20/WhichMin.py +83 -0
- teradataml/data/docs/sqle/docs_17_20/WordEmbeddings.py +237 -0
- teradataml/data/docs/sqle/docs_17_20/XGBoost.py +362 -0
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +281 -0
- teradataml/data/docs/sqle/docs_17_20/ZTest.py +220 -0
- teradataml/data/docs/sqle/docs_17_20/__init__.py +0 -0
- teradataml/data/docs/tableoperator/__init__.py +0 -0
- teradataml/data/docs/tableoperator/docs_17_00/ReadNOS.py +430 -0
- teradataml/data/docs/tableoperator/docs_17_00/__init__.py +0 -0
- teradataml/data/docs/tableoperator/docs_17_05/ReadNOS.py +430 -0
- teradataml/data/docs/tableoperator/docs_17_05/WriteNOS.py +348 -0
- teradataml/data/docs/tableoperator/docs_17_05/__init__.py +0 -0
- teradataml/data/docs/tableoperator/docs_17_10/ReadNOS.py +429 -0
- teradataml/data/docs/tableoperator/docs_17_10/WriteNOS.py +348 -0
- teradataml/data/docs/tableoperator/docs_17_10/__init__.py +0 -0
- teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
- teradataml/data/docs/tableoperator/docs_17_20/ReadNOS.py +440 -0
- teradataml/data/docs/tableoperator/docs_17_20/WriteNOS.py +387 -0
- teradataml/data/docs/tableoperator/docs_17_20/__init__.py +0 -0
- teradataml/data/docs/uaf/__init__.py +0 -0
- teradataml/data/docs/uaf/docs_17_20/ACF.py +186 -0
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +370 -0
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +172 -0
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +161 -0
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BinaryMatrixOp.py +248 -0
- teradataml/data/docs/uaf/docs_17_20/BinarySeriesOp.py +252 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +178 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +175 -0
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +230 -0
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +218 -0
- teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +185 -0
- teradataml/data/docs/uaf/docs_17_20/DFFT.py +204 -0
- teradataml/data/docs/uaf/docs_17_20/DFFT2.py +216 -0
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +216 -0
- teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +192 -0
- teradataml/data/docs/uaf/docs_17_20/DIFF.py +175 -0
- teradataml/data/docs/uaf/docs_17_20/DTW.py +180 -0
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +217 -0
- teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +142 -0
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +184 -0
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +185 -0
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/FitMetrics.py +172 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesFormula.py +206 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +143 -0
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +198 -0
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +260 -0
- teradataml/data/docs/uaf/docs_17_20/IDFFT.py +165 -0
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +191 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/InputValidator.py +121 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +156 -0
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +215 -0
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +174 -0
- teradataml/data/docs/uaf/docs_17_20/MInfo.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +191 -0
- teradataml/data/docs/uaf/docs_17_20/PACF.py +157 -0
- teradataml/data/docs/uaf/docs_17_20/Portman.py +217 -0
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +203 -0
- teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +155 -0
- teradataml/data/docs/uaf/docs_17_20/Resample.py +237 -0
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SInfo.py +123 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +173 -0
- teradataml/data/docs/uaf/docs_17_20/SelectionCriteria.py +174 -0
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +171 -0
- teradataml/data/docs/uaf/docs_17_20/SignifResidmean.py +164 -0
- teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +180 -0
- teradataml/data/docs/uaf/docs_17_20/Smoothma.py +208 -0
- teradataml/data/docs/uaf/docs_17_20/TrackingOp.py +151 -0
- teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +171 -0
- teradataml/data/docs/uaf/docs_17_20/Unnormalize.py +202 -0
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +171 -0
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/docs/uaf/docs_17_20/__init__.py +0 -0
- teradataml/data/dtw_example.json +18 -0
- teradataml/data/dtw_t1.csv +11 -0
- teradataml/data/dtw_t2.csv +4 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt2d_example.json +16 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_example.json +15 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/dwt_filter_dim.csv +5 -0
- teradataml/data/emission.csv +9 -0
- teradataml/data/emp_table_by_dept.csv +19 -0
- teradataml/data/employee_info.csv +4 -0
- teradataml/data/employee_table.csv +6 -0
- teradataml/data/excluding_event_table.csv +2 -0
- teradataml/data/finance_data.csv +6 -0
- teradataml/data/finance_data2.csv +61 -0
- teradataml/data/finance_data3.csv +93 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/fish.csv +160 -0
- teradataml/data/fm_blood2ageandweight.csv +26 -0
- teradataml/data/fmeasure_example.json +12 -0
- teradataml/data/followers_leaders.csv +10 -0
- teradataml/data/fpgrowth_example.json +12 -0
- teradataml/data/frequentpaths_example.json +29 -0
- teradataml/data/friends.csv +9 -0
- teradataml/data/fs_input.csv +33 -0
- teradataml/data/fs_input1.csv +33 -0
- teradataml/data/genData.csv +513 -0
- teradataml/data/geodataframe_example.json +40 -0
- teradataml/data/glass_types.csv +215 -0
- teradataml/data/glm_admissions_model.csv +12 -0
- teradataml/data/glm_example.json +56 -0
- teradataml/data/glml1l2_example.json +28 -0
- teradataml/data/glml1l2predict_example.json +54 -0
- teradataml/data/glmpredict_example.json +54 -0
- teradataml/data/gq_t1.csv +21 -0
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/hconvolve_complex_right.csv +5 -0
- teradataml/data/hconvolve_complex_rightmulti.csv +5 -0
- teradataml/data/histogram_example.json +12 -0
- teradataml/data/hmmdecoder_example.json +79 -0
- teradataml/data/hmmevaluator_example.json +25 -0
- teradataml/data/hmmsupervised_example.json +10 -0
- teradataml/data/hmmunsupervised_example.json +8 -0
- teradataml/data/hnsw_alter_data.csv +5 -0
- teradataml/data/hnsw_data.csv +10 -0
- teradataml/data/house_values.csv +12 -0
- teradataml/data/house_values2.csv +13 -0
- teradataml/data/housing_cat.csv +7 -0
- teradataml/data/housing_data.csv +9 -0
- teradataml/data/housing_test.csv +47 -0
- teradataml/data/housing_test_binary.csv +47 -0
- teradataml/data/housing_train.csv +493 -0
- teradataml/data/housing_train_attribute.csv +5 -0
- teradataml/data/housing_train_binary.csv +437 -0
- teradataml/data/housing_train_parameter.csv +2 -0
- teradataml/data/housing_train_response.csv +493 -0
- teradataml/data/housing_train_segment.csv +201 -0
- teradataml/data/ibm_stock.csv +370 -0
- teradataml/data/ibm_stock1.csv +370 -0
- teradataml/data/identitymatch_example.json +22 -0
- teradataml/data/idf_table.csv +4 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/impressions.csv +101 -0
- teradataml/data/inflation.csv +21 -0
- teradataml/data/initial.csv +3 -0
- teradataml/data/insect2Cols.csv +61 -0
- teradataml/data/insect_sprays.csv +13 -0
- teradataml/data/insurance.csv +1339 -0
- teradataml/data/interpolator_example.json +13 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/iris_altinput.csv +481 -0
- teradataml/data/iris_attribute_output.csv +8 -0
- teradataml/data/iris_attribute_test.csv +121 -0
- teradataml/data/iris_attribute_train.csv +481 -0
- teradataml/data/iris_category_expect_predict.csv +31 -0
- teradataml/data/iris_data.csv +151 -0
- teradataml/data/iris_input.csv +151 -0
- teradataml/data/iris_response_train.csv +121 -0
- teradataml/data/iris_test.csv +31 -0
- teradataml/data/iris_train.csv +121 -0
- teradataml/data/join_table1.csv +4 -0
- teradataml/data/join_table2.csv +4 -0
- teradataml/data/jsons/anly_function_name.json +7 -0
- teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
- teradataml/data/jsons/byom/dataikupredict.json +148 -0
- teradataml/data/jsons/byom/datarobotpredict.json +147 -0
- teradataml/data/jsons/byom/h2opredict.json +195 -0
- teradataml/data/jsons/byom/onnxembeddings.json +267 -0
- teradataml/data/jsons/byom/onnxpredict.json +187 -0
- teradataml/data/jsons/byom/pmmlpredict.json +147 -0
- teradataml/data/jsons/paired_functions.json +450 -0
- teradataml/data/jsons/sqle/16.20/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/16.20/Attribution.json +249 -0
- teradataml/data/jsons/sqle/16.20/DecisionForestPredict.json +156 -0
- teradataml/data/jsons/sqle/16.20/DecisionTreePredict.json +170 -0
- teradataml/data/jsons/sqle/16.20/GLMPredict.json +122 -0
- teradataml/data/jsons/sqle/16.20/MovingAverage.json +367 -0
- teradataml/data/jsons/sqle/16.20/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/16.20/NaiveBayesPredict.json +136 -0
- teradataml/data/jsons/sqle/16.20/NaiveBayesTextClassifierPredict.json +235 -0
- teradataml/data/jsons/sqle/16.20/Pack.json +98 -0
- teradataml/data/jsons/sqle/16.20/SVMSparsePredict.json +162 -0
- teradataml/data/jsons/sqle/16.20/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/16.20/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/16.20/Unpack.json +166 -0
- teradataml/data/jsons/sqle/16.20/nPath.json +269 -0
- teradataml/data/jsons/sqle/17.00/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/17.00/Attribution.json +249 -0
- teradataml/data/jsons/sqle/17.00/DecisionForestPredict.json +156 -0
- teradataml/data/jsons/sqle/17.00/DecisionTreePredict.json +170 -0
- teradataml/data/jsons/sqle/17.00/GLMPredict.json +122 -0
- teradataml/data/jsons/sqle/17.00/MovingAverage.json +367 -0
- teradataml/data/jsons/sqle/17.00/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/17.00/NaiveBayesPredict.json +136 -0
- teradataml/data/jsons/sqle/17.00/NaiveBayesTextClassifierPredict.json +235 -0
- teradataml/data/jsons/sqle/17.00/Pack.json +98 -0
- teradataml/data/jsons/sqle/17.00/SVMSparsePredict.json +162 -0
- teradataml/data/jsons/sqle/17.00/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/17.00/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/17.00/Unpack.json +166 -0
- teradataml/data/jsons/sqle/17.00/nPath.json +269 -0
- teradataml/data/jsons/sqle/17.05/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/17.05/Attribution.json +249 -0
- teradataml/data/jsons/sqle/17.05/DecisionForestPredict.json +156 -0
- teradataml/data/jsons/sqle/17.05/DecisionTreePredict.json +170 -0
- teradataml/data/jsons/sqle/17.05/GLMPredict.json +122 -0
- teradataml/data/jsons/sqle/17.05/MovingAverage.json +367 -0
- teradataml/data/jsons/sqle/17.05/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/17.05/NaiveBayesPredict.json +136 -0
- teradataml/data/jsons/sqle/17.05/NaiveBayesTextClassifierPredict.json +235 -0
- teradataml/data/jsons/sqle/17.05/Pack.json +98 -0
- teradataml/data/jsons/sqle/17.05/SVMSparsePredict.json +162 -0
- teradataml/data/jsons/sqle/17.05/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/17.05/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/17.05/Unpack.json +166 -0
- teradataml/data/jsons/sqle/17.05/nPath.json +269 -0
- teradataml/data/jsons/sqle/17.10/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/17.10/Attribution.json +249 -0
- teradataml/data/jsons/sqle/17.10/DecisionForestPredict.json +185 -0
- teradataml/data/jsons/sqle/17.10/DecisionTreePredict.json +172 -0
- teradataml/data/jsons/sqle/17.10/GLMPredict.json +151 -0
- teradataml/data/jsons/sqle/17.10/MovingAverage.json +368 -0
- teradataml/data/jsons/sqle/17.10/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/17.10/NaiveBayesPredict.json +149 -0
- teradataml/data/jsons/sqle/17.10/NaiveBayesTextClassifierPredict.json +288 -0
- teradataml/data/jsons/sqle/17.10/Pack.json +133 -0
- teradataml/data/jsons/sqle/17.10/SVMSparsePredict.json +193 -0
- teradataml/data/jsons/sqle/17.10/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/17.10/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/17.10/TD_BinCodeFit.json +239 -0
- teradataml/data/jsons/sqle/17.10/TD_BinCodeTransform.json +70 -0
- teradataml/data/jsons/sqle/17.10/TD_CategoricalSummary.json +54 -0
- teradataml/data/jsons/sqle/17.10/TD_Chisq.json +68 -0
- teradataml/data/jsons/sqle/17.10/TD_ColumnSummary.json +54 -0
- teradataml/data/jsons/sqle/17.10/TD_ConvertTo.json +69 -0
- teradataml/data/jsons/sqle/17.10/TD_FTest.json +187 -0
- teradataml/data/jsons/sqle/17.10/TD_FillRowID.json +52 -0
- teradataml/data/jsons/sqle/17.10/TD_FunctionFit.json +46 -0
- teradataml/data/jsons/sqle/17.10/TD_FunctionTransform.json +72 -0
- teradataml/data/jsons/sqle/17.10/TD_GetRowsWithMissingValues.json +53 -0
- teradataml/data/jsons/sqle/17.10/TD_GetRowsWithoutMissingValues.json +53 -0
- teradataml/data/jsons/sqle/17.10/TD_Histogram.json +133 -0
- teradataml/data/jsons/sqle/17.10/TD_NumApply.json +147 -0
- teradataml/data/jsons/sqle/17.10/TD_OneHotEncodingFit.json +183 -0
- teradataml/data/jsons/sqle/17.10/TD_OneHotEncodingTransform.json +66 -0
- teradataml/data/jsons/sqle/17.10/TD_OutlierFilterFit.json +197 -0
- teradataml/data/jsons/sqle/17.10/TD_OutlierFilterTransform.json +48 -0
- teradataml/data/jsons/sqle/17.10/TD_PolynomialFeaturesFit.json +114 -0
- teradataml/data/jsons/sqle/17.10/TD_PolynomialFeaturesTransform.json +72 -0
- teradataml/data/jsons/sqle/17.10/TD_QQNorm.json +112 -0
- teradataml/data/jsons/sqle/17.10/TD_RoundColumns.json +93 -0
- teradataml/data/jsons/sqle/17.10/TD_RowNormalizeFit.json +128 -0
- teradataml/data/jsons/sqle/17.10/TD_RowNormalizeTransform.json +71 -0
- teradataml/data/jsons/sqle/17.10/TD_ScaleFit.json +157 -0
- teradataml/data/jsons/sqle/17.10/TD_ScaleTransform.json +71 -0
- teradataml/data/jsons/sqle/17.10/TD_SimpleImputeFit.json +148 -0
- teradataml/data/jsons/sqle/17.10/TD_SimpleImputeTransform.json +48 -0
- teradataml/data/jsons/sqle/17.10/TD_StrApply.json +240 -0
- teradataml/data/jsons/sqle/17.10/TD_UnivariateStatistics.json +119 -0
- teradataml/data/jsons/sqle/17.10/TD_WhichMax.json +53 -0
- teradataml/data/jsons/sqle/17.10/TD_WhichMin.json +53 -0
- teradataml/data/jsons/sqle/17.10/TD_ZTest.json +171 -0
- teradataml/data/jsons/sqle/17.10/Unpack.json +188 -0
- teradataml/data/jsons/sqle/17.10/nPath.json +269 -0
- teradataml/data/jsons/sqle/17.20/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/17.20/Attribution.json +249 -0
- teradataml/data/jsons/sqle/17.20/DecisionForestPredict.json +185 -0
- teradataml/data/jsons/sqle/17.20/DecisionTreePredict.json +172 -0
- teradataml/data/jsons/sqle/17.20/GLMPredict.json +151 -0
- teradataml/data/jsons/sqle/17.20/MovingAverage.json +367 -0
- teradataml/data/jsons/sqle/17.20/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/17.20/NaiveBayesPredict.json +149 -0
- teradataml/data/jsons/sqle/17.20/NaiveBayesTextClassifierPredict.json +287 -0
- teradataml/data/jsons/sqle/17.20/Pack.json +133 -0
- teradataml/data/jsons/sqle/17.20/SVMSparsePredict.json +192 -0
- teradataml/data/jsons/sqle/17.20/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/17.20/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +149 -0
- teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
- teradataml/data/jsons/sqle/17.20/TD_BinCodeFit.json +239 -0
- teradataml/data/jsons/sqle/17.20/TD_BinCodeTransform.json +71 -0
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_CategoricalSummary.json +53 -0
- teradataml/data/jsons/sqle/17.20/TD_Chisq.json +68 -0
- teradataml/data/jsons/sqle/17.20/TD_ClassificationEvaluator.json +146 -0
- teradataml/data/jsons/sqle/17.20/TD_ColumnSummary.json +53 -0
- teradataml/data/jsons/sqle/17.20/TD_ColumnTransformer.json +218 -0
- teradataml/data/jsons/sqle/17.20/TD_ConvertTo.json +92 -0
- teradataml/data/jsons/sqle/17.20/TD_DecisionForest.json +260 -0
- teradataml/data/jsons/sqle/17.20/TD_DecisionForestPredict.json +139 -0
- teradataml/data/jsons/sqle/17.20/TD_FTest.json +269 -0
- teradataml/data/jsons/sqle/17.20/TD_FillRowID.json +52 -0
- teradataml/data/jsons/sqle/17.20/TD_FunctionFit.json +46 -0
- teradataml/data/jsons/sqle/17.20/TD_FunctionTransform.json +72 -0
- teradataml/data/jsons/sqle/17.20/TD_GLM.json +507 -0
- teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +168 -0
- teradataml/data/jsons/sqle/17.20/TD_GLMPerSegment.json +411 -0
- teradataml/data/jsons/sqle/17.20/TD_GLMPredictPerSegment.json +146 -0
- teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +93 -0
- teradataml/data/jsons/sqle/17.20/TD_GetRowsWithMissingValues.json +76 -0
- teradataml/data/jsons/sqle/17.20/TD_GetRowsWithoutMissingValues.json +76 -0
- teradataml/data/jsons/sqle/17.20/TD_Histogram.json +152 -0
- teradataml/data/jsons/sqle/17.20/TD_KMeans.json +232 -0
- teradataml/data/jsons/sqle/17.20/TD_KMeansPredict.json +87 -0
- teradataml/data/jsons/sqle/17.20/TD_KNN.json +262 -0
- teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesTextClassifierTrainer.json +137 -0
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +102 -0
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineTransform.json +71 -0
- teradataml/data/jsons/sqle/17.20/TD_NumApply.json +147 -0
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +316 -0
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVMPredict.json +124 -0
- teradataml/data/jsons/sqle/17.20/TD_OneHotEncodingFit.json +271 -0
- teradataml/data/jsons/sqle/17.20/TD_OneHotEncodingTransform.json +65 -0
- teradataml/data/jsons/sqle/17.20/TD_OrdinalEncodingFit.json +229 -0
- teradataml/data/jsons/sqle/17.20/TD_OrdinalEncodingTransform.json +75 -0
- teradataml/data/jsons/sqle/17.20/TD_OutlierFilterFit.json +217 -0
- teradataml/data/jsons/sqle/17.20/TD_OutlierFilterTransform.json +48 -0
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_PolynomialFeaturesFit.json +114 -0
- teradataml/data/jsons/sqle/17.20/TD_PolynomialFeaturesTransform.json +72 -0
- teradataml/data/jsons/sqle/17.20/TD_QQNorm.json +111 -0
- teradataml/data/jsons/sqle/17.20/TD_ROC.json +179 -0
- teradataml/data/jsons/sqle/17.20/TD_RandomProjectionFit.json +179 -0
- teradataml/data/jsons/sqle/17.20/TD_RandomProjectionMinComponents.json +74 -0
- teradataml/data/jsons/sqle/17.20/TD_RandomProjectionTransform.json +74 -0
- teradataml/data/jsons/sqle/17.20/TD_RegressionEvaluator.json +138 -0
- teradataml/data/jsons/sqle/17.20/TD_RoundColumns.json +93 -0
- teradataml/data/jsons/sqle/17.20/TD_RowNormalizeFit.json +128 -0
- teradataml/data/jsons/sqle/17.20/TD_RowNormalizeTransform.json +71 -0
- teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
- teradataml/data/jsons/sqle/17.20/TD_SVM.json +389 -0
- teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +142 -0
- teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +310 -0
- teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +120 -0
- teradataml/data/jsons/sqle/17.20/TD_SentimentExtractor.json +194 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +221 -0
- teradataml/data/jsons/sqle/17.20/TD_Silhouette.json +143 -0
- teradataml/data/jsons/sqle/17.20/TD_SimpleImputeFit.json +147 -0
- teradataml/data/jsons/sqle/17.20/TD_SimpleImputeTransform.json +48 -0
- teradataml/data/jsons/sqle/17.20/TD_StrApply.json +240 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_TargetEncodingFit.json +248 -0
- teradataml/data/jsons/sqle/17.20/TD_TargetEncodingTransform.json +75 -0
- teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +297 -0
- teradataml/data/jsons/sqle/17.20/TD_TrainTestSplit.json +142 -0
- teradataml/data/jsons/sqle/17.20/TD_UnivariateStatistics.json +117 -0
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/sqle/17.20/TD_VectorDistance.json +183 -0
- teradataml/data/jsons/sqle/17.20/TD_WhichMax.json +53 -0
- teradataml/data/jsons/sqle/17.20/TD_WhichMin.json +53 -0
- teradataml/data/jsons/sqle/17.20/TD_WordEmbeddings.json +241 -0
- teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +330 -0
- teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +195 -0
- teradataml/data/jsons/sqle/17.20/TD_ZTest.json +247 -0
- teradataml/data/jsons/sqle/17.20/Unpack.json +188 -0
- teradataml/data/jsons/sqle/17.20/nPath.json +269 -0
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +370 -0
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +460 -0
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +385 -0
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +369 -0
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +369 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +369 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +369 -0
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +400 -0
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +401 -0
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +384 -0
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +384 -0
- teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
- teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
- teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/tableoperator/17.00/read_nos.json +198 -0
- teradataml/data/jsons/tableoperator/17.05/read_nos.json +198 -0
- teradataml/data/jsons/tableoperator/17.05/write_nos.json +195 -0
- teradataml/data/jsons/tableoperator/17.10/read_nos.json +184 -0
- teradataml/data/jsons/tableoperator/17.10/write_nos.json +195 -0
- teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
- teradataml/data/jsons/tableoperator/17.20/read_nos.json +183 -0
- teradataml/data/jsons/tableoperator/17.20/write_nos.json +224 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +132 -0
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +396 -0
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +77 -0
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +153 -0
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +107 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +106 -0
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +89 -0
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +104 -0
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +78 -0
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +66 -0
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +87 -0
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +134 -0
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +144 -0
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +108 -0
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +108 -0
- teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +78 -0
- teradataml/data/jsons/uaf/17.20/TD_DIFF.json +92 -0
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +114 -0
- teradataml/data/jsons/uaf/17.20/TD_DURBIN_WATSON.json +101 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_EXTRACT_RESULTS.json +39 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +101 -0
- teradataml/data/jsons/uaf/17.20/TD_GENSERIES4FORMULA.json +85 -0
- teradataml/data/jsons/uaf/17.20/TD_GENSERIES4SINUSOIDS.json +71 -0
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +139 -0
- teradataml/data/jsons/uaf/17.20/TD_HOLT_WINTERS_FORECASTER.json +313 -0
- teradataml/data/jsons/uaf/17.20/TD_IDFFT.json +58 -0
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +81 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_INPUTVALIDATOR.json +64 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +182 -0
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +103 -0
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +181 -0
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_MATRIXMULTIPLY.json +68 -0
- teradataml/data/jsons/uaf/17.20/TD_MINFO.json +67 -0
- teradataml/data/jsons/uaf/17.20/TD_MULTIVAR_REGR.json +179 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +114 -0
- teradataml/data/jsons/uaf/17.20/TD_PORTMAN.json +119 -0
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +175 -0
- teradataml/data/jsons/uaf/17.20/TD_POWERTRANSFORM.json +98 -0
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +194 -0
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +210 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +143 -0
- teradataml/data/jsons/uaf/17.20/TD_SELECTION_CRITERIA.json +90 -0
- teradataml/data/jsons/uaf/17.20/TD_SIGNIF_PERIODICITIES.json +80 -0
- teradataml/data/jsons/uaf/17.20/TD_SIGNIF_RESIDMEAN.json +68 -0
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +184 -0
- teradataml/data/jsons/uaf/17.20/TD_SINFO.json +58 -0
- teradataml/data/jsons/uaf/17.20/TD_SMOOTHMA.json +163 -0
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +101 -0
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +112 -0
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +95 -0
- teradataml/data/jsons/uaf/17.20/TD_WHITES_GENERAL.json +78 -0
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +410 -0
- teradataml/data/kmeans_example.json +23 -0
- teradataml/data/kmeans_table.csv +10 -0
- teradataml/data/kmeans_us_arrests_data.csv +51 -0
- teradataml/data/knn_example.json +19 -0
- teradataml/data/knnrecommender_example.json +7 -0
- teradataml/data/knnrecommenderpredict_example.json +12 -0
- teradataml/data/lar_example.json +17 -0
- teradataml/data/larpredict_example.json +30 -0
- teradataml/data/lc_new_predictors.csv +5 -0
- teradataml/data/lc_new_reference.csv +9 -0
- teradataml/data/lda_example.json +9 -0
- teradataml/data/ldainference_example.json +15 -0
- teradataml/data/ldatopicsummary_example.json +9 -0
- teradataml/data/levendist_input.csv +13 -0
- teradataml/data/levenshteindistance_example.json +10 -0
- teradataml/data/linreg_example.json +10 -0
- teradataml/data/load_example_data.py +350 -0
- teradataml/data/loan_prediction.csv +295 -0
- teradataml/data/lungcancer.csv +138 -0
- teradataml/data/mappingdata.csv +12 -0
- teradataml/data/medical_readings.csv +101 -0
- teradataml/data/milk_timeseries.csv +157 -0
- teradataml/data/min_max_titanic.csv +4 -0
- teradataml/data/minhash_example.json +6 -0
- teradataml/data/ml_ratings.csv +7547 -0
- teradataml/data/ml_ratings_10.csv +2445 -0
- teradataml/data/mobile_data.csv +13 -0
- teradataml/data/model1_table.csv +5 -0
- teradataml/data/model2_table.csv +5 -0
- teradataml/data/models/License_file.txt +1 -0
- teradataml/data/models/License_file_empty.txt +0 -0
- teradataml/data/models/dataiku_iris_data_ann_thin +0 -0
- teradataml/data/models/dr_iris_rf +0 -0
- teradataml/data/models/iris_db_dt_model_sklearn.onnx +0 -0
- teradataml/data/models/iris_db_dt_model_sklearn_floattensor.onnx +0 -0
- teradataml/data/models/iris_db_glm_model.pmml +57 -0
- teradataml/data/models/iris_db_xgb_model.pmml +4471 -0
- teradataml/data/models/iris_kmeans_model +0 -0
- teradataml/data/models/iris_mojo_glm_h2o_model +0 -0
- teradataml/data/models/iris_mojo_xgb_h2o_model +0 -0
- teradataml/data/modularity_example.json +12 -0
- teradataml/data/movavg_example.json +8 -0
- teradataml/data/mtx1.csv +7 -0
- teradataml/data/mtx2.csv +13 -0
- teradataml/data/multi_model_classification.csv +401 -0
- teradataml/data/multi_model_regression.csv +401 -0
- teradataml/data/mvdfft8.csv +9 -0
- teradataml/data/naivebayes_example.json +10 -0
- teradataml/data/naivebayespredict_example.json +19 -0
- teradataml/data/naivebayestextclassifier2_example.json +7 -0
- teradataml/data/naivebayestextclassifier_example.json +8 -0
- teradataml/data/naivebayestextclassifierpredict_example.json +32 -0
- teradataml/data/name_Find_configure.csv +10 -0
- teradataml/data/namedentityfinder_example.json +14 -0
- teradataml/data/namedentityfinderevaluator_example.json +10 -0
- teradataml/data/namedentityfindertrainer_example.json +6 -0
- teradataml/data/nb_iris_input_test.csv +31 -0
- teradataml/data/nb_iris_input_train.csv +121 -0
- teradataml/data/nbp_iris_model.csv +13 -0
- teradataml/data/ner_dict.csv +8 -0
- teradataml/data/ner_extractor_text.csv +2 -0
- teradataml/data/ner_input_eng.csv +7 -0
- teradataml/data/ner_rule.csv +5 -0
- teradataml/data/ner_sports_test2.csv +29 -0
- teradataml/data/ner_sports_train.csv +501 -0
- teradataml/data/nerevaluator_example.json +6 -0
- teradataml/data/nerextractor_example.json +18 -0
- teradataml/data/nermem_sports_test.csv +18 -0
- teradataml/data/nermem_sports_train.csv +51 -0
- teradataml/data/nertrainer_example.json +7 -0
- teradataml/data/ngrams_example.json +7 -0
- teradataml/data/notebooks/__init__.py +0 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Aggregate Functions using SQLAlchemy.ipynb +1455 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Arithmetic Functions Using SQLAlchemy.ipynb +1993 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Bit-Byte Manipulation Functions using SQLAlchemy.ipynb +1492 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Built-in functions using SQLAlchemy.ipynb +536 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Regular Expressions Using SQLAlchemy.ipynb +570 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage String Functions Using SQLAlchemy.ipynb +2559 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Window Aggregate Functions using SQLAlchemy.ipynb +2911 -0
- teradataml/data/notebooks/sqlalchemy/Using Generic SQLAlchemy ClauseElements teradataml DataFrame assign method.ipynb +698 -0
- teradataml/data/notebooks/sqlalchemy/__init__.py +0 -0
- teradataml/data/notebooks/sqlalchemy/teradataml filtering using SQLAlchemy ClauseElements.ipynb +784 -0
- teradataml/data/npath_example.json +23 -0
- teradataml/data/ntree_example.json +14 -0
- teradataml/data/numeric_strings.csv +5 -0
- teradataml/data/numerics.csv +4 -0
- teradataml/data/ocean_buoy.csv +17 -0
- teradataml/data/ocean_buoy2.csv +17 -0
- teradataml/data/ocean_buoys.csv +28 -0
- teradataml/data/ocean_buoys2.csv +10 -0
- teradataml/data/ocean_buoys_nonpti.csv +28 -0
- teradataml/data/ocean_buoys_seq.csv +29 -0
- teradataml/data/onehot_encoder_train.csv +4 -0
- teradataml/data/openml_example.json +92 -0
- teradataml/data/optional_event_table.csv +4 -0
- teradataml/data/orders1.csv +11 -0
- teradataml/data/orders1_12.csv +13 -0
- teradataml/data/orders_ex.csv +4 -0
- teradataml/data/pack_example.json +9 -0
- teradataml/data/package_tracking.csv +19 -0
- teradataml/data/package_tracking_pti.csv +19 -0
- teradataml/data/pagerank_example.json +13 -0
- teradataml/data/paragraphs_input.csv +6 -0
- teradataml/data/pathanalyzer_example.json +8 -0
- teradataml/data/pathgenerator_example.json +8 -0
- teradataml/data/patient_profile.csv +101 -0
- teradataml/data/pattern_matching_data.csv +11 -0
- teradataml/data/payment_fraud_dataset.csv +10001 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/phrases.csv +7 -0
- teradataml/data/pivot_example.json +9 -0
- teradataml/data/pivot_input.csv +22 -0
- teradataml/data/playerRating.csv +31 -0
- teradataml/data/pos_input.csv +40 -0
- teradataml/data/postagger_example.json +7 -0
- teradataml/data/posttagger_output.csv +44 -0
- teradataml/data/production_data.csv +17 -0
- teradataml/data/production_data2.csv +7 -0
- teradataml/data/randomsample_example.json +32 -0
- teradataml/data/randomwalksample_example.json +9 -0
- teradataml/data/rank_table.csv +6 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/ref_mobile_data.csv +4 -0
- teradataml/data/ref_mobile_data_dense.csv +2 -0
- teradataml/data/ref_url.csv +17 -0
- teradataml/data/restaurant_reviews.csv +7 -0
- teradataml/data/retail_churn_table.csv +27772 -0
- teradataml/data/river_data.csv +145 -0
- teradataml/data/roc_example.json +8 -0
- teradataml/data/roc_input.csv +101 -0
- teradataml/data/rule_inputs.csv +6 -0
- teradataml/data/rule_table.csv +2 -0
- teradataml/data/sales.csv +7 -0
- teradataml/data/sales_transaction.csv +501 -0
- teradataml/data/salesdata.csv +342 -0
- teradataml/data/sample_cities.csv +3 -0
- teradataml/data/sample_shapes.csv +11 -0
- teradataml/data/sample_streets.csv +3 -0
- teradataml/data/sampling_example.json +16 -0
- teradataml/data/sax_example.json +17 -0
- teradataml/data/scale_attributes.csv +3 -0
- teradataml/data/scale_example.json +74 -0
- teradataml/data/scale_housing.csv +11 -0
- teradataml/data/scale_housing_test.csv +6 -0
- teradataml/data/scale_input_part_sparse.csv +31 -0
- teradataml/data/scale_input_partitioned.csv +16 -0
- teradataml/data/scale_input_sparse.csv +11 -0
- teradataml/data/scale_parameters.csv +3 -0
- teradataml/data/scale_stat.csv +11 -0
- teradataml/data/scalebypartition_example.json +13 -0
- teradataml/data/scalemap_example.json +13 -0
- teradataml/data/scalesummary_example.json +12 -0
- teradataml/data/score_category.csv +101 -0
- teradataml/data/score_summary.csv +4 -0
- teradataml/data/script_example.json +10 -0
- teradataml/data/scripts/deploy_script.py +84 -0
- teradataml/data/scripts/lightgbm/dataset.template +175 -0
- teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +264 -0
- teradataml/data/scripts/lightgbm/lightgbm_function.template +234 -0
- teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +177 -0
- teradataml/data/scripts/mapper.R +20 -0
- teradataml/data/scripts/mapper.py +16 -0
- teradataml/data/scripts/mapper_replace.py +16 -0
- teradataml/data/scripts/sklearn/__init__.py +0 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +205 -0
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +148 -0
- teradataml/data/scripts/sklearn/sklearn_function.template +144 -0
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +166 -0
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +161 -0
- teradataml/data/scripts/sklearn/sklearn_score.py +145 -0
- teradataml/data/scripts/sklearn/sklearn_transform.py +327 -0
- teradataml/data/sdk/modelops/modelops_spec.json +101737 -0
- teradataml/data/seeds.csv +10 -0
- teradataml/data/sentenceextractor_example.json +7 -0
- teradataml/data/sentiment_extract_input.csv +11 -0
- teradataml/data/sentiment_train.csv +16 -0
- teradataml/data/sentiment_word.csv +20 -0
- teradataml/data/sentiment_word_input.csv +20 -0
- teradataml/data/sentimentextractor_example.json +24 -0
- teradataml/data/sentimenttrainer_example.json +8 -0
- teradataml/data/sequence_table.csv +10 -0
- teradataml/data/seriessplitter_example.json +8 -0
- teradataml/data/sessionize_example.json +17 -0
- teradataml/data/sessionize_table.csv +116 -0
- teradataml/data/setop_test1.csv +24 -0
- teradataml/data/setop_test2.csv +22 -0
- teradataml/data/soc_nw_edges.csv +11 -0
- teradataml/data/soc_nw_vertices.csv +8 -0
- teradataml/data/souvenir_timeseries.csv +168 -0
- teradataml/data/sparse_iris_attribute.csv +5 -0
- teradataml/data/sparse_iris_test.csv +121 -0
- teradataml/data/sparse_iris_train.csv +601 -0
- teradataml/data/star1.csv +6 -0
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/state_transition.csv +5 -0
- teradataml/data/stock_data.csv +53 -0
- teradataml/data/stock_movement.csv +11 -0
- teradataml/data/stock_vol.csv +76 -0
- teradataml/data/stop_words.csv +8 -0
- teradataml/data/store_sales.csv +37 -0
- teradataml/data/stringsimilarity_example.json +8 -0
- teradataml/data/strsimilarity_input.csv +13 -0
- teradataml/data/students.csv +101 -0
- teradataml/data/svm_iris_input_test.csv +121 -0
- teradataml/data/svm_iris_input_train.csv +481 -0
- teradataml/data/svm_iris_model.csv +7 -0
- teradataml/data/svmdense_example.json +10 -0
- teradataml/data/svmdensepredict_example.json +19 -0
- teradataml/data/svmsparse_example.json +8 -0
- teradataml/data/svmsparsepredict_example.json +14 -0
- teradataml/data/svmsparsesummary_example.json +8 -0
- teradataml/data/target_mobile_data.csv +13 -0
- teradataml/data/target_mobile_data_dense.csv +5 -0
- teradataml/data/target_udt_data.csv +8 -0
- teradataml/data/tdnerextractor_example.json +14 -0
- teradataml/data/templatedata.csv +1201 -0
- teradataml/data/templates/open_source_ml.json +11 -0
- teradataml/data/teradata_icon.ico +0 -0
- teradataml/data/teradataml_example.json +1473 -0
- teradataml/data/test_classification.csv +101 -0
- teradataml/data/test_loan_prediction.csv +53 -0
- teradataml/data/test_pacf_12.csv +37 -0
- teradataml/data/test_prediction.csv +101 -0
- teradataml/data/test_regression.csv +101 -0
- teradataml/data/test_river2.csv +109 -0
- teradataml/data/text_inputs.csv +6 -0
- teradataml/data/textchunker_example.json +8 -0
- teradataml/data/textclassifier_example.json +7 -0
- teradataml/data/textclassifier_input.csv +7 -0
- teradataml/data/textclassifiertrainer_example.json +7 -0
- teradataml/data/textmorph_example.json +11 -0
- teradataml/data/textparser_example.json +15 -0
- teradataml/data/texttagger_example.json +12 -0
- teradataml/data/texttokenizer_example.json +7 -0
- teradataml/data/texttrainer_input.csv +11 -0
- teradataml/data/tf_example.json +7 -0
- teradataml/data/tfidf_example.json +14 -0
- teradataml/data/tfidf_input1.csv +201 -0
- teradataml/data/tfidf_train.csv +6 -0
- teradataml/data/time_table1.csv +535 -0
- teradataml/data/time_table2.csv +14 -0
- teradataml/data/timeseriesdata.csv +1601 -0
- teradataml/data/timeseriesdatasetsd4.csv +105 -0
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic.csv +892 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/to_num_data.csv +4 -0
- teradataml/data/tochar_data.csv +5 -0
- teradataml/data/token_table.csv +696 -0
- teradataml/data/train_multiclass.csv +101 -0
- teradataml/data/train_regression.csv +101 -0
- teradataml/data/train_regression_multiple_labels.csv +101 -0
- teradataml/data/train_tracking.csv +28 -0
- teradataml/data/trans_dense.csv +16 -0
- teradataml/data/trans_sparse.csv +55 -0
- teradataml/data/transformation_table.csv +6 -0
- teradataml/data/transformation_table_new.csv +2 -0
- teradataml/data/tv_spots.csv +16 -0
- teradataml/data/twod_climate_data.csv +117 -0
- teradataml/data/uaf_example.json +529 -0
- teradataml/data/univariatestatistics_example.json +9 -0
- teradataml/data/unpack_example.json +10 -0
- teradataml/data/unpivot_example.json +25 -0
- teradataml/data/unpivot_input.csv +8 -0
- teradataml/data/url_data.csv +10 -0
- teradataml/data/us_air_pass.csv +37 -0
- teradataml/data/us_population.csv +624 -0
- teradataml/data/us_states_shapes.csv +52 -0
- teradataml/data/varmax_example.json +18 -0
- teradataml/data/vectordistance_example.json +30 -0
- teradataml/data/ville_climatedata.csv +121 -0
- teradataml/data/ville_tempdata.csv +12 -0
- teradataml/data/ville_tempdata1.csv +12 -0
- teradataml/data/ville_temperature.csv +11 -0
- teradataml/data/waveletTable.csv +1605 -0
- teradataml/data/waveletTable2.csv +1605 -0
- teradataml/data/weightedmovavg_example.json +9 -0
- teradataml/data/wft_testing.csv +5 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/data/wine_data.csv +1600 -0
- teradataml/data/word_embed_input_table1.csv +6 -0
- teradataml/data/word_embed_input_table2.csv +5 -0
- teradataml/data/word_embed_model.csv +23 -0
- teradataml/data/words_input.csv +13 -0
- teradataml/data/xconvolve_complex_left.csv +6 -0
- teradataml/data/xconvolve_complex_leftmulti.csv +6 -0
- teradataml/data/xgboost_example.json +36 -0
- teradataml/data/xgboostpredict_example.json +32 -0
- teradataml/data/ztest_example.json +16 -0
- teradataml/dataframe/__init__.py +0 -0
- teradataml/dataframe/copy_to.py +2446 -0
- teradataml/dataframe/data_transfer.py +2840 -0
- teradataml/dataframe/dataframe.py +20908 -0
- teradataml/dataframe/dataframe_utils.py +2114 -0
- teradataml/dataframe/fastload.py +794 -0
- teradataml/dataframe/functions.py +2110 -0
- teradataml/dataframe/indexer.py +424 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +1171 -0
- teradataml/dataframe/sql.py +10904 -0
- teradataml/dataframe/sql_function_parameters.py +440 -0
- teradataml/dataframe/sql_functions.py +652 -0
- teradataml/dataframe/sql_interfaces.py +220 -0
- teradataml/dataframe/vantage_function_types.py +675 -0
- teradataml/dataframe/window.py +694 -0
- teradataml/dbutils/__init__.py +3 -0
- teradataml/dbutils/dbutils.py +2871 -0
- teradataml/dbutils/filemgr.py +318 -0
- teradataml/gen_ai/__init__.py +2 -0
- teradataml/gen_ai/convAI.py +473 -0
- teradataml/geospatial/__init__.py +4 -0
- teradataml/geospatial/geodataframe.py +1105 -0
- teradataml/geospatial/geodataframecolumn.py +392 -0
- teradataml/geospatial/geometry_types.py +926 -0
- teradataml/hyperparameter_tuner/__init__.py +1 -0
- teradataml/hyperparameter_tuner/optimizer.py +4115 -0
- teradataml/hyperparameter_tuner/utils.py +303 -0
- teradataml/lib/__init__.py +0 -0
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/lib/libaed_0_1.dylib +0 -0
- teradataml/lib/libaed_0_1.so +0 -0
- teradataml/lib/libaed_0_1_aarch64.so +0 -0
- teradataml/lib/libaed_0_1_ppc64le.so +0 -0
- teradataml/opensource/__init__.py +1 -0
- teradataml/opensource/_base.py +1321 -0
- teradataml/opensource/_class.py +464 -0
- teradataml/opensource/_constants.py +61 -0
- teradataml/opensource/_lightgbm.py +949 -0
- teradataml/opensource/_sklearn.py +1008 -0
- teradataml/opensource/_wrapper_utils.py +267 -0
- teradataml/options/__init__.py +148 -0
- teradataml/options/configure.py +489 -0
- teradataml/options/display.py +187 -0
- teradataml/plot/__init__.py +3 -0
- teradataml/plot/axis.py +1427 -0
- teradataml/plot/constants.py +15 -0
- teradataml/plot/figure.py +431 -0
- teradataml/plot/plot.py +810 -0
- teradataml/plot/query_generator.py +83 -0
- teradataml/plot/subplot.py +216 -0
- teradataml/scriptmgmt/UserEnv.py +4273 -0
- teradataml/scriptmgmt/__init__.py +3 -0
- teradataml/scriptmgmt/lls_utils.py +2157 -0
- teradataml/sdk/README.md +79 -0
- teradataml/sdk/__init__.py +4 -0
- teradataml/sdk/_auth_modes.py +422 -0
- teradataml/sdk/_func_params.py +487 -0
- teradataml/sdk/_json_parser.py +453 -0
- teradataml/sdk/_openapi_spec_constants.py +249 -0
- teradataml/sdk/_utils.py +236 -0
- teradataml/sdk/api_client.py +900 -0
- teradataml/sdk/constants.py +62 -0
- teradataml/sdk/modelops/__init__.py +98 -0
- teradataml/sdk/modelops/_client.py +409 -0
- teradataml/sdk/modelops/_constants.py +304 -0
- teradataml/sdk/modelops/models.py +2308 -0
- teradataml/sdk/spinner.py +107 -0
- teradataml/series/__init__.py +0 -0
- teradataml/series/series.py +537 -0
- teradataml/series/series_utils.py +71 -0
- teradataml/store/__init__.py +12 -0
- teradataml/store/feature_store/__init__.py +0 -0
- teradataml/store/feature_store/constants.py +658 -0
- teradataml/store/feature_store/feature_store.py +4814 -0
- teradataml/store/feature_store/mind_map.py +639 -0
- teradataml/store/feature_store/models.py +7330 -0
- teradataml/store/feature_store/utils.py +390 -0
- teradataml/table_operators/Apply.py +979 -0
- teradataml/table_operators/Script.py +1739 -0
- teradataml/table_operators/TableOperator.py +1343 -0
- teradataml/table_operators/__init__.py +2 -0
- teradataml/table_operators/apply_query_generator.py +262 -0
- teradataml/table_operators/query_generator.py +493 -0
- teradataml/table_operators/table_operator_query_generator.py +462 -0
- teradataml/table_operators/table_operator_util.py +726 -0
- teradataml/table_operators/templates/dataframe_apply.template +184 -0
- teradataml/table_operators/templates/dataframe_map.template +176 -0
- teradataml/table_operators/templates/dataframe_register.template +73 -0
- teradataml/table_operators/templates/dataframe_udf.template +67 -0
- teradataml/table_operators/templates/script_executor.template +170 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +53 -0
- teradataml/utils/__init__.py +0 -0
- teradataml/utils/docstring.py +527 -0
- teradataml/utils/dtypes.py +943 -0
- teradataml/utils/internal_buffer.py +122 -0
- teradataml/utils/print_versions.py +206 -0
- teradataml/utils/utils.py +451 -0
- teradataml/utils/validators.py +3305 -0
- teradataml-20.0.0.8.dist-info/METADATA +2804 -0
- teradataml-20.0.0.8.dist-info/RECORD +1208 -0
- teradataml-20.0.0.8.dist-info/WHEEL +5 -0
- teradataml-20.0.0.8.dist-info/top_level.txt +1 -0
- teradataml-20.0.0.8.dist-info/zip-safe +1 -0
|
@@ -0,0 +1,2273 @@
|
|
|
1
|
+
# ##################################################################
|
|
2
|
+
#
|
|
3
|
+
# Copyright 2025 Teradata. All rights reserved.
|
|
4
|
+
# TERADATA CONFIDENTIAL AND TRADE SECRET
|
|
5
|
+
#
|
|
6
|
+
# Primary Owner: Sweta Shaw
|
|
7
|
+
# Email Id: Sweta.Shaw@Teradata.com
|
|
8
|
+
#
|
|
9
|
+
# Secondary Owner: Akhil Bisht
|
|
10
|
+
# Email Id: AKHIL.BISHT@Teradata.com
|
|
11
|
+
#
|
|
12
|
+
# Version: 1.1
|
|
13
|
+
# Function Version: 1.0
|
|
14
|
+
# ##################################################################
|
|
15
|
+
|
|
16
|
+
# Python libraries
|
|
17
|
+
import pandas as pd
|
|
18
|
+
import time
|
|
19
|
+
import json
|
|
20
|
+
import re
|
|
21
|
+
|
|
22
|
+
# Teradata libraries
|
|
23
|
+
from teradataml.dataframe.dataframe import DataFrame
|
|
24
|
+
from teradataml.dataframe.copy_to import copy_to_sql
|
|
25
|
+
from teradataml import Antiselect
|
|
26
|
+
from teradataml import BincodeFit, BincodeTransform
|
|
27
|
+
from teradataml import CategoricalSummary, ColumnSummary, ConvertTo, GetFutileColumns, FillRowId
|
|
28
|
+
from teradataml import Fit, Transform
|
|
29
|
+
from teradataml import NonLinearCombineFit, NonLinearCombineTransform
|
|
30
|
+
from teradataml import NumApply
|
|
31
|
+
from teradataml import OneHotEncodingFit, OneHotEncodingTransform
|
|
32
|
+
from teradataml import OrdinalEncodingFit, OrdinalEncodingTransform
|
|
33
|
+
from teradataml import SimpleImputeFit, SimpleImputeTransform
|
|
34
|
+
from teradataml import StrApply
|
|
35
|
+
from teradataml import TargetEncodingFit, TargetEncodingTransform
|
|
36
|
+
from sqlalchemy import literal_column
|
|
37
|
+
from teradatasqlalchemy import INTEGER
|
|
38
|
+
from teradataml import display
|
|
39
|
+
from teradataml.common.garbagecollector import GarbageCollector
|
|
40
|
+
from teradataml.dataframe.sql_functions import case
|
|
41
|
+
from teradataml.hyperparameter_tuner.utils import _ProgressBar
|
|
42
|
+
from teradataml.utils.validators import _Validators
|
|
43
|
+
from teradataml.common.utils import UtilFuncs
|
|
44
|
+
from teradataml.common.constants import TeradataConstants
|
|
45
|
+
from teradataml.options.configure import configure
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class _FeatureEngineering:
|
|
49
|
+
|
|
50
|
+
def __init__(self,
|
|
51
|
+
data,
|
|
52
|
+
target_column,
|
|
53
|
+
id_column,
|
|
54
|
+
model_list,
|
|
55
|
+
verbose=0,
|
|
56
|
+
task_type="Regression",
|
|
57
|
+
custom_data=None,
|
|
58
|
+
**kwargs):
|
|
59
|
+
"""
|
|
60
|
+
DESCRIPTION:
|
|
61
|
+
Function initializes the data, target column and columns datatypes
|
|
62
|
+
for feature engineering.
|
|
63
|
+
|
|
64
|
+
PARAMETERS:
|
|
65
|
+
data:
|
|
66
|
+
Required Argument.
|
|
67
|
+
Specifies the input teradataml DataFrame for feature engineering.
|
|
68
|
+
Types: teradataml Dataframe
|
|
69
|
+
|
|
70
|
+
target_column:
|
|
71
|
+
Required Argument.
|
|
72
|
+
Specifies the name of the target column in "data"..
|
|
73
|
+
Types: str
|
|
74
|
+
|
|
75
|
+
id_column:
|
|
76
|
+
Required Argument.
|
|
77
|
+
Specifies the name of the unique identifier column in "data".
|
|
78
|
+
Types: str
|
|
79
|
+
|
|
80
|
+
model_list:
|
|
81
|
+
Required Argument.
|
|
82
|
+
Specifies the list of models to be used for model training.
|
|
83
|
+
Types: list
|
|
84
|
+
|
|
85
|
+
verbose:
|
|
86
|
+
Optional Argument.
|
|
87
|
+
Specifies the detailed execution steps based on verbose level.
|
|
88
|
+
Default Value: 0
|
|
89
|
+
Permitted Values:
|
|
90
|
+
* 0: prints the progress bar and leaderboard
|
|
91
|
+
* 1: prints the execution steps of AutoML.
|
|
92
|
+
* 2: prints the intermediate data between the execution of each step of AutoML.
|
|
93
|
+
Types: int
|
|
94
|
+
|
|
95
|
+
task_type:
|
|
96
|
+
Required Argument.
|
|
97
|
+
Specifies the task type for AutoML, whether to apply regresion OR classification OR clustering
|
|
98
|
+
on the provived dataset.
|
|
99
|
+
Default Value: "Regression"
|
|
100
|
+
Permitted Values: "Regression", "Classification", "Clustering"
|
|
101
|
+
Types: str
|
|
102
|
+
|
|
103
|
+
custom_data:
|
|
104
|
+
Optional Argument.
|
|
105
|
+
Specifies json object containing user customized input.
|
|
106
|
+
Types: json object
|
|
107
|
+
|
|
108
|
+
**kwargs:
|
|
109
|
+
Specifies the additional arguments for feature engineering. Below
|
|
110
|
+
are the additional arguments:
|
|
111
|
+
volatile:
|
|
112
|
+
Optional Argument.
|
|
113
|
+
Specifies whether to put the interim results of the
|
|
114
|
+
functions in a volatile table or not. When set to
|
|
115
|
+
True, results are stored in a volatile table,
|
|
116
|
+
otherwise not.
|
|
117
|
+
Default Value: False
|
|
118
|
+
Types: bool
|
|
119
|
+
|
|
120
|
+
persist:
|
|
121
|
+
Optional Argument.
|
|
122
|
+
Specifies whether to persist the interim results of the
|
|
123
|
+
functions in a table or not. When set to True,
|
|
124
|
+
results are persisted in a table; otherwise,
|
|
125
|
+
results are garbage collected at the end of the
|
|
126
|
+
session.
|
|
127
|
+
Default Value: False
|
|
128
|
+
Types: bool
|
|
129
|
+
|
|
130
|
+
cluster:
|
|
131
|
+
Optional Argument.
|
|
132
|
+
Specifies whether to apply clustering techniques.
|
|
133
|
+
Default Value: False
|
|
134
|
+
Types: bool
|
|
135
|
+
|
|
136
|
+
progress_prefix:
|
|
137
|
+
Optional Argument.
|
|
138
|
+
Specifies the prefix for the progress bar messages.
|
|
139
|
+
Default Value: None
|
|
140
|
+
Types: str.
|
|
141
|
+
|
|
142
|
+
automl_phases:
|
|
143
|
+
Optional Argument.
|
|
144
|
+
Specifies the phase of AutoML to be executed.
|
|
145
|
+
Default Value: None
|
|
146
|
+
Types: str or list of str.
|
|
147
|
+
|
|
148
|
+
auto_dataprep:
|
|
149
|
+
Optional Argument.
|
|
150
|
+
Specifies whether to run AutoDataPrep workflow.
|
|
151
|
+
Default Value: False
|
|
152
|
+
Types: bool
|
|
153
|
+
|
|
154
|
+
enable_lasso:
|
|
155
|
+
Optional Argument.
|
|
156
|
+
Specifies whether to use lasso regression for feature selection.
|
|
157
|
+
By default, only RFE and PCA are used for feature selection.
|
|
158
|
+
Default Value: False
|
|
159
|
+
Types: bool
|
|
160
|
+
|
|
161
|
+
RETURNS:
|
|
162
|
+
None
|
|
163
|
+
|
|
164
|
+
RAISES:
|
|
165
|
+
None
|
|
166
|
+
|
|
167
|
+
EXAMPLES:
|
|
168
|
+
>>> _FeatureEngineering(data=df, target_column="target",
|
|
169
|
+
... id_column="id", model_list=["xgboost"], verbose=1)
|
|
170
|
+
"""
|
|
171
|
+
# Instance variables
|
|
172
|
+
self.data = data
|
|
173
|
+
self.target_column = target_column
|
|
174
|
+
self.id_column = id_column
|
|
175
|
+
self.model_list = model_list
|
|
176
|
+
self.verbose = verbose
|
|
177
|
+
self.task_type = task_type
|
|
178
|
+
self.custom_data = custom_data
|
|
179
|
+
self.excluded_cols=[]
|
|
180
|
+
self.data_types = {key: value for key, value in self.data._column_names_and_types}
|
|
181
|
+
self.target_label = None
|
|
182
|
+
|
|
183
|
+
self.one_hot_obj_count = 0
|
|
184
|
+
self.is_classification_type = lambda: self.task_type.upper() == 'CLASSIFICATION'
|
|
185
|
+
self.persist = kwargs.get('persist', False)
|
|
186
|
+
self.volatile = kwargs.get('volatile', False) or (configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE and self.persist is False)
|
|
187
|
+
self.cluster = kwargs.get('cluster', False)
|
|
188
|
+
|
|
189
|
+
self.data_mapping = {}
|
|
190
|
+
self.progress_prefix = kwargs.get('progress_prefix', None)
|
|
191
|
+
self.aml_phases = kwargs.get('automl_phases', None)
|
|
192
|
+
self.auto_dataprep = kwargs.get('auto_dataprep', False)
|
|
193
|
+
self.enable_lasso = kwargs.get('enable_lasso', False)
|
|
194
|
+
|
|
195
|
+
# Method for doing feature engineering on data -> adding id, removing futile col, imputation, encoding(one hot)
|
|
196
|
+
def feature_engineering(self,
|
|
197
|
+
auto=True):
|
|
198
|
+
"""
|
|
199
|
+
DESCRIPTION:
|
|
200
|
+
Function performs following operations :-
|
|
201
|
+
1. Removes futile columns/features from dataset.
|
|
202
|
+
2. Detects the columns with missing values.
|
|
203
|
+
3. Performs imputation on these columns with missing values.
|
|
204
|
+
4. Detects categorical columns and perform encoding on those columns.
|
|
205
|
+
|
|
206
|
+
PARAMETERS:
|
|
207
|
+
auto:
|
|
208
|
+
Optional Argument.
|
|
209
|
+
Specifies whether to run AutoML in custom mode or auto mode.
|
|
210
|
+
When set to False, runs in custom mode. Otherwise, by default runs in auto mode.
|
|
211
|
+
Default Value: True
|
|
212
|
+
Types: bool
|
|
213
|
+
|
|
214
|
+
RETURNS:
|
|
215
|
+
tuple containing, teradataml DataFrame, list of excluded columns,
|
|
216
|
+
target label information, data transformation dictionary, and data mapping dictionary.
|
|
217
|
+
|
|
218
|
+
RAISES:
|
|
219
|
+
TeradataMlException
|
|
220
|
+
|
|
221
|
+
EXAMPLES:
|
|
222
|
+
>>> data, excluded_cols, target_label, transform_dict, data_mapping = self.feature_engineering(auto=True)
|
|
223
|
+
"""
|
|
224
|
+
# Assigning number of base jobs for progress bar.
|
|
225
|
+
if self.cluster:
|
|
226
|
+
base_jobs = 11 if auto else 15
|
|
227
|
+
else:
|
|
228
|
+
# Base jobs for supervised learning: add extra job when lasso selection is enabled
|
|
229
|
+
base_jobs = (12 if self.enable_lasso else 11) if auto else (17 if self.enable_lasso else 16)
|
|
230
|
+
|
|
231
|
+
# Updating model list based on distinct value of target column for classification type
|
|
232
|
+
if self.is_classification_type():
|
|
233
|
+
if self.data.drop_duplicate(self.target_column).size > 2:
|
|
234
|
+
unsupported_models = ['svm', 'glm'] # Models that don't support multiclass
|
|
235
|
+
for model in unsupported_models:
|
|
236
|
+
if model in self.model_list:
|
|
237
|
+
self._display_msg(inline_msg="Multi-class classification is "
|
|
238
|
+
"not supported by {} model. Skipping {} model."
|
|
239
|
+
.format(model, model))
|
|
240
|
+
self.model_list = [model for model in self.model_list if model not in unsupported_models]
|
|
241
|
+
|
|
242
|
+
# After filtering models like glm/svm due to multiclass
|
|
243
|
+
if not self.auto_dataprep:
|
|
244
|
+
_Validators._validate_non_empty_list_or_valid_selection(self.model_list, "List of models")
|
|
245
|
+
|
|
246
|
+
# Updating number of jobs for progress bar based on number of models.
|
|
247
|
+
jobs = base_jobs + len(self.model_list)
|
|
248
|
+
self.progress_bar = _ProgressBar(jobs=jobs,
|
|
249
|
+
verbose=2,
|
|
250
|
+
prefix=self.progress_prefix)
|
|
251
|
+
|
|
252
|
+
self._display_heading(phase=1,
|
|
253
|
+
progress_bar=self.progress_bar,
|
|
254
|
+
automl_phases=self.aml_phases)
|
|
255
|
+
|
|
256
|
+
self._display_msg(msg='Feature Engineering started ...',
|
|
257
|
+
progress_bar=self.progress_bar)
|
|
258
|
+
|
|
259
|
+
# Storing target column to data transform dictionary
|
|
260
|
+
# Setting target column for supervised learning, for clustering it will be None.
|
|
261
|
+
if not self.cluster:
|
|
262
|
+
self.data_transform_dict['data_target_column'] = self.target_column
|
|
263
|
+
else:
|
|
264
|
+
self.data_transform_dict['data_target_column'] = None
|
|
265
|
+
|
|
266
|
+
# Storing target column encoding indicator to data transform dictionary
|
|
267
|
+
if "target_col_encode_ind" not in self.data_transform_dict:
|
|
268
|
+
self.data_transform_dict["target_col_encode_ind"] = False
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
# Storing task type to data transform dictionary
|
|
272
|
+
if not self.cluster:
|
|
273
|
+
self.data_transform_dict['classification_type'] = self.is_classification_type()
|
|
274
|
+
else:
|
|
275
|
+
self.data_transform_dict['classification_type'] = False
|
|
276
|
+
# Storing params for performing one hot encoding
|
|
277
|
+
self.data_transform_dict['one_hot_encoding_fit_obj'] = {}
|
|
278
|
+
self.data_transform_dict['one_hot_encoding_drop_list'] = []
|
|
279
|
+
|
|
280
|
+
if auto:
|
|
281
|
+
self._remove_duplicate_rows()
|
|
282
|
+
self.progress_bar.update()
|
|
283
|
+
|
|
284
|
+
self._remove_futile_columns()
|
|
285
|
+
self.progress_bar.update()
|
|
286
|
+
|
|
287
|
+
self._handle_date_columns()
|
|
288
|
+
self.progress_bar.update()
|
|
289
|
+
|
|
290
|
+
self._handling_missing_value()
|
|
291
|
+
self.progress_bar.update()
|
|
292
|
+
|
|
293
|
+
self._impute_missing_value()
|
|
294
|
+
self.progress_bar.update()
|
|
295
|
+
|
|
296
|
+
self._encoding_categorical_columns()
|
|
297
|
+
self.progress_bar.update()
|
|
298
|
+
|
|
299
|
+
else:
|
|
300
|
+
self._remove_duplicate_rows()
|
|
301
|
+
self.progress_bar.update()
|
|
302
|
+
|
|
303
|
+
self._anti_select_columns()
|
|
304
|
+
self.progress_bar.update()
|
|
305
|
+
|
|
306
|
+
self._remove_futile_columns()
|
|
307
|
+
self.progress_bar.update()
|
|
308
|
+
|
|
309
|
+
self._handle_date_columns()
|
|
310
|
+
self.progress_bar.update()
|
|
311
|
+
|
|
312
|
+
self._custom_handling_missing_value()
|
|
313
|
+
self.progress_bar.update()
|
|
314
|
+
|
|
315
|
+
self._bin_code_transformation()
|
|
316
|
+
self.progress_bar.update()
|
|
317
|
+
|
|
318
|
+
self._string_manipulation()
|
|
319
|
+
self.progress_bar.update()
|
|
320
|
+
|
|
321
|
+
self._custom_categorical_encoding()
|
|
322
|
+
self.progress_bar.update()
|
|
323
|
+
|
|
324
|
+
self._mathematical_transformation()
|
|
325
|
+
self.progress_bar.update()
|
|
326
|
+
|
|
327
|
+
self._non_linear_transformation()
|
|
328
|
+
self.progress_bar.update()
|
|
329
|
+
|
|
330
|
+
return self.data, self.excluded_cols, self.target_label, self.data_transform_dict, self.data_mapping
|
|
331
|
+
|
|
332
|
+
def _extract_list(self,
|
|
333
|
+
list1,
|
|
334
|
+
list2):
|
|
335
|
+
"""
|
|
336
|
+
DESCRIPTION:
|
|
337
|
+
Function to extract elements from list1 which are not present in list2.
|
|
338
|
+
|
|
339
|
+
PARAMETERS:
|
|
340
|
+
list1:
|
|
341
|
+
Required Argument.
|
|
342
|
+
Specifies the first list for extracting elements from.
|
|
343
|
+
Types: list
|
|
344
|
+
|
|
345
|
+
list2:
|
|
346
|
+
Required Argument.
|
|
347
|
+
Specifies the second list to get elements for avoiding in first list while extracting.
|
|
348
|
+
Types: list
|
|
349
|
+
|
|
350
|
+
RETURNS:
|
|
351
|
+
list containing extracted elements.
|
|
352
|
+
|
|
353
|
+
RAISES:
|
|
354
|
+
None
|
|
355
|
+
|
|
356
|
+
EXAMPLES:
|
|
357
|
+
>>> result = self._extract_list(list1=["a", "b", "c"], list2=["b"])
|
|
358
|
+
"""
|
|
359
|
+
# Ensure list1 and list2 are lists, default to empty list if None
|
|
360
|
+
if list1 is None:
|
|
361
|
+
list1 = []
|
|
362
|
+
if list2 is None:
|
|
363
|
+
list2 = []
|
|
364
|
+
new_lst = list(set(list1) - set(list2))
|
|
365
|
+
return new_lst
|
|
366
|
+
|
|
367
|
+
def _remove_duplicate_rows(self):
|
|
368
|
+
"""
|
|
369
|
+
DESCRIPTION:
|
|
370
|
+
Function to handles duplicate rows present in dataset.
|
|
371
|
+
|
|
372
|
+
PARAMETERS:
|
|
373
|
+
None
|
|
374
|
+
|
|
375
|
+
RETURNS:
|
|
376
|
+
None
|
|
377
|
+
|
|
378
|
+
RAISES:
|
|
379
|
+
None
|
|
380
|
+
|
|
381
|
+
EXAMPLES:
|
|
382
|
+
>>> self._remove_duplicate_rows()
|
|
383
|
+
"""
|
|
384
|
+
self._display_msg(msg="Handling duplicate records present in dataset ...",
|
|
385
|
+
progress_bar=self.progress_bar,
|
|
386
|
+
show_data=True)
|
|
387
|
+
start_time = time.time()
|
|
388
|
+
rows = self.data.shape[0]
|
|
389
|
+
self.data=self.data.drop_duplicate(self.data.columns)
|
|
390
|
+
if rows != self.data.shape[0]:
|
|
391
|
+
self._display_msg(msg=f'Updated dataset sample after removing {rows-self.data.shape[0]} duplicate records:',
|
|
392
|
+
data=self.data,
|
|
393
|
+
progress_bar=self.progress_bar)
|
|
394
|
+
self._display_msg(inline_msg=f"Remaining Rows in the data: {self.data.shape[0]}\n"\
|
|
395
|
+
f"Remaining Columns in the data: {self.data.shape[1]}",
|
|
396
|
+
progress_bar=self.progress_bar)
|
|
397
|
+
else:
|
|
398
|
+
self._display_msg(inline_msg="Analysis completed. No action taken.",
|
|
399
|
+
progress_bar=self.progress_bar)
|
|
400
|
+
|
|
401
|
+
end_time = time.time()
|
|
402
|
+
self._display_msg(msg="Total time to handle duplicate records: {:.2f} sec ".format(end_time - start_time),
|
|
403
|
+
progress_bar=self.progress_bar,
|
|
404
|
+
show_data=True)
|
|
405
|
+
|
|
406
|
+
def _get_distinct_count(self):
|
|
407
|
+
"""
|
|
408
|
+
DESCRIPTION:
|
|
409
|
+
Function to get distinct count for all features and store it in dictionary for further use.
|
|
410
|
+
|
|
411
|
+
PARAMETERS:
|
|
412
|
+
None
|
|
413
|
+
|
|
414
|
+
RETURNS:
|
|
415
|
+
None
|
|
416
|
+
|
|
417
|
+
RAISES:
|
|
418
|
+
None
|
|
419
|
+
|
|
420
|
+
EXAMPLES:
|
|
421
|
+
>>> self._get_distinct_count()
|
|
422
|
+
"""
|
|
423
|
+
# Count of distinct value in each column
|
|
424
|
+
counts = self.data.select(self.data.columns).count(distinct=True)
|
|
425
|
+
|
|
426
|
+
# Dict containing disctinct value in each column
|
|
427
|
+
self.counts_dict = next(counts.itertuples())._asdict()
|
|
428
|
+
|
|
429
|
+
def _preprocess_data(self):
|
|
430
|
+
"""
|
|
431
|
+
DESCRIPTION:
|
|
432
|
+
Function replaces the existing id column or adds the new id column and
|
|
433
|
+
removes columns with single value/same values in the dataset.
|
|
434
|
+
|
|
435
|
+
PARAMETERS:
|
|
436
|
+
None
|
|
437
|
+
|
|
438
|
+
RETURNS:
|
|
439
|
+
None
|
|
440
|
+
|
|
441
|
+
RAISES:
|
|
442
|
+
None
|
|
443
|
+
|
|
444
|
+
EXAMPLES:
|
|
445
|
+
>>> self._preprocess_data()
|
|
446
|
+
"""
|
|
447
|
+
# Get distinct value in each column
|
|
448
|
+
self._get_distinct_count()
|
|
449
|
+
# Columns to removed if count of distinct value = 1
|
|
450
|
+
columns_to_be_removed = [col for col in self.data.columns if self.counts_dict[f'count_{col}'] == 1]
|
|
451
|
+
# Removing irrelevant columns
|
|
452
|
+
if len(columns_to_be_removed) != 0:
|
|
453
|
+
self.data = self.data.drop(columns_to_be_removed, axis=1)
|
|
454
|
+
# Storing irrelevant column list in data transform dictionary
|
|
455
|
+
self.data_transform_dict['drop_irrelevant_columns'] = columns_to_be_removed
|
|
456
|
+
|
|
457
|
+
if self.id_column == 'automl_id':
|
|
458
|
+
# Adding id columns
|
|
459
|
+
obj = FillRowId(data=self.data, row_id_column='automl_id')
|
|
460
|
+
self.data = obj.result
|
|
461
|
+
|
|
462
|
+
# Storing id column to data transform dictionary
|
|
463
|
+
self.data_transform_dict['data_id_column'] = self.id_column
|
|
464
|
+
|
|
465
|
+
def _remove_futile_columns(self):
|
|
466
|
+
"""
|
|
467
|
+
DESCRIPTION:
|
|
468
|
+
Function removes the futile columns from dataset.
|
|
469
|
+
|
|
470
|
+
PARAMETERS:
|
|
471
|
+
None
|
|
472
|
+
|
|
473
|
+
RETURNS:
|
|
474
|
+
None
|
|
475
|
+
|
|
476
|
+
RAISES:
|
|
477
|
+
None
|
|
478
|
+
|
|
479
|
+
EXAMPLES:
|
|
480
|
+
>>> self._remove_futile_columns()
|
|
481
|
+
"""
|
|
482
|
+
self._display_msg(msg="Handling less significant features from data ...",
|
|
483
|
+
progress_bar=self.progress_bar,
|
|
484
|
+
show_data=True)
|
|
485
|
+
start_time = time.time()
|
|
486
|
+
|
|
487
|
+
self._preprocess_data()
|
|
488
|
+
|
|
489
|
+
# Handling string type target column in classification
|
|
490
|
+
# Performing Ordinal Encoding
|
|
491
|
+
if not self.cluster:
|
|
492
|
+
if self.data_types[self.target_column] in ['str']:
|
|
493
|
+
self._ordinal_encoding([self.target_column])
|
|
494
|
+
|
|
495
|
+
# Detecting categorical columns
|
|
496
|
+
categorical_columns = [col for col, d_type in self.data._column_names_and_types if d_type == 'str']
|
|
497
|
+
|
|
498
|
+
# Detecting and removing futile columns, if categorical_column exists
|
|
499
|
+
if len(categorical_columns) != 0:
|
|
500
|
+
|
|
501
|
+
obj = CategoricalSummary(data=self.data,
|
|
502
|
+
target_columns=categorical_columns,
|
|
503
|
+
volatile=self.volatile,
|
|
504
|
+
persist=self.persist)
|
|
505
|
+
|
|
506
|
+
gfc_out = GetFutileColumns(data=self.data,
|
|
507
|
+
object=obj,
|
|
508
|
+
category_summary_column="ColumnName",
|
|
509
|
+
threshold_value =0.7,
|
|
510
|
+
volatile=self.volatile,
|
|
511
|
+
persist=self.persist)
|
|
512
|
+
|
|
513
|
+
# Extracting Futile columns
|
|
514
|
+
f_cols = [row[0] for row in gfc_out.result.itertuples()]
|
|
515
|
+
|
|
516
|
+
self.data_mapping['categorical_summary'] = obj.result._table_name
|
|
517
|
+
self.data_mapping['futile_columns'] = gfc_out.result._table_name
|
|
518
|
+
|
|
519
|
+
if len(f_cols) == 0:
|
|
520
|
+
self._display_msg(inline_msg="Analysis indicates all categorical columns are significant. No action Needed.",
|
|
521
|
+
progress_bar=self.progress_bar)
|
|
522
|
+
else:
|
|
523
|
+
|
|
524
|
+
self.data = self.data.drop(f_cols, axis=1)
|
|
525
|
+
# Storing futile column list in data transform dictionary
|
|
526
|
+
self.data_transform_dict['futile_columns'] = f_cols
|
|
527
|
+
|
|
528
|
+
if self.persist:
|
|
529
|
+
table_name = UtilFuncs._generate_temp_table_name(table_type=TeradataConstants.TERADATA_TABLE,
|
|
530
|
+
gc_on_quit=False)
|
|
531
|
+
self.data.to_sql(table_name)
|
|
532
|
+
else:
|
|
533
|
+
self.data.materialize()
|
|
534
|
+
|
|
535
|
+
self.data_mapping['data_without_futile_columns'] = self.data._table_name
|
|
536
|
+
self._display_msg(msg='Removing Futile columns:',
|
|
537
|
+
col_lst=f_cols,
|
|
538
|
+
progress_bar=self.progress_bar)
|
|
539
|
+
self._display_msg(msg='Sample of Data after removing Futile columns:',
|
|
540
|
+
data=self.data,
|
|
541
|
+
progress_bar=self.progress_bar)
|
|
542
|
+
end_time= time.time()
|
|
543
|
+
self._display_msg(msg="Total time to handle less significant features: {:.2f} sec ".format( end_time - start_time),
|
|
544
|
+
progress_bar=self.progress_bar,
|
|
545
|
+
show_data=True)
|
|
546
|
+
|
|
547
|
+
def _fetch_date_component(self):
|
|
548
|
+
"""
|
|
549
|
+
DESCRIPTION:
|
|
550
|
+
Function to fetch day of week, week of month, month of quarter, quarter of year
|
|
551
|
+
component from date column. Generate weekend and month half details from day of week and
|
|
552
|
+
week of month columns respectively. Convert quarter of year and month of quarter
|
|
553
|
+
component columns to VARCHAR.
|
|
554
|
+
|
|
555
|
+
PARAMETERS:
|
|
556
|
+
None
|
|
557
|
+
|
|
558
|
+
RETURNS:
|
|
559
|
+
list of newly generated date component features.
|
|
560
|
+
|
|
561
|
+
RAISES:
|
|
562
|
+
None
|
|
563
|
+
|
|
564
|
+
EXAMPLES:
|
|
565
|
+
>>> new_features = self._fetch_date_component()
|
|
566
|
+
"""
|
|
567
|
+
# List for storing newly generated date component features
|
|
568
|
+
new_date_components=[]
|
|
569
|
+
# Extracting weekend, month, quarter details information from date columns
|
|
570
|
+
date_component_param={}
|
|
571
|
+
for col in self.date_column_list:
|
|
572
|
+
# Generating new column names for extracted date components
|
|
573
|
+
weekend_col = f'{col}_weekend'
|
|
574
|
+
month_half_col = f'{col}_month_half'
|
|
575
|
+
month_of_quarter_col=f'{col}_month_of_quarter'
|
|
576
|
+
quarter_of_year_col=f'{col}_quarter_of_year'
|
|
577
|
+
|
|
578
|
+
date_component_param = {
|
|
579
|
+
**date_component_param,
|
|
580
|
+
weekend_col: case([(self.data[col].day_of_week().isin([1, 7]), 'yes')], else_='no'),
|
|
581
|
+
month_half_col: case([(self.data[col].week_of_month().isin([1, 2]), 'first_half')], else_='second_half'),
|
|
582
|
+
month_of_quarter_col: self.data[col].month_of_quarter(),
|
|
583
|
+
quarter_of_year_col: self.data[col].quarter_of_year()
|
|
584
|
+
}
|
|
585
|
+
# Storing newly generated date component month and quarter columns.
|
|
586
|
+
# Skipping day of week and week of month columns as they will be used
|
|
587
|
+
# later for extracting weekend and month part details.
|
|
588
|
+
new_date_components.extend([weekend_col, month_half_col, month_of_quarter_col, quarter_of_year_col])
|
|
589
|
+
# Adding new date component columns to dataset
|
|
590
|
+
self.data=self.data.assign(**date_component_param)
|
|
591
|
+
# Dropping date columns as different component columns are extracted.
|
|
592
|
+
self.data = self.data.drop(self.date_column_list, axis=1)
|
|
593
|
+
|
|
594
|
+
# Converting remaining component columns to VARCHAR
|
|
595
|
+
# So that it will be treated as categorical columns
|
|
596
|
+
remaining_component_columns = [col for col in self.data.columns if re.search('month_of_quarter|quarter_of_year'+"$", col)]
|
|
597
|
+
accumulate_columns = self._extract_list(self.data.columns, remaining_component_columns)
|
|
598
|
+
convertto_params = {
|
|
599
|
+
"data" : self.data,
|
|
600
|
+
"target_columns" : remaining_component_columns,
|
|
601
|
+
"target_datatype" : ["VARCHAR(charlen=20,charset=UNICODE,casespecific=NO)"],
|
|
602
|
+
"accumulate" : accumulate_columns,
|
|
603
|
+
"persist" : True
|
|
604
|
+
}
|
|
605
|
+
# Disabling display table name if persist is True by default
|
|
606
|
+
if not self.volatile and not self.persist:
|
|
607
|
+
convertto_params["display_table_name"] = False
|
|
608
|
+
|
|
609
|
+
# Setting persist to False if volatile is True
|
|
610
|
+
if self.volatile:
|
|
611
|
+
convertto_params["persist"] = False
|
|
612
|
+
convertto_params["volatile"] = True
|
|
613
|
+
|
|
614
|
+
# returning dataset after performing string manipulation
|
|
615
|
+
self.data = ConvertTo(**convertto_params).result
|
|
616
|
+
|
|
617
|
+
# IF volatile is False and persist is False
|
|
618
|
+
if not self.volatile and not self.persist:
|
|
619
|
+
# Adding transformed data containing table to garbage collector
|
|
620
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
621
|
+
return new_date_components
|
|
622
|
+
|
|
623
|
+
def _handle_date_columns_helper(self):
|
|
624
|
+
"""
|
|
625
|
+
DESCRIPTION:
|
|
626
|
+
Function for dropping irrelevant date features. Perform extraction of different
|
|
627
|
+
component from relevant date features and transform them.
|
|
628
|
+
|
|
629
|
+
PARAMETERS:
|
|
630
|
+
None
|
|
631
|
+
|
|
632
|
+
RETURNS:
|
|
633
|
+
None
|
|
634
|
+
|
|
635
|
+
RAISES:
|
|
636
|
+
None
|
|
637
|
+
|
|
638
|
+
EXAMPLES:
|
|
639
|
+
>>> self._handle_date_columns_helper()
|
|
640
|
+
"""
|
|
641
|
+
|
|
642
|
+
# Dropping missing value for all date columns
|
|
643
|
+
self._display_msg(msg="Dropping missing values for:",
|
|
644
|
+
col_lst=self.date_column_list,
|
|
645
|
+
progress_bar=self.progress_bar)
|
|
646
|
+
|
|
647
|
+
self.data = self.data.dropna(subset=self.date_column_list)
|
|
648
|
+
|
|
649
|
+
# Date columns list eligible for dropping from dataset
|
|
650
|
+
drop_date_cols = []
|
|
651
|
+
|
|
652
|
+
# Checking for unique valued date columns
|
|
653
|
+
for col in self.date_column_list:
|
|
654
|
+
if self.data.drop_duplicate(col).size == self.data.shape[0]:
|
|
655
|
+
drop_date_cols.append(col)
|
|
656
|
+
|
|
657
|
+
if len(drop_date_cols) != 0:
|
|
658
|
+
self.data = self.data.drop(drop_date_cols, axis=1)
|
|
659
|
+
# Storing unique date column list in data transform dictionary
|
|
660
|
+
self.data_transform_dict['drop_unique_date_columns'] = drop_date_cols
|
|
661
|
+
self._display_msg(msg='Dropping date features with all unique value:',
|
|
662
|
+
col_lst = drop_date_cols,
|
|
663
|
+
progress_bar=self.progress_bar)
|
|
664
|
+
# Updated date column list after dropping irrelevant date columns
|
|
665
|
+
self.date_column_list = [item for item in self.date_column_list if item not in drop_date_cols]
|
|
666
|
+
|
|
667
|
+
if len(self.date_column_list) != 0:
|
|
668
|
+
|
|
669
|
+
# List for storing newly generated date component features
|
|
670
|
+
new_columns=self._fetch_date_component()
|
|
671
|
+
self._display_msg(msg='List of newly generated features from existing date features:',
|
|
672
|
+
col_lst=new_columns,
|
|
673
|
+
progress_bar=self.progress_bar)
|
|
674
|
+
# Dropping columns with all unique values or single value
|
|
675
|
+
drop_cols=[]
|
|
676
|
+
for col in new_columns:
|
|
677
|
+
distinct_rows = self.data.drop_duplicate(col).size
|
|
678
|
+
if distinct_rows == self.data.shape[0]:
|
|
679
|
+
drop_cols.append(col)
|
|
680
|
+
self._display_msg(msg='Dropping features with all unique values:',
|
|
681
|
+
col_lst=col,
|
|
682
|
+
progress_bar=self.progress_bar)
|
|
683
|
+
|
|
684
|
+
elif distinct_rows == 1:
|
|
685
|
+
drop_cols.append(col)
|
|
686
|
+
self._display_msg(msg='Dropping features with single value:',
|
|
687
|
+
col_lst=col,
|
|
688
|
+
progress_bar=self.progress_bar)
|
|
689
|
+
|
|
690
|
+
# Dropping columns from drop_cols list
|
|
691
|
+
if len(drop_cols) != 0:
|
|
692
|
+
self.data = self.data.drop(drop_cols, axis=1)
|
|
693
|
+
# Storing extract date component list for drop in data transform dictionary
|
|
694
|
+
self.data_transform_dict['drop_extract_date_columns'] = drop_cols
|
|
695
|
+
# Extracting all newly generated columns
|
|
696
|
+
new_columns = [item for item in new_columns if item not in drop_cols]
|
|
697
|
+
|
|
698
|
+
self._display_msg(msg='Updated list of newly generated features from existing date features :',
|
|
699
|
+
col_lst=new_columns,
|
|
700
|
+
progress_bar=self.progress_bar)
|
|
701
|
+
|
|
702
|
+
self._display_msg(msg='Updated dataset sample after handling date features:',
|
|
703
|
+
data=self.data,
|
|
704
|
+
progress_bar=self.progress_bar)
|
|
705
|
+
else:
|
|
706
|
+
self._display_msg(inline_msg="No useful date feature found",
|
|
707
|
+
progress_bar=self.progress_bar)
|
|
708
|
+
|
|
709
|
+
def _handle_date_columns(self):
|
|
710
|
+
"""
|
|
711
|
+
DESCRIPTION:
|
|
712
|
+
Function to handle date columns in dataset if any.
|
|
713
|
+
Perform relevant transformation by extracting different components, i.e., Day, Month and Year.
|
|
714
|
+
|
|
715
|
+
PARAMETERS:
|
|
716
|
+
None
|
|
717
|
+
|
|
718
|
+
RETURNS:
|
|
719
|
+
None
|
|
720
|
+
|
|
721
|
+
RAISES:
|
|
722
|
+
None
|
|
723
|
+
|
|
724
|
+
EXAMPLES:
|
|
725
|
+
>>> self._handle_date_columns()
|
|
726
|
+
"""
|
|
727
|
+
self._display_msg(msg="Handling Date Features ...",
|
|
728
|
+
progress_bar=self.progress_bar,
|
|
729
|
+
show_data=True)
|
|
730
|
+
start_time = time.time()
|
|
731
|
+
|
|
732
|
+
self.date_column_list = [col for col, d_type in self.data._column_names_and_types \
|
|
733
|
+
if d_type in ["datetime.date","datetime.datetime"]]
|
|
734
|
+
|
|
735
|
+
if len(self.date_column_list) == 0:
|
|
736
|
+
self._display_msg(inline_msg="Analysis Completed. Dataset does not contain any feature related to dates. No action needed.",
|
|
737
|
+
progress_bar=self.progress_bar)
|
|
738
|
+
else:
|
|
739
|
+
# Storing date column list in data transform dictionary
|
|
740
|
+
self.data_transform_dict['date_columns'] = self.date_column_list
|
|
741
|
+
self._handle_date_columns_helper()
|
|
742
|
+
if self.persist:
|
|
743
|
+
table_name = UtilFuncs._generate_temp_table_name(table_type=TeradataConstants.TERADATA_TABLE,
|
|
744
|
+
gc_on_quit=False)
|
|
745
|
+
self.data.to_sql(table_name)
|
|
746
|
+
else:
|
|
747
|
+
self.data.materialize()
|
|
748
|
+
self.data_mapping['data_after_date_handling'] = self.data._table_name
|
|
749
|
+
|
|
750
|
+
end_time = time.time()
|
|
751
|
+
self._display_msg(msg="Total time to handle date features: {:.2f} sec".format(end_time-start_time),
|
|
752
|
+
progress_bar=self.progress_bar,
|
|
753
|
+
show_data=True)
|
|
754
|
+
|
|
755
|
+
def _missing_count_per_column(self):
|
|
756
|
+
"""
|
|
757
|
+
DESCRIPTION:
|
|
758
|
+
Function finds and returns a dictionary containing list of columns
|
|
759
|
+
with missing values.
|
|
760
|
+
|
|
761
|
+
PARAMETERS:
|
|
762
|
+
None
|
|
763
|
+
|
|
764
|
+
RETURNS:
|
|
765
|
+
dict, keys represent column names and
|
|
766
|
+
values represent the missing value count for corresponding column.
|
|
767
|
+
|
|
768
|
+
RAISES:
|
|
769
|
+
None
|
|
770
|
+
|
|
771
|
+
EXAMPLES:
|
|
772
|
+
>>> missing_cols = self._missing_count_per_column()
|
|
773
|
+
"""
|
|
774
|
+
|
|
775
|
+
# Removing rows with missing target column value
|
|
776
|
+
if not self.cluster:
|
|
777
|
+
self.data = self.data.dropna(subset=[self.target_column])
|
|
778
|
+
|
|
779
|
+
params = {
|
|
780
|
+
"data": self.data,
|
|
781
|
+
"target_columns": self.data.columns,
|
|
782
|
+
"persist": True,
|
|
783
|
+
"display_table_name": False
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
obj = ColumnSummary(**params)
|
|
787
|
+
|
|
788
|
+
# Adding transformed data containing table to garbage collector
|
|
789
|
+
GarbageCollector._add_to_garbagecollector(obj.result._table_name)
|
|
790
|
+
|
|
791
|
+
cols_miss_val={}
|
|
792
|
+
# Iterating over each row in the column summary result
|
|
793
|
+
for row in obj.result.itertuples():
|
|
794
|
+
# Checking if the third element of the row (missing values count) is greater than 0
|
|
795
|
+
if row[3] > 0:
|
|
796
|
+
# If so, add an entry to the 'cols_miss_val' dictionary
|
|
797
|
+
# Key: column name (first element of the row)
|
|
798
|
+
# Value: count of missing values in the column (third element of the row)
|
|
799
|
+
cols_miss_val[row[0]] = row[3]
|
|
800
|
+
|
|
801
|
+
return cols_miss_val
|
|
802
|
+
|
|
803
|
+
def _handling_missing_value(self):
|
|
804
|
+
"""
|
|
805
|
+
DESCRIPTION:
|
|
806
|
+
Function detects the missing values in the each feature of dataset,
|
|
807
|
+
then performs these operations based on condition:
|
|
808
|
+
1. deleting rows from columns/feature
|
|
809
|
+
2. dropping columns from dataset
|
|
810
|
+
|
|
811
|
+
PARAMETERS:
|
|
812
|
+
None
|
|
813
|
+
|
|
814
|
+
RETURNS:
|
|
815
|
+
None
|
|
816
|
+
|
|
817
|
+
RAISES:
|
|
818
|
+
None
|
|
819
|
+
|
|
820
|
+
EXAMPLES:
|
|
821
|
+
>>> self._handling_missing_value()
|
|
822
|
+
"""
|
|
823
|
+
self._display_msg(msg="Checking Missing values in dataset ...",
|
|
824
|
+
progress_bar=self.progress_bar,
|
|
825
|
+
show_data=True)
|
|
826
|
+
start_time = time.time()
|
|
827
|
+
|
|
828
|
+
# Flag for missing values
|
|
829
|
+
msg_val_found=0
|
|
830
|
+
|
|
831
|
+
#num of rows
|
|
832
|
+
d_size = self.data.shape[0]
|
|
833
|
+
|
|
834
|
+
delete_rows = []
|
|
835
|
+
drop_cols = []
|
|
836
|
+
self.imputation_cols = {}
|
|
837
|
+
|
|
838
|
+
cols_miss_val = self._missing_count_per_column()
|
|
839
|
+
|
|
840
|
+
if len(cols_miss_val) != 0:
|
|
841
|
+
self._display_msg(msg="Columns with their missing values:",
|
|
842
|
+
col_lst=cols_miss_val,
|
|
843
|
+
progress_bar=self.progress_bar)
|
|
844
|
+
|
|
845
|
+
# Get distinct value in each column
|
|
846
|
+
self._get_distinct_count()
|
|
847
|
+
|
|
848
|
+
# Iterating over columns with missing values
|
|
849
|
+
for col,val in cols_miss_val.items():
|
|
850
|
+
|
|
851
|
+
# Drop col, if count of missing value > 60%
|
|
852
|
+
if val > .6*d_size:
|
|
853
|
+
drop_cols.append(col)
|
|
854
|
+
continue
|
|
855
|
+
|
|
856
|
+
# For clustering tasks, all columns with missing values are sent directly to imputation
|
|
857
|
+
if self.cluster:
|
|
858
|
+
self.imputation_cols[col] = val
|
|
859
|
+
continue
|
|
860
|
+
|
|
861
|
+
if self.data_types[col] in ['float', 'int']:
|
|
862
|
+
corr_df = self.data[col].corr(self.data[self.target_column])
|
|
863
|
+
corr_val = self.data.assign(True, corr_=corr_df)
|
|
864
|
+
related = next(corr_val.itertuples())[0]
|
|
865
|
+
|
|
866
|
+
# Delete row, if count of missing value < 2% and
|
|
867
|
+
# Relation b/w target column and numeric column <= .25
|
|
868
|
+
if val < .02*d_size and related <= .25:
|
|
869
|
+
delete_rows.append(col)
|
|
870
|
+
continue
|
|
871
|
+
|
|
872
|
+
elif self.data_types[col] in ['str']:
|
|
873
|
+
# Delete row, if count of missing value < 4%
|
|
874
|
+
if val < .04*d_size:
|
|
875
|
+
delete_rows.append(col)
|
|
876
|
+
continue
|
|
877
|
+
# Drop col, if unique count of column > 75%
|
|
878
|
+
elif self.counts_dict[f'count_{col}'] > .75*(d_size-val):
|
|
879
|
+
drop_cols.append(col)
|
|
880
|
+
continue
|
|
881
|
+
|
|
882
|
+
# Remaining column for imputation
|
|
883
|
+
self.imputation_cols[col] = val
|
|
884
|
+
# Storing columns with missing value for imputation in data transform dictionary
|
|
885
|
+
self.data_transform_dict['imputation_columns'] = self.imputation_cols
|
|
886
|
+
|
|
887
|
+
if len(delete_rows) != 0:
|
|
888
|
+
rows = self.data.shape[0]
|
|
889
|
+
self.data = self.data.dropna(subset=delete_rows)
|
|
890
|
+
msg_val_found=1
|
|
891
|
+
self._display_msg(msg='Deleting rows of these columns for handling missing values:',
|
|
892
|
+
col_lst=delete_rows,
|
|
893
|
+
progress_bar=self.progress_bar)
|
|
894
|
+
self._display_msg(msg=f'Sample of dataset after removing {rows-self.data.shape[0]} rows:',
|
|
895
|
+
data=self.data,
|
|
896
|
+
progress_bar=self.progress_bar)
|
|
897
|
+
|
|
898
|
+
if len(drop_cols) != 0:
|
|
899
|
+
self.data = self.data.drop(drop_cols, axis=1)
|
|
900
|
+
msg_val_found=1
|
|
901
|
+
# Storing columns with missing value for drop in data transform dictionary
|
|
902
|
+
self.data_transform_dict['drop_missing_columns'] = drop_cols
|
|
903
|
+
self._display_msg(msg='Dropping these columns for handling missing values:',
|
|
904
|
+
col_lst=drop_cols,
|
|
905
|
+
progress_bar=self.progress_bar)
|
|
906
|
+
self._display_msg(msg=f'Sample of dataset after removing {len(drop_cols)} columns:',
|
|
907
|
+
data=self.data,
|
|
908
|
+
progress_bar=self.progress_bar)
|
|
909
|
+
|
|
910
|
+
if len(self.imputation_cols) == 0 and msg_val_found ==0:
|
|
911
|
+
self._display_msg(inline_msg="Analysis Completed. No Missing Values Detected.",
|
|
912
|
+
progress_bar=self.progress_bar)
|
|
913
|
+
|
|
914
|
+
end_time = time.time()
|
|
915
|
+
self._display_msg(msg="Total time to find missing values in data: {:.2f} sec ".format( end_time - start_time),
|
|
916
|
+
progress_bar=self.progress_bar,
|
|
917
|
+
show_data=True)
|
|
918
|
+
|
|
919
|
+
def _impute_helper(self):
|
|
920
|
+
"""
|
|
921
|
+
DESCRIPTION:
|
|
922
|
+
Function decides the imputation methods [mean/ median/ mode] for columns with missing values
|
|
923
|
+
on the basis of skewness of column in the dataset.
|
|
924
|
+
|
|
925
|
+
PARAMETERS:
|
|
926
|
+
None
|
|
927
|
+
|
|
928
|
+
RETURNS:
|
|
929
|
+
A tuple containing,
|
|
930
|
+
col_stat (name of columns with missing value)
|
|
931
|
+
stat (imputation method for respective columns)
|
|
932
|
+
|
|
933
|
+
RAISES:
|
|
934
|
+
None
|
|
935
|
+
|
|
936
|
+
EXAMPLES:
|
|
937
|
+
>>> col_stat, stat = self._impute_helper()
|
|
938
|
+
"""
|
|
939
|
+
col_stat = []
|
|
940
|
+
stat = []
|
|
941
|
+
|
|
942
|
+
# Converting o/p of skew() into dictonary with key as column name and value as skewness value
|
|
943
|
+
df = self.data.skew()
|
|
944
|
+
skew_data = next(df.itertuples())._asdict()
|
|
945
|
+
|
|
946
|
+
# Iterating over columns with missing value
|
|
947
|
+
for key, val in self.imputation_cols.items():
|
|
948
|
+
|
|
949
|
+
col_stat.append(key)
|
|
950
|
+
if self.data_types[key] in ['float', 'int', 'decimal.Decimal']:
|
|
951
|
+
val = skew_data[f'skew_{key}']
|
|
952
|
+
# Median imputation method, if abs(skewness value) > 1
|
|
953
|
+
if abs(val) > 1:
|
|
954
|
+
stat.append('median')
|
|
955
|
+
# Mean imputation method, if abs(skewness value) <= 1
|
|
956
|
+
else:
|
|
957
|
+
stat.append('mean')
|
|
958
|
+
# Mode imputation method, if categorical column
|
|
959
|
+
elif self.data_types[key] in ['str']:
|
|
960
|
+
stat.append('mode')
|
|
961
|
+
|
|
962
|
+
self._display_msg(msg="Columns with their imputation method:",
|
|
963
|
+
col_lst=dict(zip(col_stat, stat)),
|
|
964
|
+
progress_bar=self.progress_bar)
|
|
965
|
+
|
|
966
|
+
return col_stat, stat
|
|
967
|
+
|
|
968
|
+
def _impute_missing_value(self):
|
|
969
|
+
"""
|
|
970
|
+
DESCRIPTION:
|
|
971
|
+
Function performs the imputation on columns/features with missing values in the dataset.
|
|
972
|
+
|
|
973
|
+
PARAMETERS:
|
|
974
|
+
None
|
|
975
|
+
|
|
976
|
+
RETURNS:
|
|
977
|
+
None
|
|
978
|
+
|
|
979
|
+
RAISES:
|
|
980
|
+
None
|
|
981
|
+
|
|
982
|
+
EXAMPLES:
|
|
983
|
+
>>> self._impute_missing_value()
|
|
984
|
+
"""
|
|
985
|
+
|
|
986
|
+
start_time = time.time()
|
|
987
|
+
self._display_msg(msg="Imputing Missing Values ...",
|
|
988
|
+
progress_bar=self.progress_bar,
|
|
989
|
+
show_data=True)
|
|
990
|
+
|
|
991
|
+
if len(self.imputation_cols) != 0:
|
|
992
|
+
|
|
993
|
+
# List of columns and imputation Method
|
|
994
|
+
col_stat, stat = self._impute_helper()
|
|
995
|
+
|
|
996
|
+
fit_obj = SimpleImputeFit(data=self.data,
|
|
997
|
+
stats_columns=col_stat,
|
|
998
|
+
stats=stat,
|
|
999
|
+
volatile=self.volatile,
|
|
1000
|
+
persist=self.persist)
|
|
1001
|
+
|
|
1002
|
+
# Storing fit object for imputation in data transform dictionary
|
|
1003
|
+
self.data_transform_dict['imputation_fit_object'] = fit_obj.output
|
|
1004
|
+
sm = SimpleImputeTransform(data=self.data,
|
|
1005
|
+
object=fit_obj,
|
|
1006
|
+
volatile=self.volatile,
|
|
1007
|
+
persist=self.persist)
|
|
1008
|
+
|
|
1009
|
+
self.data = sm.result
|
|
1010
|
+
self.data_mapping['fit_simpleimpute_output'] = fit_obj.output_data._table_name
|
|
1011
|
+
self.data_mapping['fit_simpleimpute_result'] = fit_obj.output._table_name
|
|
1012
|
+
self.data_mapping['data_without_missing_values'] = self.data._table_name
|
|
1013
|
+
self._display_msg(msg="Sample of dataset after Imputation:",
|
|
1014
|
+
data=self.data,
|
|
1015
|
+
progress_bar=self.progress_bar)
|
|
1016
|
+
else:
|
|
1017
|
+
self._display_msg(inline_msg="Analysis completed. No imputation required.",
|
|
1018
|
+
progress_bar=self.progress_bar)
|
|
1019
|
+
|
|
1020
|
+
end_time = time.time()
|
|
1021
|
+
self._display_msg(msg="Time taken to perform imputation: {:.2f} sec ".format(end_time - start_time),
|
|
1022
|
+
progress_bar=self.progress_bar,
|
|
1023
|
+
show_data=True)
|
|
1024
|
+
|
|
1025
|
+
def _custom_handling_missing_value(self):
|
|
1026
|
+
"""
|
|
1027
|
+
DESCRIPTION:
|
|
1028
|
+
Function to perform customized missing value handling for features based on user input.
|
|
1029
|
+
|
|
1030
|
+
PARAMETERS:
|
|
1031
|
+
None
|
|
1032
|
+
|
|
1033
|
+
RETURNS:
|
|
1034
|
+
None
|
|
1035
|
+
|
|
1036
|
+
RAISES:
|
|
1037
|
+
TeradataMlException
|
|
1038
|
+
|
|
1039
|
+
EXAMPLES:
|
|
1040
|
+
>>> self._custom_handling_missing_value()
|
|
1041
|
+
"""
|
|
1042
|
+
# Fetching user input for performing missing value handling
|
|
1043
|
+
missing_handling_input = self.custom_data.get("MissingValueHandlingIndicator", False)
|
|
1044
|
+
|
|
1045
|
+
if missing_handling_input:
|
|
1046
|
+
# Fetching parameters required for performing
|
|
1047
|
+
missing_handling_param = self.custom_data.get("MissingValueHandlingParam", None)
|
|
1048
|
+
if missing_handling_param:
|
|
1049
|
+
# Fetching user input for different methods missing value handling
|
|
1050
|
+
drop_col_ind = missing_handling_param.get("DroppingColumnIndicator", False)
|
|
1051
|
+
drop_row_ind = missing_handling_param.get("DroppingRowIndicator", False)
|
|
1052
|
+
impute_ind = missing_handling_param.get("ImputeMissingIndicator", False)
|
|
1053
|
+
volatile = missing_handling_param.pop("volatile", False)
|
|
1054
|
+
persist = missing_handling_param.pop("persist", False)
|
|
1055
|
+
# Checking for user input if all methods indicator are false or not
|
|
1056
|
+
if not any([drop_col_ind, drop_row_ind, impute_ind]):
|
|
1057
|
+
self._display_msg(inline_msg="No method information provided for performing customized missing value handling. \
|
|
1058
|
+
AutoML will proceed with default missing value handling method.",
|
|
1059
|
+
progress_bar=self.progress_bar)
|
|
1060
|
+
|
|
1061
|
+
else:
|
|
1062
|
+
# Checking user input for dropping missing value columns
|
|
1063
|
+
if drop_col_ind:
|
|
1064
|
+
drop_col_list = missing_handling_param.get("DroppingColumnList", [])
|
|
1065
|
+
# Storing customcolumns with missing value for drop in data transform dictionary
|
|
1066
|
+
self.data_transform_dict["custom_drop_missing_columns"] = drop_col_list
|
|
1067
|
+
if len(drop_col_list):
|
|
1068
|
+
# Checking for column present in dataset or not
|
|
1069
|
+
_Validators._validate_dataframe_has_argument_columns(drop_col_list, "DroppingColumnList", self.data, "df")
|
|
1070
|
+
|
|
1071
|
+
self._display_msg(msg="Dropping these columns for handling customized missing value:",
|
|
1072
|
+
col_lst=drop_col_list,
|
|
1073
|
+
progress_bar=self.progress_bar)
|
|
1074
|
+
self.data = self.data.drop(drop_col_list, axis=1)
|
|
1075
|
+
else:
|
|
1076
|
+
self._display_msg(inline_msg="No information provided for dropping missing value containing columns.",
|
|
1077
|
+
progress_bar=self.progress_bar)
|
|
1078
|
+
|
|
1079
|
+
# Checking user input for dropping missing value rows
|
|
1080
|
+
if drop_row_ind:
|
|
1081
|
+
drop_row_list = missing_handling_param.get("DroppingRowList", [])
|
|
1082
|
+
if len(drop_row_list):
|
|
1083
|
+
# Checking for column present in dataset or not
|
|
1084
|
+
_Validators._validate_dataframe_has_argument_columns(drop_row_list, "DroppingRowList", self.data, "df")
|
|
1085
|
+
|
|
1086
|
+
self._display_msg(msg="Dropping missing rows in these columns for handling customized missing value:",
|
|
1087
|
+
col_lst=drop_row_list,
|
|
1088
|
+
progress_bar=self.progress_bar)
|
|
1089
|
+
self.data = self.data.dropna(subset = drop_row_list)
|
|
1090
|
+
else:
|
|
1091
|
+
self._display_msg(inline_msg="No information provided for dropping missing value containing rows.",
|
|
1092
|
+
progress_bar=self.progress_bar)
|
|
1093
|
+
# Checking user input for missing value imputation
|
|
1094
|
+
if impute_ind:
|
|
1095
|
+
stat_list = missing_handling_param.get("StatImputeList", None)
|
|
1096
|
+
stat_method = missing_handling_param.get("StatImputeMethod", None)
|
|
1097
|
+
literal_list = missing_handling_param.get("LiteralImputeList", None)
|
|
1098
|
+
literal_value = missing_handling_param.get("LiteralImputeValue", None)
|
|
1099
|
+
|
|
1100
|
+
# Checking for column present in dataset or not
|
|
1101
|
+
_Validators._validate_dataframe_has_argument_columns(stat_list, "StatImputeList", self.data, "df")
|
|
1102
|
+
|
|
1103
|
+
_Validators._validate_dataframe_has_argument_columns(literal_list, "LiteralImputeList", self.data, "df")
|
|
1104
|
+
|
|
1105
|
+
# Creating fit params
|
|
1106
|
+
fit_param = {
|
|
1107
|
+
"data" : self.data,
|
|
1108
|
+
"stats_columns" : stat_list,
|
|
1109
|
+
"stats" : stat_method,
|
|
1110
|
+
"literals_columns" : literal_list,
|
|
1111
|
+
"literals" : literal_value,
|
|
1112
|
+
"volatile" : volatile,
|
|
1113
|
+
"persist" : persist
|
|
1114
|
+
}
|
|
1115
|
+
# Fitting on dataset
|
|
1116
|
+
fit_obj = SimpleImputeFit(**fit_param)
|
|
1117
|
+
# Storing custom fit object for imputation in data transform dictionary
|
|
1118
|
+
self.data_transform_dict["custom_imputation_ind"] = True
|
|
1119
|
+
self.data_transform_dict["custom_imputation_fit_object"] = fit_obj.output
|
|
1120
|
+
# Creating transform params
|
|
1121
|
+
transform_param = {
|
|
1122
|
+
"data" : self.data,
|
|
1123
|
+
"object" : fit_obj.output,
|
|
1124
|
+
"persist" : True
|
|
1125
|
+
}
|
|
1126
|
+
# Disabling display table name if persist is True by default
|
|
1127
|
+
if not volatile and not persist:
|
|
1128
|
+
transform_param["display_table_name"] = False
|
|
1129
|
+
|
|
1130
|
+
if volatile:
|
|
1131
|
+
transform_param["volatile"] = True
|
|
1132
|
+
transform_param["persist"] = False
|
|
1133
|
+
# Updating dataset with transform result
|
|
1134
|
+
self.data = SimpleImputeTransform(**transform_param).result
|
|
1135
|
+
|
|
1136
|
+
self.data_mapping['fit_simpleimpute_output'] = fit_obj.output_data._table_name
|
|
1137
|
+
self.data_mapping['fit_simpleimpute_result'] = fit_obj.output._table_name
|
|
1138
|
+
self.data_mapping['data_without_missing_values'] = self.data._table_name
|
|
1139
|
+
|
|
1140
|
+
if not volatile and not persist:
|
|
1141
|
+
# Adding transformed data containing table to garbage collector
|
|
1142
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1143
|
+
self._display_msg(msg="Updated dataset sample after performing customized missing value imputation:",
|
|
1144
|
+
data=self.data,
|
|
1145
|
+
progress_bar=self.progress_bar)
|
|
1146
|
+
else:
|
|
1147
|
+
self._display_msg(inline_msg="No information provided for performing customized missing value handling. \
|
|
1148
|
+
AutoML will proceed with default missing value handling method.",
|
|
1149
|
+
progress_bar=self.progress_bar)
|
|
1150
|
+
else:
|
|
1151
|
+
self._display_msg(inline_msg="Proceeding with default option for missing value imputation.",
|
|
1152
|
+
progress_bar=self.progress_bar)
|
|
1153
|
+
|
|
1154
|
+
# Proceeding with default method for handling remaining missing values
|
|
1155
|
+
self._display_msg(inline_msg="Proceeding with default option for handling remaining missing values.",
|
|
1156
|
+
progress_bar=self.progress_bar)
|
|
1157
|
+
self._handling_missing_value()
|
|
1158
|
+
self._impute_missing_value()
|
|
1159
|
+
|
|
1160
|
+
def _bin_code_transformation(self):
|
|
1161
|
+
"""
|
|
1162
|
+
DESCRIPTION:
|
|
1163
|
+
Function to perform customized binning on features based on user input.
|
|
1164
|
+
|
|
1165
|
+
PARAMETERS:
|
|
1166
|
+
None
|
|
1167
|
+
|
|
1168
|
+
RETURNS:
|
|
1169
|
+
None
|
|
1170
|
+
|
|
1171
|
+
RAISES:
|
|
1172
|
+
TeradataMlException
|
|
1173
|
+
|
|
1174
|
+
EXAMPLES:
|
|
1175
|
+
>>> self._bin_code_transformation()
|
|
1176
|
+
"""
|
|
1177
|
+
# Fetching user input for performing bin code transformation.
|
|
1178
|
+
bin_code_input = self.custom_data.get("BincodeIndicator", False)
|
|
1179
|
+
|
|
1180
|
+
if bin_code_input:
|
|
1181
|
+
# Storing custom bin code transformation indicator in data transform dictionary
|
|
1182
|
+
self.data_transform_dict['custom_bincode_ind'] = True
|
|
1183
|
+
# Fetching list required for performing transfomation.
|
|
1184
|
+
extracted_col = self.custom_data.get("BincodeParam", None)
|
|
1185
|
+
if not extracted_col:
|
|
1186
|
+
self._display_msg(inline_msg="BincodeParam is empty. Skipping customized bincode transformation.",
|
|
1187
|
+
progress_bar=self.progress_bar)
|
|
1188
|
+
else:
|
|
1189
|
+
# Creating list for storing column and binning informartion for performing transformation
|
|
1190
|
+
equal_width_bin_list = []
|
|
1191
|
+
equal_width_bin_columns = []
|
|
1192
|
+
var_width_bin_list = []
|
|
1193
|
+
var_width_bin_columns = []
|
|
1194
|
+
volatile = extracted_col.pop("volatile", False)
|
|
1195
|
+
persist = extracted_col.pop("persist", False)
|
|
1196
|
+
|
|
1197
|
+
# Checking for column present in dataset or not
|
|
1198
|
+
_Validators._validate_dataframe_has_argument_columns(list(extracted_col.keys()), "BincodeParam", self.data, "df")
|
|
1199
|
+
|
|
1200
|
+
for col,transform_val in extracted_col.items():
|
|
1201
|
+
# Fetching type of binning to be performed
|
|
1202
|
+
bin_trans_type = transform_val["Type"]
|
|
1203
|
+
# Fetching number of bins to be created
|
|
1204
|
+
num_bin = transform_val["NumOfBins"]
|
|
1205
|
+
# Checking for bin types and adding details into lists for binning
|
|
1206
|
+
if bin_trans_type == "Equal-Width":
|
|
1207
|
+
bins = num_bin
|
|
1208
|
+
equal_width_bin_list.append(bins)
|
|
1209
|
+
equal_width_bin_columns.append(col)
|
|
1210
|
+
elif bin_trans_type == "Variable-Width":
|
|
1211
|
+
var_width_bin_columns.append(col)
|
|
1212
|
+
bins = num_bin
|
|
1213
|
+
for i in range(1, bins+1):
|
|
1214
|
+
# Forming binning name as per expected input
|
|
1215
|
+
temp="Bin_"+str(i)
|
|
1216
|
+
# Fetching required details for variable type binning
|
|
1217
|
+
minval = transform_val[temp]["min_value"]
|
|
1218
|
+
maxval = transform_val[temp]["max_value"]
|
|
1219
|
+
label = transform_val[temp]["label"]
|
|
1220
|
+
# Appending information of each bin
|
|
1221
|
+
var_width_bin_list.append({ "ColumnName":col, "MinValue":minval, "MaxValue":maxval, "Label":label})
|
|
1222
|
+
# Checking column list for performing binning with Equal-Width.
|
|
1223
|
+
if len(equal_width_bin_columns) != 0:
|
|
1224
|
+
# Adding fit parameter for performing binning with Equal-Width.
|
|
1225
|
+
fit_params={
|
|
1226
|
+
"data" : self.data,
|
|
1227
|
+
"target_columns": equal_width_bin_columns,
|
|
1228
|
+
"method_type" : "Equal-Width",
|
|
1229
|
+
"nbins" : bins,
|
|
1230
|
+
"volatile" : volatile,
|
|
1231
|
+
"persist" : persist
|
|
1232
|
+
}
|
|
1233
|
+
eql_bin_code_fit = BincodeFit(**fit_params)
|
|
1234
|
+
# Storing fit object and column list for Equal-Width binning in data transform dictionary
|
|
1235
|
+
self.data_transform_dict['custom_eql_bincode_col'] = equal_width_bin_columns
|
|
1236
|
+
self.data_transform_dict['custom_eql_bincode_fit_object'] = eql_bin_code_fit.output
|
|
1237
|
+
# Extracting accumulate columns
|
|
1238
|
+
accumulate_columns = self._extract_list(self.data.columns, equal_width_bin_columns)
|
|
1239
|
+
# Adding transform parameters for performing binning with Equal-Width.
|
|
1240
|
+
eql_transform_params = {
|
|
1241
|
+
"data" : self.data,
|
|
1242
|
+
"object" : eql_bin_code_fit.output,
|
|
1243
|
+
"accumulate" : accumulate_columns,
|
|
1244
|
+
"persist" : True
|
|
1245
|
+
}
|
|
1246
|
+
# Disabling display table name if persist is True by default
|
|
1247
|
+
if not volatile and not persist:
|
|
1248
|
+
eql_transform_params["display_table_name"] = False
|
|
1249
|
+
|
|
1250
|
+
if volatile:
|
|
1251
|
+
eql_transform_params["volatile"] = True
|
|
1252
|
+
eql_transform_params["persist"] = False
|
|
1253
|
+
self.data = BincodeTransform(**eql_transform_params).result
|
|
1254
|
+
if not volatile and not persist:
|
|
1255
|
+
# Adding transformed data containing table to garbage collector
|
|
1256
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1257
|
+
|
|
1258
|
+
self.data_mapping['fit_eql_width'] = eql_bin_code_fit.output._table_name
|
|
1259
|
+
self.data_mapping['eql_width_bincoded_data'] = self.data._table_name
|
|
1260
|
+
|
|
1261
|
+
self._display_msg(msg="Updated dataset sample after performing Equal-Width binning :-",
|
|
1262
|
+
data=self.data,
|
|
1263
|
+
progress_bar=self.progress_bar)
|
|
1264
|
+
else:
|
|
1265
|
+
self._display_msg(inline_msg="No information provided for Equal-Width Transformation.",
|
|
1266
|
+
progress_bar=self.progress_bar)
|
|
1267
|
+
|
|
1268
|
+
if len(var_width_bin_columns) != 0:
|
|
1269
|
+
# Creating pandas dataframe and then teradata dataframe for storing binning information
|
|
1270
|
+
var_bin_table = pd.DataFrame(var_width_bin_list, columns=["ColumnName", "MinValue", "MaxValue", "Label"])
|
|
1271
|
+
self._display_msg(msg="Variable-Width binning information:-",
|
|
1272
|
+
data=var_bin_table,
|
|
1273
|
+
progress_bar=self.progress_bar)
|
|
1274
|
+
copy_to_sql(df=var_bin_table, table_name="automl_bincode_var_fit", temporary=True)
|
|
1275
|
+
var_fit_input = DataFrame.from_table("automl_bincode_var_fit")
|
|
1276
|
+
fit_params = {
|
|
1277
|
+
"data" : self.data,
|
|
1278
|
+
"fit_data": var_fit_input,
|
|
1279
|
+
"fit_data_order_column" : ["MinValue", "MaxValue"],
|
|
1280
|
+
"target_columns": var_width_bin_columns,
|
|
1281
|
+
"minvalue_column" : "MinValue",
|
|
1282
|
+
"maxvalue_column" : "MaxValue",
|
|
1283
|
+
"label_column" : "Label",
|
|
1284
|
+
"method_type" : "Variable-Width",
|
|
1285
|
+
"label_prefix" : "label_prefix",
|
|
1286
|
+
"volatile" : volatile,
|
|
1287
|
+
"persist" : persist
|
|
1288
|
+
}
|
|
1289
|
+
var_bin_code_fit = BincodeFit(**fit_params)
|
|
1290
|
+
# Storing fit object and column list for Variable-Width binning in data transform dictionary
|
|
1291
|
+
self.data_transform_dict['custom_var_bincode_col'] = var_width_bin_columns
|
|
1292
|
+
self.data_transform_dict['custom_var_bincode_fit_object'] = var_bin_code_fit.output
|
|
1293
|
+
accumulate_columns = self._extract_list(self.data.columns, var_width_bin_columns)
|
|
1294
|
+
var_transform_params = {
|
|
1295
|
+
"data" : self.data,
|
|
1296
|
+
"object" : var_bin_code_fit.output,
|
|
1297
|
+
"object_order_column" : "TD_MinValue_BINFIT",
|
|
1298
|
+
"accumulate" : accumulate_columns,
|
|
1299
|
+
"persist" : True
|
|
1300
|
+
}
|
|
1301
|
+
# Disabling display table name if persist is True by default
|
|
1302
|
+
if not volatile and not persist:
|
|
1303
|
+
var_transform_params["display_table_name"] = False
|
|
1304
|
+
|
|
1305
|
+
if volatile:
|
|
1306
|
+
var_transform_params["volatile"] = True
|
|
1307
|
+
var_transform_params["persist"] = False
|
|
1308
|
+
self.data = BincodeTransform(**var_transform_params).result
|
|
1309
|
+
self.data_mapping['fit_var_width'] = var_bin_code_fit.output._table_name
|
|
1310
|
+
self.data_mapping['var_width_bincoded_data'] = self.data._table_name
|
|
1311
|
+
if not volatile and not persist:
|
|
1312
|
+
# Adding transformed data containing table to garbage collector
|
|
1313
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1314
|
+
self._display_msg(msg="Updated dataset sample after performing Variable-Width binning:",
|
|
1315
|
+
data=self.data,
|
|
1316
|
+
progress_bar=self.progress_bar)
|
|
1317
|
+
else:
|
|
1318
|
+
self._display_msg(inline_msg="No information provided for Variable-Width Transformation.",
|
|
1319
|
+
progress_bar=self.progress_bar)
|
|
1320
|
+
else:
|
|
1321
|
+
self._display_msg(inline_msg="No information provided for Variable-Width Transformation.",
|
|
1322
|
+
progress_bar=self.progress_bar)
|
|
1323
|
+
|
|
1324
|
+
def _string_manipulation(self):
|
|
1325
|
+
"""
|
|
1326
|
+
DESCRIPTION:
|
|
1327
|
+
Function to perform customized string manipulations on categorical features based on user input.
|
|
1328
|
+
|
|
1329
|
+
PARAMETERS:
|
|
1330
|
+
None
|
|
1331
|
+
|
|
1332
|
+
RETURNS:
|
|
1333
|
+
None
|
|
1334
|
+
|
|
1335
|
+
RAISES:
|
|
1336
|
+
TeradataMlException
|
|
1337
|
+
|
|
1338
|
+
EXAMPLES:
|
|
1339
|
+
>>> self._string_manipulation()
|
|
1340
|
+
"""
|
|
1341
|
+
# Fetching user input for performing string manipulation.
|
|
1342
|
+
str_mnpl_input = self.custom_data.get("StringManipulationIndicator", False)
|
|
1343
|
+
# Checking user input for string manipulation on categrical features.
|
|
1344
|
+
if str_mnpl_input:
|
|
1345
|
+
# Storing custom string manipulation indicator in data transform dictionary
|
|
1346
|
+
self.data_transform_dict['custom_string_manipulation_ind'] = True
|
|
1347
|
+
# Fetching list required for performing operation.
|
|
1348
|
+
extracted_col = self.custom_data.get("StringManipulationParam", None).copy()
|
|
1349
|
+
if not extracted_col:
|
|
1350
|
+
self._display_msg(inline_msg="No information provided for performing string manipulation.",
|
|
1351
|
+
progress_bar=self.progress_bar)
|
|
1352
|
+
else:
|
|
1353
|
+
volatile = extracted_col.pop("volatile", False)
|
|
1354
|
+
persist = extracted_col.pop("persist", False)
|
|
1355
|
+
# Checking for column present in dataset or not
|
|
1356
|
+
_Validators._validate_dataframe_has_argument_columns(list(extracted_col.keys()), "StringManipulationParam", self.data, "df")
|
|
1357
|
+
|
|
1358
|
+
for target_col,transform_val in extracted_col.items():
|
|
1359
|
+
self.data = self._str_method_mapping(target_col, transform_val)
|
|
1360
|
+
# Storing custom string manipulation parameters in data transform dictionary
|
|
1361
|
+
self.data_transform_dict['custom_string_manipulation_param'] = extracted_col
|
|
1362
|
+
|
|
1363
|
+
self._display_msg(msg="Updated dataset sample after performing string manipulation:",
|
|
1364
|
+
data=self.data,
|
|
1365
|
+
progress_bar=self.progress_bar)
|
|
1366
|
+
else:
|
|
1367
|
+
self._display_msg(inline_msg="Skipping customized string manipulation.",
|
|
1368
|
+
progress_bar=self.progress_bar)
|
|
1369
|
+
|
|
1370
|
+
def _str_method_mapping(self,
|
|
1371
|
+
target_col,
|
|
1372
|
+
transform_val):
|
|
1373
|
+
"""
|
|
1374
|
+
DESCRIPTION:
|
|
1375
|
+
Function to map customized parameters according to passed method and
|
|
1376
|
+
performs string manipulation on categorical features.
|
|
1377
|
+
|
|
1378
|
+
PARAMETERS:
|
|
1379
|
+
target_col:
|
|
1380
|
+
Required Argument.
|
|
1381
|
+
Specifies feature for applying string manipulation.
|
|
1382
|
+
Types: str
|
|
1383
|
+
|
|
1384
|
+
transform_val:
|
|
1385
|
+
Required Argument.
|
|
1386
|
+
Specifies different parameter require for applying string manipulation.
|
|
1387
|
+
Types: dict
|
|
1388
|
+
|
|
1389
|
+
RETURNS:
|
|
1390
|
+
DataFrame containing transformed data after applying string manipulation.
|
|
1391
|
+
|
|
1392
|
+
RAISES:
|
|
1393
|
+
None
|
|
1394
|
+
|
|
1395
|
+
EXAMPLES:
|
|
1396
|
+
>>> transform_val = {"StringOperation": "upper"}
|
|
1397
|
+
>>> self._str_method_mapping(target_col="text_col", transform_val=transform_val)
|
|
1398
|
+
"""
|
|
1399
|
+
# Creating list of features for accumulating while performing string manipulation on certain features
|
|
1400
|
+
accumulate_columns = self._extract_list(self.data.columns, [target_col])
|
|
1401
|
+
|
|
1402
|
+
# Fetching required parameters from json object
|
|
1403
|
+
string_operation = transform_val["StringOperation"]
|
|
1404
|
+
|
|
1405
|
+
# Setting volatile and persist parameters for performing string manipulation
|
|
1406
|
+
volatile, persist = self._get_generic_parameters(func_indicator="StringManipulationIndicator",
|
|
1407
|
+
param_name="StringManipulationParam")
|
|
1408
|
+
|
|
1409
|
+
# Storing general parameters for performing string transformation
|
|
1410
|
+
fit_params = {
|
|
1411
|
+
"data" : self.data,
|
|
1412
|
+
"target_columns" : target_col,
|
|
1413
|
+
"string_operation" : string_operation,
|
|
1414
|
+
"accumulate" : accumulate_columns,
|
|
1415
|
+
"inplace" : True,
|
|
1416
|
+
"persist" : True
|
|
1417
|
+
}
|
|
1418
|
+
# Disabling display table name if persist is True by default
|
|
1419
|
+
if not volatile and not persist:
|
|
1420
|
+
fit_params["display_table_name"] = False
|
|
1421
|
+
|
|
1422
|
+
if volatile:
|
|
1423
|
+
fit_params["volatile"] = True
|
|
1424
|
+
fit_params["persist"] = False
|
|
1425
|
+
|
|
1426
|
+
# Adding additional parameters based on string operation type
|
|
1427
|
+
if string_operation in ["StringCon", "StringTrim"]:
|
|
1428
|
+
string_argument = transform_val["String"]
|
|
1429
|
+
fit_params = {**fit_params,
|
|
1430
|
+
"string" : string_argument}
|
|
1431
|
+
elif string_operation == "StringPad":
|
|
1432
|
+
string_argument = transform_val["String"]
|
|
1433
|
+
string_length = transform_val["StringLength"]
|
|
1434
|
+
fit_params = {**fit_params,
|
|
1435
|
+
"string" : string_argument,
|
|
1436
|
+
"string_length" : string_length}
|
|
1437
|
+
elif string_operation == "Substring":
|
|
1438
|
+
string_index = transform_val["StartIndex"]
|
|
1439
|
+
string_length = transform_val["StringLength"]
|
|
1440
|
+
fit_params = {**fit_params,
|
|
1441
|
+
"start_index" : string_index,
|
|
1442
|
+
"string_length" : string_length}
|
|
1443
|
+
|
|
1444
|
+
# returning dataset after performing string manipulation
|
|
1445
|
+
transform_output = StrApply(**fit_params).result
|
|
1446
|
+
if not volatile and not persist:
|
|
1447
|
+
# Adding transformed data containing table to garbage collector
|
|
1448
|
+
GarbageCollector._add_to_garbagecollector(transform_output._table_name)
|
|
1449
|
+
self.data_mapping['string_manipulated_data'] = transform_output._table_name
|
|
1450
|
+
return transform_output
|
|
1451
|
+
|
|
1452
|
+
def _one_hot_encoding(self,
|
|
1453
|
+
one_hot_columns,
|
|
1454
|
+
unique_counts):
|
|
1455
|
+
"""
|
|
1456
|
+
DESCRIPTION:
|
|
1457
|
+
Function performs the one hot encoding to categorcial columns/features in the dataset.
|
|
1458
|
+
|
|
1459
|
+
PARAMETERS:
|
|
1460
|
+
one_hot_columns:
|
|
1461
|
+
Required Argument.
|
|
1462
|
+
Specifies the categorical columns for which one hot encoding will be performed.
|
|
1463
|
+
Types: str or list of strings (str)
|
|
1464
|
+
|
|
1465
|
+
unique_counts:
|
|
1466
|
+
Required Argument.
|
|
1467
|
+
Specifies the unique counts in the categorical columns.
|
|
1468
|
+
Types: int or list of integer (int)
|
|
1469
|
+
|
|
1470
|
+
RETURNS:
|
|
1471
|
+
None
|
|
1472
|
+
|
|
1473
|
+
RAISES:
|
|
1474
|
+
None
|
|
1475
|
+
|
|
1476
|
+
EXAMPLES:
|
|
1477
|
+
>>> self._one_hot_encoding(one_hot_columns=["category1"], unique_counts=[5])
|
|
1478
|
+
"""
|
|
1479
|
+
# TD function will add extra column_other in onehotEncoding, so
|
|
1480
|
+
# initailizing this list to remove those extra columns
|
|
1481
|
+
drop_lst = [ele + "_other" for ele in one_hot_columns]
|
|
1482
|
+
|
|
1483
|
+
# Setting volatile and persist parameters for performing encoding
|
|
1484
|
+
volatile, persist = self._get_generic_parameters(func_indicator="CategoricalEncodingIndicator",
|
|
1485
|
+
param_name="CategoricalEncodingParam")
|
|
1486
|
+
|
|
1487
|
+
# Adding fit parameters for performing encoding
|
|
1488
|
+
fit_params = {
|
|
1489
|
+
"data" : self.data,
|
|
1490
|
+
"approach" : "auto",
|
|
1491
|
+
"is_input_dense" : True,
|
|
1492
|
+
"target_column" : one_hot_columns,
|
|
1493
|
+
"category_counts" : unique_counts,
|
|
1494
|
+
"other_column" : "other",
|
|
1495
|
+
"volatile" : volatile,
|
|
1496
|
+
"persist" : persist
|
|
1497
|
+
}
|
|
1498
|
+
# Performing one hot encoding fit on target columns
|
|
1499
|
+
fit_obj = OneHotEncodingFit(**fit_params)
|
|
1500
|
+
# Storing indicator, fit object and column drop list for one hot encoding in data transform dictionary
|
|
1501
|
+
self.data_transform_dict['one_hot_encoding_ind'] = True
|
|
1502
|
+
self.data_transform_dict['one_hot_encoding_fit_obj'].update({self.one_hot_obj_count : fit_obj.result})
|
|
1503
|
+
self.data_transform_dict['one_hot_encoding_drop_list'].extend(drop_lst)
|
|
1504
|
+
self.one_hot_obj_count = self.one_hot_obj_count + 1
|
|
1505
|
+
# Adding transform parameters for performing encoding
|
|
1506
|
+
transform_params = {
|
|
1507
|
+
"data" : self.data,
|
|
1508
|
+
"object" : fit_obj.result,
|
|
1509
|
+
"is_input_dense" : True,
|
|
1510
|
+
"persist" : True
|
|
1511
|
+
}
|
|
1512
|
+
# Disabling display table name if persist is True by default
|
|
1513
|
+
if not volatile and not persist:
|
|
1514
|
+
transform_params["display_table_name"] = False
|
|
1515
|
+
|
|
1516
|
+
# Setting persist to False if volatile is True
|
|
1517
|
+
if volatile:
|
|
1518
|
+
transform_params["volatile"] = True
|
|
1519
|
+
transform_params["persist"] = False
|
|
1520
|
+
|
|
1521
|
+
# Performing one hot encoding transformation
|
|
1522
|
+
transform_output = OneHotEncodingTransform(**transform_params).result
|
|
1523
|
+
|
|
1524
|
+
if not volatile and not persist:
|
|
1525
|
+
# Adding transformed data containing table to garbage collector
|
|
1526
|
+
GarbageCollector._add_to_garbagecollector(transform_output._table_name)
|
|
1527
|
+
self.data = transform_output.drop(drop_lst, axis=1)
|
|
1528
|
+
self.data.materialize()
|
|
1529
|
+
self.data_mapping['one_hot_encoded_data'] = transform_output._table_name
|
|
1530
|
+
self.data_mapping['fit_ohe_result'] = fit_obj.result._table_name
|
|
1531
|
+
|
|
1532
|
+
def _ordinal_encoding(self,
|
|
1533
|
+
ordinal_columns):
|
|
1534
|
+
"""
|
|
1535
|
+
DESCRIPTION:
|
|
1536
|
+
Function performs the ordinal encoding to categorcial columns or features in the dataset.
|
|
1537
|
+
|
|
1538
|
+
PARAMETERS:
|
|
1539
|
+
ordinal_columns:
|
|
1540
|
+
Required Argument.
|
|
1541
|
+
Specifies the categorical columns for which ordinal encoding will be performed.
|
|
1542
|
+
Types: str or list of strings (str)
|
|
1543
|
+
|
|
1544
|
+
RETURNS:
|
|
1545
|
+
None
|
|
1546
|
+
|
|
1547
|
+
RAISES:
|
|
1548
|
+
None
|
|
1549
|
+
|
|
1550
|
+
EXAMPLES:
|
|
1551
|
+
>>> self._ordinal_encoding(ordinal_columns=["category1", "category2"])
|
|
1552
|
+
"""
|
|
1553
|
+
# Setting volatile and persist parameters for performing encoding
|
|
1554
|
+
volatile, persist = self._get_generic_parameters(func_indicator="CategoricalEncodingIndicator",
|
|
1555
|
+
param_name="CategoricalEncodingParam")
|
|
1556
|
+
|
|
1557
|
+
# Adding fit parameters for performing encoding
|
|
1558
|
+
fit_params = {
|
|
1559
|
+
"data" : self.data,
|
|
1560
|
+
"target_column" : ordinal_columns,
|
|
1561
|
+
"volatile" : volatile,
|
|
1562
|
+
"persist" : persist
|
|
1563
|
+
}
|
|
1564
|
+
# Performing ordinal encoding fit on target columns
|
|
1565
|
+
ord_fit_obj = OrdinalEncodingFit(**fit_params)
|
|
1566
|
+
# Storing fit object and column list for ordinal encoding in data transform dictionary
|
|
1567
|
+
if ordinal_columns[0] != self.target_column:
|
|
1568
|
+
self.data_transform_dict["custom_ord_encoding_fit_obj"] = ord_fit_obj.result
|
|
1569
|
+
self.data_transform_dict['custom_ord_encoding_col'] = ordinal_columns
|
|
1570
|
+
else:
|
|
1571
|
+
self.data_transform_dict['target_col_encode_ind'] = True
|
|
1572
|
+
self.data_transform_dict['target_col_ord_encoding_fit_obj'] = ord_fit_obj.result
|
|
1573
|
+
# Extracting accumulate columns
|
|
1574
|
+
accumulate_columns = self._extract_list(self.data.columns, ordinal_columns)
|
|
1575
|
+
# Adding transform parameters for performing encoding
|
|
1576
|
+
transform_params = {
|
|
1577
|
+
"data" : self.data,
|
|
1578
|
+
"object" : ord_fit_obj.result,
|
|
1579
|
+
"accumulate" : accumulate_columns,
|
|
1580
|
+
"persist" : True
|
|
1581
|
+
}
|
|
1582
|
+
# Disabling display table name if persist is True by default
|
|
1583
|
+
if not volatile and not persist:
|
|
1584
|
+
transform_params["display_table_name"] = False
|
|
1585
|
+
|
|
1586
|
+
# Setting persist to False if volatile is True
|
|
1587
|
+
if volatile:
|
|
1588
|
+
transform_params["volatile"] = True
|
|
1589
|
+
transform_params["persist"] = False
|
|
1590
|
+
# Performing ordinal encoding transformation
|
|
1591
|
+
self.data = OrdinalEncodingTransform(**transform_params).result
|
|
1592
|
+
|
|
1593
|
+
if not volatile and not persist:
|
|
1594
|
+
# Adding transformed data containing table to garbage collector
|
|
1595
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1596
|
+
|
|
1597
|
+
self.data_mapping['fit_ordinal_output'] = ord_fit_obj.output_data._table_name
|
|
1598
|
+
self.data_mapping['fit_ordinal_result'] = ord_fit_obj.result._table_name
|
|
1599
|
+
self.data_mapping['ordinal_encoded_data'] = self.data._table_name
|
|
1600
|
+
|
|
1601
|
+
if len(ordinal_columns) == 1 and ordinal_columns[0] == self.target_column:
|
|
1602
|
+
self.target_label = ord_fit_obj
|
|
1603
|
+
|
|
1604
|
+
def _target_encoding(self,
|
|
1605
|
+
target_encoding_list):
|
|
1606
|
+
"""
|
|
1607
|
+
DESCRIPTION:
|
|
1608
|
+
Function performs the target encoding to categorcial columns/features in the dataset.
|
|
1609
|
+
|
|
1610
|
+
PARAMETERS:
|
|
1611
|
+
target_encoding_list:
|
|
1612
|
+
Required Argument.
|
|
1613
|
+
Specifies the categorical columns for which target encoding will be performed.
|
|
1614
|
+
Types: str or list of strings (str)
|
|
1615
|
+
|
|
1616
|
+
RETURNS:
|
|
1617
|
+
None
|
|
1618
|
+
|
|
1619
|
+
RAISES:
|
|
1620
|
+
TeradataMlException
|
|
1621
|
+
|
|
1622
|
+
EXAMPLES:
|
|
1623
|
+
>>> target_dict = {"category": {"encoder_method": "mean", "response_column": "target"}}
|
|
1624
|
+
>>> self._target_encoding(target_encoding_list=target_dict)
|
|
1625
|
+
"""
|
|
1626
|
+
# Fetching all columns on which target encoding will be performed.
|
|
1627
|
+
target_columns = list(target_encoding_list.keys())
|
|
1628
|
+
# Checking for column present in dataset or not
|
|
1629
|
+
_Validators._validate_dataframe_has_argument_columns(target_columns, "TargetEncodingList", self.data, "df")
|
|
1630
|
+
# Finding distinct values and counts for columns.
|
|
1631
|
+
cat_sum = CategoricalSummary(data=self.data,
|
|
1632
|
+
target_columns=target_columns)
|
|
1633
|
+
category_data = cat_sum.result.groupby("ColumnName").count()
|
|
1634
|
+
category_data = category_data.assign(drop_columns=True,
|
|
1635
|
+
ColumnName=category_data.ColumnName,
|
|
1636
|
+
CategoryCount=category_data.count_DistinctValue)
|
|
1637
|
+
# Storing indicator and fit object for target encoding in data transform dictionary
|
|
1638
|
+
self.data_transform_dict["custom_target_encoding_ind"] = True
|
|
1639
|
+
self.data_transform_dict["custom_target_encoding_fit_obj"] = {}
|
|
1640
|
+
|
|
1641
|
+
# Setting volatile and persist parameters for performing encoding
|
|
1642
|
+
volatile, persist = self._get_generic_parameters(func_indicator="CategoricalEncodingIndicator",
|
|
1643
|
+
param_name="CategoricalEncodingParam")
|
|
1644
|
+
|
|
1645
|
+
# Fetching required argument for performing target encoding
|
|
1646
|
+
for col,transform_val in target_encoding_list.items():
|
|
1647
|
+
encoder_method = transform_val["encoder_method"]
|
|
1648
|
+
response_column = transform_val["response_column"]
|
|
1649
|
+
# Adding fit parameters for performing encoding
|
|
1650
|
+
fit_params = {
|
|
1651
|
+
"data" : self.data,
|
|
1652
|
+
"category_data" : category_data,
|
|
1653
|
+
"encoder_method" : encoder_method,
|
|
1654
|
+
"target_columns" : col,
|
|
1655
|
+
"response_column" : response_column,
|
|
1656
|
+
"default_values": -1,
|
|
1657
|
+
"volatile" : volatile,
|
|
1658
|
+
"persist" : persist
|
|
1659
|
+
}
|
|
1660
|
+
if encoder_method == "CBM_DIRICHLET":
|
|
1661
|
+
num_distinct_responses=transform_val["num_distinct_responses"]
|
|
1662
|
+
fit_params = {**fit_params,
|
|
1663
|
+
"num_distinct_responses" : num_distinct_responses}
|
|
1664
|
+
# Performing target encoding fit on target columns
|
|
1665
|
+
tar_fit_obj = TargetEncodingFit(**fit_params)
|
|
1666
|
+
# Storing each column fit object for target encoding in data transform dictionary
|
|
1667
|
+
self.data_transform_dict["custom_target_encoding_fit_obj"].update({col : tar_fit_obj.result})
|
|
1668
|
+
# Extracting accumulate columns
|
|
1669
|
+
accumulate_columns = self._extract_list(self.data.columns, [col])
|
|
1670
|
+
# Adding transform parameters for performing encoding
|
|
1671
|
+
transform_params = {
|
|
1672
|
+
"data" : self.data,
|
|
1673
|
+
"object" : tar_fit_obj,
|
|
1674
|
+
"accumulate" : accumulate_columns,
|
|
1675
|
+
"persist" : True
|
|
1676
|
+
}
|
|
1677
|
+
|
|
1678
|
+
# Disabling display table name if persist is True by default
|
|
1679
|
+
if not volatile and not persist:
|
|
1680
|
+
transform_params["display_table_name"] = False
|
|
1681
|
+
|
|
1682
|
+
if volatile:
|
|
1683
|
+
transform_params["volatile"] = True
|
|
1684
|
+
transform_params["persist"] = False
|
|
1685
|
+
# Performing ordinal encoding transformation
|
|
1686
|
+
self.data = TargetEncodingTransform(**transform_params).result
|
|
1687
|
+
if not volatile and not persist:
|
|
1688
|
+
# Adding transformed data containing table to garbage collector
|
|
1689
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1690
|
+
self.data_mapping[f'fit_{col}_target_output'] = tar_fit_obj.output_data._table_name
|
|
1691
|
+
self.data_mapping[f'fit_{col}_target_result'] = tar_fit_obj.result._table_name
|
|
1692
|
+
self.data_mapping[f'{col}_target_encoded_data'] = self.data._table_name
|
|
1693
|
+
|
|
1694
|
+
def _encoding_categorical_columns(self):
|
|
1695
|
+
"""
|
|
1696
|
+
DESCRIPTION:
|
|
1697
|
+
Function detects the categorical columns and performs encoding on categorical columns in the dataset.
|
|
1698
|
+
|
|
1699
|
+
PARAMETERS:
|
|
1700
|
+
None
|
|
1701
|
+
|
|
1702
|
+
RETURNS:
|
|
1703
|
+
None
|
|
1704
|
+
|
|
1705
|
+
RAISES:
|
|
1706
|
+
None
|
|
1707
|
+
|
|
1708
|
+
EXAMPLES:
|
|
1709
|
+
>>> self._encoding_categorical_columns()
|
|
1710
|
+
"""
|
|
1711
|
+
self._display_msg(msg="Performing encoding for categorical columns ...",
|
|
1712
|
+
progress_bar=self.progress_bar,
|
|
1713
|
+
show_data=True)
|
|
1714
|
+
start_time = time.time()
|
|
1715
|
+
|
|
1716
|
+
ohe_col = []
|
|
1717
|
+
unique_count = []
|
|
1718
|
+
|
|
1719
|
+
# List of columns before one hot
|
|
1720
|
+
col_bf_ohe = self.data.columns
|
|
1721
|
+
|
|
1722
|
+
# Get distinct value in each column
|
|
1723
|
+
self._get_distinct_count()
|
|
1724
|
+
|
|
1725
|
+
# Detecting categorical columns with thier unique counts
|
|
1726
|
+
for col, d_type in self.data._column_names_and_types:
|
|
1727
|
+
if d_type in ['str']:
|
|
1728
|
+
ohe_col.append(col)
|
|
1729
|
+
unique_count.append(self.counts_dict[f'count_{col}'])
|
|
1730
|
+
|
|
1731
|
+
if len(ohe_col) != 0:
|
|
1732
|
+
self._one_hot_encoding(ohe_col, unique_count)
|
|
1733
|
+
|
|
1734
|
+
self._display_msg(msg="ONE HOT Encoding these Columns:",
|
|
1735
|
+
col_lst=ohe_col,
|
|
1736
|
+
progress_bar=self.progress_bar)
|
|
1737
|
+
self._display_msg(msg="Sample of dataset after performing one hot encoding:",
|
|
1738
|
+
data=self.data,
|
|
1739
|
+
progress_bar=self.progress_bar)
|
|
1740
|
+
else:
|
|
1741
|
+
self._display_msg(inline_msg="Analysis completed. No categorical columns were found.",
|
|
1742
|
+
progress_bar=self.progress_bar)
|
|
1743
|
+
|
|
1744
|
+
# List of columns after one hot
|
|
1745
|
+
col_af_ohe = self.data.columns
|
|
1746
|
+
|
|
1747
|
+
# List of excluded columns from outlier processing and scaling
|
|
1748
|
+
self.excluded_cols= self._extract_list(col_af_ohe, col_bf_ohe)
|
|
1749
|
+
|
|
1750
|
+
end_time = time.time()
|
|
1751
|
+
self._display_msg(msg="Time taken to encode the columns: {:.2f} sec".format( end_time - start_time),
|
|
1752
|
+
progress_bar=self.progress_bar,
|
|
1753
|
+
show_data=True)
|
|
1754
|
+
|
|
1755
|
+
def _custom_categorical_encoding(self):
|
|
1756
|
+
"""
|
|
1757
|
+
DESCRIPTION:
|
|
1758
|
+
Function to perform specific encoding on the categorical columns based on user input.
|
|
1759
|
+
if validation fails, default encoding is getting performed on all remaining categorical columns.
|
|
1760
|
+
|
|
1761
|
+
PARAMETERS:
|
|
1762
|
+
None
|
|
1763
|
+
|
|
1764
|
+
RETURNS:
|
|
1765
|
+
None
|
|
1766
|
+
|
|
1767
|
+
RAISES:
|
|
1768
|
+
TeradataMlException
|
|
1769
|
+
|
|
1770
|
+
EXAMPLES:
|
|
1771
|
+
>>> self._custom_categorical_encoding()
|
|
1772
|
+
"""
|
|
1773
|
+
self._display_msg(msg="Starting Customized Categorical Feature Encoding ...",
|
|
1774
|
+
progress_bar=self.progress_bar)
|
|
1775
|
+
cat_end_input = self.custom_data.get("CategoricalEncodingIndicator", False)
|
|
1776
|
+
# Checking user input for categorical encoding
|
|
1777
|
+
if cat_end_input:
|
|
1778
|
+
# Storing custom categorical encoding indicator in data transform dictionary
|
|
1779
|
+
self.data_transform_dict["custom_categorical_encoding_ind"] = True
|
|
1780
|
+
# Fetching user input list for performing
|
|
1781
|
+
encoding_list = self.custom_data.get("CategoricalEncodingParam", None).copy()
|
|
1782
|
+
if encoding_list:
|
|
1783
|
+
volatile = encoding_list.pop("volatile", False)
|
|
1784
|
+
persist = encoding_list.pop("persist", False)
|
|
1785
|
+
onehot_encode_ind = encoding_list.get("OneHotEncodingIndicator", False)
|
|
1786
|
+
ordinal_encode_ind = encoding_list.get("OrdinalEncodingIndicator", False)
|
|
1787
|
+
target_encode_ind = encoding_list.get("TargetEncodingIndicator", False)
|
|
1788
|
+
# Checking if any of categorical encoding technique indicator
|
|
1789
|
+
if not any([onehot_encode_ind, ordinal_encode_ind, target_encode_ind]):
|
|
1790
|
+
self._display_msg(inline_msg="No information provided for any type of customized categorical encoding techniques. AutoML will proceed with default encoding technique.",
|
|
1791
|
+
progress_bar=self.progress_bar)
|
|
1792
|
+
else:
|
|
1793
|
+
if onehot_encode_ind:
|
|
1794
|
+
unique_count = []
|
|
1795
|
+
ohe_list = encoding_list.get("OneHotEncodingList", None)
|
|
1796
|
+
# Checking for empty list
|
|
1797
|
+
if not ohe_list:
|
|
1798
|
+
self._display_msg(inline_msg="No information provided for customized one hot encoding technique.",
|
|
1799
|
+
progress_bar=self.progress_bar)
|
|
1800
|
+
else:
|
|
1801
|
+
# Checking for column present in dataset or not
|
|
1802
|
+
_Validators._validate_dataframe_has_argument_columns(ohe_list, "OneHotEncodingList", self.data, "df")
|
|
1803
|
+
|
|
1804
|
+
# Keeping track for existing columns before apply one hot encoding
|
|
1805
|
+
col_bf_ohe = self.data.columns
|
|
1806
|
+
# Detecting categorical columns with their unique counts
|
|
1807
|
+
for col in ohe_list:
|
|
1808
|
+
unique_count.append(self.data.drop_duplicate(col).size)
|
|
1809
|
+
# Performing one hot encoding
|
|
1810
|
+
self._one_hot_encoding(ohe_list, unique_count)
|
|
1811
|
+
# Keeping track for new columns after apply one hot encoding
|
|
1812
|
+
col_af_ohe = self.data.columns
|
|
1813
|
+
# Fetching list of columns on which outlier processing should not be applied
|
|
1814
|
+
self.excluded_cols.extend(self._extract_list(col_af_ohe, col_bf_ohe))
|
|
1815
|
+
|
|
1816
|
+
self._display_msg(msg="Updated dataset sample after performing one hot encoding:",
|
|
1817
|
+
data=self.data,
|
|
1818
|
+
progress_bar=self.progress_bar)
|
|
1819
|
+
|
|
1820
|
+
if ordinal_encode_ind:
|
|
1821
|
+
ord_list = encoding_list.get("OrdinalEncodingList", None)
|
|
1822
|
+
# Checking for empty list
|
|
1823
|
+
if not ord_list:
|
|
1824
|
+
self._display_msg(inline_msg="No information provided for customized ordinal encoding technique.",
|
|
1825
|
+
progress_bar=self.progress_bar)
|
|
1826
|
+
else:
|
|
1827
|
+
# Checking for column present in dataset or not
|
|
1828
|
+
_Validators._validate_dataframe_has_argument_columns(ord_list, "OrdinalEncodingList", self.data, "df")
|
|
1829
|
+
|
|
1830
|
+
# Performing ordinal encoding
|
|
1831
|
+
self._ordinal_encoding(ord_list)
|
|
1832
|
+
self._display_msg(msg="Updated dataset sample after performing ordinal encoding:",
|
|
1833
|
+
data=self.data,
|
|
1834
|
+
progress_bar=self.progress_bar)
|
|
1835
|
+
|
|
1836
|
+
if target_encode_ind:
|
|
1837
|
+
if self.cluster:
|
|
1838
|
+
self._display_msg(inline_msg="Target Encoding is not applicable for clustering. Skipping it.",
|
|
1839
|
+
progress_bar=self.progress_bar)
|
|
1840
|
+
else:
|
|
1841
|
+
tar_list = encoding_list.get("TargetEncodingList", None)
|
|
1842
|
+
if not tar_list:
|
|
1843
|
+
self._display_msg(inline_msg="No information provided for customized target encoding technique.",
|
|
1844
|
+
progress_bar=self.progress_bar)
|
|
1845
|
+
else:
|
|
1846
|
+
# Performing target encoding
|
|
1847
|
+
self._target_encoding(tar_list)
|
|
1848
|
+
self._display_msg(msg="Updated dataset sample after performing target encoding:",
|
|
1849
|
+
data=self.data,
|
|
1850
|
+
progress_bar=self.progress_bar)
|
|
1851
|
+
else:
|
|
1852
|
+
self._display_msg(inline_msg="No input provided for performing customized categorical encoding. AutoML will proceed with default encoding technique.",
|
|
1853
|
+
progress_bar=self.progress_bar)
|
|
1854
|
+
else:
|
|
1855
|
+
self._display_msg(inline_msg="AutoML will proceed with default encoding technique.",
|
|
1856
|
+
progress_bar=self.progress_bar)
|
|
1857
|
+
|
|
1858
|
+
# Performing default encoding on remaining categorical columns
|
|
1859
|
+
self._encoding_categorical_columns()
|
|
1860
|
+
|
|
1861
|
+
def _numapply_transformation(self, target_col, transform_val):
|
|
1862
|
+
"""
|
|
1863
|
+
DESCRIPTION:
|
|
1864
|
+
Function to perform different numerical transformations using NumApply on numerical features based on user input.
|
|
1865
|
+
|
|
1866
|
+
PARAMETERS:
|
|
1867
|
+
target_col:
|
|
1868
|
+
Required Argument.
|
|
1869
|
+
Specifies the numerical column for which transformation will be performed.
|
|
1870
|
+
Types: str
|
|
1871
|
+
|
|
1872
|
+
transform_val:
|
|
1873
|
+
Required Argument.
|
|
1874
|
+
Specifies different parameter require for applying numerical transformation.
|
|
1875
|
+
Types: dict
|
|
1876
|
+
|
|
1877
|
+
RETURNS:
|
|
1878
|
+
NumApply result object containing the transformed data.
|
|
1879
|
+
|
|
1880
|
+
RAISES:
|
|
1881
|
+
None
|
|
1882
|
+
|
|
1883
|
+
EXAMPLES:
|
|
1884
|
+
>>> transform_val = {"apply_method": "sqrt"}
|
|
1885
|
+
>>> result = self._numapply_transformation(target_col="numeric_col", transform_val=transform_val)
|
|
1886
|
+
"""
|
|
1887
|
+
# Fetching columns for accumulation
|
|
1888
|
+
accumulate_columns = self._extract_list(self.data.columns, [target_col])
|
|
1889
|
+
apply_method = transform_val["apply_method"]
|
|
1890
|
+
|
|
1891
|
+
# Setting volatile and persist parameters for performing transformation
|
|
1892
|
+
volatile, persist = self._get_generic_parameters(func_indicator="MathameticalTransformationIndicator",
|
|
1893
|
+
param_name="MathameticalTransformationParam")
|
|
1894
|
+
# Adding fit parameters for performing transformation
|
|
1895
|
+
fit_params={
|
|
1896
|
+
"data": self.data,
|
|
1897
|
+
"target_columns" : target_col,
|
|
1898
|
+
"apply_method" : apply_method,
|
|
1899
|
+
"inplace" : True,
|
|
1900
|
+
"persist" :True,
|
|
1901
|
+
"accumulate" : accumulate_columns
|
|
1902
|
+
}
|
|
1903
|
+
# Disabling display table name if persist is True by default
|
|
1904
|
+
if not volatile and not persist:
|
|
1905
|
+
fit_params["display_table_name"] = False
|
|
1906
|
+
|
|
1907
|
+
if volatile:
|
|
1908
|
+
fit_params["volatile"] = True
|
|
1909
|
+
fit_params["persist"] = False
|
|
1910
|
+
# Adding addition details for fit parameters in case of SIGMOID transformation
|
|
1911
|
+
if apply_method == "sigmoid":
|
|
1912
|
+
sigmoid_style=transform_val["sigmoid_style"]
|
|
1913
|
+
fit_params = {**fit_params, "sigmoid_style" : sigmoid_style}
|
|
1914
|
+
# Performing transformation on target columns
|
|
1915
|
+
transform_output = NumApply(**fit_params).result
|
|
1916
|
+
if not volatile and not persist:
|
|
1917
|
+
# Adding transformed data containing table to garbage collector
|
|
1918
|
+
GarbageCollector._add_to_garbagecollector(transform_output._table_name)
|
|
1919
|
+
return transform_output
|
|
1920
|
+
|
|
1921
|
+
def _numerical_transformation(self, target_columns, num_transform_data, volatile, persist):
|
|
1922
|
+
"""
|
|
1923
|
+
DESCRIPTION:
|
|
1924
|
+
Function to perform different numerical transformations using Fit and Transform on numerical features based on user input.
|
|
1925
|
+
|
|
1926
|
+
PARAMETERS:
|
|
1927
|
+
target_columns:
|
|
1928
|
+
Required Argument.
|
|
1929
|
+
Specifies the target columns for numerical transformation.
|
|
1930
|
+
Types: list
|
|
1931
|
+
|
|
1932
|
+
num_transform_data:
|
|
1933
|
+
Required Argument.
|
|
1934
|
+
Specifies the numerical transformation data object.
|
|
1935
|
+
Types: object
|
|
1936
|
+
|
|
1937
|
+
volatile:
|
|
1938
|
+
Required Argument.
|
|
1939
|
+
Specifies whether to use volatile tables.
|
|
1940
|
+
Types: bool
|
|
1941
|
+
|
|
1942
|
+
persist:
|
|
1943
|
+
Required Argument.
|
|
1944
|
+
Specifies whether to persist results.
|
|
1945
|
+
Types: bool
|
|
1946
|
+
|
|
1947
|
+
RETURNS:
|
|
1948
|
+
None
|
|
1949
|
+
|
|
1950
|
+
RAISES:
|
|
1951
|
+
None
|
|
1952
|
+
|
|
1953
|
+
EXAMPLES:
|
|
1954
|
+
>>> self._numerical_transformation(target_columns=["col1"], num_transform_data=transform_obj, volatile=False, persist=True)
|
|
1955
|
+
"""
|
|
1956
|
+
# Adding fit parameters for transformation
|
|
1957
|
+
fit_params={
|
|
1958
|
+
"data" : self.data,
|
|
1959
|
+
"object" : num_transform_data,
|
|
1960
|
+
"object_order_column" : "TargetColumn",
|
|
1961
|
+
"volatile" : volatile,
|
|
1962
|
+
"persist" : persist
|
|
1963
|
+
}
|
|
1964
|
+
# Peforming fit with all arguments.
|
|
1965
|
+
num_fit_obj = Fit(**fit_params)
|
|
1966
|
+
# Fetching all numerical columns
|
|
1967
|
+
numerical_columns = [col for col, d_type in self.data._column_names_and_types if d_type in ["int","float"]]
|
|
1968
|
+
# Extracting id columns where transformation should not affect numerical columns
|
|
1969
|
+
id_columns = self._extract_list(numerical_columns,target_columns)
|
|
1970
|
+
# Storing fit object and id column list for numerical transformation in data transform dictionary
|
|
1971
|
+
self.data_transform_dict['custom_numerical_transformation_fit_object'] = num_fit_obj.result
|
|
1972
|
+
self.data_transform_dict['custom_numerical_transformation_id_columns'] = id_columns
|
|
1973
|
+
# Adding transform parameters for transformation
|
|
1974
|
+
transform_params={
|
|
1975
|
+
"data" : self.data,
|
|
1976
|
+
"object" : num_fit_obj.result,
|
|
1977
|
+
"id_columns" : id_columns,
|
|
1978
|
+
"persist" :True
|
|
1979
|
+
}
|
|
1980
|
+
# Disabling display table name if persist is True by default
|
|
1981
|
+
if not volatile and not persist:
|
|
1982
|
+
transform_params["display_table_name"] = False
|
|
1983
|
+
|
|
1984
|
+
if volatile:
|
|
1985
|
+
transform_params["volatile"] = True
|
|
1986
|
+
transform_params["persist"] = False
|
|
1987
|
+
# Peforming transformation on target columns
|
|
1988
|
+
self.data = Transform(**transform_params).result
|
|
1989
|
+
if not volatile and not persist:
|
|
1990
|
+
# Adding transformed data containing table to garbage collector
|
|
1991
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
1992
|
+
|
|
1993
|
+
self.data_mapping['fit_numerical_result'] = num_fit_obj.result._table_name
|
|
1994
|
+
self.data_mapping['numerical_transformed_data'] = self.data._table_name
|
|
1995
|
+
self._display_msg(msg="Updated dataset sample after applying numerical transformation:",
|
|
1996
|
+
data=self.data,
|
|
1997
|
+
progress_bar=self.progress_bar)
|
|
1998
|
+
|
|
1999
|
+
def _mathematical_transformation(self):
|
|
2000
|
+
"""
|
|
2001
|
+
DESCRIPTION:
|
|
2002
|
+
Function to perform different mathematical transformations (i.e., log, pow,
|
|
2003
|
+
exp, sininv, sigmoid) on numerical features based on user input.
|
|
2004
|
+
|
|
2005
|
+
PARAMETERS:
|
|
2006
|
+
None
|
|
2007
|
+
|
|
2008
|
+
RETURNS:
|
|
2009
|
+
None
|
|
2010
|
+
|
|
2011
|
+
RAISES:
|
|
2012
|
+
TeradataMlException
|
|
2013
|
+
|
|
2014
|
+
EXAMPLES:
|
|
2015
|
+
>>> self._mathematical_transformation()
|
|
2016
|
+
"""
|
|
2017
|
+
self._display_msg(msg="Starting customized mathematical transformation ...",
|
|
2018
|
+
progress_bar=self.progress_bar,
|
|
2019
|
+
show_data=True)
|
|
2020
|
+
|
|
2021
|
+
mat_transform_input = self.custom_data.get("MathameticalTransformationIndicator", False)
|
|
2022
|
+
# Checking user input for mathematical transformations
|
|
2023
|
+
if mat_transform_input:
|
|
2024
|
+
# Extracting list required for mathematical transformations
|
|
2025
|
+
mat_transform_list = self.custom_data.get("MathameticalTransformationParam", None).copy()
|
|
2026
|
+
|
|
2027
|
+
if mat_transform_list:
|
|
2028
|
+
volatile = mat_transform_list.pop("volatile", False)
|
|
2029
|
+
persist = mat_transform_list.pop("persist", False)
|
|
2030
|
+
# Checking for column present in dataset or not
|
|
2031
|
+
_Validators._validate_dataframe_has_argument_columns(list(mat_transform_list.keys()),
|
|
2032
|
+
"MathameticalTransformationParam", self.data, "df")
|
|
2033
|
+
|
|
2034
|
+
# List of storing target columns and mathematical transformation information
|
|
2035
|
+
transform_data=[]
|
|
2036
|
+
target_columns=[]
|
|
2037
|
+
# Storing custom mathematical transformation indicator in data transform dictionary
|
|
2038
|
+
self.data_transform_dict['custom_mathematical_transformation_ind'] = True
|
|
2039
|
+
# Storing custom numapply transformation parameters in data transform dictionary
|
|
2040
|
+
self.data_transform_dict['custom_numapply_transformation_param'] = {}
|
|
2041
|
+
|
|
2042
|
+
for col, transform_val in mat_transform_list.items():
|
|
2043
|
+
apply_method=transform_val["apply_method"]
|
|
2044
|
+
if apply_method in (["sininv","sigmoid"]):
|
|
2045
|
+
# Applying numapply transformation
|
|
2046
|
+
self.data = self._numapply_transformation(col,transform_val)
|
|
2047
|
+
self.data_mapping[f'{apply_method}_transformed_data'] = self.data._table_name
|
|
2048
|
+
self._display_msg(msg="Updated dataset sample after applying numapply transformation:",
|
|
2049
|
+
data=self.data,
|
|
2050
|
+
progress_bar=self.progress_bar)
|
|
2051
|
+
# Updating parameter details for each column
|
|
2052
|
+
self.data_transform_dict['custom_numapply_transformation_param'].update({col:transform_val})
|
|
2053
|
+
else:
|
|
2054
|
+
# Handling specific scenarios for log and pow transformation
|
|
2055
|
+
parameters=""
|
|
2056
|
+
if apply_method == "log":
|
|
2057
|
+
base = transform_val["base"]
|
|
2058
|
+
parameters = json.dumps({"base":base})
|
|
2059
|
+
elif apply_method == "pow":
|
|
2060
|
+
exponent = transform_val["exponent"]
|
|
2061
|
+
parameters = json.dumps({"exponent":exponent})
|
|
2062
|
+
target_columns.append(col)
|
|
2063
|
+
transform_data.append({"TargetColumn":col, "DefaultValue":1, "Transformation":apply_method, "Parameters":parameters})
|
|
2064
|
+
# Checking for transformation data
|
|
2065
|
+
if len(transform_data):
|
|
2066
|
+
# Coverting into pandas and then teradata dataframe for performing further opration
|
|
2067
|
+
transform_data = pd.DataFrame(transform_data, columns=["TargetColumn", "DefaultValue", "Transformation", "Parameters"])
|
|
2068
|
+
self._display_msg(msg="Numerical transformation information :-",
|
|
2069
|
+
data=transform_data,
|
|
2070
|
+
progress_bar=self.progress_bar)
|
|
2071
|
+
copy_to_sql(df=transform_data, table_name="automl_num_transform_data", temporary=True)
|
|
2072
|
+
num_transform_data = DataFrame.from_table("automl_num_transform_data")
|
|
2073
|
+
# Applying transformation using Fit/Transform functions
|
|
2074
|
+
self._numerical_transformation(target_columns, num_transform_data, volatile, persist)
|
|
2075
|
+
# Storing custom numerical transformation parameters and column list in data transform dictionary
|
|
2076
|
+
self.data_transform_dict['custom_numerical_transformation_col'] = target_columns
|
|
2077
|
+
self.data_transform_dict['custom_numerical_transformation_params'] = num_transform_data
|
|
2078
|
+
else:
|
|
2079
|
+
self._display_msg(inline_msg="No input provided for performing customized mathematical transformation.",
|
|
2080
|
+
progress_bar=self.progress_bar)
|
|
2081
|
+
else:
|
|
2082
|
+
self._display_msg(inline_msg="Skipping customized mathematical transformation.",
|
|
2083
|
+
progress_bar=self.progress_bar)
|
|
2084
|
+
|
|
2085
|
+
def _non_linear_transformation(self):
|
|
2086
|
+
"""
|
|
2087
|
+
DESCRIPTION:
|
|
2088
|
+
Function to perform customized non-linear transformation on numerical features based on user input.
|
|
2089
|
+
|
|
2090
|
+
PARAMETERS:
|
|
2091
|
+
None
|
|
2092
|
+
|
|
2093
|
+
RETURNS:
|
|
2094
|
+
None
|
|
2095
|
+
|
|
2096
|
+
RAISES:
|
|
2097
|
+
TeradataMlException
|
|
2098
|
+
|
|
2099
|
+
EXAMPLES:
|
|
2100
|
+
>>> self._non_linear_transformation()
|
|
2101
|
+
"""
|
|
2102
|
+
self._display_msg(msg="Starting customized non-linear transformation ...",
|
|
2103
|
+
progress_bar=self.progress_bar,
|
|
2104
|
+
show_data=True)
|
|
2105
|
+
nl_transform_input = self.custom_data.get("NonLinearTransformationIndicator", False)
|
|
2106
|
+
# Checking user input for non-linear transformation
|
|
2107
|
+
if nl_transform_input:
|
|
2108
|
+
nl_transform_list = self.custom_data.get("NonLinearTransformationParam", None)
|
|
2109
|
+
# Extracting list required for non-linear transformation
|
|
2110
|
+
if nl_transform_list:
|
|
2111
|
+
volatile = nl_transform_list.pop("volatile", False)
|
|
2112
|
+
persist = nl_transform_list.pop("persist", False)
|
|
2113
|
+
total_combination = len(nl_transform_list)
|
|
2114
|
+
# Generating all possible combination names
|
|
2115
|
+
possible_combination = ["Combination_"+str(counter) for counter in range(1,total_combination+1)]
|
|
2116
|
+
self._display_msg(msg="Possible combination :",
|
|
2117
|
+
col_lst=possible_combination,
|
|
2118
|
+
progress_bar=self.progress_bar)
|
|
2119
|
+
# Storing custom non-linear transformation indicator in data transform dictionary
|
|
2120
|
+
self.data_transform_dict['custom_non_linear_transformation_ind'] = True
|
|
2121
|
+
# Storing custom non-linear transformation fit object in data transform dictionary
|
|
2122
|
+
self.data_transform_dict['custom_non_linear_transformation_fit_object'] = {}
|
|
2123
|
+
# print("Possible combination :",possible_combination)
|
|
2124
|
+
# Performing transformation for each combination
|
|
2125
|
+
for comb, transform_val in nl_transform_list.items():
|
|
2126
|
+
if comb in possible_combination:
|
|
2127
|
+
target_columns = transform_val["target_columns"]
|
|
2128
|
+
# Checking for column present in dataset or not
|
|
2129
|
+
_Validators._validate_dataframe_has_argument_columns(target_columns,
|
|
2130
|
+
"target_columns", self.data, "df")
|
|
2131
|
+
|
|
2132
|
+
formula = transform_val["formula"]
|
|
2133
|
+
result_column = transform_val["result_column"]
|
|
2134
|
+
# Adding fit params for transformation
|
|
2135
|
+
fit_param = {
|
|
2136
|
+
"data" : self.data,
|
|
2137
|
+
"target_columns" : target_columns,
|
|
2138
|
+
"formula" : formula,
|
|
2139
|
+
"result_column" : result_column,
|
|
2140
|
+
"volatile" : volatile,
|
|
2141
|
+
"persist" : persist
|
|
2142
|
+
}
|
|
2143
|
+
# Performing fit on dataset
|
|
2144
|
+
fit_obj = NonLinearCombineFit(**fit_param)
|
|
2145
|
+
# Updating it for each non-linear combination
|
|
2146
|
+
self.data_transform_dict['custom_non_linear_transformation_fit_object'].update({comb:fit_obj.result})
|
|
2147
|
+
# Adding transform params for transformation
|
|
2148
|
+
transform_params = {
|
|
2149
|
+
"data" : self.data,
|
|
2150
|
+
"object" : fit_obj,
|
|
2151
|
+
"accumulate" : self.data.columns,
|
|
2152
|
+
"persist" : True
|
|
2153
|
+
}
|
|
2154
|
+
# Disabling display table name if persist is True by default
|
|
2155
|
+
if not volatile and not persist:
|
|
2156
|
+
transform_params["display_table_name"] = False
|
|
2157
|
+
|
|
2158
|
+
if volatile:
|
|
2159
|
+
transform_params["volatile"] = True
|
|
2160
|
+
transform_params["persist"] = False
|
|
2161
|
+
self.data = NonLinearCombineTransform(**transform_params).result
|
|
2162
|
+
|
|
2163
|
+
self.data_mapping[f'fit_nonlinear_{comb}_output'] = fit_obj.output_data._table_name
|
|
2164
|
+
self.data_mapping[f'fit_nonlinear_{comb}_result'] = fit_obj.result._table_name
|
|
2165
|
+
self.data_mapping['non_linear_transformed_data'] = self.data._table_name
|
|
2166
|
+
|
|
2167
|
+
if not volatile and not persist:
|
|
2168
|
+
# Adding transformed data containing table to garbage collector
|
|
2169
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
2170
|
+
else:
|
|
2171
|
+
self._display_msg(inline_msg="Combinations are not as per expectation.",
|
|
2172
|
+
progress_bar=self.progress_bar)
|
|
2173
|
+
self._display_msg(msg="Updated dataset sample after performing non-liner transformation:",
|
|
2174
|
+
data=self.data,
|
|
2175
|
+
progress_bar=self.progress_bar)
|
|
2176
|
+
else:
|
|
2177
|
+
self._display_msg(inline_msg="No information provided for performing customized non-linear transformation.",
|
|
2178
|
+
progress_bar=self.progress_bar)
|
|
2179
|
+
else:
|
|
2180
|
+
self._display_msg(inline_msg="Skipping customized non-linear transformation.",
|
|
2181
|
+
progress_bar=self.progress_bar)
|
|
2182
|
+
|
|
2183
|
+
def _anti_select_columns(self):
|
|
2184
|
+
"""
|
|
2185
|
+
DESCRIPTION:
|
|
2186
|
+
Function to remove specific features from dataset based on user input.
|
|
2187
|
+
|
|
2188
|
+
PARAMETERS:
|
|
2189
|
+
None
|
|
2190
|
+
|
|
2191
|
+
RETURNS:
|
|
2192
|
+
None
|
|
2193
|
+
|
|
2194
|
+
RAISES:
|
|
2195
|
+
None
|
|
2196
|
+
|
|
2197
|
+
EXAMPLES:
|
|
2198
|
+
>>> self._anti_select_columns()
|
|
2199
|
+
"""
|
|
2200
|
+
self._display_msg(msg="Starting customized anti-select columns ...",
|
|
2201
|
+
progress_bar=self.progress_bar,
|
|
2202
|
+
show_data=True)
|
|
2203
|
+
anti_select_input = self.custom_data.get("AntiselectIndicator", False)
|
|
2204
|
+
# Checking user input for anti-select columns
|
|
2205
|
+
if anti_select_input:
|
|
2206
|
+
anti_select_params = self.custom_data.get("AntiselectParam", None)
|
|
2207
|
+
if anti_select_params:
|
|
2208
|
+
# Extracting list required for anti-select columns
|
|
2209
|
+
anti_select_list = anti_select_params.get("excluded_columns", None)
|
|
2210
|
+
volatile = anti_select_params.get("volatile", False)
|
|
2211
|
+
persist = anti_select_params.get("persist", False)
|
|
2212
|
+
if(anti_select_list):
|
|
2213
|
+
if all(item in self.data.columns for item in anti_select_list):
|
|
2214
|
+
# Storing custom anti-select columns indicator and column list in data transform dictionary
|
|
2215
|
+
self.data_transform_dict['custom_anti_select_columns_ind'] = True
|
|
2216
|
+
self.data_transform_dict['custom_anti_select_columns'] = anti_select_list
|
|
2217
|
+
fit_params = {
|
|
2218
|
+
"data" : self.data,
|
|
2219
|
+
"exclude" : anti_select_list,
|
|
2220
|
+
"volatile" : volatile,
|
|
2221
|
+
"persist" : persist
|
|
2222
|
+
}
|
|
2223
|
+
# Performing transformation for given user input
|
|
2224
|
+
self.data = Antiselect(**fit_params).result
|
|
2225
|
+
self._display_msg(msg="Updated dataset sample after performing anti-select columns:",
|
|
2226
|
+
data=self.data,
|
|
2227
|
+
progress_bar=self.progress_bar)
|
|
2228
|
+
else:
|
|
2229
|
+
self._display_msg(msg="Columns provided in list are not present in dataset:",
|
|
2230
|
+
col_lst=anti_select_list,
|
|
2231
|
+
progress_bar=self.progress_bar)
|
|
2232
|
+
else:
|
|
2233
|
+
self._display_msg(inline_msg="No information provided for performing anti-select columns operation.",
|
|
2234
|
+
progress_bar=self.progress_bar)
|
|
2235
|
+
else:
|
|
2236
|
+
self._display_msg(inline_msg="Skipping customized anti-select columns.",
|
|
2237
|
+
progress_bar=self.progress_bar)
|
|
2238
|
+
|
|
2239
|
+
def _get_generic_parameters(self,
|
|
2240
|
+
func_indicator=None,
|
|
2241
|
+
param_name=None):
|
|
2242
|
+
"""
|
|
2243
|
+
DESCRIPTION:
|
|
2244
|
+
Function to set generic parameters.
|
|
2245
|
+
|
|
2246
|
+
PARAMETERS:
|
|
2247
|
+
func_indicator:
|
|
2248
|
+
Optional Argument.
|
|
2249
|
+
Specifies the name of function indicator.
|
|
2250
|
+
Types: str
|
|
2251
|
+
|
|
2252
|
+
param_name:
|
|
2253
|
+
Optional Argument.
|
|
2254
|
+
Specifies the name of the param which contains generic parameters.
|
|
2255
|
+
Types: str
|
|
2256
|
+
|
|
2257
|
+
RETURNS:
|
|
2258
|
+
tuple containing volatile and persist parameters.
|
|
2259
|
+
|
|
2260
|
+
RAISES:
|
|
2261
|
+
None
|
|
2262
|
+
|
|
2263
|
+
EXAMPLES:
|
|
2264
|
+
>>> volatile, persist = self._get_generic_parameters(func_indicator="CategoricalEncodingIndicator", param_name="CategoricalEncodingParam")
|
|
2265
|
+
"""
|
|
2266
|
+
# Prioritizing persist argument and then volatile
|
|
2267
|
+
persist = self.persist
|
|
2268
|
+
volatile = self.volatile or (configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE and persist is False)
|
|
2269
|
+
if self.custom_data is not None and self.custom_data.get(func_indicator, False):
|
|
2270
|
+
volatile = self.custom_data[param_name].get("volatile", False)
|
|
2271
|
+
persist = self.custom_data[param_name].get("persist", False)
|
|
2272
|
+
|
|
2273
|
+
return (volatile, persist)
|