teradataml 20.0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +2762 -0
- teradataml/__init__.py +78 -0
- teradataml/_version.py +11 -0
- teradataml/analytics/Transformations.py +2996 -0
- teradataml/analytics/__init__.py +82 -0
- teradataml/analytics/analytic_function_executor.py +2416 -0
- teradataml/analytics/analytic_query_generator.py +1050 -0
- teradataml/analytics/byom/H2OPredict.py +514 -0
- teradataml/analytics/byom/PMMLPredict.py +437 -0
- teradataml/analytics/byom/__init__.py +16 -0
- teradataml/analytics/json_parser/__init__.py +133 -0
- teradataml/analytics/json_parser/analytic_functions_argument.py +1805 -0
- teradataml/analytics/json_parser/json_store.py +191 -0
- teradataml/analytics/json_parser/metadata.py +1666 -0
- teradataml/analytics/json_parser/utils.py +805 -0
- teradataml/analytics/meta_class.py +236 -0
- teradataml/analytics/sqle/DecisionTreePredict.py +456 -0
- teradataml/analytics/sqle/NaiveBayesPredict.py +420 -0
- teradataml/analytics/sqle/__init__.py +128 -0
- teradataml/analytics/sqle/json/decisiontreepredict_sqle.json +78 -0
- teradataml/analytics/sqle/json/naivebayespredict_sqle.json +62 -0
- teradataml/analytics/table_operator/__init__.py +11 -0
- teradataml/analytics/uaf/__init__.py +82 -0
- teradataml/analytics/utils.py +828 -0
- teradataml/analytics/valib.py +1617 -0
- teradataml/automl/__init__.py +5835 -0
- teradataml/automl/autodataprep/__init__.py +493 -0
- teradataml/automl/custom_json_utils.py +1625 -0
- teradataml/automl/data_preparation.py +1384 -0
- teradataml/automl/data_transformation.py +1254 -0
- teradataml/automl/feature_engineering.py +2273 -0
- teradataml/automl/feature_exploration.py +1873 -0
- teradataml/automl/model_evaluation.py +488 -0
- teradataml/automl/model_training.py +1407 -0
- teradataml/catalog/__init__.py +2 -0
- teradataml/catalog/byom.py +1759 -0
- teradataml/catalog/function_argument_mapper.py +859 -0
- teradataml/catalog/model_cataloging_utils.py +491 -0
- teradataml/clients/__init__.py +0 -0
- teradataml/clients/auth_client.py +137 -0
- teradataml/clients/keycloak_client.py +165 -0
- teradataml/clients/pkce_client.py +481 -0
- teradataml/common/__init__.py +1 -0
- teradataml/common/aed_utils.py +2078 -0
- teradataml/common/bulk_exposed_utils.py +113 -0
- teradataml/common/constants.py +1669 -0
- teradataml/common/deprecations.py +166 -0
- teradataml/common/exceptions.py +147 -0
- teradataml/common/formula.py +743 -0
- teradataml/common/garbagecollector.py +666 -0
- teradataml/common/logger.py +1261 -0
- teradataml/common/messagecodes.py +518 -0
- teradataml/common/messages.py +262 -0
- teradataml/common/pylogger.py +67 -0
- teradataml/common/sqlbundle.py +764 -0
- teradataml/common/td_coltype_code_to_tdtype.py +48 -0
- teradataml/common/utils.py +3166 -0
- teradataml/common/warnings.py +36 -0
- teradataml/common/wrapper_utils.py +625 -0
- teradataml/config/__init__.py +0 -0
- teradataml/config/dummy_file1.cfg +5 -0
- teradataml/config/dummy_file2.cfg +3 -0
- teradataml/config/sqlengine_alias_definitions_v1.0 +14 -0
- teradataml/config/sqlengine_alias_definitions_v1.1 +20 -0
- teradataml/config/sqlengine_alias_definitions_v1.3 +19 -0
- teradataml/context/__init__.py +0 -0
- teradataml/context/aed_context.py +223 -0
- teradataml/context/context.py +1462 -0
- teradataml/data/A_loan.csv +19 -0
- teradataml/data/BINARY_REALS_LEFT.csv +11 -0
- teradataml/data/BINARY_REALS_RIGHT.csv +11 -0
- teradataml/data/B_loan.csv +49 -0
- teradataml/data/BuoyData2.csv +17 -0
- teradataml/data/CONVOLVE2_COMPLEX_LEFT.csv +5 -0
- teradataml/data/CONVOLVE2_COMPLEX_RIGHT.csv +5 -0
- teradataml/data/Convolve2RealsLeft.csv +5 -0
- teradataml/data/Convolve2RealsRight.csv +5 -0
- teradataml/data/Convolve2ValidLeft.csv +11 -0
- teradataml/data/Convolve2ValidRight.csv +11 -0
- teradataml/data/DFFTConv_Real_8_8.csv +65 -0
- teradataml/data/Employee.csv +5 -0
- teradataml/data/Employee_Address.csv +4 -0
- teradataml/data/Employee_roles.csv +5 -0
- teradataml/data/JulesBelvezeDummyData.csv +100 -0
- teradataml/data/Mall_customer_data.csv +201 -0
- teradataml/data/Orders1_12mf.csv +25 -0
- teradataml/data/Pi_loan.csv +7 -0
- teradataml/data/SMOOTHED_DATA.csv +7 -0
- teradataml/data/TestDFFT8.csv +9 -0
- teradataml/data/TestRiver.csv +109 -0
- teradataml/data/Traindata.csv +28 -0
- teradataml/data/__init__.py +0 -0
- teradataml/data/acf.csv +17 -0
- teradataml/data/adaboost_example.json +34 -0
- teradataml/data/adaboostpredict_example.json +24 -0
- teradataml/data/additional_table.csv +11 -0
- teradataml/data/admissions_test.csv +21 -0
- teradataml/data/admissions_train.csv +41 -0
- teradataml/data/admissions_train_nulls.csv +41 -0
- teradataml/data/advertising.csv +201 -0
- teradataml/data/ageandheight.csv +13 -0
- teradataml/data/ageandpressure.csv +31 -0
- teradataml/data/amazon_reviews_25.csv +26 -0
- teradataml/data/antiselect_example.json +36 -0
- teradataml/data/antiselect_input.csv +8 -0
- teradataml/data/antiselect_input_mixed_case.csv +8 -0
- teradataml/data/applicant_external.csv +7 -0
- teradataml/data/applicant_reference.csv +7 -0
- teradataml/data/apriori_example.json +22 -0
- teradataml/data/arima_example.json +9 -0
- teradataml/data/assortedtext_input.csv +8 -0
- teradataml/data/attribution_example.json +34 -0
- teradataml/data/attribution_sample_table.csv +27 -0
- teradataml/data/attribution_sample_table1.csv +6 -0
- teradataml/data/attribution_sample_table2.csv +11 -0
- teradataml/data/bank_churn.csv +10001 -0
- teradataml/data/bank_marketing.csv +11163 -0
- teradataml/data/bank_web_clicks1.csv +43 -0
- teradataml/data/bank_web_clicks2.csv +91 -0
- teradataml/data/bank_web_url.csv +85 -0
- teradataml/data/barrier.csv +2 -0
- teradataml/data/barrier_new.csv +3 -0
- teradataml/data/betweenness_example.json +14 -0
- teradataml/data/bike_sharing.csv +732 -0
- teradataml/data/bin_breaks.csv +8 -0
- teradataml/data/bin_fit_ip.csv +4 -0
- teradataml/data/binary_complex_left.csv +11 -0
- teradataml/data/binary_complex_right.csv +11 -0
- teradataml/data/binary_matrix_complex_left.csv +21 -0
- teradataml/data/binary_matrix_complex_right.csv +21 -0
- teradataml/data/binary_matrix_real_left.csv +21 -0
- teradataml/data/binary_matrix_real_right.csv +21 -0
- teradataml/data/blood2ageandweight.csv +26 -0
- teradataml/data/bmi.csv +501 -0
- teradataml/data/boston.csv +507 -0
- teradataml/data/boston2cols.csv +721 -0
- teradataml/data/breast_cancer.csv +570 -0
- teradataml/data/buoydata_mix.csv +11 -0
- teradataml/data/burst_data.csv +5 -0
- teradataml/data/burst_example.json +21 -0
- teradataml/data/byom_example.json +34 -0
- teradataml/data/bytes_table.csv +4 -0
- teradataml/data/cal_housing_ex_raw.csv +70 -0
- teradataml/data/callers.csv +7 -0
- teradataml/data/calls.csv +10 -0
- teradataml/data/cars_hist.csv +33 -0
- teradataml/data/cat_table.csv +25 -0
- teradataml/data/ccm_example.json +32 -0
- teradataml/data/ccm_input.csv +91 -0
- teradataml/data/ccm_input2.csv +13 -0
- teradataml/data/ccmexample.csv +101 -0
- teradataml/data/ccmprepare_example.json +9 -0
- teradataml/data/ccmprepare_input.csv +91 -0
- teradataml/data/cfilter_example.json +12 -0
- teradataml/data/changepointdetection_example.json +18 -0
- teradataml/data/changepointdetectionrt_example.json +8 -0
- teradataml/data/chi_sq.csv +3 -0
- teradataml/data/churn_data.csv +14 -0
- teradataml/data/churn_emission.csv +35 -0
- teradataml/data/churn_initial.csv +3 -0
- teradataml/data/churn_state_transition.csv +5 -0
- teradataml/data/citedges_2.csv +745 -0
- teradataml/data/citvertices_2.csv +1210 -0
- teradataml/data/clicks2.csv +16 -0
- teradataml/data/clickstream.csv +13 -0
- teradataml/data/clickstream1.csv +11 -0
- teradataml/data/closeness_example.json +16 -0
- teradataml/data/complaints.csv +21 -0
- teradataml/data/complaints_mini.csv +3 -0
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_testtoken.csv +224 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/complaints_tokens_test.csv +353 -0
- teradataml/data/complaints_traintoken.csv +472 -0
- teradataml/data/computers_category.csv +1001 -0
- teradataml/data/computers_test1.csv +1252 -0
- teradataml/data/computers_train1.csv +5009 -0
- teradataml/data/computers_train1_clustered.csv +5009 -0
- teradataml/data/confusionmatrix_example.json +9 -0
- teradataml/data/conversion_event_table.csv +3 -0
- teradataml/data/corr_input.csv +17 -0
- teradataml/data/correlation_example.json +11 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/coxhazardratio_example.json +39 -0
- teradataml/data/coxph_example.json +15 -0
- teradataml/data/coxsurvival_example.json +28 -0
- teradataml/data/cpt.csv +41 -0
- teradataml/data/credit_ex_merged.csv +45 -0
- teradataml/data/creditcard_data.csv +1001 -0
- teradataml/data/customer_loyalty.csv +301 -0
- teradataml/data/customer_loyalty_newseq.csv +31 -0
- teradataml/data/customer_segmentation_test.csv +2628 -0
- teradataml/data/customer_segmentation_train.csv +8069 -0
- teradataml/data/dataframe_example.json +173 -0
- teradataml/data/decisionforest_example.json +37 -0
- teradataml/data/decisionforestpredict_example.json +38 -0
- teradataml/data/decisiontree_example.json +21 -0
- teradataml/data/decisiontreepredict_example.json +45 -0
- teradataml/data/dfft2_size4_real.csv +17 -0
- teradataml/data/dfft2_test_matrix16.csv +17 -0
- teradataml/data/dfft2conv_real_4_4.csv +65 -0
- teradataml/data/diabetes.csv +443 -0
- teradataml/data/diabetes_test.csv +89 -0
- teradataml/data/dict_table.csv +5 -0
- teradataml/data/docperterm_table.csv +4 -0
- teradataml/data/docs/__init__.py +1 -0
- teradataml/data/docs/byom/__init__.py +0 -0
- teradataml/data/docs/byom/docs/DataRobotPredict.py +180 -0
- teradataml/data/docs/byom/docs/DataikuPredict.py +217 -0
- teradataml/data/docs/byom/docs/H2OPredict.py +325 -0
- teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
- teradataml/data/docs/byom/docs/ONNXPredict.py +283 -0
- teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
- teradataml/data/docs/byom/docs/PMMLPredict.py +278 -0
- teradataml/data/docs/byom/docs/__init__.py +0 -0
- teradataml/data/docs/sqle/__init__.py +0 -0
- teradataml/data/docs/sqle/docs_17_10/Antiselect.py +83 -0
- teradataml/data/docs/sqle/docs_17_10/Attribution.py +200 -0
- teradataml/data/docs/sqle/docs_17_10/BincodeFit.py +172 -0
- teradataml/data/docs/sqle/docs_17_10/BincodeTransform.py +131 -0
- teradataml/data/docs/sqle/docs_17_10/CategoricalSummary.py +86 -0
- teradataml/data/docs/sqle/docs_17_10/ChiSq.py +90 -0
- teradataml/data/docs/sqle/docs_17_10/ColumnSummary.py +86 -0
- teradataml/data/docs/sqle/docs_17_10/ConvertTo.py +96 -0
- teradataml/data/docs/sqle/docs_17_10/DecisionForestPredict.py +139 -0
- teradataml/data/docs/sqle/docs_17_10/DecisionTreePredict.py +152 -0
- teradataml/data/docs/sqle/docs_17_10/FTest.py +161 -0
- teradataml/data/docs/sqle/docs_17_10/FillRowId.py +83 -0
- teradataml/data/docs/sqle/docs_17_10/Fit.py +88 -0
- teradataml/data/docs/sqle/docs_17_10/GLMPredict.py +144 -0
- teradataml/data/docs/sqle/docs_17_10/GetRowsWithMissingValues.py +85 -0
- teradataml/data/docs/sqle/docs_17_10/GetRowsWithoutMissingValues.py +82 -0
- teradataml/data/docs/sqle/docs_17_10/Histogram.py +165 -0
- teradataml/data/docs/sqle/docs_17_10/MovingAverage.py +134 -0
- teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +209 -0
- teradataml/data/docs/sqle/docs_17_10/NPath.py +266 -0
- teradataml/data/docs/sqle/docs_17_10/NaiveBayesPredict.py +116 -0
- teradataml/data/docs/sqle/docs_17_10/NaiveBayesTextClassifierPredict.py +176 -0
- teradataml/data/docs/sqle/docs_17_10/NumApply.py +147 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +135 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +109 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterFit.py +166 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +105 -0
- teradataml/data/docs/sqle/docs_17_10/Pack.py +128 -0
- teradataml/data/docs/sqle/docs_17_10/PolynomialFeaturesFit.py +112 -0
- teradataml/data/docs/sqle/docs_17_10/PolynomialFeaturesTransform.py +102 -0
- teradataml/data/docs/sqle/docs_17_10/QQNorm.py +105 -0
- teradataml/data/docs/sqle/docs_17_10/RoundColumns.py +110 -0
- teradataml/data/docs/sqle/docs_17_10/RowNormalizeFit.py +118 -0
- teradataml/data/docs/sqle/docs_17_10/RowNormalizeTransform.py +99 -0
- teradataml/data/docs/sqle/docs_17_10/SVMSparsePredict.py +153 -0
- teradataml/data/docs/sqle/docs_17_10/ScaleFit.py +197 -0
- teradataml/data/docs/sqle/docs_17_10/ScaleTransform.py +99 -0
- teradataml/data/docs/sqle/docs_17_10/Sessionize.py +114 -0
- teradataml/data/docs/sqle/docs_17_10/SimpleImputeFit.py +116 -0
- teradataml/data/docs/sqle/docs_17_10/SimpleImputeTransform.py +98 -0
- teradataml/data/docs/sqle/docs_17_10/StrApply.py +187 -0
- teradataml/data/docs/sqle/docs_17_10/StringSimilarity.py +146 -0
- teradataml/data/docs/sqle/docs_17_10/Transform.py +105 -0
- teradataml/data/docs/sqle/docs_17_10/UnivariateStatistics.py +142 -0
- teradataml/data/docs/sqle/docs_17_10/Unpack.py +214 -0
- teradataml/data/docs/sqle/docs_17_10/WhichMax.py +83 -0
- teradataml/data/docs/sqle/docs_17_10/WhichMin.py +83 -0
- teradataml/data/docs/sqle/docs_17_10/ZTest.py +155 -0
- teradataml/data/docs/sqle/docs_17_10/__init__.py +0 -0
- teradataml/data/docs/sqle/docs_17_20/ANOVA.py +186 -0
- teradataml/data/docs/sqle/docs_17_20/Antiselect.py +83 -0
- teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
- teradataml/data/docs/sqle/docs_17_20/Attribution.py +201 -0
- teradataml/data/docs/sqle/docs_17_20/BincodeFit.py +172 -0
- teradataml/data/docs/sqle/docs_17_20/BincodeTransform.py +139 -0
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/CategoricalSummary.py +86 -0
- teradataml/data/docs/sqle/docs_17_20/ChiSq.py +90 -0
- teradataml/data/docs/sqle/docs_17_20/ClassificationEvaluator.py +166 -0
- teradataml/data/docs/sqle/docs_17_20/ColumnSummary.py +86 -0
- teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +246 -0
- teradataml/data/docs/sqle/docs_17_20/ConvertTo.py +113 -0
- teradataml/data/docs/sqle/docs_17_20/DecisionForest.py +280 -0
- teradataml/data/docs/sqle/docs_17_20/DecisionForestPredict.py +144 -0
- teradataml/data/docs/sqle/docs_17_20/DecisionTreePredict.py +136 -0
- teradataml/data/docs/sqle/docs_17_20/FTest.py +240 -0
- teradataml/data/docs/sqle/docs_17_20/FillRowId.py +83 -0
- teradataml/data/docs/sqle/docs_17_20/Fit.py +88 -0
- teradataml/data/docs/sqle/docs_17_20/GLM.py +541 -0
- teradataml/data/docs/sqle/docs_17_20/GLMPerSegment.py +415 -0
- teradataml/data/docs/sqle/docs_17_20/GLMPredict.py +144 -0
- teradataml/data/docs/sqle/docs_17_20/GLMPredictPerSegment.py +233 -0
- teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +125 -0
- teradataml/data/docs/sqle/docs_17_20/GetRowsWithMissingValues.py +109 -0
- teradataml/data/docs/sqle/docs_17_20/GetRowsWithoutMissingValues.py +106 -0
- teradataml/data/docs/sqle/docs_17_20/Histogram.py +224 -0
- teradataml/data/docs/sqle/docs_17_20/KMeans.py +251 -0
- teradataml/data/docs/sqle/docs_17_20/KMeansPredict.py +144 -0
- teradataml/data/docs/sqle/docs_17_20/KNN.py +215 -0
- teradataml/data/docs/sqle/docs_17_20/MovingAverage.py +134 -0
- teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +209 -0
- teradataml/data/docs/sqle/docs_17_20/NPath.py +266 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayesPredict.py +116 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierPredict.py +177 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierTrainer.py +127 -0
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +119 -0
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineTransform.py +112 -0
- teradataml/data/docs/sqle/docs_17_20/NumApply.py +147 -0
- teradataml/data/docs/sqle/docs_17_20/OneClassSVM.py +307 -0
- teradataml/data/docs/sqle/docs_17_20/OneClassSVMPredict.py +185 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +231 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +121 -0
- teradataml/data/docs/sqle/docs_17_20/OrdinalEncodingFit.py +220 -0
- teradataml/data/docs/sqle/docs_17_20/OrdinalEncodingTransform.py +127 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +191 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterTransform.py +117 -0
- teradataml/data/docs/sqle/docs_17_20/Pack.py +128 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/PolynomialFeaturesFit.py +112 -0
- teradataml/data/docs/sqle/docs_17_20/PolynomialFeaturesTransform.py +112 -0
- teradataml/data/docs/sqle/docs_17_20/QQNorm.py +105 -0
- teradataml/data/docs/sqle/docs_17_20/ROC.py +164 -0
- teradataml/data/docs/sqle/docs_17_20/RandomProjectionFit.py +155 -0
- teradataml/data/docs/sqle/docs_17_20/RandomProjectionMinComponents.py +106 -0
- teradataml/data/docs/sqle/docs_17_20/RandomProjectionTransform.py +120 -0
- teradataml/data/docs/sqle/docs_17_20/RegressionEvaluator.py +211 -0
- teradataml/data/docs/sqle/docs_17_20/RoundColumns.py +109 -0
- teradataml/data/docs/sqle/docs_17_20/RowNormalizeFit.py +118 -0
- teradataml/data/docs/sqle/docs_17_20/RowNormalizeTransform.py +111 -0
- teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
- teradataml/data/docs/sqle/docs_17_20/SVM.py +414 -0
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +213 -0
- teradataml/data/docs/sqle/docs_17_20/SVMSparsePredict.py +153 -0
- teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +315 -0
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +202 -0
- teradataml/data/docs/sqle/docs_17_20/SentimentExtractor.py +206 -0
- teradataml/data/docs/sqle/docs_17_20/Sessionize.py +114 -0
- teradataml/data/docs/sqle/docs_17_20/Shap.py +225 -0
- teradataml/data/docs/sqle/docs_17_20/Silhouette.py +153 -0
- teradataml/data/docs/sqle/docs_17_20/SimpleImputeFit.py +116 -0
- teradataml/data/docs/sqle/docs_17_20/SimpleImputeTransform.py +109 -0
- teradataml/data/docs/sqle/docs_17_20/StrApply.py +187 -0
- teradataml/data/docs/sqle/docs_17_20/StringSimilarity.py +146 -0
- teradataml/data/docs/sqle/docs_17_20/TDDecisionForestPredict.py +207 -0
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +333 -0
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/TargetEncodingFit.py +267 -0
- teradataml/data/docs/sqle/docs_17_20/TargetEncodingTransform.py +141 -0
- teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +224 -0
- teradataml/data/docs/sqle/docs_17_20/TrainTestSplit.py +160 -0
- teradataml/data/docs/sqle/docs_17_20/Transform.py +123 -0
- teradataml/data/docs/sqle/docs_17_20/UnivariateStatistics.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/Unpack.py +214 -0
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/sqle/docs_17_20/VectorDistance.py +169 -0
- teradataml/data/docs/sqle/docs_17_20/WhichMax.py +83 -0
- teradataml/data/docs/sqle/docs_17_20/WhichMin.py +83 -0
- teradataml/data/docs/sqle/docs_17_20/WordEmbeddings.py +237 -0
- teradataml/data/docs/sqle/docs_17_20/XGBoost.py +362 -0
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +281 -0
- teradataml/data/docs/sqle/docs_17_20/ZTest.py +220 -0
- teradataml/data/docs/sqle/docs_17_20/__init__.py +0 -0
- teradataml/data/docs/tableoperator/__init__.py +0 -0
- teradataml/data/docs/tableoperator/docs_17_00/ReadNOS.py +430 -0
- teradataml/data/docs/tableoperator/docs_17_00/__init__.py +0 -0
- teradataml/data/docs/tableoperator/docs_17_05/ReadNOS.py +430 -0
- teradataml/data/docs/tableoperator/docs_17_05/WriteNOS.py +348 -0
- teradataml/data/docs/tableoperator/docs_17_05/__init__.py +0 -0
- teradataml/data/docs/tableoperator/docs_17_10/ReadNOS.py +429 -0
- teradataml/data/docs/tableoperator/docs_17_10/WriteNOS.py +348 -0
- teradataml/data/docs/tableoperator/docs_17_10/__init__.py +0 -0
- teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
- teradataml/data/docs/tableoperator/docs_17_20/ReadNOS.py +440 -0
- teradataml/data/docs/tableoperator/docs_17_20/WriteNOS.py +387 -0
- teradataml/data/docs/tableoperator/docs_17_20/__init__.py +0 -0
- teradataml/data/docs/uaf/__init__.py +0 -0
- teradataml/data/docs/uaf/docs_17_20/ACF.py +186 -0
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +370 -0
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +172 -0
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +161 -0
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BinaryMatrixOp.py +248 -0
- teradataml/data/docs/uaf/docs_17_20/BinarySeriesOp.py +252 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +178 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +175 -0
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +230 -0
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +218 -0
- teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +185 -0
- teradataml/data/docs/uaf/docs_17_20/DFFT.py +204 -0
- teradataml/data/docs/uaf/docs_17_20/DFFT2.py +216 -0
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +216 -0
- teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +192 -0
- teradataml/data/docs/uaf/docs_17_20/DIFF.py +175 -0
- teradataml/data/docs/uaf/docs_17_20/DTW.py +180 -0
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +217 -0
- teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +142 -0
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +184 -0
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +185 -0
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/FitMetrics.py +172 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesFormula.py +206 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +143 -0
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +198 -0
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +260 -0
- teradataml/data/docs/uaf/docs_17_20/IDFFT.py +165 -0
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +191 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/InputValidator.py +121 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +156 -0
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +215 -0
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +174 -0
- teradataml/data/docs/uaf/docs_17_20/MInfo.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +191 -0
- teradataml/data/docs/uaf/docs_17_20/PACF.py +157 -0
- teradataml/data/docs/uaf/docs_17_20/Portman.py +217 -0
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +203 -0
- teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +155 -0
- teradataml/data/docs/uaf/docs_17_20/Resample.py +237 -0
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SInfo.py +123 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +173 -0
- teradataml/data/docs/uaf/docs_17_20/SelectionCriteria.py +174 -0
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +171 -0
- teradataml/data/docs/uaf/docs_17_20/SignifResidmean.py +164 -0
- teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +180 -0
- teradataml/data/docs/uaf/docs_17_20/Smoothma.py +208 -0
- teradataml/data/docs/uaf/docs_17_20/TrackingOp.py +151 -0
- teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +171 -0
- teradataml/data/docs/uaf/docs_17_20/Unnormalize.py +202 -0
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +171 -0
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/docs/uaf/docs_17_20/__init__.py +0 -0
- teradataml/data/dtw_example.json +18 -0
- teradataml/data/dtw_t1.csv +11 -0
- teradataml/data/dtw_t2.csv +4 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt2d_example.json +16 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_example.json +15 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/dwt_filter_dim.csv +5 -0
- teradataml/data/emission.csv +9 -0
- teradataml/data/emp_table_by_dept.csv +19 -0
- teradataml/data/employee_info.csv +4 -0
- teradataml/data/employee_table.csv +6 -0
- teradataml/data/excluding_event_table.csv +2 -0
- teradataml/data/finance_data.csv +6 -0
- teradataml/data/finance_data2.csv +61 -0
- teradataml/data/finance_data3.csv +93 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/fish.csv +160 -0
- teradataml/data/fm_blood2ageandweight.csv +26 -0
- teradataml/data/fmeasure_example.json +12 -0
- teradataml/data/followers_leaders.csv +10 -0
- teradataml/data/fpgrowth_example.json +12 -0
- teradataml/data/frequentpaths_example.json +29 -0
- teradataml/data/friends.csv +9 -0
- teradataml/data/fs_input.csv +33 -0
- teradataml/data/fs_input1.csv +33 -0
- teradataml/data/genData.csv +513 -0
- teradataml/data/geodataframe_example.json +40 -0
- teradataml/data/glass_types.csv +215 -0
- teradataml/data/glm_admissions_model.csv +12 -0
- teradataml/data/glm_example.json +56 -0
- teradataml/data/glml1l2_example.json +28 -0
- teradataml/data/glml1l2predict_example.json +54 -0
- teradataml/data/glmpredict_example.json +54 -0
- teradataml/data/gq_t1.csv +21 -0
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/hconvolve_complex_right.csv +5 -0
- teradataml/data/hconvolve_complex_rightmulti.csv +5 -0
- teradataml/data/histogram_example.json +12 -0
- teradataml/data/hmmdecoder_example.json +79 -0
- teradataml/data/hmmevaluator_example.json +25 -0
- teradataml/data/hmmsupervised_example.json +10 -0
- teradataml/data/hmmunsupervised_example.json +8 -0
- teradataml/data/hnsw_alter_data.csv +5 -0
- teradataml/data/hnsw_data.csv +10 -0
- teradataml/data/house_values.csv +12 -0
- teradataml/data/house_values2.csv +13 -0
- teradataml/data/housing_cat.csv +7 -0
- teradataml/data/housing_data.csv +9 -0
- teradataml/data/housing_test.csv +47 -0
- teradataml/data/housing_test_binary.csv +47 -0
- teradataml/data/housing_train.csv +493 -0
- teradataml/data/housing_train_attribute.csv +5 -0
- teradataml/data/housing_train_binary.csv +437 -0
- teradataml/data/housing_train_parameter.csv +2 -0
- teradataml/data/housing_train_response.csv +493 -0
- teradataml/data/housing_train_segment.csv +201 -0
- teradataml/data/ibm_stock.csv +370 -0
- teradataml/data/ibm_stock1.csv +370 -0
- teradataml/data/identitymatch_example.json +22 -0
- teradataml/data/idf_table.csv +4 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/impressions.csv +101 -0
- teradataml/data/inflation.csv +21 -0
- teradataml/data/initial.csv +3 -0
- teradataml/data/insect2Cols.csv +61 -0
- teradataml/data/insect_sprays.csv +13 -0
- teradataml/data/insurance.csv +1339 -0
- teradataml/data/interpolator_example.json +13 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/iris_altinput.csv +481 -0
- teradataml/data/iris_attribute_output.csv +8 -0
- teradataml/data/iris_attribute_test.csv +121 -0
- teradataml/data/iris_attribute_train.csv +481 -0
- teradataml/data/iris_category_expect_predict.csv +31 -0
- teradataml/data/iris_data.csv +151 -0
- teradataml/data/iris_input.csv +151 -0
- teradataml/data/iris_response_train.csv +121 -0
- teradataml/data/iris_test.csv +31 -0
- teradataml/data/iris_train.csv +121 -0
- teradataml/data/join_table1.csv +4 -0
- teradataml/data/join_table2.csv +4 -0
- teradataml/data/jsons/anly_function_name.json +7 -0
- teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
- teradataml/data/jsons/byom/dataikupredict.json +148 -0
- teradataml/data/jsons/byom/datarobotpredict.json +147 -0
- teradataml/data/jsons/byom/h2opredict.json +195 -0
- teradataml/data/jsons/byom/onnxembeddings.json +267 -0
- teradataml/data/jsons/byom/onnxpredict.json +187 -0
- teradataml/data/jsons/byom/pmmlpredict.json +147 -0
- teradataml/data/jsons/paired_functions.json +450 -0
- teradataml/data/jsons/sqle/16.20/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/16.20/Attribution.json +249 -0
- teradataml/data/jsons/sqle/16.20/DecisionForestPredict.json +156 -0
- teradataml/data/jsons/sqle/16.20/DecisionTreePredict.json +170 -0
- teradataml/data/jsons/sqle/16.20/GLMPredict.json +122 -0
- teradataml/data/jsons/sqle/16.20/MovingAverage.json +367 -0
- teradataml/data/jsons/sqle/16.20/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/16.20/NaiveBayesPredict.json +136 -0
- teradataml/data/jsons/sqle/16.20/NaiveBayesTextClassifierPredict.json +235 -0
- teradataml/data/jsons/sqle/16.20/Pack.json +98 -0
- teradataml/data/jsons/sqle/16.20/SVMSparsePredict.json +162 -0
- teradataml/data/jsons/sqle/16.20/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/16.20/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/16.20/Unpack.json +166 -0
- teradataml/data/jsons/sqle/16.20/nPath.json +269 -0
- teradataml/data/jsons/sqle/17.00/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/17.00/Attribution.json +249 -0
- teradataml/data/jsons/sqle/17.00/DecisionForestPredict.json +156 -0
- teradataml/data/jsons/sqle/17.00/DecisionTreePredict.json +170 -0
- teradataml/data/jsons/sqle/17.00/GLMPredict.json +122 -0
- teradataml/data/jsons/sqle/17.00/MovingAverage.json +367 -0
- teradataml/data/jsons/sqle/17.00/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/17.00/NaiveBayesPredict.json +136 -0
- teradataml/data/jsons/sqle/17.00/NaiveBayesTextClassifierPredict.json +235 -0
- teradataml/data/jsons/sqle/17.00/Pack.json +98 -0
- teradataml/data/jsons/sqle/17.00/SVMSparsePredict.json +162 -0
- teradataml/data/jsons/sqle/17.00/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/17.00/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/17.00/Unpack.json +166 -0
- teradataml/data/jsons/sqle/17.00/nPath.json +269 -0
- teradataml/data/jsons/sqle/17.05/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/17.05/Attribution.json +249 -0
- teradataml/data/jsons/sqle/17.05/DecisionForestPredict.json +156 -0
- teradataml/data/jsons/sqle/17.05/DecisionTreePredict.json +170 -0
- teradataml/data/jsons/sqle/17.05/GLMPredict.json +122 -0
- teradataml/data/jsons/sqle/17.05/MovingAverage.json +367 -0
- teradataml/data/jsons/sqle/17.05/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/17.05/NaiveBayesPredict.json +136 -0
- teradataml/data/jsons/sqle/17.05/NaiveBayesTextClassifierPredict.json +235 -0
- teradataml/data/jsons/sqle/17.05/Pack.json +98 -0
- teradataml/data/jsons/sqle/17.05/SVMSparsePredict.json +162 -0
- teradataml/data/jsons/sqle/17.05/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/17.05/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/17.05/Unpack.json +166 -0
- teradataml/data/jsons/sqle/17.05/nPath.json +269 -0
- teradataml/data/jsons/sqle/17.10/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/17.10/Attribution.json +249 -0
- teradataml/data/jsons/sqle/17.10/DecisionForestPredict.json +185 -0
- teradataml/data/jsons/sqle/17.10/DecisionTreePredict.json +172 -0
- teradataml/data/jsons/sqle/17.10/GLMPredict.json +151 -0
- teradataml/data/jsons/sqle/17.10/MovingAverage.json +368 -0
- teradataml/data/jsons/sqle/17.10/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/17.10/NaiveBayesPredict.json +149 -0
- teradataml/data/jsons/sqle/17.10/NaiveBayesTextClassifierPredict.json +288 -0
- teradataml/data/jsons/sqle/17.10/Pack.json +133 -0
- teradataml/data/jsons/sqle/17.10/SVMSparsePredict.json +193 -0
- teradataml/data/jsons/sqle/17.10/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/17.10/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/17.10/TD_BinCodeFit.json +239 -0
- teradataml/data/jsons/sqle/17.10/TD_BinCodeTransform.json +70 -0
- teradataml/data/jsons/sqle/17.10/TD_CategoricalSummary.json +54 -0
- teradataml/data/jsons/sqle/17.10/TD_Chisq.json +68 -0
- teradataml/data/jsons/sqle/17.10/TD_ColumnSummary.json +54 -0
- teradataml/data/jsons/sqle/17.10/TD_ConvertTo.json +69 -0
- teradataml/data/jsons/sqle/17.10/TD_FTest.json +187 -0
- teradataml/data/jsons/sqle/17.10/TD_FillRowID.json +52 -0
- teradataml/data/jsons/sqle/17.10/TD_FunctionFit.json +46 -0
- teradataml/data/jsons/sqle/17.10/TD_FunctionTransform.json +72 -0
- teradataml/data/jsons/sqle/17.10/TD_GetRowsWithMissingValues.json +53 -0
- teradataml/data/jsons/sqle/17.10/TD_GetRowsWithoutMissingValues.json +53 -0
- teradataml/data/jsons/sqle/17.10/TD_Histogram.json +133 -0
- teradataml/data/jsons/sqle/17.10/TD_NumApply.json +147 -0
- teradataml/data/jsons/sqle/17.10/TD_OneHotEncodingFit.json +183 -0
- teradataml/data/jsons/sqle/17.10/TD_OneHotEncodingTransform.json +66 -0
- teradataml/data/jsons/sqle/17.10/TD_OutlierFilterFit.json +197 -0
- teradataml/data/jsons/sqle/17.10/TD_OutlierFilterTransform.json +48 -0
- teradataml/data/jsons/sqle/17.10/TD_PolynomialFeaturesFit.json +114 -0
- teradataml/data/jsons/sqle/17.10/TD_PolynomialFeaturesTransform.json +72 -0
- teradataml/data/jsons/sqle/17.10/TD_QQNorm.json +112 -0
- teradataml/data/jsons/sqle/17.10/TD_RoundColumns.json +93 -0
- teradataml/data/jsons/sqle/17.10/TD_RowNormalizeFit.json +128 -0
- teradataml/data/jsons/sqle/17.10/TD_RowNormalizeTransform.json +71 -0
- teradataml/data/jsons/sqle/17.10/TD_ScaleFit.json +157 -0
- teradataml/data/jsons/sqle/17.10/TD_ScaleTransform.json +71 -0
- teradataml/data/jsons/sqle/17.10/TD_SimpleImputeFit.json +148 -0
- teradataml/data/jsons/sqle/17.10/TD_SimpleImputeTransform.json +48 -0
- teradataml/data/jsons/sqle/17.10/TD_StrApply.json +240 -0
- teradataml/data/jsons/sqle/17.10/TD_UnivariateStatistics.json +119 -0
- teradataml/data/jsons/sqle/17.10/TD_WhichMax.json +53 -0
- teradataml/data/jsons/sqle/17.10/TD_WhichMin.json +53 -0
- teradataml/data/jsons/sqle/17.10/TD_ZTest.json +171 -0
- teradataml/data/jsons/sqle/17.10/Unpack.json +188 -0
- teradataml/data/jsons/sqle/17.10/nPath.json +269 -0
- teradataml/data/jsons/sqle/17.20/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/17.20/Attribution.json +249 -0
- teradataml/data/jsons/sqle/17.20/DecisionForestPredict.json +185 -0
- teradataml/data/jsons/sqle/17.20/DecisionTreePredict.json +172 -0
- teradataml/data/jsons/sqle/17.20/GLMPredict.json +151 -0
- teradataml/data/jsons/sqle/17.20/MovingAverage.json +367 -0
- teradataml/data/jsons/sqle/17.20/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/17.20/NaiveBayesPredict.json +149 -0
- teradataml/data/jsons/sqle/17.20/NaiveBayesTextClassifierPredict.json +287 -0
- teradataml/data/jsons/sqle/17.20/Pack.json +133 -0
- teradataml/data/jsons/sqle/17.20/SVMSparsePredict.json +192 -0
- teradataml/data/jsons/sqle/17.20/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/17.20/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +149 -0
- teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
- teradataml/data/jsons/sqle/17.20/TD_BinCodeFit.json +239 -0
- teradataml/data/jsons/sqle/17.20/TD_BinCodeTransform.json +71 -0
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_CategoricalSummary.json +53 -0
- teradataml/data/jsons/sqle/17.20/TD_Chisq.json +68 -0
- teradataml/data/jsons/sqle/17.20/TD_ClassificationEvaluator.json +146 -0
- teradataml/data/jsons/sqle/17.20/TD_ColumnSummary.json +53 -0
- teradataml/data/jsons/sqle/17.20/TD_ColumnTransformer.json +218 -0
- teradataml/data/jsons/sqle/17.20/TD_ConvertTo.json +92 -0
- teradataml/data/jsons/sqle/17.20/TD_DecisionForest.json +260 -0
- teradataml/data/jsons/sqle/17.20/TD_DecisionForestPredict.json +139 -0
- teradataml/data/jsons/sqle/17.20/TD_FTest.json +269 -0
- teradataml/data/jsons/sqle/17.20/TD_FillRowID.json +52 -0
- teradataml/data/jsons/sqle/17.20/TD_FunctionFit.json +46 -0
- teradataml/data/jsons/sqle/17.20/TD_FunctionTransform.json +72 -0
- teradataml/data/jsons/sqle/17.20/TD_GLM.json +507 -0
- teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +168 -0
- teradataml/data/jsons/sqle/17.20/TD_GLMPerSegment.json +411 -0
- teradataml/data/jsons/sqle/17.20/TD_GLMPredictPerSegment.json +146 -0
- teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +93 -0
- teradataml/data/jsons/sqle/17.20/TD_GetRowsWithMissingValues.json +76 -0
- teradataml/data/jsons/sqle/17.20/TD_GetRowsWithoutMissingValues.json +76 -0
- teradataml/data/jsons/sqle/17.20/TD_Histogram.json +152 -0
- teradataml/data/jsons/sqle/17.20/TD_KMeans.json +232 -0
- teradataml/data/jsons/sqle/17.20/TD_KMeansPredict.json +87 -0
- teradataml/data/jsons/sqle/17.20/TD_KNN.json +262 -0
- teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesTextClassifierTrainer.json +137 -0
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +102 -0
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineTransform.json +71 -0
- teradataml/data/jsons/sqle/17.20/TD_NumApply.json +147 -0
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +316 -0
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVMPredict.json +124 -0
- teradataml/data/jsons/sqle/17.20/TD_OneHotEncodingFit.json +271 -0
- teradataml/data/jsons/sqle/17.20/TD_OneHotEncodingTransform.json +65 -0
- teradataml/data/jsons/sqle/17.20/TD_OrdinalEncodingFit.json +229 -0
- teradataml/data/jsons/sqle/17.20/TD_OrdinalEncodingTransform.json +75 -0
- teradataml/data/jsons/sqle/17.20/TD_OutlierFilterFit.json +217 -0
- teradataml/data/jsons/sqle/17.20/TD_OutlierFilterTransform.json +48 -0
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_PolynomialFeaturesFit.json +114 -0
- teradataml/data/jsons/sqle/17.20/TD_PolynomialFeaturesTransform.json +72 -0
- teradataml/data/jsons/sqle/17.20/TD_QQNorm.json +111 -0
- teradataml/data/jsons/sqle/17.20/TD_ROC.json +179 -0
- teradataml/data/jsons/sqle/17.20/TD_RandomProjectionFit.json +179 -0
- teradataml/data/jsons/sqle/17.20/TD_RandomProjectionMinComponents.json +74 -0
- teradataml/data/jsons/sqle/17.20/TD_RandomProjectionTransform.json +74 -0
- teradataml/data/jsons/sqle/17.20/TD_RegressionEvaluator.json +138 -0
- teradataml/data/jsons/sqle/17.20/TD_RoundColumns.json +93 -0
- teradataml/data/jsons/sqle/17.20/TD_RowNormalizeFit.json +128 -0
- teradataml/data/jsons/sqle/17.20/TD_RowNormalizeTransform.json +71 -0
- teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
- teradataml/data/jsons/sqle/17.20/TD_SVM.json +389 -0
- teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +142 -0
- teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +310 -0
- teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +120 -0
- teradataml/data/jsons/sqle/17.20/TD_SentimentExtractor.json +194 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +221 -0
- teradataml/data/jsons/sqle/17.20/TD_Silhouette.json +143 -0
- teradataml/data/jsons/sqle/17.20/TD_SimpleImputeFit.json +147 -0
- teradataml/data/jsons/sqle/17.20/TD_SimpleImputeTransform.json +48 -0
- teradataml/data/jsons/sqle/17.20/TD_StrApply.json +240 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_TargetEncodingFit.json +248 -0
- teradataml/data/jsons/sqle/17.20/TD_TargetEncodingTransform.json +75 -0
- teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +297 -0
- teradataml/data/jsons/sqle/17.20/TD_TrainTestSplit.json +142 -0
- teradataml/data/jsons/sqle/17.20/TD_UnivariateStatistics.json +117 -0
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/sqle/17.20/TD_VectorDistance.json +183 -0
- teradataml/data/jsons/sqle/17.20/TD_WhichMax.json +53 -0
- teradataml/data/jsons/sqle/17.20/TD_WhichMin.json +53 -0
- teradataml/data/jsons/sqle/17.20/TD_WordEmbeddings.json +241 -0
- teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +330 -0
- teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +195 -0
- teradataml/data/jsons/sqle/17.20/TD_ZTest.json +247 -0
- teradataml/data/jsons/sqle/17.20/Unpack.json +188 -0
- teradataml/data/jsons/sqle/17.20/nPath.json +269 -0
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +370 -0
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +460 -0
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +385 -0
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +369 -0
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +369 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +369 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +369 -0
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +400 -0
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +401 -0
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +384 -0
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +384 -0
- teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
- teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
- teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/tableoperator/17.00/read_nos.json +198 -0
- teradataml/data/jsons/tableoperator/17.05/read_nos.json +198 -0
- teradataml/data/jsons/tableoperator/17.05/write_nos.json +195 -0
- teradataml/data/jsons/tableoperator/17.10/read_nos.json +184 -0
- teradataml/data/jsons/tableoperator/17.10/write_nos.json +195 -0
- teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
- teradataml/data/jsons/tableoperator/17.20/read_nos.json +183 -0
- teradataml/data/jsons/tableoperator/17.20/write_nos.json +224 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +132 -0
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +396 -0
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +77 -0
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +153 -0
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +107 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +106 -0
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +89 -0
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +104 -0
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +78 -0
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +66 -0
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +87 -0
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +134 -0
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +144 -0
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +108 -0
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +108 -0
- teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +78 -0
- teradataml/data/jsons/uaf/17.20/TD_DIFF.json +92 -0
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +114 -0
- teradataml/data/jsons/uaf/17.20/TD_DURBIN_WATSON.json +101 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_EXTRACT_RESULTS.json +39 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +101 -0
- teradataml/data/jsons/uaf/17.20/TD_GENSERIES4FORMULA.json +85 -0
- teradataml/data/jsons/uaf/17.20/TD_GENSERIES4SINUSOIDS.json +71 -0
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +139 -0
- teradataml/data/jsons/uaf/17.20/TD_HOLT_WINTERS_FORECASTER.json +313 -0
- teradataml/data/jsons/uaf/17.20/TD_IDFFT.json +58 -0
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +81 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_INPUTVALIDATOR.json +64 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +182 -0
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +103 -0
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +181 -0
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_MATRIXMULTIPLY.json +68 -0
- teradataml/data/jsons/uaf/17.20/TD_MINFO.json +67 -0
- teradataml/data/jsons/uaf/17.20/TD_MULTIVAR_REGR.json +179 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +114 -0
- teradataml/data/jsons/uaf/17.20/TD_PORTMAN.json +119 -0
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +175 -0
- teradataml/data/jsons/uaf/17.20/TD_POWERTRANSFORM.json +98 -0
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +194 -0
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +210 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +143 -0
- teradataml/data/jsons/uaf/17.20/TD_SELECTION_CRITERIA.json +90 -0
- teradataml/data/jsons/uaf/17.20/TD_SIGNIF_PERIODICITIES.json +80 -0
- teradataml/data/jsons/uaf/17.20/TD_SIGNIF_RESIDMEAN.json +68 -0
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +184 -0
- teradataml/data/jsons/uaf/17.20/TD_SINFO.json +58 -0
- teradataml/data/jsons/uaf/17.20/TD_SMOOTHMA.json +163 -0
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +101 -0
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +112 -0
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +95 -0
- teradataml/data/jsons/uaf/17.20/TD_WHITES_GENERAL.json +78 -0
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +410 -0
- teradataml/data/kmeans_example.json +23 -0
- teradataml/data/kmeans_table.csv +10 -0
- teradataml/data/kmeans_us_arrests_data.csv +51 -0
- teradataml/data/knn_example.json +19 -0
- teradataml/data/knnrecommender_example.json +7 -0
- teradataml/data/knnrecommenderpredict_example.json +12 -0
- teradataml/data/lar_example.json +17 -0
- teradataml/data/larpredict_example.json +30 -0
- teradataml/data/lc_new_predictors.csv +5 -0
- teradataml/data/lc_new_reference.csv +9 -0
- teradataml/data/lda_example.json +9 -0
- teradataml/data/ldainference_example.json +15 -0
- teradataml/data/ldatopicsummary_example.json +9 -0
- teradataml/data/levendist_input.csv +13 -0
- teradataml/data/levenshteindistance_example.json +10 -0
- teradataml/data/linreg_example.json +10 -0
- teradataml/data/load_example_data.py +350 -0
- teradataml/data/loan_prediction.csv +295 -0
- teradataml/data/lungcancer.csv +138 -0
- teradataml/data/mappingdata.csv +12 -0
- teradataml/data/medical_readings.csv +101 -0
- teradataml/data/milk_timeseries.csv +157 -0
- teradataml/data/min_max_titanic.csv +4 -0
- teradataml/data/minhash_example.json +6 -0
- teradataml/data/ml_ratings.csv +7547 -0
- teradataml/data/ml_ratings_10.csv +2445 -0
- teradataml/data/mobile_data.csv +13 -0
- teradataml/data/model1_table.csv +5 -0
- teradataml/data/model2_table.csv +5 -0
- teradataml/data/models/License_file.txt +1 -0
- teradataml/data/models/License_file_empty.txt +0 -0
- teradataml/data/models/dataiku_iris_data_ann_thin +0 -0
- teradataml/data/models/dr_iris_rf +0 -0
- teradataml/data/models/iris_db_dt_model_sklearn.onnx +0 -0
- teradataml/data/models/iris_db_dt_model_sklearn_floattensor.onnx +0 -0
- teradataml/data/models/iris_db_glm_model.pmml +57 -0
- teradataml/data/models/iris_db_xgb_model.pmml +4471 -0
- teradataml/data/models/iris_kmeans_model +0 -0
- teradataml/data/models/iris_mojo_glm_h2o_model +0 -0
- teradataml/data/models/iris_mojo_xgb_h2o_model +0 -0
- teradataml/data/modularity_example.json +12 -0
- teradataml/data/movavg_example.json +8 -0
- teradataml/data/mtx1.csv +7 -0
- teradataml/data/mtx2.csv +13 -0
- teradataml/data/multi_model_classification.csv +401 -0
- teradataml/data/multi_model_regression.csv +401 -0
- teradataml/data/mvdfft8.csv +9 -0
- teradataml/data/naivebayes_example.json +10 -0
- teradataml/data/naivebayespredict_example.json +19 -0
- teradataml/data/naivebayestextclassifier2_example.json +7 -0
- teradataml/data/naivebayestextclassifier_example.json +8 -0
- teradataml/data/naivebayestextclassifierpredict_example.json +32 -0
- teradataml/data/name_Find_configure.csv +10 -0
- teradataml/data/namedentityfinder_example.json +14 -0
- teradataml/data/namedentityfinderevaluator_example.json +10 -0
- teradataml/data/namedentityfindertrainer_example.json +6 -0
- teradataml/data/nb_iris_input_test.csv +31 -0
- teradataml/data/nb_iris_input_train.csv +121 -0
- teradataml/data/nbp_iris_model.csv +13 -0
- teradataml/data/ner_dict.csv +8 -0
- teradataml/data/ner_extractor_text.csv +2 -0
- teradataml/data/ner_input_eng.csv +7 -0
- teradataml/data/ner_rule.csv +5 -0
- teradataml/data/ner_sports_test2.csv +29 -0
- teradataml/data/ner_sports_train.csv +501 -0
- teradataml/data/nerevaluator_example.json +6 -0
- teradataml/data/nerextractor_example.json +18 -0
- teradataml/data/nermem_sports_test.csv +18 -0
- teradataml/data/nermem_sports_train.csv +51 -0
- teradataml/data/nertrainer_example.json +7 -0
- teradataml/data/ngrams_example.json +7 -0
- teradataml/data/notebooks/__init__.py +0 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Aggregate Functions using SQLAlchemy.ipynb +1455 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Arithmetic Functions Using SQLAlchemy.ipynb +1993 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Bit-Byte Manipulation Functions using SQLAlchemy.ipynb +1492 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Built-in functions using SQLAlchemy.ipynb +536 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Regular Expressions Using SQLAlchemy.ipynb +570 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage String Functions Using SQLAlchemy.ipynb +2559 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Window Aggregate Functions using SQLAlchemy.ipynb +2911 -0
- teradataml/data/notebooks/sqlalchemy/Using Generic SQLAlchemy ClauseElements teradataml DataFrame assign method.ipynb +698 -0
- teradataml/data/notebooks/sqlalchemy/__init__.py +0 -0
- teradataml/data/notebooks/sqlalchemy/teradataml filtering using SQLAlchemy ClauseElements.ipynb +784 -0
- teradataml/data/npath_example.json +23 -0
- teradataml/data/ntree_example.json +14 -0
- teradataml/data/numeric_strings.csv +5 -0
- teradataml/data/numerics.csv +4 -0
- teradataml/data/ocean_buoy.csv +17 -0
- teradataml/data/ocean_buoy2.csv +17 -0
- teradataml/data/ocean_buoys.csv +28 -0
- teradataml/data/ocean_buoys2.csv +10 -0
- teradataml/data/ocean_buoys_nonpti.csv +28 -0
- teradataml/data/ocean_buoys_seq.csv +29 -0
- teradataml/data/onehot_encoder_train.csv +4 -0
- teradataml/data/openml_example.json +92 -0
- teradataml/data/optional_event_table.csv +4 -0
- teradataml/data/orders1.csv +11 -0
- teradataml/data/orders1_12.csv +13 -0
- teradataml/data/orders_ex.csv +4 -0
- teradataml/data/pack_example.json +9 -0
- teradataml/data/package_tracking.csv +19 -0
- teradataml/data/package_tracking_pti.csv +19 -0
- teradataml/data/pagerank_example.json +13 -0
- teradataml/data/paragraphs_input.csv +6 -0
- teradataml/data/pathanalyzer_example.json +8 -0
- teradataml/data/pathgenerator_example.json +8 -0
- teradataml/data/patient_profile.csv +101 -0
- teradataml/data/pattern_matching_data.csv +11 -0
- teradataml/data/payment_fraud_dataset.csv +10001 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/phrases.csv +7 -0
- teradataml/data/pivot_example.json +9 -0
- teradataml/data/pivot_input.csv +22 -0
- teradataml/data/playerRating.csv +31 -0
- teradataml/data/pos_input.csv +40 -0
- teradataml/data/postagger_example.json +7 -0
- teradataml/data/posttagger_output.csv +44 -0
- teradataml/data/production_data.csv +17 -0
- teradataml/data/production_data2.csv +7 -0
- teradataml/data/randomsample_example.json +32 -0
- teradataml/data/randomwalksample_example.json +9 -0
- teradataml/data/rank_table.csv +6 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/ref_mobile_data.csv +4 -0
- teradataml/data/ref_mobile_data_dense.csv +2 -0
- teradataml/data/ref_url.csv +17 -0
- teradataml/data/restaurant_reviews.csv +7 -0
- teradataml/data/retail_churn_table.csv +27772 -0
- teradataml/data/river_data.csv +145 -0
- teradataml/data/roc_example.json +8 -0
- teradataml/data/roc_input.csv +101 -0
- teradataml/data/rule_inputs.csv +6 -0
- teradataml/data/rule_table.csv +2 -0
- teradataml/data/sales.csv +7 -0
- teradataml/data/sales_transaction.csv +501 -0
- teradataml/data/salesdata.csv +342 -0
- teradataml/data/sample_cities.csv +3 -0
- teradataml/data/sample_shapes.csv +11 -0
- teradataml/data/sample_streets.csv +3 -0
- teradataml/data/sampling_example.json +16 -0
- teradataml/data/sax_example.json +17 -0
- teradataml/data/scale_attributes.csv +3 -0
- teradataml/data/scale_example.json +74 -0
- teradataml/data/scale_housing.csv +11 -0
- teradataml/data/scale_housing_test.csv +6 -0
- teradataml/data/scale_input_part_sparse.csv +31 -0
- teradataml/data/scale_input_partitioned.csv +16 -0
- teradataml/data/scale_input_sparse.csv +11 -0
- teradataml/data/scale_parameters.csv +3 -0
- teradataml/data/scale_stat.csv +11 -0
- teradataml/data/scalebypartition_example.json +13 -0
- teradataml/data/scalemap_example.json +13 -0
- teradataml/data/scalesummary_example.json +12 -0
- teradataml/data/score_category.csv +101 -0
- teradataml/data/score_summary.csv +4 -0
- teradataml/data/script_example.json +10 -0
- teradataml/data/scripts/deploy_script.py +84 -0
- teradataml/data/scripts/lightgbm/dataset.template +175 -0
- teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +264 -0
- teradataml/data/scripts/lightgbm/lightgbm_function.template +234 -0
- teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +177 -0
- teradataml/data/scripts/mapper.R +20 -0
- teradataml/data/scripts/mapper.py +16 -0
- teradataml/data/scripts/mapper_replace.py +16 -0
- teradataml/data/scripts/sklearn/__init__.py +0 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +205 -0
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +148 -0
- teradataml/data/scripts/sklearn/sklearn_function.template +144 -0
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +166 -0
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +161 -0
- teradataml/data/scripts/sklearn/sklearn_score.py +145 -0
- teradataml/data/scripts/sklearn/sklearn_transform.py +327 -0
- teradataml/data/sdk/modelops/modelops_spec.json +101737 -0
- teradataml/data/seeds.csv +10 -0
- teradataml/data/sentenceextractor_example.json +7 -0
- teradataml/data/sentiment_extract_input.csv +11 -0
- teradataml/data/sentiment_train.csv +16 -0
- teradataml/data/sentiment_word.csv +20 -0
- teradataml/data/sentiment_word_input.csv +20 -0
- teradataml/data/sentimentextractor_example.json +24 -0
- teradataml/data/sentimenttrainer_example.json +8 -0
- teradataml/data/sequence_table.csv +10 -0
- teradataml/data/seriessplitter_example.json +8 -0
- teradataml/data/sessionize_example.json +17 -0
- teradataml/data/sessionize_table.csv +116 -0
- teradataml/data/setop_test1.csv +24 -0
- teradataml/data/setop_test2.csv +22 -0
- teradataml/data/soc_nw_edges.csv +11 -0
- teradataml/data/soc_nw_vertices.csv +8 -0
- teradataml/data/souvenir_timeseries.csv +168 -0
- teradataml/data/sparse_iris_attribute.csv +5 -0
- teradataml/data/sparse_iris_test.csv +121 -0
- teradataml/data/sparse_iris_train.csv +601 -0
- teradataml/data/star1.csv +6 -0
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/state_transition.csv +5 -0
- teradataml/data/stock_data.csv +53 -0
- teradataml/data/stock_movement.csv +11 -0
- teradataml/data/stock_vol.csv +76 -0
- teradataml/data/stop_words.csv +8 -0
- teradataml/data/store_sales.csv +37 -0
- teradataml/data/stringsimilarity_example.json +8 -0
- teradataml/data/strsimilarity_input.csv +13 -0
- teradataml/data/students.csv +101 -0
- teradataml/data/svm_iris_input_test.csv +121 -0
- teradataml/data/svm_iris_input_train.csv +481 -0
- teradataml/data/svm_iris_model.csv +7 -0
- teradataml/data/svmdense_example.json +10 -0
- teradataml/data/svmdensepredict_example.json +19 -0
- teradataml/data/svmsparse_example.json +8 -0
- teradataml/data/svmsparsepredict_example.json +14 -0
- teradataml/data/svmsparsesummary_example.json +8 -0
- teradataml/data/target_mobile_data.csv +13 -0
- teradataml/data/target_mobile_data_dense.csv +5 -0
- teradataml/data/target_udt_data.csv +8 -0
- teradataml/data/tdnerextractor_example.json +14 -0
- teradataml/data/templatedata.csv +1201 -0
- teradataml/data/templates/open_source_ml.json +11 -0
- teradataml/data/teradata_icon.ico +0 -0
- teradataml/data/teradataml_example.json +1473 -0
- teradataml/data/test_classification.csv +101 -0
- teradataml/data/test_loan_prediction.csv +53 -0
- teradataml/data/test_pacf_12.csv +37 -0
- teradataml/data/test_prediction.csv +101 -0
- teradataml/data/test_regression.csv +101 -0
- teradataml/data/test_river2.csv +109 -0
- teradataml/data/text_inputs.csv +6 -0
- teradataml/data/textchunker_example.json +8 -0
- teradataml/data/textclassifier_example.json +7 -0
- teradataml/data/textclassifier_input.csv +7 -0
- teradataml/data/textclassifiertrainer_example.json +7 -0
- teradataml/data/textmorph_example.json +11 -0
- teradataml/data/textparser_example.json +15 -0
- teradataml/data/texttagger_example.json +12 -0
- teradataml/data/texttokenizer_example.json +7 -0
- teradataml/data/texttrainer_input.csv +11 -0
- teradataml/data/tf_example.json +7 -0
- teradataml/data/tfidf_example.json +14 -0
- teradataml/data/tfidf_input1.csv +201 -0
- teradataml/data/tfidf_train.csv +6 -0
- teradataml/data/time_table1.csv +535 -0
- teradataml/data/time_table2.csv +14 -0
- teradataml/data/timeseriesdata.csv +1601 -0
- teradataml/data/timeseriesdatasetsd4.csv +105 -0
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic.csv +892 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/to_num_data.csv +4 -0
- teradataml/data/tochar_data.csv +5 -0
- teradataml/data/token_table.csv +696 -0
- teradataml/data/train_multiclass.csv +101 -0
- teradataml/data/train_regression.csv +101 -0
- teradataml/data/train_regression_multiple_labels.csv +101 -0
- teradataml/data/train_tracking.csv +28 -0
- teradataml/data/trans_dense.csv +16 -0
- teradataml/data/trans_sparse.csv +55 -0
- teradataml/data/transformation_table.csv +6 -0
- teradataml/data/transformation_table_new.csv +2 -0
- teradataml/data/tv_spots.csv +16 -0
- teradataml/data/twod_climate_data.csv +117 -0
- teradataml/data/uaf_example.json +529 -0
- teradataml/data/univariatestatistics_example.json +9 -0
- teradataml/data/unpack_example.json +10 -0
- teradataml/data/unpivot_example.json +25 -0
- teradataml/data/unpivot_input.csv +8 -0
- teradataml/data/url_data.csv +10 -0
- teradataml/data/us_air_pass.csv +37 -0
- teradataml/data/us_population.csv +624 -0
- teradataml/data/us_states_shapes.csv +52 -0
- teradataml/data/varmax_example.json +18 -0
- teradataml/data/vectordistance_example.json +30 -0
- teradataml/data/ville_climatedata.csv +121 -0
- teradataml/data/ville_tempdata.csv +12 -0
- teradataml/data/ville_tempdata1.csv +12 -0
- teradataml/data/ville_temperature.csv +11 -0
- teradataml/data/waveletTable.csv +1605 -0
- teradataml/data/waveletTable2.csv +1605 -0
- teradataml/data/weightedmovavg_example.json +9 -0
- teradataml/data/wft_testing.csv +5 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/data/wine_data.csv +1600 -0
- teradataml/data/word_embed_input_table1.csv +6 -0
- teradataml/data/word_embed_input_table2.csv +5 -0
- teradataml/data/word_embed_model.csv +23 -0
- teradataml/data/words_input.csv +13 -0
- teradataml/data/xconvolve_complex_left.csv +6 -0
- teradataml/data/xconvolve_complex_leftmulti.csv +6 -0
- teradataml/data/xgboost_example.json +36 -0
- teradataml/data/xgboostpredict_example.json +32 -0
- teradataml/data/ztest_example.json +16 -0
- teradataml/dataframe/__init__.py +0 -0
- teradataml/dataframe/copy_to.py +2446 -0
- teradataml/dataframe/data_transfer.py +2840 -0
- teradataml/dataframe/dataframe.py +20908 -0
- teradataml/dataframe/dataframe_utils.py +2114 -0
- teradataml/dataframe/fastload.py +794 -0
- teradataml/dataframe/functions.py +2110 -0
- teradataml/dataframe/indexer.py +424 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +1171 -0
- teradataml/dataframe/sql.py +10904 -0
- teradataml/dataframe/sql_function_parameters.py +440 -0
- teradataml/dataframe/sql_functions.py +652 -0
- teradataml/dataframe/sql_interfaces.py +220 -0
- teradataml/dataframe/vantage_function_types.py +675 -0
- teradataml/dataframe/window.py +694 -0
- teradataml/dbutils/__init__.py +3 -0
- teradataml/dbutils/dbutils.py +2871 -0
- teradataml/dbutils/filemgr.py +318 -0
- teradataml/gen_ai/__init__.py +2 -0
- teradataml/gen_ai/convAI.py +473 -0
- teradataml/geospatial/__init__.py +4 -0
- teradataml/geospatial/geodataframe.py +1105 -0
- teradataml/geospatial/geodataframecolumn.py +392 -0
- teradataml/geospatial/geometry_types.py +926 -0
- teradataml/hyperparameter_tuner/__init__.py +1 -0
- teradataml/hyperparameter_tuner/optimizer.py +4115 -0
- teradataml/hyperparameter_tuner/utils.py +303 -0
- teradataml/lib/__init__.py +0 -0
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/lib/libaed_0_1.dylib +0 -0
- teradataml/lib/libaed_0_1.so +0 -0
- teradataml/lib/libaed_0_1_aarch64.so +0 -0
- teradataml/lib/libaed_0_1_ppc64le.so +0 -0
- teradataml/opensource/__init__.py +1 -0
- teradataml/opensource/_base.py +1321 -0
- teradataml/opensource/_class.py +464 -0
- teradataml/opensource/_constants.py +61 -0
- teradataml/opensource/_lightgbm.py +949 -0
- teradataml/opensource/_sklearn.py +1008 -0
- teradataml/opensource/_wrapper_utils.py +267 -0
- teradataml/options/__init__.py +148 -0
- teradataml/options/configure.py +489 -0
- teradataml/options/display.py +187 -0
- teradataml/plot/__init__.py +3 -0
- teradataml/plot/axis.py +1427 -0
- teradataml/plot/constants.py +15 -0
- teradataml/plot/figure.py +431 -0
- teradataml/plot/plot.py +810 -0
- teradataml/plot/query_generator.py +83 -0
- teradataml/plot/subplot.py +216 -0
- teradataml/scriptmgmt/UserEnv.py +4273 -0
- teradataml/scriptmgmt/__init__.py +3 -0
- teradataml/scriptmgmt/lls_utils.py +2157 -0
- teradataml/sdk/README.md +79 -0
- teradataml/sdk/__init__.py +4 -0
- teradataml/sdk/_auth_modes.py +422 -0
- teradataml/sdk/_func_params.py +487 -0
- teradataml/sdk/_json_parser.py +453 -0
- teradataml/sdk/_openapi_spec_constants.py +249 -0
- teradataml/sdk/_utils.py +236 -0
- teradataml/sdk/api_client.py +900 -0
- teradataml/sdk/constants.py +62 -0
- teradataml/sdk/modelops/__init__.py +98 -0
- teradataml/sdk/modelops/_client.py +409 -0
- teradataml/sdk/modelops/_constants.py +304 -0
- teradataml/sdk/modelops/models.py +2308 -0
- teradataml/sdk/spinner.py +107 -0
- teradataml/series/__init__.py +0 -0
- teradataml/series/series.py +537 -0
- teradataml/series/series_utils.py +71 -0
- teradataml/store/__init__.py +12 -0
- teradataml/store/feature_store/__init__.py +0 -0
- teradataml/store/feature_store/constants.py +658 -0
- teradataml/store/feature_store/feature_store.py +4814 -0
- teradataml/store/feature_store/mind_map.py +639 -0
- teradataml/store/feature_store/models.py +7330 -0
- teradataml/store/feature_store/utils.py +390 -0
- teradataml/table_operators/Apply.py +979 -0
- teradataml/table_operators/Script.py +1739 -0
- teradataml/table_operators/TableOperator.py +1343 -0
- teradataml/table_operators/__init__.py +2 -0
- teradataml/table_operators/apply_query_generator.py +262 -0
- teradataml/table_operators/query_generator.py +493 -0
- teradataml/table_operators/table_operator_query_generator.py +462 -0
- teradataml/table_operators/table_operator_util.py +726 -0
- teradataml/table_operators/templates/dataframe_apply.template +184 -0
- teradataml/table_operators/templates/dataframe_map.template +176 -0
- teradataml/table_operators/templates/dataframe_register.template +73 -0
- teradataml/table_operators/templates/dataframe_udf.template +67 -0
- teradataml/table_operators/templates/script_executor.template +170 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +53 -0
- teradataml/utils/__init__.py +0 -0
- teradataml/utils/docstring.py +527 -0
- teradataml/utils/dtypes.py +943 -0
- teradataml/utils/internal_buffer.py +122 -0
- teradataml/utils/print_versions.py +206 -0
- teradataml/utils/utils.py +451 -0
- teradataml/utils/validators.py +3305 -0
- teradataml-20.0.0.8.dist-info/METADATA +2804 -0
- teradataml-20.0.0.8.dist-info/RECORD +1208 -0
- teradataml-20.0.0.8.dist-info/WHEEL +5 -0
- teradataml-20.0.0.8.dist-info/top_level.txt +1 -0
- teradataml-20.0.0.8.dist-info/zip-safe +1 -0
|
@@ -0,0 +1,1384 @@
|
|
|
1
|
+
# ##################################################################
|
|
2
|
+
#
|
|
3
|
+
# Copyright 2025 Teradata. All rights reserved.
|
|
4
|
+
# TERADATA CONFIDENTIAL AND TRADE SECRET
|
|
5
|
+
#
|
|
6
|
+
# Primary Owner: Sweta Shaw
|
|
7
|
+
# Email Id: Sweta.Shaw@Teradata.com
|
|
8
|
+
#
|
|
9
|
+
# Secondary Owner: Akhil Bisht
|
|
10
|
+
# Email Id: AKHIL.BISHT@Teradata.com
|
|
11
|
+
#
|
|
12
|
+
# Version: 1.1
|
|
13
|
+
# Function Version: 1.0
|
|
14
|
+
# ##################################################################
|
|
15
|
+
|
|
16
|
+
# Python libraries
|
|
17
|
+
import numpy as np
|
|
18
|
+
import pandas as pd
|
|
19
|
+
import time
|
|
20
|
+
import warnings
|
|
21
|
+
|
|
22
|
+
# Teradata libraries
|
|
23
|
+
from teradataml import SMOTE
|
|
24
|
+
from teradataml.dataframe.dataframe import DataFrame
|
|
25
|
+
from teradataml.dataframe.copy_to import copy_to_sql
|
|
26
|
+
from teradataml import OutlierFilterFit, OutlierFilterTransform, FillRowId
|
|
27
|
+
from teradataml import RoundColumns, TeradataMlException
|
|
28
|
+
from teradataml import ScaleFit, ScaleTransform
|
|
29
|
+
from teradataml import UtilFuncs, TeradataConstants
|
|
30
|
+
from teradataml.dbutils.dbutils import execute_sql
|
|
31
|
+
from teradataml.common.garbagecollector import GarbageCollector
|
|
32
|
+
from teradataml.common.messages import Messages, MessageCodes
|
|
33
|
+
from teradataml.context.context import _get_current_databasename
|
|
34
|
+
from teradataml.utils.validators import _Validators
|
|
35
|
+
from teradataml import configure, INTEGER
|
|
36
|
+
from teradataml.common.constants import TeradataConstants
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class _DataPreparation:
|
|
40
|
+
|
|
41
|
+
def __init__(self,
|
|
42
|
+
data,
|
|
43
|
+
target_column,
|
|
44
|
+
id_column,
|
|
45
|
+
verbose=0,
|
|
46
|
+
excluded_columns=None,
|
|
47
|
+
custom_data=None,
|
|
48
|
+
data_transform_dict=None,
|
|
49
|
+
task_type="Regression",
|
|
50
|
+
**kwargs):
|
|
51
|
+
"""
|
|
52
|
+
DESCRIPTION:
|
|
53
|
+
Function initializes the data, target column and columns datatypes
|
|
54
|
+
for data preparation.
|
|
55
|
+
|
|
56
|
+
PARAMETERS:
|
|
57
|
+
data:
|
|
58
|
+
Required Argument.
|
|
59
|
+
Specifies the input teradataml Dataframe for data preparation phase.
|
|
60
|
+
Types: teradataml Dataframe
|
|
61
|
+
|
|
62
|
+
target_column:
|
|
63
|
+
Required Argument.
|
|
64
|
+
Specifies the name of the target column in "data".
|
|
65
|
+
Types: str
|
|
66
|
+
|
|
67
|
+
id_column:
|
|
68
|
+
Required Argument.
|
|
69
|
+
Specifies the name of the unique identifier column in "data".
|
|
70
|
+
Types: str
|
|
71
|
+
|
|
72
|
+
verbose:
|
|
73
|
+
Optional Argument.
|
|
74
|
+
Specifies the detailed execution steps based on verbose level.
|
|
75
|
+
Default Value: 0
|
|
76
|
+
Permitted Values:
|
|
77
|
+
* 0: prints the progress bar and leaderboard
|
|
78
|
+
* 1: prints the execution steps of AutoML.
|
|
79
|
+
* 2: prints the intermediate data between the execution of each step of AutoML.
|
|
80
|
+
Types: int
|
|
81
|
+
|
|
82
|
+
excluded_columns:
|
|
83
|
+
Required Argument.
|
|
84
|
+
Specifies the columns should be excluded from any processing.
|
|
85
|
+
Types: str or list of strings (str)
|
|
86
|
+
|
|
87
|
+
custom_data:
|
|
88
|
+
Optional Argument.
|
|
89
|
+
Specifies json object containing user customized input.
|
|
90
|
+
Types: json object
|
|
91
|
+
|
|
92
|
+
data_transform_dict:
|
|
93
|
+
Optional Argument.
|
|
94
|
+
Specifies the parameters for data transformation.
|
|
95
|
+
Types: dict
|
|
96
|
+
|
|
97
|
+
task_type:
|
|
98
|
+
Required Argument.
|
|
99
|
+
Specifies the task type for AutoML, whether to apply regresion OR classification
|
|
100
|
+
on the provived dataset.
|
|
101
|
+
Default Value: "Regression"
|
|
102
|
+
Permitted Values: "Regression", "Classification"
|
|
103
|
+
Types: str
|
|
104
|
+
|
|
105
|
+
**kwargs:
|
|
106
|
+
Specifies the additional arguments for data preparation. Below
|
|
107
|
+
are the additional arguments:
|
|
108
|
+
volatile:
|
|
109
|
+
Optional Argument.
|
|
110
|
+
Specifies whether to put the interim results of the
|
|
111
|
+
functions in a volatile table or not. When set to
|
|
112
|
+
True, results are stored in a volatile table,
|
|
113
|
+
otherwise not.
|
|
114
|
+
Default Value: False
|
|
115
|
+
Types: bool
|
|
116
|
+
|
|
117
|
+
persist:
|
|
118
|
+
Optional Argument.
|
|
119
|
+
Specifies whether to persist the interim results of the
|
|
120
|
+
functions in a table or not. When set to True,
|
|
121
|
+
results are persisted in a table; otherwise,
|
|
122
|
+
results are garbage collected at the end of the
|
|
123
|
+
session.
|
|
124
|
+
Default Value: False
|
|
125
|
+
Types: bool
|
|
126
|
+
|
|
127
|
+
seed:
|
|
128
|
+
Optional Argument.
|
|
129
|
+
Specifies the random seed for reproducibility.
|
|
130
|
+
Default Value: 42
|
|
131
|
+
Types: int
|
|
132
|
+
|
|
133
|
+
automl_phases:
|
|
134
|
+
Optional Argument.
|
|
135
|
+
Specifies the phase of AutoML to be executed.
|
|
136
|
+
Default Value: None
|
|
137
|
+
Types: str or list of str.
|
|
138
|
+
|
|
139
|
+
cluster:
|
|
140
|
+
Optional Argument.
|
|
141
|
+
Specifies whether to run data preparation for handling clustering.
|
|
142
|
+
Default Value: False
|
|
143
|
+
Types: bool
|
|
144
|
+
|
|
145
|
+
imbalance_handling_method:
|
|
146
|
+
Optional Argument.
|
|
147
|
+
Specifies which imbalance handling method to use.
|
|
148
|
+
Default Value: "SMOTE"
|
|
149
|
+
Permitted Values: "SMOTE", "ADASYN", "SMOTETomek", "NearMiss"
|
|
150
|
+
Types: str
|
|
151
|
+
|
|
152
|
+
enable_lasso:
|
|
153
|
+
Optional Argument.
|
|
154
|
+
Specifies whether to use lasso regression for feature selection.
|
|
155
|
+
By default, only RFE and PCA are used for feature selection.
|
|
156
|
+
Default Value: False
|
|
157
|
+
Types: bool
|
|
158
|
+
|
|
159
|
+
raise_errors:
|
|
160
|
+
Optional Argument.
|
|
161
|
+
Specifies whether to raise errors or warnings for
|
|
162
|
+
non-blocking errors. When set to True, raises errors,
|
|
163
|
+
otherwise raises warnings.
|
|
164
|
+
Default Value: False
|
|
165
|
+
Types: bool
|
|
166
|
+
RETURNS:
|
|
167
|
+
None
|
|
168
|
+
|
|
169
|
+
RAISES:
|
|
170
|
+
None
|
|
171
|
+
|
|
172
|
+
EXAMPLES:
|
|
173
|
+
>>> excluded_cols = ["id", "timestamp"]
|
|
174
|
+
>>> transform_dict = {"target_col_encode_ind": False, "classification_type": True}
|
|
175
|
+
>>> data_prep = _DataPreparation(data=df,
|
|
176
|
+
... target_column="target",
|
|
177
|
+
... id_column="id",
|
|
178
|
+
... verbose=1,
|
|
179
|
+
... excluded_columns=excluded_cols,
|
|
180
|
+
... data_transform_dict=transform_dict,
|
|
181
|
+
... task_type="Classification",
|
|
182
|
+
... persist=True,
|
|
183
|
+
... seed=42)
|
|
184
|
+
"""
|
|
185
|
+
self.data = data
|
|
186
|
+
self.target_column = target_column
|
|
187
|
+
self.id_column = id_column
|
|
188
|
+
self.verbose = verbose
|
|
189
|
+
self.excluded_columns = excluded_columns
|
|
190
|
+
self.data_transform_dict = data_transform_dict
|
|
191
|
+
self.custom_data = custom_data
|
|
192
|
+
self.task_type = task_type
|
|
193
|
+
self.volatile = kwargs.get("volatile", False)
|
|
194
|
+
self.persist = kwargs.get("persist", False)
|
|
195
|
+
self.aml_phases = kwargs.get("automl_phases", None)
|
|
196
|
+
self.cluster = kwargs.get('cluster', False)
|
|
197
|
+
self._data_sampling_method = kwargs.get("imbalance_handling_method", "SMOTE")
|
|
198
|
+
|
|
199
|
+
# Setting default value for auto run mode
|
|
200
|
+
self._scale_method_reg = "STD"
|
|
201
|
+
self._scale_method_cls = "RANGE"
|
|
202
|
+
self._scale_method_clust = "STD"
|
|
203
|
+
|
|
204
|
+
self.data_types = {key: value for key, value in self.data._column_names_and_types}
|
|
205
|
+
self.seed = kwargs.get("seed", 42)
|
|
206
|
+
# np.random.seed() affects the random number generation in numpy and sklearn
|
|
207
|
+
# setting this changes the global state of the random number generator
|
|
208
|
+
# hence, setting the seed only if it is not None
|
|
209
|
+
if kwargs.get("seed") is not None:
|
|
210
|
+
np.random.seed(self.seed)
|
|
211
|
+
|
|
212
|
+
self.data_mapping = kwargs.get("data_mapping", {})
|
|
213
|
+
# Setting lasso feature selection flag
|
|
214
|
+
self.enable_lasso = kwargs.get('enable_lasso', False)
|
|
215
|
+
self.raise_errors = kwargs.get("raise_errors", False)
|
|
216
|
+
|
|
217
|
+
def data_preparation(self,
|
|
218
|
+
auto=True):
|
|
219
|
+
"""
|
|
220
|
+
DESCRIPTION:
|
|
221
|
+
Function to perform following tasks:-
|
|
222
|
+
1. Performs outlier processing and transformation on dataset.
|
|
223
|
+
2. Performs feature selection using RFE, PCA, and Lasso.
|
|
224
|
+
3. Performs feature scaling.
|
|
225
|
+
|
|
226
|
+
PARAMETERS:
|
|
227
|
+
auto:
|
|
228
|
+
Optional Argument.
|
|
229
|
+
Specifies whether to run AutoML in custom mode or auto mode.
|
|
230
|
+
When set to False, runs in custom mode. Otherwise, by default runs in auto mode.
|
|
231
|
+
Default Value: True
|
|
232
|
+
Types: bool
|
|
233
|
+
|
|
234
|
+
RETURNS:
|
|
235
|
+
tuple containing, list of feature lists, data transformation dictionary,
|
|
236
|
+
and data mapping dictionary.
|
|
237
|
+
|
|
238
|
+
RAISES:
|
|
239
|
+
None
|
|
240
|
+
|
|
241
|
+
EXAMPLES:
|
|
242
|
+
>>> data_prep = _DataPreparation(data=df,
|
|
243
|
+
... target_column="target",
|
|
244
|
+
... verbose=1,
|
|
245
|
+
... data_transform_dict=transform_dict,
|
|
246
|
+
... task_type="Classification")
|
|
247
|
+
>>> feature_lists, transform_dict, data_mapping = data_prep.data_preparation(auto=True)
|
|
248
|
+
"""
|
|
249
|
+
self._display_heading(phase=2,
|
|
250
|
+
progress_bar=self.progress_bar,
|
|
251
|
+
automl_phases=self.aml_phases)
|
|
252
|
+
self._display_msg(msg='Data preparation started ...',
|
|
253
|
+
progress_bar=self.progress_bar)
|
|
254
|
+
# Setting user value in case of custom running mode
|
|
255
|
+
if not auto:
|
|
256
|
+
self._set_custom_scaling_method()
|
|
257
|
+
self._set_custom_sampling()
|
|
258
|
+
|
|
259
|
+
# Handling ouliers in dataset
|
|
260
|
+
self._handle_outliers(auto)
|
|
261
|
+
self.progress_bar.update()
|
|
262
|
+
|
|
263
|
+
# Checking for data imbalance
|
|
264
|
+
data_imbalance_check = False
|
|
265
|
+
if not self.cluster and self.task_type.lower() == "classification":
|
|
266
|
+
# Checking for data imbalance in dataset
|
|
267
|
+
imb_ratio, minority_class = self._check_data_imbalance()
|
|
268
|
+
# Setting data imbalance check flag
|
|
269
|
+
data_imbalance_check = imb_ratio <= 0.4
|
|
270
|
+
# Handling data imbalance if imbalance ratio is less than 0.4 and sampling method is SMOTE/ADASYN
|
|
271
|
+
if data_imbalance_check and self._data_sampling_method.lower() in ["smote", "adasyn"]:
|
|
272
|
+
res = self._data_sampling_smote(minority_class=str(minority_class),
|
|
273
|
+
imb_ratio=imb_ratio)
|
|
274
|
+
data_imbalance_check = not res
|
|
275
|
+
|
|
276
|
+
# Temporary Pulling data for feature selection
|
|
277
|
+
# Will change after sto
|
|
278
|
+
# Handling float type features before processing with feature selection and scaling
|
|
279
|
+
training_data = self._handle_generated_features()
|
|
280
|
+
self.progress_bar.update()
|
|
281
|
+
|
|
282
|
+
# Checking for data imbalance
|
|
283
|
+
if data_imbalance_check:
|
|
284
|
+
training_data = self._data_sampling(training_data)
|
|
285
|
+
self.progress_bar.update()
|
|
286
|
+
|
|
287
|
+
# Sorting the data based on id to
|
|
288
|
+
# remove any shuffling done by sampling
|
|
289
|
+
training_data = training_data.sort_values(by=self.id_column)
|
|
290
|
+
|
|
291
|
+
if not self.cluster:
|
|
292
|
+
if self.enable_lasso:
|
|
293
|
+
# Performing feature selection using lasso followed by scaling
|
|
294
|
+
self._feature_selection_Lasso(training_data)
|
|
295
|
+
self._scaling_features(feature_selection_mtd="lasso")
|
|
296
|
+
self.progress_bar.update()
|
|
297
|
+
|
|
298
|
+
# Performing feature selection using rfe followed by scaling
|
|
299
|
+
self._feature_selection_RFE(training_data)
|
|
300
|
+
self._scaling_features(feature_selection_mtd="rfe")
|
|
301
|
+
self.progress_bar.update()
|
|
302
|
+
else:
|
|
303
|
+
self._scaling_features(feature_selection_mtd="Non_pca")
|
|
304
|
+
self.progress_bar.update()
|
|
305
|
+
|
|
306
|
+
# Performing scaling followed by feature selection using pca
|
|
307
|
+
self._scaling_features(feature_selection_mtd="pca")
|
|
308
|
+
self._feature_selection_PCA()
|
|
309
|
+
self.progress_bar.update()
|
|
310
|
+
|
|
311
|
+
if not self.cluster:
|
|
312
|
+
# Return feature lists based on whether lasso selection was performed
|
|
313
|
+
if self.enable_lasso:
|
|
314
|
+
return [self.lasso_feature, self.rfe_feature, self.pca_feature], self.data_transform_dict, self.data_mapping
|
|
315
|
+
else:
|
|
316
|
+
return [self.rfe_feature, self.pca_feature], self.data_transform_dict, self.data_mapping
|
|
317
|
+
else:
|
|
318
|
+
return [self.pca_feature, self.non_pca_feature], self.data_transform_dict, self.data_mapping
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def _data_sampling_smote(self,
|
|
322
|
+
minority_class,
|
|
323
|
+
imb_ratio):
|
|
324
|
+
"""
|
|
325
|
+
DESCRIPTION:
|
|
326
|
+
Internal function to handle data imbalance in dataset using SMOTE technique.
|
|
327
|
+
|
|
328
|
+
PARAMETERS:
|
|
329
|
+
minority_class:
|
|
330
|
+
Required Argument.
|
|
331
|
+
Specifies the minority class for which synthetic samples need to be
|
|
332
|
+
generated.
|
|
333
|
+
Note:
|
|
334
|
+
* The label for minority class under response column must be numeric integer.
|
|
335
|
+
Types: str
|
|
336
|
+
|
|
337
|
+
imb_ratio:
|
|
338
|
+
Required Argument.
|
|
339
|
+
Specifies the imbalance ratio in the dataset.
|
|
340
|
+
Types: float
|
|
341
|
+
|
|
342
|
+
RETURNS:
|
|
343
|
+
True if SMOTE sampling is successful, False otherwise.
|
|
344
|
+
|
|
345
|
+
RAISES:
|
|
346
|
+
None
|
|
347
|
+
|
|
348
|
+
EXAMPLES:
|
|
349
|
+
>>> res = self._data_sampling_smote(minority_class="1", imb_ratio=0.3)
|
|
350
|
+
"""
|
|
351
|
+
self._display_msg(msg="Handling data imbalance using {} ...".format(self._data_sampling_method),
|
|
352
|
+
progress_bar=self.progress_bar,
|
|
353
|
+
show_data=True)
|
|
354
|
+
|
|
355
|
+
# Setting n_neighbors based on minority class count
|
|
356
|
+
minority_rows = self.data[self.data[self.target_column] == minority_class].shape[0]
|
|
357
|
+
n_neighbors = max(5, min(minority_rows - 1, 99))
|
|
358
|
+
# Setting oversampling factor based on imbalance ratio
|
|
359
|
+
os_factor = 5
|
|
360
|
+
if imb_ratio <= 0.4 and imb_ratio > 0.3:
|
|
361
|
+
os_factor = 1.5
|
|
362
|
+
elif imb_ratio <= 0.3 and imb_ratio > 0.2:
|
|
363
|
+
os_factor = 2
|
|
364
|
+
elif imb_ratio <= 0.2 and imb_ratio > 0.1:
|
|
365
|
+
os_factor = 3
|
|
366
|
+
|
|
367
|
+
# Setting parameters for SMOTE function
|
|
368
|
+
smote_params = {
|
|
369
|
+
"id_column": self.id_column,
|
|
370
|
+
"input_columns": [col for col in self.data.columns if col not in [self.id_column, self.target_column]],
|
|
371
|
+
"minority_class": minority_class,
|
|
372
|
+
"oversampling_factor": os_factor,
|
|
373
|
+
"sampling_strategy": self._data_sampling_method.lower(),
|
|
374
|
+
"n_neighbors": n_neighbors,
|
|
375
|
+
"persist": True, # setting persist True to avoid parser memory error
|
|
376
|
+
"seed": self.seed,
|
|
377
|
+
"display_table_name": False
|
|
378
|
+
}
|
|
379
|
+
try:
|
|
380
|
+
# Running SMOTE function to generate synthetic samples for minority class
|
|
381
|
+
sm_data = SMOTE(data=self.data,
|
|
382
|
+
response_column=self.target_column,
|
|
383
|
+
**smote_params).result
|
|
384
|
+
except TeradataMlException as e:
|
|
385
|
+
if self.raise_errors:
|
|
386
|
+
raise e
|
|
387
|
+
else:
|
|
388
|
+
# give user warning and proceed with default data sampling technique
|
|
389
|
+
msg = f"TD_SMOTE function failed, proceeding with default data sampling technique."
|
|
390
|
+
warnings.warn(message=msg, stacklevel=2)
|
|
391
|
+
return False
|
|
392
|
+
# concatenating original data with smote generated data
|
|
393
|
+
concat_df = self.data.concat(sm_data, ignore_index=False)
|
|
394
|
+
# Generating column list excluding id column
|
|
395
|
+
cols_lst = [col for col in concat_df.columns if col != self.id_column]
|
|
396
|
+
# Filling new and distinct ids to original table
|
|
397
|
+
# Note: persist is set to True to avoid parser memory error
|
|
398
|
+
obj = FillRowId(data=concat_df.select(cols_lst),
|
|
399
|
+
row_id_column=self.id_column,
|
|
400
|
+
persist=True,
|
|
401
|
+
display_table_name=False).result
|
|
402
|
+
self.data = obj
|
|
403
|
+
|
|
404
|
+
# Adding smote generated data table to garbage collector
|
|
405
|
+
GarbageCollector._add_to_garbagecollector(sm_data._table_name)
|
|
406
|
+
GarbageCollector._add_to_garbagecollector(obj._table_name)
|
|
407
|
+
|
|
408
|
+
self._display_msg(msg="Completed data imbalance handling.",
|
|
409
|
+
progress_bar=self.progress_bar,
|
|
410
|
+
show_data=True)
|
|
411
|
+
return True
|
|
412
|
+
|
|
413
|
+
def _handle_outliers(self,
|
|
414
|
+
auto):
|
|
415
|
+
"""
|
|
416
|
+
DESCRIPTION:
|
|
417
|
+
Function to handle existing outliers in dataset based on running mode.
|
|
418
|
+
|
|
419
|
+
PARAMETERS:
|
|
420
|
+
auto:
|
|
421
|
+
Required Argument.
|
|
422
|
+
Specifies whether to run AutoML in custom mode or auto mode.
|
|
423
|
+
When set to False, runs in custom mode. Otherwise runs in auto mode.
|
|
424
|
+
Types: bool
|
|
425
|
+
|
|
426
|
+
RETURNS:
|
|
427
|
+
None
|
|
428
|
+
|
|
429
|
+
RAISES:
|
|
430
|
+
None
|
|
431
|
+
|
|
432
|
+
EXAMPLES:
|
|
433
|
+
>>> self._handle_outliers(auto=True)
|
|
434
|
+
"""
|
|
435
|
+
if auto:
|
|
436
|
+
self._outlier_processing()
|
|
437
|
+
else:
|
|
438
|
+
self._custom_outlier_processing()
|
|
439
|
+
|
|
440
|
+
def _check_data_imbalance(self,
|
|
441
|
+
data):
|
|
442
|
+
"""
|
|
443
|
+
DESCRIPTION:
|
|
444
|
+
Internal function calculate and checks the imbalance in dataset
|
|
445
|
+
in case of classification.
|
|
446
|
+
|
|
447
|
+
PARAMETERS:
|
|
448
|
+
data:
|
|
449
|
+
Required Argument.
|
|
450
|
+
Specifies the input teradataml DataFrame.
|
|
451
|
+
Types: teradataml DataFrame
|
|
452
|
+
|
|
453
|
+
RETURNS:
|
|
454
|
+
None
|
|
455
|
+
|
|
456
|
+
RAISES:
|
|
457
|
+
None
|
|
458
|
+
|
|
459
|
+
EXAMPLES:
|
|
460
|
+
>>> result = self._check_data_imbalance(data=training_data)
|
|
461
|
+
"""
|
|
462
|
+
pass
|
|
463
|
+
|
|
464
|
+
def _data_sampling(self,
|
|
465
|
+
data):
|
|
466
|
+
"""
|
|
467
|
+
DESCRIPTION:
|
|
468
|
+
Function to handle data imbalance in dataset using sampling techniques
|
|
469
|
+
in case of classification.
|
|
470
|
+
|
|
471
|
+
PARAMETERS:
|
|
472
|
+
data:
|
|
473
|
+
Required Argument.
|
|
474
|
+
Specifies the input teradataml DataFrame.
|
|
475
|
+
Types: pandas DataFrame
|
|
476
|
+
|
|
477
|
+
RETURNS:
|
|
478
|
+
None
|
|
479
|
+
|
|
480
|
+
RAISES:
|
|
481
|
+
None
|
|
482
|
+
|
|
483
|
+
EXAMPLES:
|
|
484
|
+
>>> sampled_data = self._data_sampling(data=imbalanced_data)
|
|
485
|
+
"""
|
|
486
|
+
pass
|
|
487
|
+
|
|
488
|
+
def _set_custom_sampling(self):
|
|
489
|
+
"""
|
|
490
|
+
DESCRIPTION:
|
|
491
|
+
Internal Function to handle customized data sampling for imbalance dataset.
|
|
492
|
+
|
|
493
|
+
PARAMETERS:
|
|
494
|
+
None
|
|
495
|
+
|
|
496
|
+
RETURNS:
|
|
497
|
+
None
|
|
498
|
+
|
|
499
|
+
RAISES:
|
|
500
|
+
None
|
|
501
|
+
|
|
502
|
+
EXAMPLES:
|
|
503
|
+
>>> self._set_custom_sampling()
|
|
504
|
+
"""
|
|
505
|
+
pass
|
|
506
|
+
|
|
507
|
+
def _outlier_handling_techniques(self):
|
|
508
|
+
"""
|
|
509
|
+
DESCRIPTION:
|
|
510
|
+
Function determines the handling techniques[drop rows/impute values] for outlier columns in the dataset.
|
|
511
|
+
|
|
512
|
+
PARAMETERS:
|
|
513
|
+
None
|
|
514
|
+
|
|
515
|
+
RETURNS:
|
|
516
|
+
tuple containing list of columns to drop rows and list of columns to impute.
|
|
517
|
+
|
|
518
|
+
RAISES:
|
|
519
|
+
None
|
|
520
|
+
|
|
521
|
+
EXAMPLES:
|
|
522
|
+
>>> drop_rows_cols, impute_cols = self._outlier_handling_techniques()
|
|
523
|
+
"""
|
|
524
|
+
columns_to_drop_rows = []
|
|
525
|
+
columns_to_impute = []
|
|
526
|
+
# Keeping default method for outlier detection "Tukey"
|
|
527
|
+
outlier_method = "Tukey"
|
|
528
|
+
|
|
529
|
+
# List of columns for outlier processing.
|
|
530
|
+
# Excluding target column and excluded columns from outlier processing
|
|
531
|
+
outlier_columns = [col for col in self.data.columns if col not in self.excluded_columns +
|
|
532
|
+
[self.id_column, self.target_column]]
|
|
533
|
+
|
|
534
|
+
if len(outlier_columns) != 0:
|
|
535
|
+
# Detecting outlier percentage in each columns
|
|
536
|
+
outlier_percentage_df = self._outlier_detection(outlier_method, outlier_columns)
|
|
537
|
+
|
|
538
|
+
# Outlier Handling techniques
|
|
539
|
+
for i in outlier_percentage_df.itertuples():
|
|
540
|
+
# Column Name
|
|
541
|
+
col = i[0]
|
|
542
|
+
# Outlier value
|
|
543
|
+
value = i[1]
|
|
544
|
+
if self.cluster:
|
|
545
|
+
if value > 0.0:
|
|
546
|
+
columns_to_impute.append(col)
|
|
547
|
+
else:
|
|
548
|
+
# Dropping rows
|
|
549
|
+
if value > 0.0 and value <= 8.0 :
|
|
550
|
+
columns_to_drop_rows.append(col)
|
|
551
|
+
elif value> 8.0 and value <= 25.0:
|
|
552
|
+
columns_to_impute.append(col)
|
|
553
|
+
|
|
554
|
+
return columns_to_drop_rows, columns_to_impute
|
|
555
|
+
|
|
556
|
+
def _outlier_handling(self,
|
|
557
|
+
target_columns,
|
|
558
|
+
outlier_method,
|
|
559
|
+
replacement_value):
|
|
560
|
+
"""
|
|
561
|
+
DESCRIPTION:
|
|
562
|
+
Function to handle outlier for target column based outlier method and replacement value.
|
|
563
|
+
|
|
564
|
+
PARAMETERS:
|
|
565
|
+
target_columns:
|
|
566
|
+
Required Argument.
|
|
567
|
+
Specifies the target columns required for outlier handling.
|
|
568
|
+
Types: str or list of strings (str)
|
|
569
|
+
|
|
570
|
+
outlier_method:
|
|
571
|
+
Required Argument.
|
|
572
|
+
Specifies the outlier method required for outlier handling.
|
|
573
|
+
Types: str
|
|
574
|
+
|
|
575
|
+
replacement_value:
|
|
576
|
+
Optional Argument.
|
|
577
|
+
Specifies the value required in case of outlier replacement.
|
|
578
|
+
Types: str, float
|
|
579
|
+
|
|
580
|
+
RETURNS:
|
|
581
|
+
OutlierFilterFit object.
|
|
582
|
+
|
|
583
|
+
RAISES:
|
|
584
|
+
None
|
|
585
|
+
|
|
586
|
+
EXAMPLES:
|
|
587
|
+
>>> outlier_result = self._outlier_handling(target_columns=["feature1"], outlier_method="Tukey", replacement_value="mean")
|
|
588
|
+
"""
|
|
589
|
+
|
|
590
|
+
# Setting volatile and persist parameters for Outlier handling function
|
|
591
|
+
volatile, persist = self._get_generic_parameters(func_indicator='OutlierFilterIndicator',
|
|
592
|
+
param_name='OutlierFilterParam')
|
|
593
|
+
|
|
594
|
+
# Performing fit on dataset for outlier handling
|
|
595
|
+
fit_params = {
|
|
596
|
+
"data" : self.data,
|
|
597
|
+
"target_columns" : target_columns,
|
|
598
|
+
"outlier_method" : outlier_method,
|
|
599
|
+
"replacement_value" : replacement_value,
|
|
600
|
+
"volatile" : volatile,
|
|
601
|
+
"persist" : persist
|
|
602
|
+
}
|
|
603
|
+
outlier_fit_out = OutlierFilterFit(**fit_params)
|
|
604
|
+
# Performing transform on dataset for outlier handling
|
|
605
|
+
transform_params = {
|
|
606
|
+
"data" : self.data,
|
|
607
|
+
"object" : outlier_fit_out.result,
|
|
608
|
+
"persist" : True
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
# Disabling print if persist is True by default
|
|
612
|
+
if not volatile and not persist:
|
|
613
|
+
transform_params["display_table_name"] = False
|
|
614
|
+
|
|
615
|
+
if volatile:
|
|
616
|
+
transform_params["volatile"] = True
|
|
617
|
+
transform_params["persist"] = False
|
|
618
|
+
self.data = OutlierFilterTransform(**transform_params).result
|
|
619
|
+
|
|
620
|
+
if not volatile and not persist:
|
|
621
|
+
# Adding transformed data containing table to garbage collector
|
|
622
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
623
|
+
|
|
624
|
+
# Returning outlier fit object to store in data mapping dictionary
|
|
625
|
+
return outlier_fit_out
|
|
626
|
+
|
|
627
|
+
def _outlier_processing(self):
|
|
628
|
+
"""
|
|
629
|
+
DESCRIPTION:
|
|
630
|
+
Function performs outlier processing on dataset. It identifies and handle outliers in the dataset.
|
|
631
|
+
|
|
632
|
+
PARAMETERS:
|
|
633
|
+
None
|
|
634
|
+
|
|
635
|
+
RETURNS:
|
|
636
|
+
None
|
|
637
|
+
|
|
638
|
+
RAISES:
|
|
639
|
+
None
|
|
640
|
+
|
|
641
|
+
EXAMPLES:
|
|
642
|
+
>>> self._outlier_processing()
|
|
643
|
+
"""
|
|
644
|
+
self._display_msg(msg="Outlier preprocessing ...",
|
|
645
|
+
progress_bar=self.progress_bar,
|
|
646
|
+
show_data=True)
|
|
647
|
+
start_time = time.time()
|
|
648
|
+
|
|
649
|
+
# List of columns for dropping rows or imputing
|
|
650
|
+
columns_to_drop_rows, columns_to_impute = self._outlier_handling_techniques()
|
|
651
|
+
# Keeping default method for outlier handling "Tukey"
|
|
652
|
+
outlier_handling_method = "Tukey"
|
|
653
|
+
|
|
654
|
+
# Dropping rows
|
|
655
|
+
if len(columns_to_drop_rows) !=0:
|
|
656
|
+
self._display_msg(msg="Deleting rows of these columns:",
|
|
657
|
+
col_lst=columns_to_drop_rows,
|
|
658
|
+
progress_bar=self.progress_bar)
|
|
659
|
+
target_columns=columns_to_drop_rows
|
|
660
|
+
replacement_strategy = "DELETE"
|
|
661
|
+
fit_obj = self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
|
|
662
|
+
self.data_mapping['fit_outlier_delete_output'] = fit_obj.output_data._table_name
|
|
663
|
+
self.data_mapping['fit_outlier_delete_result'] = self.data._table_name
|
|
664
|
+
self.data_mapping['outlier_filtered_data'] = self.data._table_name
|
|
665
|
+
self._display_msg(msg="Sample of dataset after removing outlier rows:",
|
|
666
|
+
data=self.data,
|
|
667
|
+
progress_bar=self.progress_bar)
|
|
668
|
+
|
|
669
|
+
# Imputing Median value in place of outliers
|
|
670
|
+
if len(columns_to_impute) != 0:
|
|
671
|
+
self._display_msg(msg="median inplace of outliers:",
|
|
672
|
+
col_lst=columns_to_impute,
|
|
673
|
+
progress_bar=self.progress_bar)
|
|
674
|
+
target_columns=columns_to_impute
|
|
675
|
+
replacement_strategy = "MEDIAN"
|
|
676
|
+
fit_obj = self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
|
|
677
|
+
self.data_mapping['fit_outlier_impute_output'] = fit_obj.output_data._table_name
|
|
678
|
+
self.data_mapping['fit_outlier_impute_result'] = fit_obj.result._table_name
|
|
679
|
+
self.data_mapping['outlier_imputed_data'] = self.data._table_name
|
|
680
|
+
self._display_msg(msg="Sample of dataset after performing MEDIAN inplace:",
|
|
681
|
+
data=self.data,
|
|
682
|
+
progress_bar=self.progress_bar)
|
|
683
|
+
|
|
684
|
+
if len(columns_to_drop_rows) == 0 and len(columns_to_impute) == 0:
|
|
685
|
+
self._display_msg(msg='Analysis indicates not outlier in the dataset. No Action Taken.',
|
|
686
|
+
progress_bar=self.progress_bar)
|
|
687
|
+
|
|
688
|
+
end_time = time.time()
|
|
689
|
+
self._display_msg("Time Taken by Outlier processing: {:.2f} sec ".format(end_time - start_time),
|
|
690
|
+
progress_bar=self.progress_bar,
|
|
691
|
+
show_data=True)
|
|
692
|
+
|
|
693
|
+
def _custom_outlier_processing(self):
|
|
694
|
+
"""
|
|
695
|
+
DESCRIPTION:
|
|
696
|
+
Function to perform outlier processing on dataset based on user input.
|
|
697
|
+
|
|
698
|
+
PARAMETERS:
|
|
699
|
+
None
|
|
700
|
+
|
|
701
|
+
RETURNS:
|
|
702
|
+
None
|
|
703
|
+
|
|
704
|
+
RAISES:
|
|
705
|
+
TeradataMlException, ValueError
|
|
706
|
+
|
|
707
|
+
EXAMPLES:
|
|
708
|
+
>>> self._custom_outlier_processing()
|
|
709
|
+
"""
|
|
710
|
+
self._display_msg(msg="Starting customized outlier processing ...",
|
|
711
|
+
progress_bar=self.progress_bar,
|
|
712
|
+
show_data=True)
|
|
713
|
+
outlier_filter_input = self.custom_data.get("OutlierFilterIndicator", False)
|
|
714
|
+
# Checking user input for outlier filtering
|
|
715
|
+
if outlier_filter_input:
|
|
716
|
+
# List of columns for outlier processing.
|
|
717
|
+
target_columns = [col for col in self.data.columns if col not in self.excluded_columns]
|
|
718
|
+
# Checking user input for outlier detection method
|
|
719
|
+
outlier_method = self.custom_data.get("OutlierFilterMethod", None)
|
|
720
|
+
if outlier_method == 'PERCENTILE':
|
|
721
|
+
lower_percentile = self.custom_data.get("OutlierLowerPercentile", None)
|
|
722
|
+
upper_percentile = self.custom_data.get("OutlierUpperPercentile", None)
|
|
723
|
+
if lower_percentile and upper_percentile:
|
|
724
|
+
# Detecting outlier percentage for each columns
|
|
725
|
+
outlier_df = self._outlier_detection(outlier_method=outlier_method, column_list=target_columns, \
|
|
726
|
+
lower_percentile=lower_percentile, upper_percentile=upper_percentile)
|
|
727
|
+
else:
|
|
728
|
+
# Detecting outlier percentage for each column in case of other than percentile method
|
|
729
|
+
outlier_df = self._outlier_detection(outlier_method=outlier_method, column_list=target_columns)
|
|
730
|
+
|
|
731
|
+
# Checking for rows if outlier containing columns exist
|
|
732
|
+
if outlier_df.shape[0]:
|
|
733
|
+
# Checking user input list for outlier handling
|
|
734
|
+
outlier_transform_list = self.custom_data.get("OutlierFilterParam", None).copy()
|
|
735
|
+
if outlier_transform_list:
|
|
736
|
+
volatile = outlier_transform_list.pop("volatile", False)
|
|
737
|
+
persist = outlier_transform_list.pop("persist", False)
|
|
738
|
+
# Checking user input for outlier handling
|
|
739
|
+
_Validators._validate_dataframe_has_argument_columns(list(outlier_transform_list.keys()), "OutlierFilterParam",
|
|
740
|
+
self.data, "outlier_data")
|
|
741
|
+
|
|
742
|
+
for target_col, transform_val in outlier_transform_list.items():
|
|
743
|
+
# Fetching replacement value
|
|
744
|
+
replacement_value = transform_val["replacement_value"]
|
|
745
|
+
# Performing outlier handling
|
|
746
|
+
fit_obj = self._outlier_handling(target_col, outlier_method, replacement_value)
|
|
747
|
+
self.data_mapping[f'fit_{target_col}_outlier_output'] = fit_obj.output_data._table_name
|
|
748
|
+
self.data_mapping[f'fit_{target_col}_outlier_result'] = fit_obj.result._table_name
|
|
749
|
+
self.data_mapping[f'{target_col}_outlier_treated_data'] = self.data._table_name
|
|
750
|
+
self._display_msg(msg="Sample of dataset after performing custom outlier filtering",
|
|
751
|
+
data=self.data,progress_bar=self.progress_bar)
|
|
752
|
+
else:
|
|
753
|
+
self._display_msg(inline_msg="No information provided for feature transformation in outlier handling.",
|
|
754
|
+
progress_bar=self.progress_bar)
|
|
755
|
+
else:
|
|
756
|
+
self._display_msg(inline_msg="No oultiers found in dataset after applying the selected method.",
|
|
757
|
+
progress_bar=self.progress_bar)
|
|
758
|
+
else:
|
|
759
|
+
self._display_msg(inline_msg="No information provided for customized outlier processing. AutoML will proceed with default settings.",
|
|
760
|
+
progress_bar=self.progress_bar)
|
|
761
|
+
# Performing default handling for outliers
|
|
762
|
+
if not self.cluster:
|
|
763
|
+
self._outlier_processing()
|
|
764
|
+
|
|
765
|
+
# function for getting value of "K" in k folds cross validation
|
|
766
|
+
def _num_of_folds(self, rows=None):
|
|
767
|
+
"""
|
|
768
|
+
DESCRIPTION:
|
|
769
|
+
Function to determine the number of folds for cross-validation
|
|
770
|
+
based on the number of rows in the dataset.
|
|
771
|
+
PARAMETERS:
|
|
772
|
+
rows:
|
|
773
|
+
Required Argument.
|
|
774
|
+
Specifies the number of rows in the dataset.
|
|
775
|
+
Types: int
|
|
776
|
+
RETURNS:
|
|
777
|
+
int, number of folds to be used for cross-validation.
|
|
778
|
+
|
|
779
|
+
RAISES:
|
|
780
|
+
None
|
|
781
|
+
|
|
782
|
+
EXAMPLES:
|
|
783
|
+
>>> folds = self._num_of_folds(rows=5000)
|
|
784
|
+
"""
|
|
785
|
+
num_of_folds = lambda rows: 2 if rows > 20000 else (4 if 1000 < rows <= 20000 else 10)
|
|
786
|
+
return num_of_folds(rows)
|
|
787
|
+
|
|
788
|
+
def _feature_selection_PCA(self):
|
|
789
|
+
"""
|
|
790
|
+
DESCRIPTION:
|
|
791
|
+
Function performs Principal Component Analysis (PCA) for feature selection.
|
|
792
|
+
It reduces the dimensionality of the dataset by identifying and retaining the most informative features.
|
|
793
|
+
|
|
794
|
+
PARAMETERS:
|
|
795
|
+
None
|
|
796
|
+
|
|
797
|
+
RETURNS:
|
|
798
|
+
None
|
|
799
|
+
|
|
800
|
+
RAISES:
|
|
801
|
+
None
|
|
802
|
+
|
|
803
|
+
EXAMPLES:
|
|
804
|
+
>>> self._feature_selection_PCA()
|
|
805
|
+
"""
|
|
806
|
+
self._display_msg(msg="Dimension Reduction using pca ...",
|
|
807
|
+
progress_bar=self.progress_bar,
|
|
808
|
+
show_data=True)
|
|
809
|
+
# Required imports for PCA
|
|
810
|
+
from sklearn.decomposition import PCA
|
|
811
|
+
|
|
812
|
+
start_time = time.time()
|
|
813
|
+
|
|
814
|
+
# Temporary Pulling data for feature selection
|
|
815
|
+
pca_train = DataFrame.from_table(self.data_mapping['pca_train']).to_pandas()
|
|
816
|
+
# Drop unnecessary columns and store the result
|
|
817
|
+
if not self.cluster:
|
|
818
|
+
train_data = pca_train.drop(columns=[self.id_column, self.target_column], axis=1)
|
|
819
|
+
else:
|
|
820
|
+
train_data = pca_train.drop(columns=[self.id_column], axis=1)
|
|
821
|
+
|
|
822
|
+
# Initialize and fit PCA
|
|
823
|
+
pca = PCA(random_state=self.seed)
|
|
824
|
+
pca.fit(train_data)
|
|
825
|
+
|
|
826
|
+
# Find the number of components for PCA
|
|
827
|
+
variance = pca.explained_variance_ratio_
|
|
828
|
+
n = np.argmax(np.cumsum(variance) >= 0.95) + 1
|
|
829
|
+
|
|
830
|
+
# Create a new instance of PCA with the optimal number of components
|
|
831
|
+
pca = PCA(n_components=n, random_state=self.seed)
|
|
832
|
+
|
|
833
|
+
# Apply PCA on dataset
|
|
834
|
+
X_train_pca = pca.fit_transform(train_data)
|
|
835
|
+
|
|
836
|
+
# storing instance of PCA in data transformation dictionary
|
|
837
|
+
self.data_transform_dict["pca_fit_instance"] = pca
|
|
838
|
+
self.data_transform_dict["pca_fit_columns"] = train_data.columns.tolist()
|
|
839
|
+
|
|
840
|
+
#converting the numarray into dataframes
|
|
841
|
+
train_df = pd.DataFrame(X_train_pca)
|
|
842
|
+
|
|
843
|
+
#creating names for combined columns
|
|
844
|
+
column_name = {col: 'col_'+str(i) for i,col in enumerate(train_df.columns)}
|
|
845
|
+
|
|
846
|
+
# storing the new column names in data transformation dictionary
|
|
847
|
+
self.data_transform_dict['pca_new_column'] = column_name
|
|
848
|
+
|
|
849
|
+
#renaming them
|
|
850
|
+
train_df = train_df.rename(columns=column_name)
|
|
851
|
+
|
|
852
|
+
# adding the id column [PCA does not shuffle the dataset]
|
|
853
|
+
train_df = pd.concat([pca_train.reset_index(drop=True)[self.id_column],
|
|
854
|
+
train_df.reset_index(drop=True)], axis=1)
|
|
855
|
+
|
|
856
|
+
# merging target column with new data
|
|
857
|
+
if not self.cluster:
|
|
858
|
+
train_df[self.target_column] = pca_train[self.target_column].reset_index(drop=True)
|
|
859
|
+
self.pca_feature = train_df.drop(columns=[self.id_column, self.target_column],
|
|
860
|
+
axis=1).columns.tolist()
|
|
861
|
+
else:
|
|
862
|
+
self.pca_feature = train_df.drop(columns=[self.id_column],
|
|
863
|
+
axis=1).columns.tolist()
|
|
864
|
+
|
|
865
|
+
self._display_msg(msg="PCA columns:",
|
|
866
|
+
col_lst=self.pca_feature,
|
|
867
|
+
progress_bar=self.progress_bar)
|
|
868
|
+
end_time = time.time()
|
|
869
|
+
self._display_msg(msg="Total time taken by PCA: {:.2f} sec ".format( end_time - start_time),
|
|
870
|
+
progress_bar=self.progress_bar,
|
|
871
|
+
show_data=True)
|
|
872
|
+
|
|
873
|
+
# Pushing the data in database
|
|
874
|
+
self.copy_dataframe_to_sql(train_df, 'pca', self.persist)
|
|
875
|
+
|
|
876
|
+
def _feature_selection_RFE(self,
|
|
877
|
+
data=None):
|
|
878
|
+
"""
|
|
879
|
+
DESCRIPTION:
|
|
880
|
+
Function performs Recursive Feature Elimination (RFE) for feature selection.
|
|
881
|
+
It identifies a subset of the most relevant features in the dataset.
|
|
882
|
+
|
|
883
|
+
PARAMETERS:
|
|
884
|
+
data:
|
|
885
|
+
Required Argument.
|
|
886
|
+
Specifies the input train pandas DataFrame.
|
|
887
|
+
Types: pandas Dataframe
|
|
888
|
+
|
|
889
|
+
RETURNS:
|
|
890
|
+
None
|
|
891
|
+
|
|
892
|
+
RAISES:
|
|
893
|
+
None
|
|
894
|
+
|
|
895
|
+
EXAMPLES:
|
|
896
|
+
>>> self._feature_selection_RFE(data=training_data)
|
|
897
|
+
"""
|
|
898
|
+
self._display_msg(msg="Feature selection using rfe ...",
|
|
899
|
+
progress_bar=self.progress_bar,
|
|
900
|
+
show_data=True)
|
|
901
|
+
|
|
902
|
+
# Required imports for RFE
|
|
903
|
+
from sklearn.feature_selection import RFECV
|
|
904
|
+
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
|
|
905
|
+
from sklearn.model_selection import StratifiedKFold
|
|
906
|
+
|
|
907
|
+
start_time = time.time()
|
|
908
|
+
# Regression
|
|
909
|
+
is_classification = self.is_classification_type()
|
|
910
|
+
# Getting the value of k in k-fold cross-validation
|
|
911
|
+
folds = self._num_of_folds(data.shape[0])
|
|
912
|
+
|
|
913
|
+
# Suppressing warnings generated by pandas and sklearn
|
|
914
|
+
with warnings.catch_warnings():
|
|
915
|
+
warnings.filterwarnings('ignore')
|
|
916
|
+
|
|
917
|
+
# Random forest for RFE model
|
|
918
|
+
RFModel = RandomForestRegressor if not is_classification else RandomForestClassifier
|
|
919
|
+
rf = RFModel(n_estimators=100, random_state=self.seed)
|
|
920
|
+
|
|
921
|
+
# Determine the scoring metric based on the number of unique classes
|
|
922
|
+
score = 'r2' if not self.is_classification_type() \
|
|
923
|
+
else 'roc_auc' if self.data.drop_duplicate(self.target_column).size == 2 else 'f1_macro'
|
|
924
|
+
|
|
925
|
+
# # Instantiate StratifiedKFold with shuffling for classification
|
|
926
|
+
cv = folds if not self.is_classification_type() \
|
|
927
|
+
else StratifiedKFold(n_splits=folds, shuffle=False)
|
|
928
|
+
|
|
929
|
+
# Define the RFE with cross-validation
|
|
930
|
+
rfecv = RFECV(rf, cv=cv, scoring=score)
|
|
931
|
+
|
|
932
|
+
# Prepare data
|
|
933
|
+
train_data = data.drop(columns=[self.id_column, self.target_column], axis=1)
|
|
934
|
+
train_target = data[self.target_column]
|
|
935
|
+
|
|
936
|
+
# Fit the RFE using cv
|
|
937
|
+
rfecv.fit(train_data, train_target)
|
|
938
|
+
|
|
939
|
+
# Extract the features
|
|
940
|
+
features = train_data.columns[rfecv.support_].tolist()
|
|
941
|
+
|
|
942
|
+
self._display_msg(msg="feature selected by RFE:",
|
|
943
|
+
col_lst=features,
|
|
944
|
+
progress_bar=self.progress_bar)
|
|
945
|
+
features.append(self.target_column)
|
|
946
|
+
features.insert(0,self.id_column)
|
|
947
|
+
|
|
948
|
+
selected_rfe_df = data[features]
|
|
949
|
+
|
|
950
|
+
# storing the rfe selected features in data transformation dictionary
|
|
951
|
+
self.data_transform_dict['rfe_features'] = features
|
|
952
|
+
|
|
953
|
+
columns_to_rename = [col for col in selected_rfe_df.columns if col not in
|
|
954
|
+
[self.id_column, self.target_column]]
|
|
955
|
+
new_column = {col: f'r_{col}' for col in columns_to_rename}
|
|
956
|
+
self.excluded_columns.extend([new_column[key] for key in self.excluded_columns if key in new_column])
|
|
957
|
+
|
|
958
|
+
selected_rfe_df.rename(columns=new_column, inplace=True)
|
|
959
|
+
|
|
960
|
+
# storing the rename column list in data transformation dictionary
|
|
961
|
+
self.data_transform_dict['rfe_rename_column'] = columns_to_rename
|
|
962
|
+
|
|
963
|
+
end_time = time.time()
|
|
964
|
+
self._display_msg(msg="Total time taken by feature selection: {:.2f} sec ".format( end_time - start_time),
|
|
965
|
+
progress_bar=self.progress_bar,
|
|
966
|
+
show_data=True)
|
|
967
|
+
self.rfe_feature = selected_rfe_df.drop(columns=[self.id_column,self.target_column],
|
|
968
|
+
axis=1).columns.tolist()
|
|
969
|
+
|
|
970
|
+
# Pushing data into database
|
|
971
|
+
self.copy_dataframe_to_sql(selected_rfe_df, 'rfe', self.persist)
|
|
972
|
+
|
|
973
|
+
def _feature_selection_Lasso(self,
|
|
974
|
+
data=None):
|
|
975
|
+
"""
|
|
976
|
+
DESCRIPTION:
|
|
977
|
+
Function performs Lasso Regression for feature selection.
|
|
978
|
+
It helps in identifing and retaining the most important features while setting less important ones to zero.
|
|
979
|
+
|
|
980
|
+
PARAMETERS:
|
|
981
|
+
data:
|
|
982
|
+
Required Argument.
|
|
983
|
+
Specifies the input train pandas DataFrame.
|
|
984
|
+
Types: pandas Dataframe
|
|
985
|
+
|
|
986
|
+
RETURNS:
|
|
987
|
+
None
|
|
988
|
+
|
|
989
|
+
RAISES:
|
|
990
|
+
None
|
|
991
|
+
|
|
992
|
+
EXAMPLES:
|
|
993
|
+
>>> self._feature_selection_Lasso(data=training_data)
|
|
994
|
+
"""
|
|
995
|
+
start_time = time.time()
|
|
996
|
+
self._display_msg(msg="Feature selection using lasso ...",
|
|
997
|
+
progress_bar=self.progress_bar,
|
|
998
|
+
show_data=True)
|
|
999
|
+
|
|
1000
|
+
# Required imports for Lasso
|
|
1001
|
+
from sklearn.model_selection import GridSearchCV
|
|
1002
|
+
from sklearn.linear_model import Lasso
|
|
1003
|
+
from sklearn.linear_model import LogisticRegression
|
|
1004
|
+
from sklearn.model_selection import StratifiedKFold
|
|
1005
|
+
|
|
1006
|
+
# Getting the value k in k-fold cross-validation
|
|
1007
|
+
num_folds = self._num_of_folds(data.shape[0])
|
|
1008
|
+
|
|
1009
|
+
# Prepare data
|
|
1010
|
+
train_features = data.drop(columns=[self.id_column,self.target_column], axis=1)
|
|
1011
|
+
train_target = data[self.target_column]
|
|
1012
|
+
|
|
1013
|
+
# Suppressing warnings generated by pandas and sklearn
|
|
1014
|
+
with warnings.catch_warnings():
|
|
1015
|
+
warnings.filterwarnings('ignore')
|
|
1016
|
+
|
|
1017
|
+
# Determine the estimator and parameters based on the type of problem
|
|
1018
|
+
if self.is_classification_type():
|
|
1019
|
+
if self.data.drop_duplicate(self.target_column).size == 2:
|
|
1020
|
+
scoring_metric = 'roc_auc'
|
|
1021
|
+
else:
|
|
1022
|
+
scoring_metric = 'f1_macro'
|
|
1023
|
+
estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=self.seed)
|
|
1024
|
+
parameters = {'C':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
|
|
1025
|
+
else:
|
|
1026
|
+
estimator = Lasso(random_state=self.seed)
|
|
1027
|
+
parameters = {'alpha':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
|
|
1028
|
+
scoring_metric = "r2"
|
|
1029
|
+
|
|
1030
|
+
if self.is_classification_type():
|
|
1031
|
+
cv = StratifiedKFold(n_splits=5, shuffle=False)
|
|
1032
|
+
else:
|
|
1033
|
+
cv = num_folds
|
|
1034
|
+
|
|
1035
|
+
# Applying hyperparameter tuning and optimizing score
|
|
1036
|
+
hyperparameter_search = GridSearchCV(estimator, parameters, cv=cv, refit=True,
|
|
1037
|
+
scoring=scoring_metric, verbose=0)
|
|
1038
|
+
|
|
1039
|
+
# Fitting the best result from hyperparameter
|
|
1040
|
+
hyperparameter_search.fit(train_features, train_target)
|
|
1041
|
+
|
|
1042
|
+
# Extracting the important estimators
|
|
1043
|
+
feature_importance = np.abs(hyperparameter_search.best_estimator_.coef_)
|
|
1044
|
+
|
|
1045
|
+
# Extracting feature using estimators whose importance > 0
|
|
1046
|
+
if self.is_classification_type():
|
|
1047
|
+
selected_feature_indices = np.where(np.any(feature_importance > 0, axis=0))[0]
|
|
1048
|
+
selected_features = np.array(train_features.columns)[selected_feature_indices]
|
|
1049
|
+
important_features = list(set(selected_features))
|
|
1050
|
+
else:
|
|
1051
|
+
important_features = np.array(train_features.columns)[feature_importance>0].tolist()
|
|
1052
|
+
|
|
1053
|
+
self._display_msg(msg="feature selected by lasso:",
|
|
1054
|
+
col_lst=important_features,
|
|
1055
|
+
progress_bar=self.progress_bar)
|
|
1056
|
+
|
|
1057
|
+
important_features = [self.id_column] + important_features + [self.target_column]
|
|
1058
|
+
selected_lasso_df = data[important_features]
|
|
1059
|
+
|
|
1060
|
+
# Storing the lasso selected features in data transformation dictionary
|
|
1061
|
+
self.data_transform_dict['lasso_features'] = important_features
|
|
1062
|
+
|
|
1063
|
+
# Calculate the elapsed time
|
|
1064
|
+
end_time = time.time()
|
|
1065
|
+
self._display_msg(msg="Total time taken by feature selection: {:.2f} sec ".format( end_time - start_time),
|
|
1066
|
+
progress_bar=self.progress_bar,
|
|
1067
|
+
show_data=True)
|
|
1068
|
+
self.lasso_feature = selected_lasso_df.drop(columns=[self.id_column,self.target_column],
|
|
1069
|
+
axis=1).columns.tolist()
|
|
1070
|
+
|
|
1071
|
+
self.copy_dataframe_to_sql(selected_lasso_df, 'lasso', self.persist)
|
|
1072
|
+
|
|
1073
|
+
def copy_dataframe_to_sql(self,
|
|
1074
|
+
data,
|
|
1075
|
+
prefix,
|
|
1076
|
+
persist):
|
|
1077
|
+
"""
|
|
1078
|
+
DESCRIPTION:
|
|
1079
|
+
Function to copy dataframe to SQL with generated table name.
|
|
1080
|
+
|
|
1081
|
+
PARAMETERS:
|
|
1082
|
+
data:
|
|
1083
|
+
Required Argument.
|
|
1084
|
+
Specifies the input pandas DataFrame.
|
|
1085
|
+
Types: pandas Dataframe
|
|
1086
|
+
|
|
1087
|
+
prefix:
|
|
1088
|
+
Required Argument.
|
|
1089
|
+
Specifies the prefix for the table name.
|
|
1090
|
+
Types: str
|
|
1091
|
+
|
|
1092
|
+
persist:
|
|
1093
|
+
Required Argument.
|
|
1094
|
+
Specifies whether to persist the results of the
|
|
1095
|
+
function in a table or not. When set to True,
|
|
1096
|
+
results are persisted in a table; otherwise,
|
|
1097
|
+
results are garbage collected at the end of the
|
|
1098
|
+
session.
|
|
1099
|
+
Types: bool
|
|
1100
|
+
|
|
1101
|
+
RETURNS:
|
|
1102
|
+
None
|
|
1103
|
+
|
|
1104
|
+
RAISES:
|
|
1105
|
+
None
|
|
1106
|
+
|
|
1107
|
+
EXAMPLES:
|
|
1108
|
+
>>> self.copy_dataframe_to_sql(data=selected_df, prefix="lasso", persist=True)
|
|
1109
|
+
"""
|
|
1110
|
+
# Generating table names
|
|
1111
|
+
train_table_name = UtilFuncs._generate_temp_table_name(prefix='{}_train'.format(prefix),
|
|
1112
|
+
table_type = TeradataConstants.TERADATA_TABLE,
|
|
1113
|
+
gc_on_quit=not persist)
|
|
1114
|
+
# If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
|
|
1115
|
+
# table name in fully qualified format.
|
|
1116
|
+
train_table_name = UtilFuncs._extract_table_name(train_table_name)
|
|
1117
|
+
|
|
1118
|
+
# Storing the table names in the table name mapping dictionary
|
|
1119
|
+
self.data_mapping['{}_train'.format(prefix)] = train_table_name
|
|
1120
|
+
|
|
1121
|
+
# In the case of the VT option, the table was being persisted, so the VT condition is being checked.
|
|
1122
|
+
is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
|
|
1123
|
+
# Pushing data into database
|
|
1124
|
+
if self.is_classification_type():
|
|
1125
|
+
copy_to_sql(df=data, table_name=train_table_name, temporary=is_temporary, if_exists="replace", types={f'{self.target_column}': INTEGER})
|
|
1126
|
+
else:
|
|
1127
|
+
copy_to_sql(df=data, table_name=train_table_name, if_exists="replace", temporary=is_temporary)
|
|
1128
|
+
|
|
1129
|
+
def _scaling_features_helper(self,
|
|
1130
|
+
data=None,
|
|
1131
|
+
feature_selection_mtd=None):
|
|
1132
|
+
"""
|
|
1133
|
+
DESCRIPTION:
|
|
1134
|
+
This function selects the features on which feature scaling should be applied.
|
|
1135
|
+
|
|
1136
|
+
PARAMETERS:
|
|
1137
|
+
data:
|
|
1138
|
+
Required Argument.
|
|
1139
|
+
Specifies the data on which feature scaling will be applied.
|
|
1140
|
+
Types: teradataml Dataframe
|
|
1141
|
+
|
|
1142
|
+
feature_selection_mtd:
|
|
1143
|
+
Required Argument.
|
|
1144
|
+
Specifies the feature selection algorithm used.
|
|
1145
|
+
Types: str
|
|
1146
|
+
|
|
1147
|
+
RETURNS:
|
|
1148
|
+
scl_col:
|
|
1149
|
+
list containing, the scaled columns.
|
|
1150
|
+
|
|
1151
|
+
RAISES:
|
|
1152
|
+
None
|
|
1153
|
+
|
|
1154
|
+
EXAMPLES:
|
|
1155
|
+
>>> scaled_cols = self._scaling_features_helper(data=training_data, feature_selection_mtd="lasso")
|
|
1156
|
+
"""
|
|
1157
|
+
columns_to_scale = []
|
|
1158
|
+
|
|
1159
|
+
# Iterating over the columns
|
|
1160
|
+
for col in data.columns:
|
|
1161
|
+
# Selecting columns that will be scaled
|
|
1162
|
+
# Exculding target_col and columns with single value
|
|
1163
|
+
if col not in [self.id_column, self.target_column] and \
|
|
1164
|
+
data.drop_duplicate(col).size > 1:
|
|
1165
|
+
columns_to_scale.append(col)
|
|
1166
|
+
|
|
1167
|
+
if feature_selection_mtd == "lasso":
|
|
1168
|
+
self.lasso_feature = columns_to_scale
|
|
1169
|
+
elif feature_selection_mtd == "rfe":
|
|
1170
|
+
self.rfe_feature = columns_to_scale
|
|
1171
|
+
elif feature_selection_mtd == "pca":
|
|
1172
|
+
self.pca_feature = columns_to_scale
|
|
1173
|
+
elif feature_selection_mtd == "raw_scaled":
|
|
1174
|
+
self.raw_scaled_feature = columns_to_scale
|
|
1175
|
+
else:
|
|
1176
|
+
self.non_pca_feature = columns_to_scale
|
|
1177
|
+
|
|
1178
|
+
columns_to_scale = [col for col in columns_to_scale if col not in self.excluded_columns]
|
|
1179
|
+
return columns_to_scale
|
|
1180
|
+
|
|
1181
|
+
def _scaling_features(self,
|
|
1182
|
+
feature_selection_mtd=None):
|
|
1183
|
+
"""
|
|
1184
|
+
DESCRIPTION:
|
|
1185
|
+
Function performs feature scaling on columns present inside the dataset
|
|
1186
|
+
using scaling methods [RANGE/ABS/STD/USTD/MEAN/MIDRANGE/RESCALE].
|
|
1187
|
+
|
|
1188
|
+
PARAMETERS:
|
|
1189
|
+
feature_selection_mtd:
|
|
1190
|
+
Required Argument.
|
|
1191
|
+
Specifies the feature selection algorithm used.
|
|
1192
|
+
Types: str
|
|
1193
|
+
|
|
1194
|
+
RETURNS:
|
|
1195
|
+
None
|
|
1196
|
+
|
|
1197
|
+
RAISES:
|
|
1198
|
+
None
|
|
1199
|
+
|
|
1200
|
+
EXAMPLES:
|
|
1201
|
+
>>> self._scaling_features(feature_selection_mtd="lasso")
|
|
1202
|
+
"""
|
|
1203
|
+
|
|
1204
|
+
feature_selection_mtd = feature_selection_mtd.lower()
|
|
1205
|
+
self._display_msg(msg="Scaling Features of {} data ...".format(feature_selection_mtd),
|
|
1206
|
+
progress_bar=self.progress_bar,
|
|
1207
|
+
show_data=True)
|
|
1208
|
+
|
|
1209
|
+
start_time = time.time()
|
|
1210
|
+
data_to_scale = None
|
|
1211
|
+
|
|
1212
|
+
if not self.cluster:
|
|
1213
|
+
if self.is_classification_type():
|
|
1214
|
+
scale_method = self._scale_method_cls
|
|
1215
|
+
else:
|
|
1216
|
+
scale_method = self._scale_method_reg
|
|
1217
|
+
else:
|
|
1218
|
+
scale_method = self._scale_method_clust
|
|
1219
|
+
|
|
1220
|
+
# Loading data for feature scaling based of feature selection method
|
|
1221
|
+
if feature_selection_mtd == 'rfe':
|
|
1222
|
+
data_to_scale = DataFrame(self.data_mapping['rfe_train'])
|
|
1223
|
+
elif feature_selection_mtd == 'lasso':
|
|
1224
|
+
data_to_scale = DataFrame(self.data_mapping['lasso_train'])
|
|
1225
|
+
elif feature_selection_mtd == 'raw_scaled':
|
|
1226
|
+
data_to_scale = DataFrame(self.data_mapping['raw_scaled_train'])
|
|
1227
|
+
else:
|
|
1228
|
+
data_to_scale = self.data
|
|
1229
|
+
|
|
1230
|
+
# Setting volatile and persist parameters for ScaleFit and ScaleTransform functions
|
|
1231
|
+
volatile, persist = self._get_generic_parameters(func_indicator='FeatureScalingIndicator',
|
|
1232
|
+
param_name='FeatureScalingParam')
|
|
1233
|
+
|
|
1234
|
+
# List of columns that will be scaled
|
|
1235
|
+
scale_col= self._scaling_features_helper(data_to_scale, feature_selection_mtd)
|
|
1236
|
+
|
|
1237
|
+
if len(scale_col) != 0:
|
|
1238
|
+
self._display_msg(msg="columns that will be scaled: ",
|
|
1239
|
+
col_lst=scale_col,
|
|
1240
|
+
progress_bar=self.progress_bar)
|
|
1241
|
+
|
|
1242
|
+
# Scale Fit
|
|
1243
|
+
fit_obj = ScaleFit(data=data_to_scale,
|
|
1244
|
+
target_columns=scale_col,
|
|
1245
|
+
scale_method=scale_method,
|
|
1246
|
+
volatile=volatile,
|
|
1247
|
+
persist=persist)
|
|
1248
|
+
|
|
1249
|
+
self.data_mapping[f'fit_scale_{feature_selection_mtd}_output'] = fit_obj.output_data._table_name
|
|
1250
|
+
self.data_mapping[f'fit_scale_{feature_selection_mtd}_result'] = fit_obj.output._table_name
|
|
1251
|
+
|
|
1252
|
+
# storing the scale fit object and columns in data transformation dictionary
|
|
1253
|
+
self.data_transform_dict['{}_scale_fit_obj'.format(feature_selection_mtd)] = fit_obj.output
|
|
1254
|
+
self.data_transform_dict['{}_scale_col'.format(feature_selection_mtd)] = scale_col
|
|
1255
|
+
|
|
1256
|
+
# List of columns to copy to the output generated by scale transform
|
|
1257
|
+
accumulate_cols = list(set(data_to_scale.columns) - set(scale_col))
|
|
1258
|
+
|
|
1259
|
+
# Scaling dataset
|
|
1260
|
+
transform_obj = ScaleTransform(data=data_to_scale,
|
|
1261
|
+
object=fit_obj,
|
|
1262
|
+
accumulate=accumulate_cols)
|
|
1263
|
+
scaled_df = transform_obj.result
|
|
1264
|
+
|
|
1265
|
+
self._display_msg(msg="Dataset sample after scaling:",
|
|
1266
|
+
data=scaled_df,
|
|
1267
|
+
progress_bar=self.progress_bar)
|
|
1268
|
+
else:
|
|
1269
|
+
# No columns to scale, Original data will be used
|
|
1270
|
+
scaled_df = data_to_scale
|
|
1271
|
+
self._display_msg(msg="No columns to scale.",
|
|
1272
|
+
progress_bar=self.progress_bar)
|
|
1273
|
+
|
|
1274
|
+
self.copy_dataframe_to_sql(scaled_df, feature_selection_mtd, persist)
|
|
1275
|
+
|
|
1276
|
+
if self.cluster and feature_selection_mtd == "non_pca":
|
|
1277
|
+
self.data_mapping["non_pca_train"] = scaled_df._table_name
|
|
1278
|
+
elif self.cluster and feature_selection_mtd == "raw_scaled":
|
|
1279
|
+
self.data_mapping["raw_scaled_train"] = scaled_df._table_name
|
|
1280
|
+
|
|
1281
|
+
end_time = time.time()
|
|
1282
|
+
self._display_msg(msg="Total time taken by feature scaling: {:.2f} sec".format( end_time - start_time),
|
|
1283
|
+
progress_bar=self.progress_bar,
|
|
1284
|
+
show_data=True)
|
|
1285
|
+
|
|
1286
|
+
def _set_custom_scaling_method(self):
|
|
1287
|
+
"""
|
|
1288
|
+
DESCRIPTION:
|
|
1289
|
+
Function to perform feature scaling based on user input.
|
|
1290
|
+
|
|
1291
|
+
PARAMETERS:
|
|
1292
|
+
None
|
|
1293
|
+
|
|
1294
|
+
RETURNS:
|
|
1295
|
+
None
|
|
1296
|
+
|
|
1297
|
+
RAISES:
|
|
1298
|
+
None
|
|
1299
|
+
|
|
1300
|
+
EXAMPLES:
|
|
1301
|
+
>>> self._set_custom_scaling_method()
|
|
1302
|
+
"""
|
|
1303
|
+
# Fetching user input for performing customized scaling
|
|
1304
|
+
feature_scaling_input = self.custom_data.get("FeatureScalingIndicator", False)
|
|
1305
|
+
# Checking user input for feature scaling
|
|
1306
|
+
if feature_scaling_input:
|
|
1307
|
+
# Extracting scaling method
|
|
1308
|
+
custom_scaling_params = self.custom_data.get("FeatureScalingParam", None)
|
|
1309
|
+
if custom_scaling_params:
|
|
1310
|
+
custom_scaling_method = custom_scaling_params.get("FeatureScalingMethod", None)
|
|
1311
|
+
if custom_scaling_method is None:
|
|
1312
|
+
self._display_msg(inline_msg="No information provided for customized scaling method. AutoML will continue with default option.",
|
|
1313
|
+
progress_bar=self.progress_bar)
|
|
1314
|
+
else:
|
|
1315
|
+
if self.cluster:
|
|
1316
|
+
self._scale_method_cluster = custom_scaling_method
|
|
1317
|
+
elif self.is_classification_type():
|
|
1318
|
+
self._scale_method_cls = custom_scaling_method
|
|
1319
|
+
else:
|
|
1320
|
+
self._scale_method_reg = custom_scaling_method
|
|
1321
|
+
else:
|
|
1322
|
+
self._display_msg(inline_msg="No information provided for performing customized feature scaling. Proceeding with default option.",
|
|
1323
|
+
progress_bar=self.progress_bar)
|
|
1324
|
+
|
|
1325
|
+
|
|
1326
|
+
def _handle_generated_features(self):
|
|
1327
|
+
"""
|
|
1328
|
+
DESCRIPTION:
|
|
1329
|
+
Function to handle newly generated float features. It will round them upto 4 digit after decimal point.
|
|
1330
|
+
|
|
1331
|
+
PARAMETERS:
|
|
1332
|
+
None
|
|
1333
|
+
|
|
1334
|
+
RETURNS:
|
|
1335
|
+
Pandas DataFrame containing, rounded up float columns.
|
|
1336
|
+
|
|
1337
|
+
RAISES:
|
|
1338
|
+
None
|
|
1339
|
+
|
|
1340
|
+
EXAMPLES:
|
|
1341
|
+
>>> processed_data = self._handle_generated_features()
|
|
1342
|
+
"""
|
|
1343
|
+
# Assigning data to target dataframe
|
|
1344
|
+
target_df = self.data
|
|
1345
|
+
# Detecting list of float columns on target dataset
|
|
1346
|
+
float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float", "decimal.Decimal"]]
|
|
1347
|
+
|
|
1348
|
+
if len(float_columns) == 0:
|
|
1349
|
+
cols = target_df.columns
|
|
1350
|
+
# Doing reset index to get index column
|
|
1351
|
+
df = target_df.to_pandas().reset_index()
|
|
1352
|
+
|
|
1353
|
+
# Returning the dataframe with cols
|
|
1354
|
+
# to avoid extra columns generated by reset_index()
|
|
1355
|
+
return df[cols]
|
|
1356
|
+
# storing the column details for round up in data transformation dictionary
|
|
1357
|
+
self.data_transform_dict["round_columns"] = float_columns
|
|
1358
|
+
# Extracting accumulate columns
|
|
1359
|
+
accumulate_columns = self._extract_list(target_df.columns,float_columns)
|
|
1360
|
+
# Performing rounding up on target column upto 4 precision digit
|
|
1361
|
+
fit_params = {
|
|
1362
|
+
"data" : target_df,
|
|
1363
|
+
"target_columns" : float_columns,
|
|
1364
|
+
"precision_digit" : 4,
|
|
1365
|
+
"accumulate" : accumulate_columns,
|
|
1366
|
+
"persist" : True}
|
|
1367
|
+
|
|
1368
|
+
# Disabling print if persist is True by default
|
|
1369
|
+
if not self.volatile and not self.persist:
|
|
1370
|
+
fit_params["display_table_name"] = False
|
|
1371
|
+
|
|
1372
|
+
if self.volatile:
|
|
1373
|
+
fit_params["volatile"] = True
|
|
1374
|
+
fit_params["persist"] = False
|
|
1375
|
+
|
|
1376
|
+
transform_output = RoundColumns(**fit_params).result
|
|
1377
|
+
self.data_mapping['round_columns_data'] = transform_output._table_name
|
|
1378
|
+
if not self.volatile and not self.persist:
|
|
1379
|
+
# Adding transformed data containing table to garbage collector
|
|
1380
|
+
GarbageCollector._add_to_garbagecollector(transform_output._table_name)
|
|
1381
|
+
cols = transform_output.columns
|
|
1382
|
+
df = transform_output.to_pandas().reset_index()
|
|
1383
|
+
df = df[cols]
|
|
1384
|
+
return df
|