teradataml 20.0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +2762 -0
- teradataml/__init__.py +78 -0
- teradataml/_version.py +11 -0
- teradataml/analytics/Transformations.py +2996 -0
- teradataml/analytics/__init__.py +82 -0
- teradataml/analytics/analytic_function_executor.py +2416 -0
- teradataml/analytics/analytic_query_generator.py +1050 -0
- teradataml/analytics/byom/H2OPredict.py +514 -0
- teradataml/analytics/byom/PMMLPredict.py +437 -0
- teradataml/analytics/byom/__init__.py +16 -0
- teradataml/analytics/json_parser/__init__.py +133 -0
- teradataml/analytics/json_parser/analytic_functions_argument.py +1805 -0
- teradataml/analytics/json_parser/json_store.py +191 -0
- teradataml/analytics/json_parser/metadata.py +1666 -0
- teradataml/analytics/json_parser/utils.py +805 -0
- teradataml/analytics/meta_class.py +236 -0
- teradataml/analytics/sqle/DecisionTreePredict.py +456 -0
- teradataml/analytics/sqle/NaiveBayesPredict.py +420 -0
- teradataml/analytics/sqle/__init__.py +128 -0
- teradataml/analytics/sqle/json/decisiontreepredict_sqle.json +78 -0
- teradataml/analytics/sqle/json/naivebayespredict_sqle.json +62 -0
- teradataml/analytics/table_operator/__init__.py +11 -0
- teradataml/analytics/uaf/__init__.py +82 -0
- teradataml/analytics/utils.py +828 -0
- teradataml/analytics/valib.py +1617 -0
- teradataml/automl/__init__.py +5835 -0
- teradataml/automl/autodataprep/__init__.py +493 -0
- teradataml/automl/custom_json_utils.py +1625 -0
- teradataml/automl/data_preparation.py +1384 -0
- teradataml/automl/data_transformation.py +1254 -0
- teradataml/automl/feature_engineering.py +2273 -0
- teradataml/automl/feature_exploration.py +1873 -0
- teradataml/automl/model_evaluation.py +488 -0
- teradataml/automl/model_training.py +1407 -0
- teradataml/catalog/__init__.py +2 -0
- teradataml/catalog/byom.py +1759 -0
- teradataml/catalog/function_argument_mapper.py +859 -0
- teradataml/catalog/model_cataloging_utils.py +491 -0
- teradataml/clients/__init__.py +0 -0
- teradataml/clients/auth_client.py +137 -0
- teradataml/clients/keycloak_client.py +165 -0
- teradataml/clients/pkce_client.py +481 -0
- teradataml/common/__init__.py +1 -0
- teradataml/common/aed_utils.py +2078 -0
- teradataml/common/bulk_exposed_utils.py +113 -0
- teradataml/common/constants.py +1669 -0
- teradataml/common/deprecations.py +166 -0
- teradataml/common/exceptions.py +147 -0
- teradataml/common/formula.py +743 -0
- teradataml/common/garbagecollector.py +666 -0
- teradataml/common/logger.py +1261 -0
- teradataml/common/messagecodes.py +518 -0
- teradataml/common/messages.py +262 -0
- teradataml/common/pylogger.py +67 -0
- teradataml/common/sqlbundle.py +764 -0
- teradataml/common/td_coltype_code_to_tdtype.py +48 -0
- teradataml/common/utils.py +3166 -0
- teradataml/common/warnings.py +36 -0
- teradataml/common/wrapper_utils.py +625 -0
- teradataml/config/__init__.py +0 -0
- teradataml/config/dummy_file1.cfg +5 -0
- teradataml/config/dummy_file2.cfg +3 -0
- teradataml/config/sqlengine_alias_definitions_v1.0 +14 -0
- teradataml/config/sqlengine_alias_definitions_v1.1 +20 -0
- teradataml/config/sqlengine_alias_definitions_v1.3 +19 -0
- teradataml/context/__init__.py +0 -0
- teradataml/context/aed_context.py +223 -0
- teradataml/context/context.py +1462 -0
- teradataml/data/A_loan.csv +19 -0
- teradataml/data/BINARY_REALS_LEFT.csv +11 -0
- teradataml/data/BINARY_REALS_RIGHT.csv +11 -0
- teradataml/data/B_loan.csv +49 -0
- teradataml/data/BuoyData2.csv +17 -0
- teradataml/data/CONVOLVE2_COMPLEX_LEFT.csv +5 -0
- teradataml/data/CONVOLVE2_COMPLEX_RIGHT.csv +5 -0
- teradataml/data/Convolve2RealsLeft.csv +5 -0
- teradataml/data/Convolve2RealsRight.csv +5 -0
- teradataml/data/Convolve2ValidLeft.csv +11 -0
- teradataml/data/Convolve2ValidRight.csv +11 -0
- teradataml/data/DFFTConv_Real_8_8.csv +65 -0
- teradataml/data/Employee.csv +5 -0
- teradataml/data/Employee_Address.csv +4 -0
- teradataml/data/Employee_roles.csv +5 -0
- teradataml/data/JulesBelvezeDummyData.csv +100 -0
- teradataml/data/Mall_customer_data.csv +201 -0
- teradataml/data/Orders1_12mf.csv +25 -0
- teradataml/data/Pi_loan.csv +7 -0
- teradataml/data/SMOOTHED_DATA.csv +7 -0
- teradataml/data/TestDFFT8.csv +9 -0
- teradataml/data/TestRiver.csv +109 -0
- teradataml/data/Traindata.csv +28 -0
- teradataml/data/__init__.py +0 -0
- teradataml/data/acf.csv +17 -0
- teradataml/data/adaboost_example.json +34 -0
- teradataml/data/adaboostpredict_example.json +24 -0
- teradataml/data/additional_table.csv +11 -0
- teradataml/data/admissions_test.csv +21 -0
- teradataml/data/admissions_train.csv +41 -0
- teradataml/data/admissions_train_nulls.csv +41 -0
- teradataml/data/advertising.csv +201 -0
- teradataml/data/ageandheight.csv +13 -0
- teradataml/data/ageandpressure.csv +31 -0
- teradataml/data/amazon_reviews_25.csv +26 -0
- teradataml/data/antiselect_example.json +36 -0
- teradataml/data/antiselect_input.csv +8 -0
- teradataml/data/antiselect_input_mixed_case.csv +8 -0
- teradataml/data/applicant_external.csv +7 -0
- teradataml/data/applicant_reference.csv +7 -0
- teradataml/data/apriori_example.json +22 -0
- teradataml/data/arima_example.json +9 -0
- teradataml/data/assortedtext_input.csv +8 -0
- teradataml/data/attribution_example.json +34 -0
- teradataml/data/attribution_sample_table.csv +27 -0
- teradataml/data/attribution_sample_table1.csv +6 -0
- teradataml/data/attribution_sample_table2.csv +11 -0
- teradataml/data/bank_churn.csv +10001 -0
- teradataml/data/bank_marketing.csv +11163 -0
- teradataml/data/bank_web_clicks1.csv +43 -0
- teradataml/data/bank_web_clicks2.csv +91 -0
- teradataml/data/bank_web_url.csv +85 -0
- teradataml/data/barrier.csv +2 -0
- teradataml/data/barrier_new.csv +3 -0
- teradataml/data/betweenness_example.json +14 -0
- teradataml/data/bike_sharing.csv +732 -0
- teradataml/data/bin_breaks.csv +8 -0
- teradataml/data/bin_fit_ip.csv +4 -0
- teradataml/data/binary_complex_left.csv +11 -0
- teradataml/data/binary_complex_right.csv +11 -0
- teradataml/data/binary_matrix_complex_left.csv +21 -0
- teradataml/data/binary_matrix_complex_right.csv +21 -0
- teradataml/data/binary_matrix_real_left.csv +21 -0
- teradataml/data/binary_matrix_real_right.csv +21 -0
- teradataml/data/blood2ageandweight.csv +26 -0
- teradataml/data/bmi.csv +501 -0
- teradataml/data/boston.csv +507 -0
- teradataml/data/boston2cols.csv +721 -0
- teradataml/data/breast_cancer.csv +570 -0
- teradataml/data/buoydata_mix.csv +11 -0
- teradataml/data/burst_data.csv +5 -0
- teradataml/data/burst_example.json +21 -0
- teradataml/data/byom_example.json +34 -0
- teradataml/data/bytes_table.csv +4 -0
- teradataml/data/cal_housing_ex_raw.csv +70 -0
- teradataml/data/callers.csv +7 -0
- teradataml/data/calls.csv +10 -0
- teradataml/data/cars_hist.csv +33 -0
- teradataml/data/cat_table.csv +25 -0
- teradataml/data/ccm_example.json +32 -0
- teradataml/data/ccm_input.csv +91 -0
- teradataml/data/ccm_input2.csv +13 -0
- teradataml/data/ccmexample.csv +101 -0
- teradataml/data/ccmprepare_example.json +9 -0
- teradataml/data/ccmprepare_input.csv +91 -0
- teradataml/data/cfilter_example.json +12 -0
- teradataml/data/changepointdetection_example.json +18 -0
- teradataml/data/changepointdetectionrt_example.json +8 -0
- teradataml/data/chi_sq.csv +3 -0
- teradataml/data/churn_data.csv +14 -0
- teradataml/data/churn_emission.csv +35 -0
- teradataml/data/churn_initial.csv +3 -0
- teradataml/data/churn_state_transition.csv +5 -0
- teradataml/data/citedges_2.csv +745 -0
- teradataml/data/citvertices_2.csv +1210 -0
- teradataml/data/clicks2.csv +16 -0
- teradataml/data/clickstream.csv +13 -0
- teradataml/data/clickstream1.csv +11 -0
- teradataml/data/closeness_example.json +16 -0
- teradataml/data/complaints.csv +21 -0
- teradataml/data/complaints_mini.csv +3 -0
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_testtoken.csv +224 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/complaints_tokens_test.csv +353 -0
- teradataml/data/complaints_traintoken.csv +472 -0
- teradataml/data/computers_category.csv +1001 -0
- teradataml/data/computers_test1.csv +1252 -0
- teradataml/data/computers_train1.csv +5009 -0
- teradataml/data/computers_train1_clustered.csv +5009 -0
- teradataml/data/confusionmatrix_example.json +9 -0
- teradataml/data/conversion_event_table.csv +3 -0
- teradataml/data/corr_input.csv +17 -0
- teradataml/data/correlation_example.json +11 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/coxhazardratio_example.json +39 -0
- teradataml/data/coxph_example.json +15 -0
- teradataml/data/coxsurvival_example.json +28 -0
- teradataml/data/cpt.csv +41 -0
- teradataml/data/credit_ex_merged.csv +45 -0
- teradataml/data/creditcard_data.csv +1001 -0
- teradataml/data/customer_loyalty.csv +301 -0
- teradataml/data/customer_loyalty_newseq.csv +31 -0
- teradataml/data/customer_segmentation_test.csv +2628 -0
- teradataml/data/customer_segmentation_train.csv +8069 -0
- teradataml/data/dataframe_example.json +173 -0
- teradataml/data/decisionforest_example.json +37 -0
- teradataml/data/decisionforestpredict_example.json +38 -0
- teradataml/data/decisiontree_example.json +21 -0
- teradataml/data/decisiontreepredict_example.json +45 -0
- teradataml/data/dfft2_size4_real.csv +17 -0
- teradataml/data/dfft2_test_matrix16.csv +17 -0
- teradataml/data/dfft2conv_real_4_4.csv +65 -0
- teradataml/data/diabetes.csv +443 -0
- teradataml/data/diabetes_test.csv +89 -0
- teradataml/data/dict_table.csv +5 -0
- teradataml/data/docperterm_table.csv +4 -0
- teradataml/data/docs/__init__.py +1 -0
- teradataml/data/docs/byom/__init__.py +0 -0
- teradataml/data/docs/byom/docs/DataRobotPredict.py +180 -0
- teradataml/data/docs/byom/docs/DataikuPredict.py +217 -0
- teradataml/data/docs/byom/docs/H2OPredict.py +325 -0
- teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
- teradataml/data/docs/byom/docs/ONNXPredict.py +283 -0
- teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
- teradataml/data/docs/byom/docs/PMMLPredict.py +278 -0
- teradataml/data/docs/byom/docs/__init__.py +0 -0
- teradataml/data/docs/sqle/__init__.py +0 -0
- teradataml/data/docs/sqle/docs_17_10/Antiselect.py +83 -0
- teradataml/data/docs/sqle/docs_17_10/Attribution.py +200 -0
- teradataml/data/docs/sqle/docs_17_10/BincodeFit.py +172 -0
- teradataml/data/docs/sqle/docs_17_10/BincodeTransform.py +131 -0
- teradataml/data/docs/sqle/docs_17_10/CategoricalSummary.py +86 -0
- teradataml/data/docs/sqle/docs_17_10/ChiSq.py +90 -0
- teradataml/data/docs/sqle/docs_17_10/ColumnSummary.py +86 -0
- teradataml/data/docs/sqle/docs_17_10/ConvertTo.py +96 -0
- teradataml/data/docs/sqle/docs_17_10/DecisionForestPredict.py +139 -0
- teradataml/data/docs/sqle/docs_17_10/DecisionTreePredict.py +152 -0
- teradataml/data/docs/sqle/docs_17_10/FTest.py +161 -0
- teradataml/data/docs/sqle/docs_17_10/FillRowId.py +83 -0
- teradataml/data/docs/sqle/docs_17_10/Fit.py +88 -0
- teradataml/data/docs/sqle/docs_17_10/GLMPredict.py +144 -0
- teradataml/data/docs/sqle/docs_17_10/GetRowsWithMissingValues.py +85 -0
- teradataml/data/docs/sqle/docs_17_10/GetRowsWithoutMissingValues.py +82 -0
- teradataml/data/docs/sqle/docs_17_10/Histogram.py +165 -0
- teradataml/data/docs/sqle/docs_17_10/MovingAverage.py +134 -0
- teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +209 -0
- teradataml/data/docs/sqle/docs_17_10/NPath.py +266 -0
- teradataml/data/docs/sqle/docs_17_10/NaiveBayesPredict.py +116 -0
- teradataml/data/docs/sqle/docs_17_10/NaiveBayesTextClassifierPredict.py +176 -0
- teradataml/data/docs/sqle/docs_17_10/NumApply.py +147 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +135 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +109 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterFit.py +166 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +105 -0
- teradataml/data/docs/sqle/docs_17_10/Pack.py +128 -0
- teradataml/data/docs/sqle/docs_17_10/PolynomialFeaturesFit.py +112 -0
- teradataml/data/docs/sqle/docs_17_10/PolynomialFeaturesTransform.py +102 -0
- teradataml/data/docs/sqle/docs_17_10/QQNorm.py +105 -0
- teradataml/data/docs/sqle/docs_17_10/RoundColumns.py +110 -0
- teradataml/data/docs/sqle/docs_17_10/RowNormalizeFit.py +118 -0
- teradataml/data/docs/sqle/docs_17_10/RowNormalizeTransform.py +99 -0
- teradataml/data/docs/sqle/docs_17_10/SVMSparsePredict.py +153 -0
- teradataml/data/docs/sqle/docs_17_10/ScaleFit.py +197 -0
- teradataml/data/docs/sqle/docs_17_10/ScaleTransform.py +99 -0
- teradataml/data/docs/sqle/docs_17_10/Sessionize.py +114 -0
- teradataml/data/docs/sqle/docs_17_10/SimpleImputeFit.py +116 -0
- teradataml/data/docs/sqle/docs_17_10/SimpleImputeTransform.py +98 -0
- teradataml/data/docs/sqle/docs_17_10/StrApply.py +187 -0
- teradataml/data/docs/sqle/docs_17_10/StringSimilarity.py +146 -0
- teradataml/data/docs/sqle/docs_17_10/Transform.py +105 -0
- teradataml/data/docs/sqle/docs_17_10/UnivariateStatistics.py +142 -0
- teradataml/data/docs/sqle/docs_17_10/Unpack.py +214 -0
- teradataml/data/docs/sqle/docs_17_10/WhichMax.py +83 -0
- teradataml/data/docs/sqle/docs_17_10/WhichMin.py +83 -0
- teradataml/data/docs/sqle/docs_17_10/ZTest.py +155 -0
- teradataml/data/docs/sqle/docs_17_10/__init__.py +0 -0
- teradataml/data/docs/sqle/docs_17_20/ANOVA.py +186 -0
- teradataml/data/docs/sqle/docs_17_20/Antiselect.py +83 -0
- teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
- teradataml/data/docs/sqle/docs_17_20/Attribution.py +201 -0
- teradataml/data/docs/sqle/docs_17_20/BincodeFit.py +172 -0
- teradataml/data/docs/sqle/docs_17_20/BincodeTransform.py +139 -0
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/CategoricalSummary.py +86 -0
- teradataml/data/docs/sqle/docs_17_20/ChiSq.py +90 -0
- teradataml/data/docs/sqle/docs_17_20/ClassificationEvaluator.py +166 -0
- teradataml/data/docs/sqle/docs_17_20/ColumnSummary.py +86 -0
- teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +246 -0
- teradataml/data/docs/sqle/docs_17_20/ConvertTo.py +113 -0
- teradataml/data/docs/sqle/docs_17_20/DecisionForest.py +280 -0
- teradataml/data/docs/sqle/docs_17_20/DecisionForestPredict.py +144 -0
- teradataml/data/docs/sqle/docs_17_20/DecisionTreePredict.py +136 -0
- teradataml/data/docs/sqle/docs_17_20/FTest.py +240 -0
- teradataml/data/docs/sqle/docs_17_20/FillRowId.py +83 -0
- teradataml/data/docs/sqle/docs_17_20/Fit.py +88 -0
- teradataml/data/docs/sqle/docs_17_20/GLM.py +541 -0
- teradataml/data/docs/sqle/docs_17_20/GLMPerSegment.py +415 -0
- teradataml/data/docs/sqle/docs_17_20/GLMPredict.py +144 -0
- teradataml/data/docs/sqle/docs_17_20/GLMPredictPerSegment.py +233 -0
- teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +125 -0
- teradataml/data/docs/sqle/docs_17_20/GetRowsWithMissingValues.py +109 -0
- teradataml/data/docs/sqle/docs_17_20/GetRowsWithoutMissingValues.py +106 -0
- teradataml/data/docs/sqle/docs_17_20/Histogram.py +224 -0
- teradataml/data/docs/sqle/docs_17_20/KMeans.py +251 -0
- teradataml/data/docs/sqle/docs_17_20/KMeansPredict.py +144 -0
- teradataml/data/docs/sqle/docs_17_20/KNN.py +215 -0
- teradataml/data/docs/sqle/docs_17_20/MovingAverage.py +134 -0
- teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +209 -0
- teradataml/data/docs/sqle/docs_17_20/NPath.py +266 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayesPredict.py +116 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierPredict.py +177 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierTrainer.py +127 -0
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +119 -0
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineTransform.py +112 -0
- teradataml/data/docs/sqle/docs_17_20/NumApply.py +147 -0
- teradataml/data/docs/sqle/docs_17_20/OneClassSVM.py +307 -0
- teradataml/data/docs/sqle/docs_17_20/OneClassSVMPredict.py +185 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +231 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +121 -0
- teradataml/data/docs/sqle/docs_17_20/OrdinalEncodingFit.py +220 -0
- teradataml/data/docs/sqle/docs_17_20/OrdinalEncodingTransform.py +127 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +191 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterTransform.py +117 -0
- teradataml/data/docs/sqle/docs_17_20/Pack.py +128 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/PolynomialFeaturesFit.py +112 -0
- teradataml/data/docs/sqle/docs_17_20/PolynomialFeaturesTransform.py +112 -0
- teradataml/data/docs/sqle/docs_17_20/QQNorm.py +105 -0
- teradataml/data/docs/sqle/docs_17_20/ROC.py +164 -0
- teradataml/data/docs/sqle/docs_17_20/RandomProjectionFit.py +155 -0
- teradataml/data/docs/sqle/docs_17_20/RandomProjectionMinComponents.py +106 -0
- teradataml/data/docs/sqle/docs_17_20/RandomProjectionTransform.py +120 -0
- teradataml/data/docs/sqle/docs_17_20/RegressionEvaluator.py +211 -0
- teradataml/data/docs/sqle/docs_17_20/RoundColumns.py +109 -0
- teradataml/data/docs/sqle/docs_17_20/RowNormalizeFit.py +118 -0
- teradataml/data/docs/sqle/docs_17_20/RowNormalizeTransform.py +111 -0
- teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
- teradataml/data/docs/sqle/docs_17_20/SVM.py +414 -0
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +213 -0
- teradataml/data/docs/sqle/docs_17_20/SVMSparsePredict.py +153 -0
- teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +315 -0
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +202 -0
- teradataml/data/docs/sqle/docs_17_20/SentimentExtractor.py +206 -0
- teradataml/data/docs/sqle/docs_17_20/Sessionize.py +114 -0
- teradataml/data/docs/sqle/docs_17_20/Shap.py +225 -0
- teradataml/data/docs/sqle/docs_17_20/Silhouette.py +153 -0
- teradataml/data/docs/sqle/docs_17_20/SimpleImputeFit.py +116 -0
- teradataml/data/docs/sqle/docs_17_20/SimpleImputeTransform.py +109 -0
- teradataml/data/docs/sqle/docs_17_20/StrApply.py +187 -0
- teradataml/data/docs/sqle/docs_17_20/StringSimilarity.py +146 -0
- teradataml/data/docs/sqle/docs_17_20/TDDecisionForestPredict.py +207 -0
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +333 -0
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/TargetEncodingFit.py +267 -0
- teradataml/data/docs/sqle/docs_17_20/TargetEncodingTransform.py +141 -0
- teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +224 -0
- teradataml/data/docs/sqle/docs_17_20/TrainTestSplit.py +160 -0
- teradataml/data/docs/sqle/docs_17_20/Transform.py +123 -0
- teradataml/data/docs/sqle/docs_17_20/UnivariateStatistics.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/Unpack.py +214 -0
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/sqle/docs_17_20/VectorDistance.py +169 -0
- teradataml/data/docs/sqle/docs_17_20/WhichMax.py +83 -0
- teradataml/data/docs/sqle/docs_17_20/WhichMin.py +83 -0
- teradataml/data/docs/sqle/docs_17_20/WordEmbeddings.py +237 -0
- teradataml/data/docs/sqle/docs_17_20/XGBoost.py +362 -0
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +281 -0
- teradataml/data/docs/sqle/docs_17_20/ZTest.py +220 -0
- teradataml/data/docs/sqle/docs_17_20/__init__.py +0 -0
- teradataml/data/docs/tableoperator/__init__.py +0 -0
- teradataml/data/docs/tableoperator/docs_17_00/ReadNOS.py +430 -0
- teradataml/data/docs/tableoperator/docs_17_00/__init__.py +0 -0
- teradataml/data/docs/tableoperator/docs_17_05/ReadNOS.py +430 -0
- teradataml/data/docs/tableoperator/docs_17_05/WriteNOS.py +348 -0
- teradataml/data/docs/tableoperator/docs_17_05/__init__.py +0 -0
- teradataml/data/docs/tableoperator/docs_17_10/ReadNOS.py +429 -0
- teradataml/data/docs/tableoperator/docs_17_10/WriteNOS.py +348 -0
- teradataml/data/docs/tableoperator/docs_17_10/__init__.py +0 -0
- teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
- teradataml/data/docs/tableoperator/docs_17_20/ReadNOS.py +440 -0
- teradataml/data/docs/tableoperator/docs_17_20/WriteNOS.py +387 -0
- teradataml/data/docs/tableoperator/docs_17_20/__init__.py +0 -0
- teradataml/data/docs/uaf/__init__.py +0 -0
- teradataml/data/docs/uaf/docs_17_20/ACF.py +186 -0
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +370 -0
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +172 -0
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +161 -0
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BinaryMatrixOp.py +248 -0
- teradataml/data/docs/uaf/docs_17_20/BinarySeriesOp.py +252 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +178 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +175 -0
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +230 -0
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +218 -0
- teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +185 -0
- teradataml/data/docs/uaf/docs_17_20/DFFT.py +204 -0
- teradataml/data/docs/uaf/docs_17_20/DFFT2.py +216 -0
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +216 -0
- teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +192 -0
- teradataml/data/docs/uaf/docs_17_20/DIFF.py +175 -0
- teradataml/data/docs/uaf/docs_17_20/DTW.py +180 -0
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +217 -0
- teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +142 -0
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +184 -0
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +185 -0
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/FitMetrics.py +172 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesFormula.py +206 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +143 -0
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +198 -0
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +260 -0
- teradataml/data/docs/uaf/docs_17_20/IDFFT.py +165 -0
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +191 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/InputValidator.py +121 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +156 -0
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +215 -0
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +174 -0
- teradataml/data/docs/uaf/docs_17_20/MInfo.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +191 -0
- teradataml/data/docs/uaf/docs_17_20/PACF.py +157 -0
- teradataml/data/docs/uaf/docs_17_20/Portman.py +217 -0
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +203 -0
- teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +155 -0
- teradataml/data/docs/uaf/docs_17_20/Resample.py +237 -0
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SInfo.py +123 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +173 -0
- teradataml/data/docs/uaf/docs_17_20/SelectionCriteria.py +174 -0
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +171 -0
- teradataml/data/docs/uaf/docs_17_20/SignifResidmean.py +164 -0
- teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +180 -0
- teradataml/data/docs/uaf/docs_17_20/Smoothma.py +208 -0
- teradataml/data/docs/uaf/docs_17_20/TrackingOp.py +151 -0
- teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +171 -0
- teradataml/data/docs/uaf/docs_17_20/Unnormalize.py +202 -0
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +171 -0
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/docs/uaf/docs_17_20/__init__.py +0 -0
- teradataml/data/dtw_example.json +18 -0
- teradataml/data/dtw_t1.csv +11 -0
- teradataml/data/dtw_t2.csv +4 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt2d_example.json +16 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_example.json +15 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/dwt_filter_dim.csv +5 -0
- teradataml/data/emission.csv +9 -0
- teradataml/data/emp_table_by_dept.csv +19 -0
- teradataml/data/employee_info.csv +4 -0
- teradataml/data/employee_table.csv +6 -0
- teradataml/data/excluding_event_table.csv +2 -0
- teradataml/data/finance_data.csv +6 -0
- teradataml/data/finance_data2.csv +61 -0
- teradataml/data/finance_data3.csv +93 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/fish.csv +160 -0
- teradataml/data/fm_blood2ageandweight.csv +26 -0
- teradataml/data/fmeasure_example.json +12 -0
- teradataml/data/followers_leaders.csv +10 -0
- teradataml/data/fpgrowth_example.json +12 -0
- teradataml/data/frequentpaths_example.json +29 -0
- teradataml/data/friends.csv +9 -0
- teradataml/data/fs_input.csv +33 -0
- teradataml/data/fs_input1.csv +33 -0
- teradataml/data/genData.csv +513 -0
- teradataml/data/geodataframe_example.json +40 -0
- teradataml/data/glass_types.csv +215 -0
- teradataml/data/glm_admissions_model.csv +12 -0
- teradataml/data/glm_example.json +56 -0
- teradataml/data/glml1l2_example.json +28 -0
- teradataml/data/glml1l2predict_example.json +54 -0
- teradataml/data/glmpredict_example.json +54 -0
- teradataml/data/gq_t1.csv +21 -0
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/hconvolve_complex_right.csv +5 -0
- teradataml/data/hconvolve_complex_rightmulti.csv +5 -0
- teradataml/data/histogram_example.json +12 -0
- teradataml/data/hmmdecoder_example.json +79 -0
- teradataml/data/hmmevaluator_example.json +25 -0
- teradataml/data/hmmsupervised_example.json +10 -0
- teradataml/data/hmmunsupervised_example.json +8 -0
- teradataml/data/hnsw_alter_data.csv +5 -0
- teradataml/data/hnsw_data.csv +10 -0
- teradataml/data/house_values.csv +12 -0
- teradataml/data/house_values2.csv +13 -0
- teradataml/data/housing_cat.csv +7 -0
- teradataml/data/housing_data.csv +9 -0
- teradataml/data/housing_test.csv +47 -0
- teradataml/data/housing_test_binary.csv +47 -0
- teradataml/data/housing_train.csv +493 -0
- teradataml/data/housing_train_attribute.csv +5 -0
- teradataml/data/housing_train_binary.csv +437 -0
- teradataml/data/housing_train_parameter.csv +2 -0
- teradataml/data/housing_train_response.csv +493 -0
- teradataml/data/housing_train_segment.csv +201 -0
- teradataml/data/ibm_stock.csv +370 -0
- teradataml/data/ibm_stock1.csv +370 -0
- teradataml/data/identitymatch_example.json +22 -0
- teradataml/data/idf_table.csv +4 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/impressions.csv +101 -0
- teradataml/data/inflation.csv +21 -0
- teradataml/data/initial.csv +3 -0
- teradataml/data/insect2Cols.csv +61 -0
- teradataml/data/insect_sprays.csv +13 -0
- teradataml/data/insurance.csv +1339 -0
- teradataml/data/interpolator_example.json +13 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/iris_altinput.csv +481 -0
- teradataml/data/iris_attribute_output.csv +8 -0
- teradataml/data/iris_attribute_test.csv +121 -0
- teradataml/data/iris_attribute_train.csv +481 -0
- teradataml/data/iris_category_expect_predict.csv +31 -0
- teradataml/data/iris_data.csv +151 -0
- teradataml/data/iris_input.csv +151 -0
- teradataml/data/iris_response_train.csv +121 -0
- teradataml/data/iris_test.csv +31 -0
- teradataml/data/iris_train.csv +121 -0
- teradataml/data/join_table1.csv +4 -0
- teradataml/data/join_table2.csv +4 -0
- teradataml/data/jsons/anly_function_name.json +7 -0
- teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
- teradataml/data/jsons/byom/dataikupredict.json +148 -0
- teradataml/data/jsons/byom/datarobotpredict.json +147 -0
- teradataml/data/jsons/byom/h2opredict.json +195 -0
- teradataml/data/jsons/byom/onnxembeddings.json +267 -0
- teradataml/data/jsons/byom/onnxpredict.json +187 -0
- teradataml/data/jsons/byom/pmmlpredict.json +147 -0
- teradataml/data/jsons/paired_functions.json +450 -0
- teradataml/data/jsons/sqle/16.20/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/16.20/Attribution.json +249 -0
- teradataml/data/jsons/sqle/16.20/DecisionForestPredict.json +156 -0
- teradataml/data/jsons/sqle/16.20/DecisionTreePredict.json +170 -0
- teradataml/data/jsons/sqle/16.20/GLMPredict.json +122 -0
- teradataml/data/jsons/sqle/16.20/MovingAverage.json +367 -0
- teradataml/data/jsons/sqle/16.20/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/16.20/NaiveBayesPredict.json +136 -0
- teradataml/data/jsons/sqle/16.20/NaiveBayesTextClassifierPredict.json +235 -0
- teradataml/data/jsons/sqle/16.20/Pack.json +98 -0
- teradataml/data/jsons/sqle/16.20/SVMSparsePredict.json +162 -0
- teradataml/data/jsons/sqle/16.20/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/16.20/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/16.20/Unpack.json +166 -0
- teradataml/data/jsons/sqle/16.20/nPath.json +269 -0
- teradataml/data/jsons/sqle/17.00/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/17.00/Attribution.json +249 -0
- teradataml/data/jsons/sqle/17.00/DecisionForestPredict.json +156 -0
- teradataml/data/jsons/sqle/17.00/DecisionTreePredict.json +170 -0
- teradataml/data/jsons/sqle/17.00/GLMPredict.json +122 -0
- teradataml/data/jsons/sqle/17.00/MovingAverage.json +367 -0
- teradataml/data/jsons/sqle/17.00/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/17.00/NaiveBayesPredict.json +136 -0
- teradataml/data/jsons/sqle/17.00/NaiveBayesTextClassifierPredict.json +235 -0
- teradataml/data/jsons/sqle/17.00/Pack.json +98 -0
- teradataml/data/jsons/sqle/17.00/SVMSparsePredict.json +162 -0
- teradataml/data/jsons/sqle/17.00/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/17.00/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/17.00/Unpack.json +166 -0
- teradataml/data/jsons/sqle/17.00/nPath.json +269 -0
- teradataml/data/jsons/sqle/17.05/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/17.05/Attribution.json +249 -0
- teradataml/data/jsons/sqle/17.05/DecisionForestPredict.json +156 -0
- teradataml/data/jsons/sqle/17.05/DecisionTreePredict.json +170 -0
- teradataml/data/jsons/sqle/17.05/GLMPredict.json +122 -0
- teradataml/data/jsons/sqle/17.05/MovingAverage.json +367 -0
- teradataml/data/jsons/sqle/17.05/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/17.05/NaiveBayesPredict.json +136 -0
- teradataml/data/jsons/sqle/17.05/NaiveBayesTextClassifierPredict.json +235 -0
- teradataml/data/jsons/sqle/17.05/Pack.json +98 -0
- teradataml/data/jsons/sqle/17.05/SVMSparsePredict.json +162 -0
- teradataml/data/jsons/sqle/17.05/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/17.05/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/17.05/Unpack.json +166 -0
- teradataml/data/jsons/sqle/17.05/nPath.json +269 -0
- teradataml/data/jsons/sqle/17.10/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/17.10/Attribution.json +249 -0
- teradataml/data/jsons/sqle/17.10/DecisionForestPredict.json +185 -0
- teradataml/data/jsons/sqle/17.10/DecisionTreePredict.json +172 -0
- teradataml/data/jsons/sqle/17.10/GLMPredict.json +151 -0
- teradataml/data/jsons/sqle/17.10/MovingAverage.json +368 -0
- teradataml/data/jsons/sqle/17.10/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/17.10/NaiveBayesPredict.json +149 -0
- teradataml/data/jsons/sqle/17.10/NaiveBayesTextClassifierPredict.json +288 -0
- teradataml/data/jsons/sqle/17.10/Pack.json +133 -0
- teradataml/data/jsons/sqle/17.10/SVMSparsePredict.json +193 -0
- teradataml/data/jsons/sqle/17.10/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/17.10/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/17.10/TD_BinCodeFit.json +239 -0
- teradataml/data/jsons/sqle/17.10/TD_BinCodeTransform.json +70 -0
- teradataml/data/jsons/sqle/17.10/TD_CategoricalSummary.json +54 -0
- teradataml/data/jsons/sqle/17.10/TD_Chisq.json +68 -0
- teradataml/data/jsons/sqle/17.10/TD_ColumnSummary.json +54 -0
- teradataml/data/jsons/sqle/17.10/TD_ConvertTo.json +69 -0
- teradataml/data/jsons/sqle/17.10/TD_FTest.json +187 -0
- teradataml/data/jsons/sqle/17.10/TD_FillRowID.json +52 -0
- teradataml/data/jsons/sqle/17.10/TD_FunctionFit.json +46 -0
- teradataml/data/jsons/sqle/17.10/TD_FunctionTransform.json +72 -0
- teradataml/data/jsons/sqle/17.10/TD_GetRowsWithMissingValues.json +53 -0
- teradataml/data/jsons/sqle/17.10/TD_GetRowsWithoutMissingValues.json +53 -0
- teradataml/data/jsons/sqle/17.10/TD_Histogram.json +133 -0
- teradataml/data/jsons/sqle/17.10/TD_NumApply.json +147 -0
- teradataml/data/jsons/sqle/17.10/TD_OneHotEncodingFit.json +183 -0
- teradataml/data/jsons/sqle/17.10/TD_OneHotEncodingTransform.json +66 -0
- teradataml/data/jsons/sqle/17.10/TD_OutlierFilterFit.json +197 -0
- teradataml/data/jsons/sqle/17.10/TD_OutlierFilterTransform.json +48 -0
- teradataml/data/jsons/sqle/17.10/TD_PolynomialFeaturesFit.json +114 -0
- teradataml/data/jsons/sqle/17.10/TD_PolynomialFeaturesTransform.json +72 -0
- teradataml/data/jsons/sqle/17.10/TD_QQNorm.json +112 -0
- teradataml/data/jsons/sqle/17.10/TD_RoundColumns.json +93 -0
- teradataml/data/jsons/sqle/17.10/TD_RowNormalizeFit.json +128 -0
- teradataml/data/jsons/sqle/17.10/TD_RowNormalizeTransform.json +71 -0
- teradataml/data/jsons/sqle/17.10/TD_ScaleFit.json +157 -0
- teradataml/data/jsons/sqle/17.10/TD_ScaleTransform.json +71 -0
- teradataml/data/jsons/sqle/17.10/TD_SimpleImputeFit.json +148 -0
- teradataml/data/jsons/sqle/17.10/TD_SimpleImputeTransform.json +48 -0
- teradataml/data/jsons/sqle/17.10/TD_StrApply.json +240 -0
- teradataml/data/jsons/sqle/17.10/TD_UnivariateStatistics.json +119 -0
- teradataml/data/jsons/sqle/17.10/TD_WhichMax.json +53 -0
- teradataml/data/jsons/sqle/17.10/TD_WhichMin.json +53 -0
- teradataml/data/jsons/sqle/17.10/TD_ZTest.json +171 -0
- teradataml/data/jsons/sqle/17.10/Unpack.json +188 -0
- teradataml/data/jsons/sqle/17.10/nPath.json +269 -0
- teradataml/data/jsons/sqle/17.20/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/17.20/Attribution.json +249 -0
- teradataml/data/jsons/sqle/17.20/DecisionForestPredict.json +185 -0
- teradataml/data/jsons/sqle/17.20/DecisionTreePredict.json +172 -0
- teradataml/data/jsons/sqle/17.20/GLMPredict.json +151 -0
- teradataml/data/jsons/sqle/17.20/MovingAverage.json +367 -0
- teradataml/data/jsons/sqle/17.20/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/17.20/NaiveBayesPredict.json +149 -0
- teradataml/data/jsons/sqle/17.20/NaiveBayesTextClassifierPredict.json +287 -0
- teradataml/data/jsons/sqle/17.20/Pack.json +133 -0
- teradataml/data/jsons/sqle/17.20/SVMSparsePredict.json +192 -0
- teradataml/data/jsons/sqle/17.20/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/17.20/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +149 -0
- teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
- teradataml/data/jsons/sqle/17.20/TD_BinCodeFit.json +239 -0
- teradataml/data/jsons/sqle/17.20/TD_BinCodeTransform.json +71 -0
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_CategoricalSummary.json +53 -0
- teradataml/data/jsons/sqle/17.20/TD_Chisq.json +68 -0
- teradataml/data/jsons/sqle/17.20/TD_ClassificationEvaluator.json +146 -0
- teradataml/data/jsons/sqle/17.20/TD_ColumnSummary.json +53 -0
- teradataml/data/jsons/sqle/17.20/TD_ColumnTransformer.json +218 -0
- teradataml/data/jsons/sqle/17.20/TD_ConvertTo.json +92 -0
- teradataml/data/jsons/sqle/17.20/TD_DecisionForest.json +260 -0
- teradataml/data/jsons/sqle/17.20/TD_DecisionForestPredict.json +139 -0
- teradataml/data/jsons/sqle/17.20/TD_FTest.json +269 -0
- teradataml/data/jsons/sqle/17.20/TD_FillRowID.json +52 -0
- teradataml/data/jsons/sqle/17.20/TD_FunctionFit.json +46 -0
- teradataml/data/jsons/sqle/17.20/TD_FunctionTransform.json +72 -0
- teradataml/data/jsons/sqle/17.20/TD_GLM.json +507 -0
- teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +168 -0
- teradataml/data/jsons/sqle/17.20/TD_GLMPerSegment.json +411 -0
- teradataml/data/jsons/sqle/17.20/TD_GLMPredictPerSegment.json +146 -0
- teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +93 -0
- teradataml/data/jsons/sqle/17.20/TD_GetRowsWithMissingValues.json +76 -0
- teradataml/data/jsons/sqle/17.20/TD_GetRowsWithoutMissingValues.json +76 -0
- teradataml/data/jsons/sqle/17.20/TD_Histogram.json +152 -0
- teradataml/data/jsons/sqle/17.20/TD_KMeans.json +232 -0
- teradataml/data/jsons/sqle/17.20/TD_KMeansPredict.json +87 -0
- teradataml/data/jsons/sqle/17.20/TD_KNN.json +262 -0
- teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesTextClassifierTrainer.json +137 -0
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +102 -0
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineTransform.json +71 -0
- teradataml/data/jsons/sqle/17.20/TD_NumApply.json +147 -0
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +316 -0
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVMPredict.json +124 -0
- teradataml/data/jsons/sqle/17.20/TD_OneHotEncodingFit.json +271 -0
- teradataml/data/jsons/sqle/17.20/TD_OneHotEncodingTransform.json +65 -0
- teradataml/data/jsons/sqle/17.20/TD_OrdinalEncodingFit.json +229 -0
- teradataml/data/jsons/sqle/17.20/TD_OrdinalEncodingTransform.json +75 -0
- teradataml/data/jsons/sqle/17.20/TD_OutlierFilterFit.json +217 -0
- teradataml/data/jsons/sqle/17.20/TD_OutlierFilterTransform.json +48 -0
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_PolynomialFeaturesFit.json +114 -0
- teradataml/data/jsons/sqle/17.20/TD_PolynomialFeaturesTransform.json +72 -0
- teradataml/data/jsons/sqle/17.20/TD_QQNorm.json +111 -0
- teradataml/data/jsons/sqle/17.20/TD_ROC.json +179 -0
- teradataml/data/jsons/sqle/17.20/TD_RandomProjectionFit.json +179 -0
- teradataml/data/jsons/sqle/17.20/TD_RandomProjectionMinComponents.json +74 -0
- teradataml/data/jsons/sqle/17.20/TD_RandomProjectionTransform.json +74 -0
- teradataml/data/jsons/sqle/17.20/TD_RegressionEvaluator.json +138 -0
- teradataml/data/jsons/sqle/17.20/TD_RoundColumns.json +93 -0
- teradataml/data/jsons/sqle/17.20/TD_RowNormalizeFit.json +128 -0
- teradataml/data/jsons/sqle/17.20/TD_RowNormalizeTransform.json +71 -0
- teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
- teradataml/data/jsons/sqle/17.20/TD_SVM.json +389 -0
- teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +142 -0
- teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +310 -0
- teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +120 -0
- teradataml/data/jsons/sqle/17.20/TD_SentimentExtractor.json +194 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +221 -0
- teradataml/data/jsons/sqle/17.20/TD_Silhouette.json +143 -0
- teradataml/data/jsons/sqle/17.20/TD_SimpleImputeFit.json +147 -0
- teradataml/data/jsons/sqle/17.20/TD_SimpleImputeTransform.json +48 -0
- teradataml/data/jsons/sqle/17.20/TD_StrApply.json +240 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_TargetEncodingFit.json +248 -0
- teradataml/data/jsons/sqle/17.20/TD_TargetEncodingTransform.json +75 -0
- teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +297 -0
- teradataml/data/jsons/sqle/17.20/TD_TrainTestSplit.json +142 -0
- teradataml/data/jsons/sqle/17.20/TD_UnivariateStatistics.json +117 -0
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/sqle/17.20/TD_VectorDistance.json +183 -0
- teradataml/data/jsons/sqle/17.20/TD_WhichMax.json +53 -0
- teradataml/data/jsons/sqle/17.20/TD_WhichMin.json +53 -0
- teradataml/data/jsons/sqle/17.20/TD_WordEmbeddings.json +241 -0
- teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +330 -0
- teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +195 -0
- teradataml/data/jsons/sqle/17.20/TD_ZTest.json +247 -0
- teradataml/data/jsons/sqle/17.20/Unpack.json +188 -0
- teradataml/data/jsons/sqle/17.20/nPath.json +269 -0
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +370 -0
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +460 -0
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +385 -0
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +369 -0
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +369 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +369 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +369 -0
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +400 -0
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +401 -0
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +384 -0
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +384 -0
- teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
- teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
- teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/tableoperator/17.00/read_nos.json +198 -0
- teradataml/data/jsons/tableoperator/17.05/read_nos.json +198 -0
- teradataml/data/jsons/tableoperator/17.05/write_nos.json +195 -0
- teradataml/data/jsons/tableoperator/17.10/read_nos.json +184 -0
- teradataml/data/jsons/tableoperator/17.10/write_nos.json +195 -0
- teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
- teradataml/data/jsons/tableoperator/17.20/read_nos.json +183 -0
- teradataml/data/jsons/tableoperator/17.20/write_nos.json +224 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +132 -0
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +396 -0
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +77 -0
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +153 -0
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +107 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +106 -0
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +89 -0
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +104 -0
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +78 -0
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +66 -0
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +87 -0
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +134 -0
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +144 -0
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +108 -0
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +108 -0
- teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +78 -0
- teradataml/data/jsons/uaf/17.20/TD_DIFF.json +92 -0
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +114 -0
- teradataml/data/jsons/uaf/17.20/TD_DURBIN_WATSON.json +101 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_EXTRACT_RESULTS.json +39 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +101 -0
- teradataml/data/jsons/uaf/17.20/TD_GENSERIES4FORMULA.json +85 -0
- teradataml/data/jsons/uaf/17.20/TD_GENSERIES4SINUSOIDS.json +71 -0
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +139 -0
- teradataml/data/jsons/uaf/17.20/TD_HOLT_WINTERS_FORECASTER.json +313 -0
- teradataml/data/jsons/uaf/17.20/TD_IDFFT.json +58 -0
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +81 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_INPUTVALIDATOR.json +64 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +182 -0
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +103 -0
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +181 -0
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_MATRIXMULTIPLY.json +68 -0
- teradataml/data/jsons/uaf/17.20/TD_MINFO.json +67 -0
- teradataml/data/jsons/uaf/17.20/TD_MULTIVAR_REGR.json +179 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +114 -0
- teradataml/data/jsons/uaf/17.20/TD_PORTMAN.json +119 -0
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +175 -0
- teradataml/data/jsons/uaf/17.20/TD_POWERTRANSFORM.json +98 -0
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +194 -0
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +210 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +143 -0
- teradataml/data/jsons/uaf/17.20/TD_SELECTION_CRITERIA.json +90 -0
- teradataml/data/jsons/uaf/17.20/TD_SIGNIF_PERIODICITIES.json +80 -0
- teradataml/data/jsons/uaf/17.20/TD_SIGNIF_RESIDMEAN.json +68 -0
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +184 -0
- teradataml/data/jsons/uaf/17.20/TD_SINFO.json +58 -0
- teradataml/data/jsons/uaf/17.20/TD_SMOOTHMA.json +163 -0
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +101 -0
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +112 -0
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +95 -0
- teradataml/data/jsons/uaf/17.20/TD_WHITES_GENERAL.json +78 -0
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +410 -0
- teradataml/data/kmeans_example.json +23 -0
- teradataml/data/kmeans_table.csv +10 -0
- teradataml/data/kmeans_us_arrests_data.csv +51 -0
- teradataml/data/knn_example.json +19 -0
- teradataml/data/knnrecommender_example.json +7 -0
- teradataml/data/knnrecommenderpredict_example.json +12 -0
- teradataml/data/lar_example.json +17 -0
- teradataml/data/larpredict_example.json +30 -0
- teradataml/data/lc_new_predictors.csv +5 -0
- teradataml/data/lc_new_reference.csv +9 -0
- teradataml/data/lda_example.json +9 -0
- teradataml/data/ldainference_example.json +15 -0
- teradataml/data/ldatopicsummary_example.json +9 -0
- teradataml/data/levendist_input.csv +13 -0
- teradataml/data/levenshteindistance_example.json +10 -0
- teradataml/data/linreg_example.json +10 -0
- teradataml/data/load_example_data.py +350 -0
- teradataml/data/loan_prediction.csv +295 -0
- teradataml/data/lungcancer.csv +138 -0
- teradataml/data/mappingdata.csv +12 -0
- teradataml/data/medical_readings.csv +101 -0
- teradataml/data/milk_timeseries.csv +157 -0
- teradataml/data/min_max_titanic.csv +4 -0
- teradataml/data/minhash_example.json +6 -0
- teradataml/data/ml_ratings.csv +7547 -0
- teradataml/data/ml_ratings_10.csv +2445 -0
- teradataml/data/mobile_data.csv +13 -0
- teradataml/data/model1_table.csv +5 -0
- teradataml/data/model2_table.csv +5 -0
- teradataml/data/models/License_file.txt +1 -0
- teradataml/data/models/License_file_empty.txt +0 -0
- teradataml/data/models/dataiku_iris_data_ann_thin +0 -0
- teradataml/data/models/dr_iris_rf +0 -0
- teradataml/data/models/iris_db_dt_model_sklearn.onnx +0 -0
- teradataml/data/models/iris_db_dt_model_sklearn_floattensor.onnx +0 -0
- teradataml/data/models/iris_db_glm_model.pmml +57 -0
- teradataml/data/models/iris_db_xgb_model.pmml +4471 -0
- teradataml/data/models/iris_kmeans_model +0 -0
- teradataml/data/models/iris_mojo_glm_h2o_model +0 -0
- teradataml/data/models/iris_mojo_xgb_h2o_model +0 -0
- teradataml/data/modularity_example.json +12 -0
- teradataml/data/movavg_example.json +8 -0
- teradataml/data/mtx1.csv +7 -0
- teradataml/data/mtx2.csv +13 -0
- teradataml/data/multi_model_classification.csv +401 -0
- teradataml/data/multi_model_regression.csv +401 -0
- teradataml/data/mvdfft8.csv +9 -0
- teradataml/data/naivebayes_example.json +10 -0
- teradataml/data/naivebayespredict_example.json +19 -0
- teradataml/data/naivebayestextclassifier2_example.json +7 -0
- teradataml/data/naivebayestextclassifier_example.json +8 -0
- teradataml/data/naivebayestextclassifierpredict_example.json +32 -0
- teradataml/data/name_Find_configure.csv +10 -0
- teradataml/data/namedentityfinder_example.json +14 -0
- teradataml/data/namedentityfinderevaluator_example.json +10 -0
- teradataml/data/namedentityfindertrainer_example.json +6 -0
- teradataml/data/nb_iris_input_test.csv +31 -0
- teradataml/data/nb_iris_input_train.csv +121 -0
- teradataml/data/nbp_iris_model.csv +13 -0
- teradataml/data/ner_dict.csv +8 -0
- teradataml/data/ner_extractor_text.csv +2 -0
- teradataml/data/ner_input_eng.csv +7 -0
- teradataml/data/ner_rule.csv +5 -0
- teradataml/data/ner_sports_test2.csv +29 -0
- teradataml/data/ner_sports_train.csv +501 -0
- teradataml/data/nerevaluator_example.json +6 -0
- teradataml/data/nerextractor_example.json +18 -0
- teradataml/data/nermem_sports_test.csv +18 -0
- teradataml/data/nermem_sports_train.csv +51 -0
- teradataml/data/nertrainer_example.json +7 -0
- teradataml/data/ngrams_example.json +7 -0
- teradataml/data/notebooks/__init__.py +0 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Aggregate Functions using SQLAlchemy.ipynb +1455 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Arithmetic Functions Using SQLAlchemy.ipynb +1993 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Bit-Byte Manipulation Functions using SQLAlchemy.ipynb +1492 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Built-in functions using SQLAlchemy.ipynb +536 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Regular Expressions Using SQLAlchemy.ipynb +570 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage String Functions Using SQLAlchemy.ipynb +2559 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Window Aggregate Functions using SQLAlchemy.ipynb +2911 -0
- teradataml/data/notebooks/sqlalchemy/Using Generic SQLAlchemy ClauseElements teradataml DataFrame assign method.ipynb +698 -0
- teradataml/data/notebooks/sqlalchemy/__init__.py +0 -0
- teradataml/data/notebooks/sqlalchemy/teradataml filtering using SQLAlchemy ClauseElements.ipynb +784 -0
- teradataml/data/npath_example.json +23 -0
- teradataml/data/ntree_example.json +14 -0
- teradataml/data/numeric_strings.csv +5 -0
- teradataml/data/numerics.csv +4 -0
- teradataml/data/ocean_buoy.csv +17 -0
- teradataml/data/ocean_buoy2.csv +17 -0
- teradataml/data/ocean_buoys.csv +28 -0
- teradataml/data/ocean_buoys2.csv +10 -0
- teradataml/data/ocean_buoys_nonpti.csv +28 -0
- teradataml/data/ocean_buoys_seq.csv +29 -0
- teradataml/data/onehot_encoder_train.csv +4 -0
- teradataml/data/openml_example.json +92 -0
- teradataml/data/optional_event_table.csv +4 -0
- teradataml/data/orders1.csv +11 -0
- teradataml/data/orders1_12.csv +13 -0
- teradataml/data/orders_ex.csv +4 -0
- teradataml/data/pack_example.json +9 -0
- teradataml/data/package_tracking.csv +19 -0
- teradataml/data/package_tracking_pti.csv +19 -0
- teradataml/data/pagerank_example.json +13 -0
- teradataml/data/paragraphs_input.csv +6 -0
- teradataml/data/pathanalyzer_example.json +8 -0
- teradataml/data/pathgenerator_example.json +8 -0
- teradataml/data/patient_profile.csv +101 -0
- teradataml/data/pattern_matching_data.csv +11 -0
- teradataml/data/payment_fraud_dataset.csv +10001 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/phrases.csv +7 -0
- teradataml/data/pivot_example.json +9 -0
- teradataml/data/pivot_input.csv +22 -0
- teradataml/data/playerRating.csv +31 -0
- teradataml/data/pos_input.csv +40 -0
- teradataml/data/postagger_example.json +7 -0
- teradataml/data/posttagger_output.csv +44 -0
- teradataml/data/production_data.csv +17 -0
- teradataml/data/production_data2.csv +7 -0
- teradataml/data/randomsample_example.json +32 -0
- teradataml/data/randomwalksample_example.json +9 -0
- teradataml/data/rank_table.csv +6 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/ref_mobile_data.csv +4 -0
- teradataml/data/ref_mobile_data_dense.csv +2 -0
- teradataml/data/ref_url.csv +17 -0
- teradataml/data/restaurant_reviews.csv +7 -0
- teradataml/data/retail_churn_table.csv +27772 -0
- teradataml/data/river_data.csv +145 -0
- teradataml/data/roc_example.json +8 -0
- teradataml/data/roc_input.csv +101 -0
- teradataml/data/rule_inputs.csv +6 -0
- teradataml/data/rule_table.csv +2 -0
- teradataml/data/sales.csv +7 -0
- teradataml/data/sales_transaction.csv +501 -0
- teradataml/data/salesdata.csv +342 -0
- teradataml/data/sample_cities.csv +3 -0
- teradataml/data/sample_shapes.csv +11 -0
- teradataml/data/sample_streets.csv +3 -0
- teradataml/data/sampling_example.json +16 -0
- teradataml/data/sax_example.json +17 -0
- teradataml/data/scale_attributes.csv +3 -0
- teradataml/data/scale_example.json +74 -0
- teradataml/data/scale_housing.csv +11 -0
- teradataml/data/scale_housing_test.csv +6 -0
- teradataml/data/scale_input_part_sparse.csv +31 -0
- teradataml/data/scale_input_partitioned.csv +16 -0
- teradataml/data/scale_input_sparse.csv +11 -0
- teradataml/data/scale_parameters.csv +3 -0
- teradataml/data/scale_stat.csv +11 -0
- teradataml/data/scalebypartition_example.json +13 -0
- teradataml/data/scalemap_example.json +13 -0
- teradataml/data/scalesummary_example.json +12 -0
- teradataml/data/score_category.csv +101 -0
- teradataml/data/score_summary.csv +4 -0
- teradataml/data/script_example.json +10 -0
- teradataml/data/scripts/deploy_script.py +84 -0
- teradataml/data/scripts/lightgbm/dataset.template +175 -0
- teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +264 -0
- teradataml/data/scripts/lightgbm/lightgbm_function.template +234 -0
- teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +177 -0
- teradataml/data/scripts/mapper.R +20 -0
- teradataml/data/scripts/mapper.py +16 -0
- teradataml/data/scripts/mapper_replace.py +16 -0
- teradataml/data/scripts/sklearn/__init__.py +0 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +205 -0
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +148 -0
- teradataml/data/scripts/sklearn/sklearn_function.template +144 -0
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +166 -0
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +161 -0
- teradataml/data/scripts/sklearn/sklearn_score.py +145 -0
- teradataml/data/scripts/sklearn/sklearn_transform.py +327 -0
- teradataml/data/sdk/modelops/modelops_spec.json +101737 -0
- teradataml/data/seeds.csv +10 -0
- teradataml/data/sentenceextractor_example.json +7 -0
- teradataml/data/sentiment_extract_input.csv +11 -0
- teradataml/data/sentiment_train.csv +16 -0
- teradataml/data/sentiment_word.csv +20 -0
- teradataml/data/sentiment_word_input.csv +20 -0
- teradataml/data/sentimentextractor_example.json +24 -0
- teradataml/data/sentimenttrainer_example.json +8 -0
- teradataml/data/sequence_table.csv +10 -0
- teradataml/data/seriessplitter_example.json +8 -0
- teradataml/data/sessionize_example.json +17 -0
- teradataml/data/sessionize_table.csv +116 -0
- teradataml/data/setop_test1.csv +24 -0
- teradataml/data/setop_test2.csv +22 -0
- teradataml/data/soc_nw_edges.csv +11 -0
- teradataml/data/soc_nw_vertices.csv +8 -0
- teradataml/data/souvenir_timeseries.csv +168 -0
- teradataml/data/sparse_iris_attribute.csv +5 -0
- teradataml/data/sparse_iris_test.csv +121 -0
- teradataml/data/sparse_iris_train.csv +601 -0
- teradataml/data/star1.csv +6 -0
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/state_transition.csv +5 -0
- teradataml/data/stock_data.csv +53 -0
- teradataml/data/stock_movement.csv +11 -0
- teradataml/data/stock_vol.csv +76 -0
- teradataml/data/stop_words.csv +8 -0
- teradataml/data/store_sales.csv +37 -0
- teradataml/data/stringsimilarity_example.json +8 -0
- teradataml/data/strsimilarity_input.csv +13 -0
- teradataml/data/students.csv +101 -0
- teradataml/data/svm_iris_input_test.csv +121 -0
- teradataml/data/svm_iris_input_train.csv +481 -0
- teradataml/data/svm_iris_model.csv +7 -0
- teradataml/data/svmdense_example.json +10 -0
- teradataml/data/svmdensepredict_example.json +19 -0
- teradataml/data/svmsparse_example.json +8 -0
- teradataml/data/svmsparsepredict_example.json +14 -0
- teradataml/data/svmsparsesummary_example.json +8 -0
- teradataml/data/target_mobile_data.csv +13 -0
- teradataml/data/target_mobile_data_dense.csv +5 -0
- teradataml/data/target_udt_data.csv +8 -0
- teradataml/data/tdnerextractor_example.json +14 -0
- teradataml/data/templatedata.csv +1201 -0
- teradataml/data/templates/open_source_ml.json +11 -0
- teradataml/data/teradata_icon.ico +0 -0
- teradataml/data/teradataml_example.json +1473 -0
- teradataml/data/test_classification.csv +101 -0
- teradataml/data/test_loan_prediction.csv +53 -0
- teradataml/data/test_pacf_12.csv +37 -0
- teradataml/data/test_prediction.csv +101 -0
- teradataml/data/test_regression.csv +101 -0
- teradataml/data/test_river2.csv +109 -0
- teradataml/data/text_inputs.csv +6 -0
- teradataml/data/textchunker_example.json +8 -0
- teradataml/data/textclassifier_example.json +7 -0
- teradataml/data/textclassifier_input.csv +7 -0
- teradataml/data/textclassifiertrainer_example.json +7 -0
- teradataml/data/textmorph_example.json +11 -0
- teradataml/data/textparser_example.json +15 -0
- teradataml/data/texttagger_example.json +12 -0
- teradataml/data/texttokenizer_example.json +7 -0
- teradataml/data/texttrainer_input.csv +11 -0
- teradataml/data/tf_example.json +7 -0
- teradataml/data/tfidf_example.json +14 -0
- teradataml/data/tfidf_input1.csv +201 -0
- teradataml/data/tfidf_train.csv +6 -0
- teradataml/data/time_table1.csv +535 -0
- teradataml/data/time_table2.csv +14 -0
- teradataml/data/timeseriesdata.csv +1601 -0
- teradataml/data/timeseriesdatasetsd4.csv +105 -0
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic.csv +892 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/to_num_data.csv +4 -0
- teradataml/data/tochar_data.csv +5 -0
- teradataml/data/token_table.csv +696 -0
- teradataml/data/train_multiclass.csv +101 -0
- teradataml/data/train_regression.csv +101 -0
- teradataml/data/train_regression_multiple_labels.csv +101 -0
- teradataml/data/train_tracking.csv +28 -0
- teradataml/data/trans_dense.csv +16 -0
- teradataml/data/trans_sparse.csv +55 -0
- teradataml/data/transformation_table.csv +6 -0
- teradataml/data/transformation_table_new.csv +2 -0
- teradataml/data/tv_spots.csv +16 -0
- teradataml/data/twod_climate_data.csv +117 -0
- teradataml/data/uaf_example.json +529 -0
- teradataml/data/univariatestatistics_example.json +9 -0
- teradataml/data/unpack_example.json +10 -0
- teradataml/data/unpivot_example.json +25 -0
- teradataml/data/unpivot_input.csv +8 -0
- teradataml/data/url_data.csv +10 -0
- teradataml/data/us_air_pass.csv +37 -0
- teradataml/data/us_population.csv +624 -0
- teradataml/data/us_states_shapes.csv +52 -0
- teradataml/data/varmax_example.json +18 -0
- teradataml/data/vectordistance_example.json +30 -0
- teradataml/data/ville_climatedata.csv +121 -0
- teradataml/data/ville_tempdata.csv +12 -0
- teradataml/data/ville_tempdata1.csv +12 -0
- teradataml/data/ville_temperature.csv +11 -0
- teradataml/data/waveletTable.csv +1605 -0
- teradataml/data/waveletTable2.csv +1605 -0
- teradataml/data/weightedmovavg_example.json +9 -0
- teradataml/data/wft_testing.csv +5 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/data/wine_data.csv +1600 -0
- teradataml/data/word_embed_input_table1.csv +6 -0
- teradataml/data/word_embed_input_table2.csv +5 -0
- teradataml/data/word_embed_model.csv +23 -0
- teradataml/data/words_input.csv +13 -0
- teradataml/data/xconvolve_complex_left.csv +6 -0
- teradataml/data/xconvolve_complex_leftmulti.csv +6 -0
- teradataml/data/xgboost_example.json +36 -0
- teradataml/data/xgboostpredict_example.json +32 -0
- teradataml/data/ztest_example.json +16 -0
- teradataml/dataframe/__init__.py +0 -0
- teradataml/dataframe/copy_to.py +2446 -0
- teradataml/dataframe/data_transfer.py +2840 -0
- teradataml/dataframe/dataframe.py +20908 -0
- teradataml/dataframe/dataframe_utils.py +2114 -0
- teradataml/dataframe/fastload.py +794 -0
- teradataml/dataframe/functions.py +2110 -0
- teradataml/dataframe/indexer.py +424 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +1171 -0
- teradataml/dataframe/sql.py +10904 -0
- teradataml/dataframe/sql_function_parameters.py +440 -0
- teradataml/dataframe/sql_functions.py +652 -0
- teradataml/dataframe/sql_interfaces.py +220 -0
- teradataml/dataframe/vantage_function_types.py +675 -0
- teradataml/dataframe/window.py +694 -0
- teradataml/dbutils/__init__.py +3 -0
- teradataml/dbutils/dbutils.py +2871 -0
- teradataml/dbutils/filemgr.py +318 -0
- teradataml/gen_ai/__init__.py +2 -0
- teradataml/gen_ai/convAI.py +473 -0
- teradataml/geospatial/__init__.py +4 -0
- teradataml/geospatial/geodataframe.py +1105 -0
- teradataml/geospatial/geodataframecolumn.py +392 -0
- teradataml/geospatial/geometry_types.py +926 -0
- teradataml/hyperparameter_tuner/__init__.py +1 -0
- teradataml/hyperparameter_tuner/optimizer.py +4115 -0
- teradataml/hyperparameter_tuner/utils.py +303 -0
- teradataml/lib/__init__.py +0 -0
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/lib/libaed_0_1.dylib +0 -0
- teradataml/lib/libaed_0_1.so +0 -0
- teradataml/lib/libaed_0_1_aarch64.so +0 -0
- teradataml/lib/libaed_0_1_ppc64le.so +0 -0
- teradataml/opensource/__init__.py +1 -0
- teradataml/opensource/_base.py +1321 -0
- teradataml/opensource/_class.py +464 -0
- teradataml/opensource/_constants.py +61 -0
- teradataml/opensource/_lightgbm.py +949 -0
- teradataml/opensource/_sklearn.py +1008 -0
- teradataml/opensource/_wrapper_utils.py +267 -0
- teradataml/options/__init__.py +148 -0
- teradataml/options/configure.py +489 -0
- teradataml/options/display.py +187 -0
- teradataml/plot/__init__.py +3 -0
- teradataml/plot/axis.py +1427 -0
- teradataml/plot/constants.py +15 -0
- teradataml/plot/figure.py +431 -0
- teradataml/plot/plot.py +810 -0
- teradataml/plot/query_generator.py +83 -0
- teradataml/plot/subplot.py +216 -0
- teradataml/scriptmgmt/UserEnv.py +4273 -0
- teradataml/scriptmgmt/__init__.py +3 -0
- teradataml/scriptmgmt/lls_utils.py +2157 -0
- teradataml/sdk/README.md +79 -0
- teradataml/sdk/__init__.py +4 -0
- teradataml/sdk/_auth_modes.py +422 -0
- teradataml/sdk/_func_params.py +487 -0
- teradataml/sdk/_json_parser.py +453 -0
- teradataml/sdk/_openapi_spec_constants.py +249 -0
- teradataml/sdk/_utils.py +236 -0
- teradataml/sdk/api_client.py +900 -0
- teradataml/sdk/constants.py +62 -0
- teradataml/sdk/modelops/__init__.py +98 -0
- teradataml/sdk/modelops/_client.py +409 -0
- teradataml/sdk/modelops/_constants.py +304 -0
- teradataml/sdk/modelops/models.py +2308 -0
- teradataml/sdk/spinner.py +107 -0
- teradataml/series/__init__.py +0 -0
- teradataml/series/series.py +537 -0
- teradataml/series/series_utils.py +71 -0
- teradataml/store/__init__.py +12 -0
- teradataml/store/feature_store/__init__.py +0 -0
- teradataml/store/feature_store/constants.py +658 -0
- teradataml/store/feature_store/feature_store.py +4814 -0
- teradataml/store/feature_store/mind_map.py +639 -0
- teradataml/store/feature_store/models.py +7330 -0
- teradataml/store/feature_store/utils.py +390 -0
- teradataml/table_operators/Apply.py +979 -0
- teradataml/table_operators/Script.py +1739 -0
- teradataml/table_operators/TableOperator.py +1343 -0
- teradataml/table_operators/__init__.py +2 -0
- teradataml/table_operators/apply_query_generator.py +262 -0
- teradataml/table_operators/query_generator.py +493 -0
- teradataml/table_operators/table_operator_query_generator.py +462 -0
- teradataml/table_operators/table_operator_util.py +726 -0
- teradataml/table_operators/templates/dataframe_apply.template +184 -0
- teradataml/table_operators/templates/dataframe_map.template +176 -0
- teradataml/table_operators/templates/dataframe_register.template +73 -0
- teradataml/table_operators/templates/dataframe_udf.template +67 -0
- teradataml/table_operators/templates/script_executor.template +170 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +53 -0
- teradataml/utils/__init__.py +0 -0
- teradataml/utils/docstring.py +527 -0
- teradataml/utils/dtypes.py +943 -0
- teradataml/utils/internal_buffer.py +122 -0
- teradataml/utils/print_versions.py +206 -0
- teradataml/utils/utils.py +451 -0
- teradataml/utils/validators.py +3305 -0
- teradataml-20.0.0.8.dist-info/METADATA +2804 -0
- teradataml-20.0.0.8.dist-info/RECORD +1208 -0
- teradataml-20.0.0.8.dist-info/WHEEL +5 -0
- teradataml-20.0.0.8.dist-info/top_level.txt +1 -0
- teradataml-20.0.0.8.dist-info/zip-safe +1 -0
|
@@ -0,0 +1,2996 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from datetime import date
|
|
3
|
+
from teradataml.common.exceptions import TeradataMlException
|
|
4
|
+
from teradataml.common.messages import Messages, MessageCodes
|
|
5
|
+
from teradataml.common.utils import UtilFuncs
|
|
6
|
+
from teradataml.dataframe.dataframe import DataFrame
|
|
7
|
+
from teradataml.dataframe.dataframe_utils import DataFrameUtils as df_utils
|
|
8
|
+
from teradataml.utils.validators import _Validators
|
|
9
|
+
from teradataml.utils.dtypes import _SuppArgTypes, _DtypesMappers
|
|
10
|
+
from teradatasqlalchemy.types import BIGINT, BYTEINT, CHAR, DATE, DECIMAL, FLOAT, INTEGER, NUMBER, SMALLINT, TIME, \
|
|
11
|
+
TIMESTAMP, VARCHAR
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class _Transformations(object):
|
|
15
|
+
""" Class to represent different transformation techniques. """
|
|
16
|
+
|
|
17
|
+
def __init__(self, columns=None, out_columns=None, datatype=None,
|
|
18
|
+
columns_optional=True):
|
|
19
|
+
"""
|
|
20
|
+
DESCRIPTION:
|
|
21
|
+
Constructor for _Transformations.
|
|
22
|
+
Note:
|
|
23
|
+
It is intended to use as super() class for transformation techniques.
|
|
24
|
+
|
|
25
|
+
PARAMETERS:
|
|
26
|
+
columns:
|
|
27
|
+
Optional Argument.
|
|
28
|
+
Required when "out_columns" is used or "columns_optional" is False.
|
|
29
|
+
Specifies the names of the columns.
|
|
30
|
+
Types: str or list of str
|
|
31
|
+
|
|
32
|
+
out_columns:
|
|
33
|
+
Optional Argument.
|
|
34
|
+
Specifies the names of the output columns.
|
|
35
|
+
Notes:
|
|
36
|
+
1. "columns" argument must be used, when this argument is used.
|
|
37
|
+
2. Number of elements in "columns" and "out_columns" must be same.
|
|
38
|
+
Types: str or list of str
|
|
39
|
+
|
|
40
|
+
datatype:
|
|
41
|
+
Optional Argument.
|
|
42
|
+
Specifies the name of the intended datatype of the output column.
|
|
43
|
+
Intended data types for the output column can be specified using either the
|
|
44
|
+
teradatasqlalchemy types or the permitted strings mentioned below:
|
|
45
|
+
-------------------------------------------------------------------
|
|
46
|
+
| If intended SQL Data Type is | Permitted Value to be passed is |
|
|
47
|
+
|-------------------------------------------------------------------|
|
|
48
|
+
| bigint | bigint |
|
|
49
|
+
| byteint | byteint |
|
|
50
|
+
| char(n) | char,n |
|
|
51
|
+
| date | date |
|
|
52
|
+
| decimal(m,n) | decimal,m,n |
|
|
53
|
+
| float | float |
|
|
54
|
+
| integer | integer |
|
|
55
|
+
| number(*) | number |
|
|
56
|
+
| number(n) | number,n |
|
|
57
|
+
| number(*,n) | number,*,n |
|
|
58
|
+
| number(n,n) | number,n,n |
|
|
59
|
+
| smallint | smallint |
|
|
60
|
+
| time(p) | time,p |
|
|
61
|
+
| timestamp(p) | timestamp,p |
|
|
62
|
+
| varchar(n) | varchar,n |
|
|
63
|
+
--------------------------------------------------------------------
|
|
64
|
+
Notes:
|
|
65
|
+
1. Argument is ignored if "columns" argument is not used.
|
|
66
|
+
2. char without a size is not supported.
|
|
67
|
+
3. number(*) does not include the * in its datatype format.
|
|
68
|
+
Examples:
|
|
69
|
+
1. If intended datatype for the output column is "bigint", then
|
|
70
|
+
pass string "bigint" to the argument as shown below:
|
|
71
|
+
datatype="bigint"
|
|
72
|
+
2. If intended datatype for the output column is "decimal(3,5)", then
|
|
73
|
+
pass string "decimal,3,5" to the argument as shown below:
|
|
74
|
+
datatype="decimal,3,5"
|
|
75
|
+
Types: str, BIGINT, BYTEINT, CHAR, DATE, DECIMAL, FLOAT, INTEGER, NUMBER, SMALLINT, TIME,
|
|
76
|
+
TIMESTAMP, VARCHAR.
|
|
77
|
+
|
|
78
|
+
columns_optional:
|
|
79
|
+
Optional Argument.
|
|
80
|
+
Specifies whether to treat "columns" argument as required or optional.
|
|
81
|
+
Default Value: True ("columns" is optional)
|
|
82
|
+
Types: bool
|
|
83
|
+
|
|
84
|
+
RETURNS:
|
|
85
|
+
An instance of _Transformations class.
|
|
86
|
+
|
|
87
|
+
RAISES:
|
|
88
|
+
TeradataMlException, TypeError, ValueError
|
|
89
|
+
|
|
90
|
+
EXAMPLE:
|
|
91
|
+
_Transformations(columns="col1")
|
|
92
|
+
"""
|
|
93
|
+
self.columns = columns
|
|
94
|
+
self.out_columns = out_columns
|
|
95
|
+
self.datatype = datatype
|
|
96
|
+
|
|
97
|
+
# Validations
|
|
98
|
+
arg_info_matrix = []
|
|
99
|
+
arg_info_matrix.append(
|
|
100
|
+
["columns", self.columns, columns_optional, (str, list), True])
|
|
101
|
+
arg_info_matrix.append(["out_columns", self.out_columns, True, (str, list), True])
|
|
102
|
+
arg_info_matrix.append(["datatype", self.datatype, True, _SuppArgTypes.VAL_ARG_DATATYPE, True])
|
|
103
|
+
|
|
104
|
+
# Validate for missing required arguments.
|
|
105
|
+
_Validators._validate_missing_required_arguments(arg_info_matrix)
|
|
106
|
+
|
|
107
|
+
# Argument validations.
|
|
108
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
109
|
+
|
|
110
|
+
if self.out_columns is not None:
|
|
111
|
+
# Raise error, if "columns" not provided and "out_columns" is provided.
|
|
112
|
+
_Validators._validate_dependent_argument("out_columns", self.out_columns, "columns", self.columns)
|
|
113
|
+
|
|
114
|
+
if len(UtilFuncs._as_list(self.out_columns)) != len(
|
|
115
|
+
UtilFuncs._as_list(self.columns)):
|
|
116
|
+
# Raise error, if length of the input and output columns is not same.
|
|
117
|
+
err_ = Messages.get_message(MessageCodes.INVALID_LENGTH_ARGS,
|
|
118
|
+
"columns and out_columns")
|
|
119
|
+
raise TeradataMlException(err_, MessageCodes.INVALID_LENGTH_ARGS)
|
|
120
|
+
|
|
121
|
+
def _val_transformation_fmt(self):
|
|
122
|
+
"""
|
|
123
|
+
DESCRIPTION:
|
|
124
|
+
Internal function to return a string representation of basic Transformation
|
|
125
|
+
technique arguments "columns", "out_columns" and "datatype" as per SQL syntax
|
|
126
|
+
of the function.
|
|
127
|
+
Function will return empty string if "column" argument is None.
|
|
128
|
+
|
|
129
|
+
PARAMETERS:
|
|
130
|
+
None.
|
|
131
|
+
|
|
132
|
+
RETURNS:
|
|
133
|
+
String representing SQL syntax for arguments "columns", "out_columns"
|
|
134
|
+
and "datatype".
|
|
135
|
+
|
|
136
|
+
RAISES:
|
|
137
|
+
None.
|
|
138
|
+
|
|
139
|
+
EXAMPLE:
|
|
140
|
+
self._val_transformation_fmt()
|
|
141
|
+
"""
|
|
142
|
+
ret_value = ""
|
|
143
|
+
if self.columns is not None:
|
|
144
|
+
self.columns = UtilFuncs._as_list(self.columns)
|
|
145
|
+
columns_fmt = "columns({})"
|
|
146
|
+
|
|
147
|
+
columns_arg_values = self.columns
|
|
148
|
+
if self.out_columns:
|
|
149
|
+
self.out_columns = UtilFuncs._as_list(self.out_columns)
|
|
150
|
+
columns_arg_values = []
|
|
151
|
+
for col, out_col in dict(zip(self.columns, self.out_columns)).items():
|
|
152
|
+
columns_arg_values.append("{}/{}".format(col, out_col))
|
|
153
|
+
|
|
154
|
+
ret_value = columns_fmt.format(", ".join(columns_arg_values))
|
|
155
|
+
|
|
156
|
+
if self.datatype:
|
|
157
|
+
if not isinstance(self.datatype, str):
|
|
158
|
+
self.datatype = _DtypesMappers.TDSQLALCHEMY_DATATYPE_TO_VAL_STRING_MAPPER[type(self.datatype)]\
|
|
159
|
+
(self.datatype)
|
|
160
|
+
ret_value = "{}, datatype({})".format(ret_value, self.datatype)
|
|
161
|
+
return ret_value
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class FillNa(_Transformations):
|
|
165
|
+
""" Class to represent null replacement transformation technique. """
|
|
166
|
+
|
|
167
|
+
def __init__(self, style="mean", value=None, columns=None, out_columns=None,
|
|
168
|
+
datatype=None):
|
|
169
|
+
"""
|
|
170
|
+
DESCRIPTION:
|
|
171
|
+
FillNa allows user to perform missing value/null replacement transformations.
|
|
172
|
+
Note:
|
|
173
|
+
Output of this function is passed to "fillna" argument of "Transform"
|
|
174
|
+
function from Vantage Analytic Library.
|
|
175
|
+
|
|
176
|
+
PARAMETERS:
|
|
177
|
+
style:
|
|
178
|
+
Optional Argument.
|
|
179
|
+
Specifies the nullstyle for missing value/null value replacement.
|
|
180
|
+
A literal value, the mean, median, mode, or an imputed value joined
|
|
181
|
+
from another table can be used as the replacement value. The median
|
|
182
|
+
value can be requested with or without averaging of two middle values
|
|
183
|
+
when there is an even number of values.
|
|
184
|
+
Literal value replacement is supported for numeric, character, and
|
|
185
|
+
date data types.
|
|
186
|
+
Mean value replacement is supported for columns of numeric type or
|
|
187
|
+
date type.
|
|
188
|
+
Median without averaging, mode, and imputed value replacement are
|
|
189
|
+
valid for any supported type. Median with averaging is supported
|
|
190
|
+
only for numeric and date type.
|
|
191
|
+
Permitted Values: 'literal', 'mean', 'median', 'mode', 'median_wo_mean',
|
|
192
|
+
'imputed'
|
|
193
|
+
Default Value: 'mean'
|
|
194
|
+
Types: str
|
|
195
|
+
|
|
196
|
+
value:
|
|
197
|
+
Optional Argument. Required when "style" is 'literal' or 'imputed'.
|
|
198
|
+
Specifies the value to be used for null replacement transformations.
|
|
199
|
+
Notes:
|
|
200
|
+
1. When "style" is 'imputed', value must be of type teradataml
|
|
201
|
+
DataFrame.
|
|
202
|
+
2. When "style" is 'literal', value can be of any type.
|
|
203
|
+
3. If date values are entered as string, the keyword 'DATE' must precede
|
|
204
|
+
the date value, and do not enclose in single quotes OR
|
|
205
|
+
pass a datetime.date object.
|
|
206
|
+
For example,
|
|
207
|
+
value='DATE 1987-06-09'
|
|
208
|
+
value=date(1987, 6, 9)
|
|
209
|
+
Types: teradataml DataFrame, bool, int, str, float, datetime.date
|
|
210
|
+
|
|
211
|
+
columns:
|
|
212
|
+
Optional Argument.
|
|
213
|
+
Specifies the names of the columns.
|
|
214
|
+
Types: str or list of str
|
|
215
|
+
|
|
216
|
+
out_columns:
|
|
217
|
+
Optional Argument.
|
|
218
|
+
Specifies the names of the output columns.
|
|
219
|
+
Notes:
|
|
220
|
+
Number of elements in "columns" and "out_columns" must be same.
|
|
221
|
+
Types: str or list of str
|
|
222
|
+
|
|
223
|
+
datatype:
|
|
224
|
+
Optional Argument.
|
|
225
|
+
Specifies the name of the intended datatype of the output column.
|
|
226
|
+
Intended data types for the output column can be specified using either the
|
|
227
|
+
teradatasqlalchemy types or the permitted strings mentioned below:
|
|
228
|
+
-------------------------------------------------------------------
|
|
229
|
+
| If intended SQL Data Type is | Permitted Value to be passed is |
|
|
230
|
+
|-------------------------------------------------------------------|
|
|
231
|
+
| bigint | bigint |
|
|
232
|
+
| byteint | byteint |
|
|
233
|
+
| char(n) | char,n |
|
|
234
|
+
| date | date |
|
|
235
|
+
| decimal(m,n) | decimal,m,n |
|
|
236
|
+
| float | float |
|
|
237
|
+
| integer | integer |
|
|
238
|
+
| number(*) | number |
|
|
239
|
+
| number(n) | number,n |
|
|
240
|
+
| number(*,n) | number,*,n |
|
|
241
|
+
| number(n,n) | number,n,n |
|
|
242
|
+
| smallint | smallint |
|
|
243
|
+
| time(p) | time,p |
|
|
244
|
+
| timestamp(p) | timestamp,p |
|
|
245
|
+
| varchar(n) | varchar,n |
|
|
246
|
+
--------------------------------------------------------------------
|
|
247
|
+
Notes:
|
|
248
|
+
1. char without a size is not supported.
|
|
249
|
+
2. number(*) does not include the * in its datatype format.
|
|
250
|
+
Examples:
|
|
251
|
+
1. If intended datatype for the output column is "bigint", then
|
|
252
|
+
pass string "bigint" to the argument as shown below:
|
|
253
|
+
datatype="bigint"
|
|
254
|
+
2. If intended datatype for the output column is "decimal(3,5)", then
|
|
255
|
+
pass string "decimal,3,5" to the argument as shown below:
|
|
256
|
+
datatype="decimal,3,5"
|
|
257
|
+
Types: str, BIGINT, BYTEINT, CHAR, DATE, DECIMAL, FLOAT, INTEGER, NUMBER, SMALLINT, TIME,
|
|
258
|
+
TIMESTAMP, VARCHAR.
|
|
259
|
+
|
|
260
|
+
RETURNS:
|
|
261
|
+
An instance of FillNa class.
|
|
262
|
+
|
|
263
|
+
RAISES:
|
|
264
|
+
TeradataMlException, TypeError, ValueError
|
|
265
|
+
|
|
266
|
+
EXAMPLE:
|
|
267
|
+
# Note:
|
|
268
|
+
# To run any transformation, user needs to use Transform() function from
|
|
269
|
+
# Vantage Analytic Library.
|
|
270
|
+
# To do so import valib first and set the "val_install_location".
|
|
271
|
+
>>> from teradataml import configure, DataFrame, FillNa, load_example_data, valib
|
|
272
|
+
>>> configure.val_install_location = "SYSLIB"
|
|
273
|
+
>>>
|
|
274
|
+
|
|
275
|
+
# Load example data.
|
|
276
|
+
>>> load_example_data("dataframe", ["sales", "employee_info"])
|
|
277
|
+
>>>
|
|
278
|
+
|
|
279
|
+
# Create the required DataFrames.
|
|
280
|
+
>>> sales = DataFrame("sales")
|
|
281
|
+
>>> sales
|
|
282
|
+
Feb Jan Mar Apr datetime
|
|
283
|
+
accounts
|
|
284
|
+
Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
|
|
285
|
+
Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
|
|
286
|
+
Yellow Inc 90.0 NaN NaN NaN 04/01/2017
|
|
287
|
+
Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
|
|
288
|
+
Red Inc 200.0 150.0 140.0 NaN 04/01/2017
|
|
289
|
+
Orange Inc 210.0 NaN NaN 250.0 04/01/2017
|
|
290
|
+
>>>
|
|
291
|
+
|
|
292
|
+
# Example 1: Replace missing values in columns 'Jan' and 'Mar', with
|
|
293
|
+
# a literal value 0. Output columns are named as 'january' and
|
|
294
|
+
# 'march' respectively.
|
|
295
|
+
>>> fillna_literal = FillNa(style="literal", value=0, columns=["Jan", "Mar"],
|
|
296
|
+
... out_columns=["january", "march"])
|
|
297
|
+
>>> obj = valib.Transform(data=sales, fillna=fillna_literal, key_columns="accounts")
|
|
298
|
+
>>> obj.result
|
|
299
|
+
accounts january march
|
|
300
|
+
0 Blue Inc 50 95
|
|
301
|
+
1 Orange Inc 0 0
|
|
302
|
+
2 Red Inc 150 140
|
|
303
|
+
3 Yellow Inc 0 0
|
|
304
|
+
4 Jones LLC 150 140
|
|
305
|
+
5 Alpha Co 200 215
|
|
306
|
+
>>>
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
# Example 2: Replace missing values in column 'Jan' with 'median' value from
|
|
310
|
+
# that column. Output column produced in the output is named as
|
|
311
|
+
# 'Jan2'.
|
|
312
|
+
>>> fillna_median = FillNa(style="median", columns="Jan", out_columns="Jan2")
|
|
313
|
+
>>> obj = valib.Transform(data=sales, fillna=fillna_median, key_columns="accounts")
|
|
314
|
+
>>> obj.result
|
|
315
|
+
accounts Jan2
|
|
316
|
+
0 Alpha Co 200.000
|
|
317
|
+
1 Red Inc 150.000
|
|
318
|
+
2 Orange Inc 150.000
|
|
319
|
+
3 Jones LLC 150.000
|
|
320
|
+
4 Yellow Inc 150.000
|
|
321
|
+
5 Blue Inc 50.000
|
|
322
|
+
>>>
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
# Example 3: Replace missing values in column 'Apr' with a median value
|
|
326
|
+
# without mean from that column.
|
|
327
|
+
>>> fillna_mwm = FillNa(style="median_wo_mean", columns="Apr")
|
|
328
|
+
>>> obj = valib.Transform(data=sales, fillna=fillna_mwm, key_columns="accounts")
|
|
329
|
+
>>> obj.result
|
|
330
|
+
accounts Apr
|
|
331
|
+
0 Alpha Co 250
|
|
332
|
+
1 Blue Inc 101
|
|
333
|
+
2 Yellow Inc 180
|
|
334
|
+
3 Jones LLC 180
|
|
335
|
+
4 Red Inc 180
|
|
336
|
+
5 Orange Inc 250
|
|
337
|
+
>>>
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
# Example 4: Replace missing values in column 'Apr' with 'mode' value from
|
|
341
|
+
# that column. Output column produced in the output is named as
|
|
342
|
+
# 'Apr2000'.
|
|
343
|
+
>>> fillna_mode = FillNa(style="mode", columns="Apr", out_columns="Apr2000")
|
|
344
|
+
>>> obj = valib.Transform(data=sales, fillna=fillna_mode, key_columns="accounts")
|
|
345
|
+
>>> obj.result
|
|
346
|
+
accounts Apr2000
|
|
347
|
+
0 Blue Inc 101
|
|
348
|
+
1 Orange Inc 250
|
|
349
|
+
2 Red Inc 250
|
|
350
|
+
3 Yellow Inc 250
|
|
351
|
+
4 Jones LLC 180
|
|
352
|
+
5 Alpha Co 250
|
|
353
|
+
>>>
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
# Example 5: Replace missing values in columns 'masters' and 'programming' using
|
|
357
|
+
# 'imputed' style.
|
|
358
|
+
>>> load_example_data("dataframe", ["admissions_train_nulls", "admissions_train"])
|
|
359
|
+
|
|
360
|
+
# Dataframe to be used for 'imputed' style replacement.
|
|
361
|
+
>>> admissions_train = DataFrame("admissions_train")
|
|
362
|
+
>>> admissions_train
|
|
363
|
+
masters gpa stats programming admitted
|
|
364
|
+
id
|
|
365
|
+
22 yes 3.46 Novice Beginner 0
|
|
366
|
+
26 yes 3.57 Advanced Advanced 1
|
|
367
|
+
5 no 3.44 Novice Novice 0
|
|
368
|
+
17 no 3.83 Advanced Advanced 1
|
|
369
|
+
13 no 4.00 Advanced Novice 1
|
|
370
|
+
19 yes 1.98 Advanced Advanced 0
|
|
371
|
+
36 no 3.00 Advanced Novice 0
|
|
372
|
+
15 yes 4.00 Advanced Advanced 1
|
|
373
|
+
34 yes 3.85 Advanced Beginner 0
|
|
374
|
+
38 yes 2.65 Advanced Beginner 1
|
|
375
|
+
>>>
|
|
376
|
+
|
|
377
|
+
# DataFrame containing NULL values in columns "programming" and "masters".
|
|
378
|
+
>>> admissions_train_nulls = DataFrame("admissions_train_nulls")
|
|
379
|
+
>>> admissions_train_nulls
|
|
380
|
+
masters gpa stats programming admitted
|
|
381
|
+
id
|
|
382
|
+
5 no 3.44 Novice Novice 0
|
|
383
|
+
7 yes 2.33 Novice Novice 1
|
|
384
|
+
22 None 3.46 Novice None 0
|
|
385
|
+
19 yes 1.98 Advanced Advanced 0
|
|
386
|
+
15 None 4.00 Advanced Advanced 1
|
|
387
|
+
17 None 3.83 Advanced Advanced 1
|
|
388
|
+
34 None 3.85 Advanced Beginner 0
|
|
389
|
+
13 no 4.00 Advanced Novice 1
|
|
390
|
+
36 no 3.00 Advanced Novice 0
|
|
391
|
+
40 yes 3.95 Novice Beginner 0
|
|
392
|
+
>>>
|
|
393
|
+
|
|
394
|
+
# Replace NULL values in the columns "masters" and "programming"
|
|
395
|
+
# in admissions_train_nulls dataframe with the values in the corresponding
|
|
396
|
+
# columns' values in admissions_train dataframe.
|
|
397
|
+
>>> fillna_imputed = FillNa(style="imputed",
|
|
398
|
+
... columns=["masters", "programming"],
|
|
399
|
+
... value=admissions_train)
|
|
400
|
+
>>> obj = valib.Transform(data=admissions_train_nulls,
|
|
401
|
+
... fillna=fillna_imputed,
|
|
402
|
+
... key_columns="id")
|
|
403
|
+
>>> obj.result
|
|
404
|
+
id masters programming
|
|
405
|
+
0 22 yes Beginner
|
|
406
|
+
1 36 no Novice
|
|
407
|
+
2 15 yes Advanced
|
|
408
|
+
3 38 yes Beginner
|
|
409
|
+
4 5 no Novice
|
|
410
|
+
5 17 no Advanced
|
|
411
|
+
6 34 yes Beginner
|
|
412
|
+
7 13 no Novice
|
|
413
|
+
8 26 yes Advanced
|
|
414
|
+
9 19 yes Advanced
|
|
415
|
+
>>>
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
# Example 6: This example shows how one can operate on date and character
|
|
419
|
+
# columns. Example also showcases using multiple missing value
|
|
420
|
+
# treatment techniques in one single call for variable
|
|
421
|
+
# transformation.
|
|
422
|
+
# Create the required DataFrames.
|
|
423
|
+
>>> einfo = DataFrame("employee_info")
|
|
424
|
+
>>> einfo
|
|
425
|
+
first_name marks dob joined_date
|
|
426
|
+
employee_no
|
|
427
|
+
100 abcd None None None
|
|
428
|
+
112 None None None 18/12/05
|
|
429
|
+
101 abcde None None 02/12/05
|
|
430
|
+
>>>
|
|
431
|
+
|
|
432
|
+
# Using literal style for missing value treatment on a date type
|
|
433
|
+
# column "joined_date".
|
|
434
|
+
>>> fillna_1 = FillNa(style="literal", value="DATE 1995-12-23",
|
|
435
|
+
... columns="joined_date", out_columns="date1")
|
|
436
|
+
|
|
437
|
+
# Using literal style for missing value treatment on a character type
|
|
438
|
+
# column "first_name". Repalce missing values with 'FNU', i.e.,
|
|
439
|
+
# First Name Unknown.
|
|
440
|
+
>>> fillna_2 = FillNa(style="literal", value="FNU", columns="first_name",
|
|
441
|
+
... out_columns="char1")
|
|
442
|
+
|
|
443
|
+
# Using mean value for missing value treatment on a date type
|
|
444
|
+
# column "joined_date".
|
|
445
|
+
>>> fillna_3 = FillNa(style="mean", columns="joined_date",
|
|
446
|
+
... out_columns="date2")
|
|
447
|
+
|
|
448
|
+
# Using median value for missing value treatment on a date type
|
|
449
|
+
# column "joined_date".
|
|
450
|
+
>>> fillna_4 = FillNa(style="median", columns="joined_date",
|
|
451
|
+
... out_columns="date2A")
|
|
452
|
+
|
|
453
|
+
# Using median value without mean for missing value treatment on
|
|
454
|
+
# a date type column "joined_date".
|
|
455
|
+
>>> fillna_5 = FillNa(style="median_wo_mean", columns="joined_date",
|
|
456
|
+
... out_columns="date3")
|
|
457
|
+
|
|
458
|
+
# Using mode value for missing value treatment on a date type
|
|
459
|
+
# column "joined_date".
|
|
460
|
+
>>> fillna_6 = FillNa(style="mode", columns="joined_date",
|
|
461
|
+
... out_columns="date4")
|
|
462
|
+
|
|
463
|
+
# Using median value without mean for missing value treatment on
|
|
464
|
+
# a character type column "first_name".
|
|
465
|
+
>>> fillna_7 = FillNa(style="median_wo_mean", columns="first_name",
|
|
466
|
+
... out_columns="char2")
|
|
467
|
+
|
|
468
|
+
# Using mode value for missing value treatment on a character type
|
|
469
|
+
# column "first_name".
|
|
470
|
+
>>> fillna_8 = FillNa(style="mode", columns="first_name",
|
|
471
|
+
... out_columns="char3")
|
|
472
|
+
|
|
473
|
+
# Perform the missing value transformations using Transform() function
|
|
474
|
+
# from Vantage Analytic Library.
|
|
475
|
+
>>> obj = valib.Transform(data=einfo,
|
|
476
|
+
... fillna=[fillna_1, fillna_2, fillna_3, fillna_4,
|
|
477
|
+
... fillna_5, fillna_6, fillna_7, fillna_8],
|
|
478
|
+
... key_columns="employee_no")
|
|
479
|
+
>>> obj.result
|
|
480
|
+
employee_no date1 char1 date2 date2A date3 date4 char2 char3
|
|
481
|
+
0 112 18/12/05 FNU 18/12/05 18/12/05 18/12/05 18/12/05 abcd abcd
|
|
482
|
+
1 101 02/12/05 abcde 02/12/05 02/12/05 02/12/05 02/12/05 abcde abcde
|
|
483
|
+
2 100 95/12/23 abcd 60/12/04 60/12/04 02/12/05 02/12/05 abcd abcd
|
|
484
|
+
>>>
|
|
485
|
+
"""
|
|
486
|
+
# Call super()
|
|
487
|
+
super().__init__(columns=columns, out_columns=out_columns, datatype=datatype, columns_optional=True)
|
|
488
|
+
# Initialize style and value as data members.
|
|
489
|
+
self.style = style
|
|
490
|
+
self.value = value
|
|
491
|
+
|
|
492
|
+
# Validations
|
|
493
|
+
arg_info_matrix = []
|
|
494
|
+
permitted_styles = ["LITERAL", "MEAN", "MEDIAN", "MEDIAN_WO_MEAN", "MODE",
|
|
495
|
+
"IMPUTED"]
|
|
496
|
+
arg_info_matrix.append(["style", self.style, True, str, True, permitted_styles])
|
|
497
|
+
arg_info_matrix.append(["value", self.value, True, (DataFrame, bool, int, float, str, date)])
|
|
498
|
+
|
|
499
|
+
# Note:
|
|
500
|
+
# Validations for "columns", "out_columns" and "datatype" is done by super().
|
|
501
|
+
# Other argument validations.
|
|
502
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
503
|
+
|
|
504
|
+
# If date object is passed to "value" argument
|
|
505
|
+
# then convert date object to string format 'DATE 1978-06-09'
|
|
506
|
+
if isinstance(self.value, date):
|
|
507
|
+
self.value = UtilFuncs._convert_date_to_string(self.value)
|
|
508
|
+
|
|
509
|
+
# If style is 'MEDIAN_WO_MEAN', in SQL we will use 'medianwithoutaveraging'.
|
|
510
|
+
if self.style.upper() == "MEDIAN_WO_MEAN":
|
|
511
|
+
self.style = "medianwithoutaveraging"
|
|
512
|
+
|
|
513
|
+
# "value" should be provided when "style" is 'literal' or 'imputed'.
|
|
514
|
+
# "value" is ignored when style is other than 'literal' or 'imputed'.
|
|
515
|
+
if self.style.upper() in ["LITERAL", "IMPUTED"] and value is None:
|
|
516
|
+
err_ = Messages.get_message(MessageCodes.DEPENDENT_ARG_MISSING, "value",
|
|
517
|
+
"style={}".format(self.style))
|
|
518
|
+
raise TeradataMlException(err_, MessageCodes.DEPENDENT_ARG_MISSING)
|
|
519
|
+
|
|
520
|
+
if self.style.upper() == "IMPUTED":
|
|
521
|
+
if not isinstance(value, DataFrame):
|
|
522
|
+
err_ = Messages.get_message(MessageCodes.UNSUPPORTED_DATATYPE, "value",
|
|
523
|
+
"teradataml DataFrame")
|
|
524
|
+
raise TypeError(err_)
|
|
525
|
+
else:
|
|
526
|
+
if value._table_name is None:
|
|
527
|
+
value._table_name = df_utils._execute_node_return_db_object_name(
|
|
528
|
+
value._nodeid, value._metaexpr)
|
|
529
|
+
self.value = UtilFuncs._extract_table_name(value._table_name).replace(
|
|
530
|
+
"\"", "")
|
|
531
|
+
|
|
532
|
+
# Add double single quotes when "value" contains any of the special val symbols.
|
|
533
|
+
# Note:
|
|
534
|
+
# Following are special val symbols {'{', '}', '(', ')', ',', '/'}.
|
|
535
|
+
if isinstance(self.value, str) and len(re.findall(r'[\{\}\(\),/]', self.value)) > 0:
|
|
536
|
+
self.value = """''{}''""".format(self.value)
|
|
537
|
+
|
|
538
|
+
def _val_nullstyle_fmt(self):
|
|
539
|
+
"""
|
|
540
|
+
DESCRIPTION:
|
|
541
|
+
Internal function to return a string representation of nullstyle
|
|
542
|
+
Transformation technique.
|
|
543
|
+
|
|
544
|
+
PARAMETERS:
|
|
545
|
+
None.
|
|
546
|
+
|
|
547
|
+
RETURNS:
|
|
548
|
+
String representing nullstyle SQL syntax.
|
|
549
|
+
|
|
550
|
+
RAISES:
|
|
551
|
+
None.
|
|
552
|
+
|
|
553
|
+
EXAMPLE:
|
|
554
|
+
self._val_nullstyle_fmt()
|
|
555
|
+
"""
|
|
556
|
+
nullstyle_fmt = "nullstyle({})"
|
|
557
|
+
|
|
558
|
+
nullstyle_args = self.style.lower()
|
|
559
|
+
if self.style.upper() in ["LITERAL", "IMPUTED"]:
|
|
560
|
+
nullstyle_args = "{}, {}".format(self.style.lower(), '\'\'\'\'' if isinstance(self.value, str) and len(
|
|
561
|
+
self.value) == 0 else self.value)
|
|
562
|
+
return nullstyle_fmt.format(nullstyle_args)
|
|
563
|
+
|
|
564
|
+
def _val_sql_syntax(self):
|
|
565
|
+
"""
|
|
566
|
+
DESCRIPTION:
|
|
567
|
+
Internal function to return a string representation of null replacement
|
|
568
|
+
Transformation as required by SQL.
|
|
569
|
+
|
|
570
|
+
PARAMETERS:
|
|
571
|
+
None.
|
|
572
|
+
|
|
573
|
+
RETURNS:
|
|
574
|
+
String representing SQL syntax for 'nullreplacement' SQL argument.
|
|
575
|
+
|
|
576
|
+
RAISES:
|
|
577
|
+
None.
|
|
578
|
+
|
|
579
|
+
EXAMPLE:
|
|
580
|
+
self._val_sql_syntax()
|
|
581
|
+
"""
|
|
582
|
+
ret_value = self._val_nullstyle_fmt()
|
|
583
|
+
columns_fmt = self._val_transformation_fmt()
|
|
584
|
+
if columns_fmt:
|
|
585
|
+
ret_value = "{}, {}".format(ret_value, columns_fmt)
|
|
586
|
+
|
|
587
|
+
return "{" + ret_value + "}"
|
|
588
|
+
|
|
589
|
+
class Binning(_Transformations):
|
|
590
|
+
""" Class to represent binning transformation technique. """
|
|
591
|
+
|
|
592
|
+
def __init__(self, columns, style="bins", value=10, lbound=None, ubound=None,
|
|
593
|
+
out_columns=None, datatype=None, fillna=None, **kwargs):
|
|
594
|
+
"""
|
|
595
|
+
DESCRIPTION:
|
|
596
|
+
Binning allows user to perform bin coding to replace continuous numeric
|
|
597
|
+
column with a categorical one to produce ordinal values (for example,
|
|
598
|
+
numeric categorical values where order is meaningful). Binning uses the
|
|
599
|
+
same techniques used in Histogram analysis, allowing you to choose between:
|
|
600
|
+
1. equal-width bins,
|
|
601
|
+
2. equal-width bins with a user-specified minimum and maximum range,
|
|
602
|
+
3. bins with a user-specified width,
|
|
603
|
+
4. evenly distributed bins, or
|
|
604
|
+
5. bins with user-specified boundaries.
|
|
605
|
+
|
|
606
|
+
If the minimum and maximum are specified, all values less than the minimum
|
|
607
|
+
are put into bin 0, while all values greater than the maximum are put into
|
|
608
|
+
bin N+1. The same is true when the boundary option is specified.
|
|
609
|
+
|
|
610
|
+
Bin Coding supports numeric and date type columns. If date values are entered,
|
|
611
|
+
the keyword DATE must precede the date value, and do not enclose in single
|
|
612
|
+
quotes.
|
|
613
|
+
|
|
614
|
+
Note:
|
|
615
|
+
Output of this function is passed to "bins" argument of "Transform"
|
|
616
|
+
function from Vantage Analytic Library.
|
|
617
|
+
|
|
618
|
+
PARAMETERS:
|
|
619
|
+
columns:
|
|
620
|
+
Required Argument.
|
|
621
|
+
Specifies the names of the columns to perform transformation on.
|
|
622
|
+
Types: str or list of str
|
|
623
|
+
|
|
624
|
+
style:
|
|
625
|
+
Optional Argument.
|
|
626
|
+
Specifies the bin style to use.
|
|
627
|
+
Permitted Values:
|
|
628
|
+
* "bins":
|
|
629
|
+
This style allows user to specify equal-width bins without any
|
|
630
|
+
boundaries. Argument "values" must be used when this style of
|
|
631
|
+
binning is used.
|
|
632
|
+
* "binswithboundaries":
|
|
633
|
+
This style allows user to specify equal-width bins with minimum
|
|
634
|
+
and maximum range. Arguments "values", "lbound" and "ubound" must
|
|
635
|
+
be used when this style of binning is used.
|
|
636
|
+
All values less than the minimum are put into bin 0, while all
|
|
637
|
+
values greater than the maximum are put into bin N+1.
|
|
638
|
+
* "boundaries":
|
|
639
|
+
This style allows user to specify bins with boundaries.
|
|
640
|
+
To specify boundaries one should use keyword arguments as:
|
|
641
|
+
b1 --> To specify first boundary.
|
|
642
|
+
b2 --> To specify second boundary.
|
|
643
|
+
b3 --> To specify third boundary.
|
|
644
|
+
...
|
|
645
|
+
bN --> To specify Nth boundary.
|
|
646
|
+
All values less than the first boundary value are put into bin 0,
|
|
647
|
+
while all values greater than the last boundary value are put into
|
|
648
|
+
bin N+1.
|
|
649
|
+
See "kwargs" description below for more details on how these
|
|
650
|
+
arguments must be used.
|
|
651
|
+
* "quantiles":
|
|
652
|
+
This style allows user to specify evenly-distributed bins.
|
|
653
|
+
Argument "values" must be used when this style of binning is used.
|
|
654
|
+
* "width":
|
|
655
|
+
This style allows user to specify bins with widths. Argument
|
|
656
|
+
"values" must be used when this style of binning is used.
|
|
657
|
+
Default Value: 'bins'
|
|
658
|
+
Types: str
|
|
659
|
+
|
|
660
|
+
value:
|
|
661
|
+
Optional Argument.
|
|
662
|
+
Specifies the value to be used for bin code transformations.
|
|
663
|
+
When bin style is:
|
|
664
|
+
* 'bins' or 'binswithboundaries' argument specifies the number of bins.
|
|
665
|
+
* 'quantiles' argument specifies the number of quantiles.
|
|
666
|
+
* 'width' argument specifies the bin width.
|
|
667
|
+
Note:
|
|
668
|
+
Ignored when style is 'boundaries'.
|
|
669
|
+
Default Value: 10
|
|
670
|
+
Types: int
|
|
671
|
+
|
|
672
|
+
lbound:
|
|
673
|
+
Optional Argument.
|
|
674
|
+
Specifies the minimum boundary value for 'binswithboundaries' style.
|
|
675
|
+
Notes:
|
|
676
|
+
1. Ignored when style is not 'binswithboundaries'.
|
|
677
|
+
2. If date values are entered as string, the keyword 'DATE' must precede
|
|
678
|
+
the date value, and do not enclose in single quotes OR
|
|
679
|
+
pass a datetime.date object.
|
|
680
|
+
For example,
|
|
681
|
+
value='DATE 1987-06-09'
|
|
682
|
+
value=date(1987, 6, 9)
|
|
683
|
+
Types: int, float, str, datetime.date
|
|
684
|
+
|
|
685
|
+
ubound:
|
|
686
|
+
Optional Argument.
|
|
687
|
+
Specifies the maximum boundary value for 'binswithboundaries' style.
|
|
688
|
+
Notes:
|
|
689
|
+
1. Ignored when style is not 'binswithboundaries'.
|
|
690
|
+
2. If date values are entered as string, the keyword 'DATE' must precede
|
|
691
|
+
the date value, and do not enclose in single quotes OR
|
|
692
|
+
pass a datetime.date object.
|
|
693
|
+
For example,
|
|
694
|
+
value='DATE 1987-06-09'
|
|
695
|
+
value=date(1987, 6, 9)
|
|
696
|
+
Types: int, float, str, datetime.date
|
|
697
|
+
|
|
698
|
+
out_columns:
|
|
699
|
+
Optional Argument.
|
|
700
|
+
Specifies the names of the output columns.
|
|
701
|
+
Note:
|
|
702
|
+
Number of elements in "columns" and "out_columns" must be same.
|
|
703
|
+
Types: str or list of str
|
|
704
|
+
|
|
705
|
+
datatype:
|
|
706
|
+
Optional Argument.
|
|
707
|
+
Specifies the name of the intended datatype of the output column.
|
|
708
|
+
Intended data types for the output column can be specified using either the
|
|
709
|
+
teradatasqlalchemy types or the permitted strings mentioned below:
|
|
710
|
+
-------------------------------------------------------------------
|
|
711
|
+
| If intended SQL Data Type is | Permitted Value to be passed is |
|
|
712
|
+
|-------------------------------------------------------------------|
|
|
713
|
+
| bigint | bigint |
|
|
714
|
+
| byteint | byteint |
|
|
715
|
+
| char(n) | char,n |
|
|
716
|
+
| date | date |
|
|
717
|
+
| decimal(m,n) | decimal,m,n |
|
|
718
|
+
| float | float |
|
|
719
|
+
| integer | integer |
|
|
720
|
+
| number(*) | number |
|
|
721
|
+
| number(n) | number,n |
|
|
722
|
+
| number(*,n) | number,*,n |
|
|
723
|
+
| number(n,n) | number,n,n |
|
|
724
|
+
| smallint | smallint |
|
|
725
|
+
| time(p) | time,p |
|
|
726
|
+
| timestamp(p) | timestamp,p |
|
|
727
|
+
| varchar(n) | varchar,n |
|
|
728
|
+
--------------------------------------------------------------------
|
|
729
|
+
Notes:
|
|
730
|
+
1. Argument is ignored if "columns" argument is not used.
|
|
731
|
+
2. char without a size is not supported.
|
|
732
|
+
3. number(*) does not include the * in its datatype format.
|
|
733
|
+
Examples:
|
|
734
|
+
1. If intended datatype for the output column is "bigint", then
|
|
735
|
+
pass string "bigint" to the argument as shown below:
|
|
736
|
+
datatype="bigint"
|
|
737
|
+
2. If intended datatype for the output column is "decimal(3,5)", then
|
|
738
|
+
pass string "decimal,3,5" to the argument as shown below:
|
|
739
|
+
datatype="decimal,3,5"
|
|
740
|
+
Types: str, BIGINT, BYTEINT, CHAR, DATE, DECIMAL, FLOAT, INTEGER, NUMBER, SMALLINT, TIME,
|
|
741
|
+
TIMESTAMP, VARCHAR.
|
|
742
|
+
|
|
743
|
+
fillna:
|
|
744
|
+
Optional Argument.
|
|
745
|
+
Specifies whether the null replacement/missing value treatment should
|
|
746
|
+
be performed with binning or not. Output of FillNa() can be passed to
|
|
747
|
+
this argument.
|
|
748
|
+
Note:
|
|
749
|
+
If the FillNa object is created with its arguments "columns",
|
|
750
|
+
"out_columns" and "datatype", then values passed in FillNa() arguments
|
|
751
|
+
are ignored. Only nullstyle information is captured from the same.
|
|
752
|
+
Types: FillNa
|
|
753
|
+
|
|
754
|
+
kwargs:
|
|
755
|
+
Specifies the keyword arguments to provide the boundaries required
|
|
756
|
+
for binning with bin style 'boundaries'.
|
|
757
|
+
To specify boundaries one should use keyword arguments as:
|
|
758
|
+
b1 --> To specify first boundary.
|
|
759
|
+
b2 --> To specify second boundary.
|
|
760
|
+
b3 --> To specify third boundary.
|
|
761
|
+
...
|
|
762
|
+
bN --> To specify Nth boundary.
|
|
763
|
+
Notes:
|
|
764
|
+
1. When keyword arguments are used, make sure to specify boundaries
|
|
765
|
+
in sequence, i.e., b1, b2, b3, ...
|
|
766
|
+
In case a sequential keyword argument is missing an error is
|
|
767
|
+
raised.
|
|
768
|
+
2. Keyword arguments specified for the boundaries must start with 'b'.
|
|
769
|
+
3. First boundary must always be specified with "b1" argument.
|
|
770
|
+
Types: int, float, str, datetime.date
|
|
771
|
+
|
|
772
|
+
RETURNS:
|
|
773
|
+
An instance of Binning class.
|
|
774
|
+
|
|
775
|
+
RAISES:
|
|
776
|
+
TeradataMlException, TypeError, ValueError
|
|
777
|
+
|
|
778
|
+
EXAMPLE:
|
|
779
|
+
# Note:
|
|
780
|
+
# To run any transformation, user needs to use Transform() function from
|
|
781
|
+
# Vantage Analytic Library.
|
|
782
|
+
# To do so import valib first and set the "val_install_location".
|
|
783
|
+
>>> from teradataml import configure, DataFrame, Binning, FillNa, load_example_data, valib
|
|
784
|
+
>>> configure.val_install_location = "SYSLIB"
|
|
785
|
+
>>>
|
|
786
|
+
|
|
787
|
+
# Load example data.
|
|
788
|
+
>>> load_example_data("movavg", "ibm_stock")
|
|
789
|
+
>>>
|
|
790
|
+
|
|
791
|
+
# Create the required teradataml DataFrame.
|
|
792
|
+
>>> ibm_stock = DataFrame.from_table("ibm_stock")
|
|
793
|
+
>>> ibm_stock
|
|
794
|
+
name period stockprice
|
|
795
|
+
id
|
|
796
|
+
183 ibm 62/02/07 552.0
|
|
797
|
+
202 ibm 62/03/07 548.0
|
|
798
|
+
181 ibm 62/02/05 551.0
|
|
799
|
+
242 ibm 62/05/02 482.0
|
|
800
|
+
364 ibm 62/10/25 331.0
|
|
801
|
+
221 ibm 62/04/03 513.0
|
|
802
|
+
38 ibm 61/07/11 473.0
|
|
803
|
+
366 ibm 62/10/29 352.0
|
|
804
|
+
326 ibm 62/08/30 387.0
|
|
805
|
+
61 ibm 61/08/11 497.0
|
|
806
|
+
>>>
|
|
807
|
+
|
|
808
|
+
# Example 1: Binning is carried out with "bins" style, i.e. equal-width
|
|
809
|
+
# binning, with 5 number of bins. Null replacement is also combined
|
|
810
|
+
# with binning.
|
|
811
|
+
# "key_columns" argument must be used with Transform() function,
|
|
812
|
+
# when null replacement is being done.
|
|
813
|
+
>>> fn = FillNa(style="literal", value=0)
|
|
814
|
+
>>> bins = Binning(style="bins", value=5, columns="stockprice", fillna=fn)
|
|
815
|
+
|
|
816
|
+
# Execute Transform() function.
|
|
817
|
+
>>> obj = valib.Transform(data=ibm_stock,
|
|
818
|
+
... bins=bins,
|
|
819
|
+
... key_columns="id")
|
|
820
|
+
>>> obj.result
|
|
821
|
+
id stockprice
|
|
822
|
+
0 263 1
|
|
823
|
+
1 324 2
|
|
824
|
+
2 303 2
|
|
825
|
+
3 99 5
|
|
826
|
+
4 36 3
|
|
827
|
+
5 97 5
|
|
828
|
+
6 160 5
|
|
829
|
+
7 59 4
|
|
830
|
+
8 19 4
|
|
831
|
+
9 122 5
|
|
832
|
+
>>>
|
|
833
|
+
|
|
834
|
+
|
|
835
|
+
# Example 2: Binning is carried out with multiple styles.
|
|
836
|
+
|
|
837
|
+
# 'binswithboundaries' style:
|
|
838
|
+
# Equal-width bins with a user-specified minimum and maximum range on 'period'
|
|
839
|
+
# column. Resultant output return the value with the same column name. Number
|
|
840
|
+
# of bins created are 5.
|
|
841
|
+
>>> bins_1 = Binning(style="binswithboundaries",
|
|
842
|
+
... value=5,
|
|
843
|
+
... lbound="DATE 1962-01-01",
|
|
844
|
+
... ubound="DATE 1962-06-01",
|
|
845
|
+
... columns="period")
|
|
846
|
+
>>>
|
|
847
|
+
|
|
848
|
+
# 'boundaries' style:
|
|
849
|
+
# Bins created with user specified boundaries on 'period' column. Resultant
|
|
850
|
+
# column is names as 'period2'. Three boundaries are specified with arguments
|
|
851
|
+
# "b1", "b2" and "b3". When using this style, keyword argument names must
|
|
852
|
+
# start with 'b' and they should be in sequence b1, b2, ..., bN.
|
|
853
|
+
>>> bins_2 = Binning(style="boundaries",
|
|
854
|
+
... b1="DATE 1962-01-01",
|
|
855
|
+
... b2="DATE 1962-06-01",
|
|
856
|
+
... b3="DATE 1962-12-31",
|
|
857
|
+
... columns="period",
|
|
858
|
+
... out_columns="period2")
|
|
859
|
+
>>>
|
|
860
|
+
|
|
861
|
+
# Execute Transform() function.
|
|
862
|
+
>>> obj = valib.Transform(data=ibm_stock,
|
|
863
|
+
... bins=[bins_1, bins_2])
|
|
864
|
+
>>> obj.result
|
|
865
|
+
id period period2
|
|
866
|
+
0 223 4 1
|
|
867
|
+
1 345 6 2
|
|
868
|
+
2 120 0 0
|
|
869
|
+
3 343 6 2
|
|
870
|
+
4 57 0 0
|
|
871
|
+
5 118 0 0
|
|
872
|
+
6 200 3 1
|
|
873
|
+
7 80 0 0
|
|
874
|
+
8 162 1 1
|
|
875
|
+
9 40 0 0
|
|
876
|
+
>>>
|
|
877
|
+
|
|
878
|
+
|
|
879
|
+
# Example 3: Binning is carried out with multiple styles 'quantiles' and
|
|
880
|
+
# 'width'.
|
|
881
|
+
|
|
882
|
+
# 'quantiles' style :
|
|
883
|
+
# Evenly distributed bins on 'stockprice' column. Resultant output returns
|
|
884
|
+
# the column with name 'stockprice_q'. Number of quantiles considered here
|
|
885
|
+
# are 4.
|
|
886
|
+
>>> bins_1 = Binning(style="quantiles",
|
|
887
|
+
... value=4,
|
|
888
|
+
... out_columns="stockprice_q",
|
|
889
|
+
... columns="stockprice")
|
|
890
|
+
>>>
|
|
891
|
+
|
|
892
|
+
# 'width' style :
|
|
893
|
+
# Bins with user specified width on 'stockprice' column. Resultant output
|
|
894
|
+
# returns the column with name 'stockprice_w'. Width considered for binning
|
|
895
|
+
# is 5.
|
|
896
|
+
>>> bins_2 = Binning(style="width",
|
|
897
|
+
... value=5,
|
|
898
|
+
... out_columns="stockprice_w",
|
|
899
|
+
... columns="stockprice")
|
|
900
|
+
>>>
|
|
901
|
+
|
|
902
|
+
# Execute Transform() function.
|
|
903
|
+
>>> obj = valib.Transform(data=ibm_stock,
|
|
904
|
+
... bins=[bins_1, bins_2])
|
|
905
|
+
>>> obj.result
|
|
906
|
+
id stockprice_q stockprice_w
|
|
907
|
+
0 183 4 50
|
|
908
|
+
1 202 3 49
|
|
909
|
+
2 181 4 50
|
|
910
|
+
3 242 2 36
|
|
911
|
+
4 364 1 6
|
|
912
|
+
5 221 3 42
|
|
913
|
+
6 38 2 34
|
|
914
|
+
7 366 1 10
|
|
915
|
+
8 326 1 17
|
|
916
|
+
9 61 3 39
|
|
917
|
+
>>>
|
|
918
|
+
"""
|
|
919
|
+
# Call super()
|
|
920
|
+
super().__init__(columns=columns, out_columns=out_columns, datatype=datatype,
|
|
921
|
+
columns_optional=False)
|
|
922
|
+
|
|
923
|
+
# Initialize style and value as data members.
|
|
924
|
+
self.style = style
|
|
925
|
+
self.value = value
|
|
926
|
+
self.lbound = lbound
|
|
927
|
+
self.ubound = ubound
|
|
928
|
+
self.fillna = fillna
|
|
929
|
+
self.kwargs = kwargs
|
|
930
|
+
|
|
931
|
+
# Validations
|
|
932
|
+
arg_info_matrix = []
|
|
933
|
+
permitted_styles = ["BINS", "BINSWITHBOUNDARIES", "BOUNDARIES", "QUANTILES",
|
|
934
|
+
"WIDTH"]
|
|
935
|
+
arg_info_matrix.append(["style", self.style, True, str, True, permitted_styles])
|
|
936
|
+
arg_info_matrix.append(["value", self.value, True, int])
|
|
937
|
+
arg_info_matrix.append(["lbound", self.lbound, True, (bool, int, float, str, date)])
|
|
938
|
+
arg_info_matrix.append(["ubound", self.ubound, True, (bool, int, float, str, date)])
|
|
939
|
+
arg_info_matrix.append(["fillna", self.fillna, True, FillNa])
|
|
940
|
+
# Note:
|
|
941
|
+
# Validations for "columns", "out_columns" and "datatype" is done by super().
|
|
942
|
+
# Other argument validations.
|
|
943
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
944
|
+
|
|
945
|
+
# "value" should be provided when "style" is 'bins'.
|
|
946
|
+
if self.style.upper() in ["BINS", "QUANTILES", "WIDTH"] and self.value is None:
|
|
947
|
+
err_ = Messages.get_message(MessageCodes.DEPENDENT_ARG_MISSING, "value",
|
|
948
|
+
"style={}".format(self.style))
|
|
949
|
+
raise TeradataMlException(err_, MessageCodes.DEPENDENT_ARG_MISSING)
|
|
950
|
+
|
|
951
|
+
# "value", "lbound", "ubound" should be provided when "style" is 'binswithboundaries'.
|
|
952
|
+
if self.style.upper() == "BINSWITHBOUNDARIES":
|
|
953
|
+
if self.value is None:
|
|
954
|
+
err_ = Messages.get_message(MessageCodes.DEPENDENT_ARG_MISSING, "value",
|
|
955
|
+
"style={}".format(self.style))
|
|
956
|
+
raise TeradataMlException(err_, MessageCodes.DEPENDENT_ARG_MISSING)
|
|
957
|
+
|
|
958
|
+
if self.lbound is None:
|
|
959
|
+
err_ = Messages.get_message(MessageCodes.DEPENDENT_ARG_MISSING, "lbound",
|
|
960
|
+
"style={}".format(self.style))
|
|
961
|
+
raise TeradataMlException(err_, MessageCodes.DEPENDENT_ARG_MISSING)
|
|
962
|
+
|
|
963
|
+
if self.ubound is None:
|
|
964
|
+
err_ = Messages.get_message(MessageCodes.DEPENDENT_ARG_MISSING, "ubound",
|
|
965
|
+
"style={}".format(self.style))
|
|
966
|
+
raise TeradataMlException(err_, MessageCodes.DEPENDENT_ARG_MISSING)
|
|
967
|
+
|
|
968
|
+
if self.style.upper() == "BOUNDARIES":
|
|
969
|
+
# Parse kwargs now for "boundaries" style argument.
|
|
970
|
+
# Expected arguments are b1, b2, ..., bN.
|
|
971
|
+
# We start extracting each boundary argument one by one and store
|
|
972
|
+
# it's corresponding value that can be used later to generate
|
|
973
|
+
# the correct binstyle syntax.
|
|
974
|
+
parse_kwargs = True
|
|
975
|
+
key_num = 1
|
|
976
|
+
self.__boundaries = []
|
|
977
|
+
while parse_kwargs:
|
|
978
|
+
value = self.kwargs.pop("b{}".format(str(key_num)), None)
|
|
979
|
+
kwarg_info_matrix=[["b{}".format(str(key_num)), value,
|
|
980
|
+
True, (int, float, str, date)]]
|
|
981
|
+
|
|
982
|
+
_Validators._validate_function_arguments(kwarg_info_matrix)
|
|
983
|
+
|
|
984
|
+
key_num = key_num + 1
|
|
985
|
+
if value is None:
|
|
986
|
+
parse_kwargs = False
|
|
987
|
+
else:
|
|
988
|
+
if isinstance(value, date):
|
|
989
|
+
value = UtilFuncs._convert_date_to_string(value)
|
|
990
|
+
self.__boundaries.append(value)
|
|
991
|
+
|
|
992
|
+
# If kwargs still has some extra arguments that means user has
|
|
993
|
+
# passed incorrect argument.
|
|
994
|
+
if len(kwargs) != 0:
|
|
995
|
+
err_ = "Boundary keyword arguments for \"boundaries\" binning style " \
|
|
996
|
+
"must be in sequence as b1, b2, ..., bN. Found: " \
|
|
997
|
+
"{}".format(list(kwargs.keys()))
|
|
998
|
+
raise TypeError(err_)
|
|
999
|
+
|
|
1000
|
+
# After parsing kwargs, if length of self.__boundaries is 0
|
|
1001
|
+
# then we should raise error as boundary values are missing for
|
|
1002
|
+
# this binning style.
|
|
1003
|
+
if len(self.__boundaries) == 0:
|
|
1004
|
+
err_ = Messages.get_message(MessageCodes.DEPENDENT_ARG_MISSING,
|
|
1005
|
+
"b1, b2, ..., bN",
|
|
1006
|
+
"style={}".format(self.style))
|
|
1007
|
+
raise TeradataMlException(err_, MessageCodes.DEPENDENT_ARG_MISSING)
|
|
1008
|
+
|
|
1009
|
+
def _val_sql_syntax(self):
|
|
1010
|
+
"""
|
|
1011
|
+
DESCRIPTION:
|
|
1012
|
+
Internal function to return a string representation of binning
|
|
1013
|
+
Transformation as required by SQL.
|
|
1014
|
+
|
|
1015
|
+
PARAMETERS:
|
|
1016
|
+
None.
|
|
1017
|
+
|
|
1018
|
+
RETURNS:
|
|
1019
|
+
String representing SQL syntax for 'bincode' SQL argument.
|
|
1020
|
+
|
|
1021
|
+
RAISES:
|
|
1022
|
+
None.
|
|
1023
|
+
|
|
1024
|
+
EXAMPLE:
|
|
1025
|
+
self._val_sql_syntax()
|
|
1026
|
+
"""
|
|
1027
|
+
# Generate and add syntax for "binstyle" SQL argument.
|
|
1028
|
+
if self.style.upper() in ["BINS", "QUANTILES", "WIDTH"]:
|
|
1029
|
+
binstyle_arg2 = self.value
|
|
1030
|
+
elif self.style.upper() == "BINSWITHBOUNDARIES":
|
|
1031
|
+
# 'lbound' is provided as date object, convert it to str format
|
|
1032
|
+
if isinstance(self.lbound, date):
|
|
1033
|
+
self.lbound = UtilFuncs._convert_date_to_string(self.lbound)
|
|
1034
|
+
|
|
1035
|
+
# 'ubound' is provided as date object, convert it to str format
|
|
1036
|
+
if isinstance(self.ubound, date):
|
|
1037
|
+
self.ubound = UtilFuncs._convert_date_to_string(self.ubound)
|
|
1038
|
+
|
|
1039
|
+
binstyle_arg2 = "{}, {}, {}".format(self.value, self.lbound, self.ubound)
|
|
1040
|
+
else:
|
|
1041
|
+
binstyle_arg2 = ", ".join([str(v) for v in self.__boundaries])
|
|
1042
|
+
|
|
1043
|
+
ret_value = "binstyle({}, {})".format(self.style.lower(), binstyle_arg2)
|
|
1044
|
+
|
|
1045
|
+
# Generate and add syntax for "columns" and "datatype" SQL arguments.
|
|
1046
|
+
columns_fmt = self._val_transformation_fmt()
|
|
1047
|
+
ret_value = "{}, {}".format(ret_value, columns_fmt)
|
|
1048
|
+
|
|
1049
|
+
if self.fillna:
|
|
1050
|
+
ret_value = "{}, {}".format(ret_value, self.fillna._val_nullstyle_fmt())
|
|
1051
|
+
|
|
1052
|
+
return "{" + ret_value + "}"
|
|
1053
|
+
|
|
1054
|
+
|
|
1055
|
+
class Derive(object):
|
|
1056
|
+
""" Class to represent derive transformation technique. """
|
|
1057
|
+
|
|
1058
|
+
def __init__(self, formula, columns, out_column, datatype=None, fillna=None):
|
|
1059
|
+
"""
|
|
1060
|
+
DESCRIPTION:
|
|
1061
|
+
The Derive transformation requires the free-form transformation be specified
|
|
1062
|
+
as a formula using the following operators, arguments, and functions:
|
|
1063
|
+
+, -, **, *, /, %, (, ), x, y, z, abs, exp, ln, log, sqrt
|
|
1064
|
+
The arguments x, y, and z can only assume the value of an input column.
|
|
1065
|
+
An implied multiply operator is automatically inserted when a number, argument
|
|
1066
|
+
(x, y, z), or parenthesis is immediately followed by an argument or parenthesis.
|
|
1067
|
+
For example,
|
|
1068
|
+
4x means 4*x, xy means x*y, and x(x+1) is equivalent to x*(x+1).
|
|
1069
|
+
|
|
1070
|
+
An example formula for the quadratic equation is below.
|
|
1071
|
+
formula="(-y+sqrt(y**2-4xz))/(2x)"
|
|
1072
|
+
|
|
1073
|
+
Note:
|
|
1074
|
+
Output of this function is passed to "derive" argument of "Transform"
|
|
1075
|
+
function from Vantage Analytic Library.
|
|
1076
|
+
|
|
1077
|
+
PARAMETERS:
|
|
1078
|
+
formula:
|
|
1079
|
+
Required Argument.
|
|
1080
|
+
Specifies the free-form transformation required for Derive.
|
|
1081
|
+
Arithmetic formula can be specified as string using following operators,
|
|
1082
|
+
arguments, and functions:
|
|
1083
|
+
+, -, **, *, /, %, (, ), x, y, z, abs, exp, ln, log, sqrt
|
|
1084
|
+
Types: str
|
|
1085
|
+
|
|
1086
|
+
columns:
|
|
1087
|
+
Required Argument.
|
|
1088
|
+
Specifies the names of the columns to use for formula.
|
|
1089
|
+
Types: str or list of str
|
|
1090
|
+
|
|
1091
|
+
out_column:
|
|
1092
|
+
Required Argument.
|
|
1093
|
+
Specifies the name of the output column.
|
|
1094
|
+
Types: str
|
|
1095
|
+
|
|
1096
|
+
datatype:
|
|
1097
|
+
Optional Argument.
|
|
1098
|
+
Specifies the name of the intended datatype of the output column.
|
|
1099
|
+
Intended data types for the output column can be specified using either the
|
|
1100
|
+
teradatasqlalchemy types or the permitted strings mentioned below:
|
|
1101
|
+
-------------------------------------------------------------------
|
|
1102
|
+
| If intended SQL Data Type is | Permitted Value to be passed is |
|
|
1103
|
+
|-------------------------------------------------------------------|
|
|
1104
|
+
| bigint | bigint |
|
|
1105
|
+
| byteint | byteint |
|
|
1106
|
+
| char(n) | char,n |
|
|
1107
|
+
| date | date |
|
|
1108
|
+
| decimal(m,n) | decimal,m,n |
|
|
1109
|
+
| float | float |
|
|
1110
|
+
| integer | integer |
|
|
1111
|
+
| number(*) | number |
|
|
1112
|
+
| number(n) | number,n |
|
|
1113
|
+
| number(*,n) | number,*,n |
|
|
1114
|
+
| number(n,n) | number,n,n |
|
|
1115
|
+
| smallint | smallint |
|
|
1116
|
+
| time(p) | time,p |
|
|
1117
|
+
| timestamp(p) | timestamp,p |
|
|
1118
|
+
| varchar(n) | varchar,n |
|
|
1119
|
+
--------------------------------------------------------------------
|
|
1120
|
+
Notes:
|
|
1121
|
+
1. Argument is ignored if "columns" argument is not used.
|
|
1122
|
+
2. char without a size is not supported.
|
|
1123
|
+
3. number(*) does not include the * in its datatype format.
|
|
1124
|
+
Examples:
|
|
1125
|
+
1. If intended datatype for the output column is "bigint", then
|
|
1126
|
+
pass string "bigint" to the argument as shown below:
|
|
1127
|
+
datatype="bigint"
|
|
1128
|
+
2. If intended datatype for the output column is "decimal(3,5)", then
|
|
1129
|
+
pass string "decimal,3,5" to the argument as shown below:
|
|
1130
|
+
datatype="decimal,3,5"
|
|
1131
|
+
Types: str, BIGINT, BYTEINT, CHAR, DATE, DECIMAL, FLOAT, INTEGER, NUMBER, SMALLINT, TIME,
|
|
1132
|
+
TIMESTAMP, VARCHAR.
|
|
1133
|
+
|
|
1134
|
+
fillna:
|
|
1135
|
+
Optional Argument.
|
|
1136
|
+
Specifies whether the null replacement/missing value treatment should
|
|
1137
|
+
be performed with derive or not. Output of FillNa() can be passed to
|
|
1138
|
+
this argument.
|
|
1139
|
+
Note:
|
|
1140
|
+
If the FillNa object is created with its arguments "columns",
|
|
1141
|
+
"out_columns" and "datatype", then values passed in FillNa() arguments
|
|
1142
|
+
are ignored. Only nullstyle information is captured from the same.
|
|
1143
|
+
Types: FillNa
|
|
1144
|
+
|
|
1145
|
+
RETURNS:
|
|
1146
|
+
An instance of Derive class.
|
|
1147
|
+
|
|
1148
|
+
RAISES:
|
|
1149
|
+
TeradataMlException, TypeError, ValueError
|
|
1150
|
+
|
|
1151
|
+
EXAMPLE:
|
|
1152
|
+
# Note:
|
|
1153
|
+
# To run any transformation, user needs to use Transform() function from
|
|
1154
|
+
# Vantage Analytic Library.
|
|
1155
|
+
# To do so import valib first and set the "val_install_location".
|
|
1156
|
+
>>> from teradataml import configure, DataFrame, Derive, FillNa, load_example_data, valib
|
|
1157
|
+
>>> configure.val_install_location = "SYSLIB"
|
|
1158
|
+
>>>
|
|
1159
|
+
|
|
1160
|
+
# Load example data.
|
|
1161
|
+
>>> load_example_data("dataframe", "sales")
|
|
1162
|
+
>>>
|
|
1163
|
+
|
|
1164
|
+
# Create the required DataFrame.
|
|
1165
|
+
>>> sales = DataFrame("sales")
|
|
1166
|
+
>>> sales
|
|
1167
|
+
Feb Jan Mar Apr datetime
|
|
1168
|
+
accounts
|
|
1169
|
+
Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
|
|
1170
|
+
Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
|
|
1171
|
+
Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
|
|
1172
|
+
Yellow Inc 90.0 NaN NaN NaN 04/01/2017
|
|
1173
|
+
Orange Inc 210.0 NaN NaN 250.0 04/01/2017
|
|
1174
|
+
Red Inc 200.0 150.0 140.0 NaN 04/01/2017
|
|
1175
|
+
>>>
|
|
1176
|
+
|
|
1177
|
+
# Example: Includes multiple derive transformations.
|
|
1178
|
+
# Derive transformation 1 is done with 3 variables, x, y, z, to calculate
|
|
1179
|
+
# the total sales for the first quarter for each account.
|
|
1180
|
+
>>> fn_1 = FillNa(style='literal', value=0)
|
|
1181
|
+
>>> dr_1 = Derive(formula="x+y+z", columns=["Jan", "Feb", "Mar"],
|
|
1182
|
+
... out_column="q1_sales", fillna=fn_1)
|
|
1183
|
+
>>>
|
|
1184
|
+
|
|
1185
|
+
# Derive transformation 2 is done with 2 variables, x, y, to calculate
|
|
1186
|
+
# the sale growth from the month of Jan to Feb.
|
|
1187
|
+
>>> fn_2 = FillNa(style='median')
|
|
1188
|
+
>>> dr_2 = Derive(formula="((y-x)/x)*100", columns=["Jan", "Feb"],
|
|
1189
|
+
... out_column="feb_growth", fillna=fn_2, datatype='bigint')
|
|
1190
|
+
>>>
|
|
1191
|
+
|
|
1192
|
+
# Execute Transform() function.
|
|
1193
|
+
>>> obj = valib.Transform(data=sales, derive=[dr_1, dr_2], key_columns="accounts")
|
|
1194
|
+
>>> obj.result
|
|
1195
|
+
accounts q1_sales feb_growth
|
|
1196
|
+
0 Alpha Co 625.0 4
|
|
1197
|
+
1 Red Inc 490.0 33
|
|
1198
|
+
2 Orange Inc NaN 40
|
|
1199
|
+
3 Jones LLC 490.0 33
|
|
1200
|
+
4 Yellow Inc NaN -40
|
|
1201
|
+
5 Blue Inc 235.0 79
|
|
1202
|
+
>>>
|
|
1203
|
+
"""
|
|
1204
|
+
# Initialize style and value as data members.
|
|
1205
|
+
self.formula = formula
|
|
1206
|
+
self.columns = columns
|
|
1207
|
+
self.out_column = out_column
|
|
1208
|
+
self.datatype = datatype
|
|
1209
|
+
self.fillna = fillna
|
|
1210
|
+
|
|
1211
|
+
# Validations
|
|
1212
|
+
arg_info_matrix = []
|
|
1213
|
+
arg_info_matrix.append(["formula", self.formula, False, str, True])
|
|
1214
|
+
arg_info_matrix.append(["columns", self.columns, False, (str, list), True])
|
|
1215
|
+
arg_info_matrix.append(["out_column", self.out_column, False, str, True])
|
|
1216
|
+
arg_info_matrix.append(["datatype", self.datatype, True, _SuppArgTypes.VAL_ARG_DATATYPE, True])
|
|
1217
|
+
arg_info_matrix.append(["fillna", self.fillna, True, FillNa])
|
|
1218
|
+
|
|
1219
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
1220
|
+
|
|
1221
|
+
def _val_sql_syntax(self):
|
|
1222
|
+
"""
|
|
1223
|
+
DESCRIPTION:
|
|
1224
|
+
Internal function to return a string representation of derive
|
|
1225
|
+
Transformation as required by SQL.
|
|
1226
|
+
|
|
1227
|
+
PARAMETERS:
|
|
1228
|
+
None.
|
|
1229
|
+
|
|
1230
|
+
RETURNS:
|
|
1231
|
+
String representing SQL syntax for 'derive' SQL argument.
|
|
1232
|
+
|
|
1233
|
+
RAISES:
|
|
1234
|
+
None.
|
|
1235
|
+
|
|
1236
|
+
EXAMPLE:
|
|
1237
|
+
self._val_sql_syntax()
|
|
1238
|
+
"""
|
|
1239
|
+
derive_fmt = "formula(''{}''), arguments({}), outputname({})"
|
|
1240
|
+
arguments = ", ".join(UtilFuncs._as_list(self.columns))
|
|
1241
|
+
ret_value = derive_fmt.format(self.formula, arguments, self.out_column)
|
|
1242
|
+
|
|
1243
|
+
# Generate and add syntax for "datatype" SQL argument.
|
|
1244
|
+
if self.datatype is not None:
|
|
1245
|
+
self.datatype = self.datatype if isinstance(self.datatype, str) else \
|
|
1246
|
+
_DtypesMappers.TDSQLALCHEMY_DATATYPE_TO_VAL_STRING_MAPPER\
|
|
1247
|
+
[type(self.datatype)](self.datatype)
|
|
1248
|
+
ret_value = "{}, datatype({})".format(ret_value, self.datatype)
|
|
1249
|
+
|
|
1250
|
+
# Generate and add syntax for "nullstyle", a SQL arguments.
|
|
1251
|
+
if self.fillna:
|
|
1252
|
+
ret_value = "{}, {}".format(ret_value, self.fillna._val_nullstyle_fmt())
|
|
1253
|
+
|
|
1254
|
+
# Return the SQL syntax for "derive", a SQL argument.
|
|
1255
|
+
return "{" + ret_value + "}"
|
|
1256
|
+
|
|
1257
|
+
|
|
1258
|
+
class OneHotEncoder(_Transformations):
|
|
1259
|
+
""" Class to represent one hot encoding transformation technique. """
|
|
1260
|
+
|
|
1261
|
+
def __init__(self, values, columns, style="dummy", reference_value=None,
|
|
1262
|
+
out_columns=None, datatype=None, fillna=None):
|
|
1263
|
+
"""
|
|
1264
|
+
DESCRIPTION:
|
|
1265
|
+
One hot encoding is useful when a categorical data element must be re-expressed
|
|
1266
|
+
as one or more numeric data elements, creating a binary numeric field for
|
|
1267
|
+
each categorical data value. One hot encoding supports character, numeric,
|
|
1268
|
+
and date type columns.
|
|
1269
|
+
One hot encoding is offered in two forms: dummy-coding and contrast-coding.
|
|
1270
|
+
* In dummy-coding, a new column is produced for each listed value, with
|
|
1271
|
+
a value of 0 or 1 depending on whether that value is assumed by the
|
|
1272
|
+
original column. If a column assumes n values, new columns can be
|
|
1273
|
+
created for all n values, (or for only n-1 values, because the nth
|
|
1274
|
+
column is perfectly correlated with the first n-1 columns).
|
|
1275
|
+
* Alternately, given a list of values to contrast-code along with a
|
|
1276
|
+
reference value, a new column is produced for each listed value, with
|
|
1277
|
+
a value of 0 or 1 depending on whether that value is assumed by the
|
|
1278
|
+
original column, or a value of -1 if that original value is equal to
|
|
1279
|
+
the reference value.
|
|
1280
|
+
|
|
1281
|
+
Note:
|
|
1282
|
+
Output of this function is passed to "one_hot_encode" argument of
|
|
1283
|
+
"Transform" function from Vantage Analytic Library.
|
|
1284
|
+
|
|
1285
|
+
PARAMETERS:
|
|
1286
|
+
values:
|
|
1287
|
+
Required Argument.
|
|
1288
|
+
Specifies the values to code and optionally the name of the
|
|
1289
|
+
resulting output column.
|
|
1290
|
+
Note:
|
|
1291
|
+
1. If date values are entered as string, the keyword 'DATE' must precede
|
|
1292
|
+
the date value, and do not enclose in single quotes OR
|
|
1293
|
+
pass a datetime.date object.
|
|
1294
|
+
For example,
|
|
1295
|
+
value='DATE 1987-06-09'
|
|
1296
|
+
value=date(1987, 6, 9)
|
|
1297
|
+
2. Use a dict to pass value when result output column is to be named.
|
|
1298
|
+
key of the dictionary must be the value to code and value must be
|
|
1299
|
+
either None, in case result output column is not to be named or a
|
|
1300
|
+
string if it is to be named.
|
|
1301
|
+
For example,
|
|
1302
|
+
values = {"Male": M, "Female": None}
|
|
1303
|
+
In the example above,
|
|
1304
|
+
- we would like to name the output column as 'M' for one hot
|
|
1305
|
+
encoded values for "Male" and
|
|
1306
|
+
- for the one hot encoding values of "Female" we would like to
|
|
1307
|
+
have the output name contain/same as that of "Female", thus
|
|
1308
|
+
None is passed as a value.
|
|
1309
|
+
Types: bool, float, int, str, dict, datetime.date or list of booleans, floats, integers,
|
|
1310
|
+
strings, datetime.date
|
|
1311
|
+
|
|
1312
|
+
columns:
|
|
1313
|
+
Required Argument.
|
|
1314
|
+
Specifies the name of the column. Value passed to this argument
|
|
1315
|
+
also plays a crucial role in determining the output column name.
|
|
1316
|
+
Types: str
|
|
1317
|
+
|
|
1318
|
+
style:
|
|
1319
|
+
Optional Argument.
|
|
1320
|
+
Specifies the one hot encoding style to use.
|
|
1321
|
+
Permitted Values: 'dummy', 'contrast'
|
|
1322
|
+
Default Value: 'dummy'
|
|
1323
|
+
Types: str
|
|
1324
|
+
|
|
1325
|
+
reference_value:
|
|
1326
|
+
Required Argument when "style" is 'contrast', ignored otherwise.
|
|
1327
|
+
Specifies the reference value to use for 'contrast' style. If original
|
|
1328
|
+
value in the column is equal to the reference value then -1 is returned
|
|
1329
|
+
for the same.
|
|
1330
|
+
Types: bool, int, float, str, datetme.date
|
|
1331
|
+
|
|
1332
|
+
out_columns:
|
|
1333
|
+
Optional Argument.
|
|
1334
|
+
Specifies the name of the output column. Value passed to this argument
|
|
1335
|
+
also plays a crucial role in determining the output column name.
|
|
1336
|
+
Types: str
|
|
1337
|
+
|
|
1338
|
+
datatype:
|
|
1339
|
+
Optional Argument.
|
|
1340
|
+
Specifies the name of the intended datatype of the output column.
|
|
1341
|
+
Intended data types for the output column can be specified using either the
|
|
1342
|
+
teradatasqlalchemy types or the permitted strings mentioned below:
|
|
1343
|
+
-------------------------------------------------------------------
|
|
1344
|
+
| If intended SQL Data Type is | Permitted Value to be passed is |
|
|
1345
|
+
|-------------------------------------------------------------------|
|
|
1346
|
+
| bigint | bigint |
|
|
1347
|
+
| byteint | byteint |
|
|
1348
|
+
| char(n) | char,n |
|
|
1349
|
+
| date | date |
|
|
1350
|
+
| decimal(m,n) | decimal,m,n |
|
|
1351
|
+
| float | float |
|
|
1352
|
+
| integer | integer |
|
|
1353
|
+
| number(*) | number |
|
|
1354
|
+
| number(n) | number,n |
|
|
1355
|
+
| number(*,n) | number,*,n |
|
|
1356
|
+
| number(n,n) | number,n,n |
|
|
1357
|
+
| smallint | smallint |
|
|
1358
|
+
| time(p) | time,p |
|
|
1359
|
+
| timestamp(p) | timestamp,p |
|
|
1360
|
+
| varchar(n) | varchar,n |
|
|
1361
|
+
--------------------------------------------------------------------
|
|
1362
|
+
Notes:
|
|
1363
|
+
1. Argument is ignored if "columns" argument is not used.
|
|
1364
|
+
2. char without a size is not supported.
|
|
1365
|
+
3. number(*) does not include the * in its datatype format.
|
|
1366
|
+
Examples:
|
|
1367
|
+
1. If intended datatype for the output column is "bigint", then
|
|
1368
|
+
pass string "bigint" to the argument as shown below:
|
|
1369
|
+
datatype="bigint"
|
|
1370
|
+
2. If intended datatype for the output column is "decimal(3,5)", then
|
|
1371
|
+
pass string "decimal,3,5" to the argument as shown below:
|
|
1372
|
+
datatype="decimal,3,5"
|
|
1373
|
+
Types: str, BIGINT, BYTEINT, CHAR, DATE, DECIMAL, FLOAT, INTEGER, NUMBER, SMALLINT, TIME,
|
|
1374
|
+
TIMESTAMP, VARCHAR.
|
|
1375
|
+
|
|
1376
|
+
fillna:
|
|
1377
|
+
Optional Argument.
|
|
1378
|
+
Specifies whether the null replacement/missing value treatment should
|
|
1379
|
+
be performed with one hot encoding or not. Output of FillNa() can be
|
|
1380
|
+
passed to this argument.
|
|
1381
|
+
Note:
|
|
1382
|
+
If the FillNa object is created with its arguments "columns",
|
|
1383
|
+
"out_columns" and "datatype", then values passed in FillNa() arguments
|
|
1384
|
+
are ignored. Only nullstyle information is captured from the same.
|
|
1385
|
+
Types: FillNa
|
|
1386
|
+
|
|
1387
|
+
NOTES:
|
|
1388
|
+
Output column names for the transformation using Transform() function depends
|
|
1389
|
+
on "values", "columns" and "out_columns" arguments. Here is how output column
|
|
1390
|
+
names are determined:
|
|
1391
|
+
1. If "values" is not dictionary:
|
|
1392
|
+
a. If "out_columns" is not passed, then output column is formed
|
|
1393
|
+
using the value in "values" and column name passed to "columns".
|
|
1394
|
+
For example,
|
|
1395
|
+
If values=["val1", "val2"] and columns="col"
|
|
1396
|
+
then, output column names are:
|
|
1397
|
+
'val1_col' and 'val2_col'
|
|
1398
|
+
b. If "out_columns" is passed, then output column is formed
|
|
1399
|
+
using the value in "values" and column name passed to "out_columns".
|
|
1400
|
+
For example,
|
|
1401
|
+
If values=["val1", "val2"], columns="col", and
|
|
1402
|
+
out_columns="ocol" then, output column names are:
|
|
1403
|
+
'val1_ocol' and 'val2_ocol'
|
|
1404
|
+
2. If "values" is a dictionary:
|
|
1405
|
+
a. If value in a dictionary is not None, then that value is used
|
|
1406
|
+
as output column name.
|
|
1407
|
+
For example:
|
|
1408
|
+
If values = {"val1": "v1"} then output column name is "v1".
|
|
1409
|
+
b. If value in a dictionary is None, then rules specified in point 1
|
|
1410
|
+
are applied to determine the output column name.
|
|
1411
|
+
|
|
1412
|
+
RETURNS:
|
|
1413
|
+
An instance of OneHotEncoder class.
|
|
1414
|
+
|
|
1415
|
+
RAISES:
|
|
1416
|
+
TeradataMlException, TypeError, ValueError
|
|
1417
|
+
|
|
1418
|
+
EXAMPLE:
|
|
1419
|
+
# Note:
|
|
1420
|
+
# To run any transformation, user needs to use Transform() function from
|
|
1421
|
+
# Vantage Analytic Library.
|
|
1422
|
+
# To do so import valib first and set the "val_install_location".
|
|
1423
|
+
>>> from teradataml import configure, DataFrame, OneHotEncoder, FillNa, load_example_data, valib
|
|
1424
|
+
>>> configure.val_install_location = "SYSLIB"
|
|
1425
|
+
>>>
|
|
1426
|
+
|
|
1427
|
+
# Load example data.
|
|
1428
|
+
>>> load_example_data("dataframe", "admissions_train")
|
|
1429
|
+
>>>
|
|
1430
|
+
|
|
1431
|
+
# Create the required DataFrame.
|
|
1432
|
+
>>> df = DataFrame("admissions_train")
|
|
1433
|
+
>>> df
|
|
1434
|
+
masters gpa stats programming admitted
|
|
1435
|
+
id
|
|
1436
|
+
13 no 4.00 Advanced Novice 1
|
|
1437
|
+
26 yes 3.57 Advanced Advanced 1
|
|
1438
|
+
5 no 3.44 Novice Novice 0
|
|
1439
|
+
19 yes 1.98 Advanced Advanced 0
|
|
1440
|
+
15 yes 4.00 Advanced Advanced 1
|
|
1441
|
+
40 yes 3.95 Novice Beginner 0
|
|
1442
|
+
7 yes 2.33 Novice Novice 1
|
|
1443
|
+
22 yes 3.46 Novice Beginner 0
|
|
1444
|
+
36 no 3.00 Advanced Novice 0
|
|
1445
|
+
38 yes 2.65 Advanced Beginner 1
|
|
1446
|
+
>>>
|
|
1447
|
+
|
|
1448
|
+
# Example 1: Encode all values 'Novice', 'Advanced', and 'Beginner'
|
|
1449
|
+
# in "programming" column using "dummy" style.
|
|
1450
|
+
>>> dc = OneHotEncoder(values=["Novice", "Advanced", "Beginner"], columns="programming")
|
|
1451
|
+
|
|
1452
|
+
# Execute Transform() function.
|
|
1453
|
+
>>> obj = valib.Transform(data=df, one_hot_encode=dc, key_columns="id")
|
|
1454
|
+
>>> obj.result
|
|
1455
|
+
id Novice_programming Advanced_programming Beginner_programming
|
|
1456
|
+
0 5 1 0 0
|
|
1457
|
+
1 34 0 0 1
|
|
1458
|
+
2 13 1 0 0
|
|
1459
|
+
3 40 0 0 1
|
|
1460
|
+
4 22 0 0 1
|
|
1461
|
+
5 19 0 1 0
|
|
1462
|
+
6 36 1 0 0
|
|
1463
|
+
7 15 0 1 0
|
|
1464
|
+
8 7 1 0 0
|
|
1465
|
+
9 17 0 1 0
|
|
1466
|
+
>>>
|
|
1467
|
+
|
|
1468
|
+
|
|
1469
|
+
# Example 2: Encode all values 'Novice', 'Advanced', and 'Beginner'
|
|
1470
|
+
# in "programming" column using "dummy" style. Also, pass
|
|
1471
|
+
# "out_columns" argument, to control the name of the output column.
|
|
1472
|
+
>>> dc = OneHotEncoder(style="dummy", values=["Novice", "Advanced", "Beginner"],
|
|
1473
|
+
... columns="programming", out_columns="prog")
|
|
1474
|
+
|
|
1475
|
+
# Execute Transform() function.
|
|
1476
|
+
>>> obj = valib.Transform(data=df, one_hot_encode=dc, key_columns="id")
|
|
1477
|
+
>>> obj.result
|
|
1478
|
+
id Novice_prog Advanced_prog Beginner_prog
|
|
1479
|
+
0 15 0 1 0
|
|
1480
|
+
1 7 1 0 0
|
|
1481
|
+
2 22 0 0 1
|
|
1482
|
+
3 17 0 1 0
|
|
1483
|
+
4 13 1 0 0
|
|
1484
|
+
5 38 0 0 1
|
|
1485
|
+
6 26 0 1 0
|
|
1486
|
+
7 5 1 0 0
|
|
1487
|
+
8 34 0 0 1
|
|
1488
|
+
9 40 0 0 1
|
|
1489
|
+
>>>
|
|
1490
|
+
|
|
1491
|
+
|
|
1492
|
+
# Example 3: Encode all values 'Novice', 'Advanced', and 'Beginner'
|
|
1493
|
+
# in "programming" column using "dummy" style. Example shows
|
|
1494
|
+
# why and how to pass values using dictionary. By passing dictionary,
|
|
1495
|
+
# we should be able to control the name of the output columns.
|
|
1496
|
+
# In this example, we would like to name the output column for
|
|
1497
|
+
# value 'Advanced' as 'Adv', 'Beginner' as 'Beg' and for 'Novice'
|
|
1498
|
+
# we would like to use default mechanism.
|
|
1499
|
+
>>> values = {"Novice": None, "Advanced": "Adv", "Beginner": "Beg"}
|
|
1500
|
+
>>> dc = OneHotEncoder(style="dummy", values=values, columns="programming")
|
|
1501
|
+
|
|
1502
|
+
# Execute Transform() function.
|
|
1503
|
+
>>> obj = valib.Transform(data=df, one_hot_encode=dc, key_columns="id")
|
|
1504
|
+
>>> obj.result
|
|
1505
|
+
id Novice_programming Adv Beg
|
|
1506
|
+
0 13 1 0 0
|
|
1507
|
+
1 26 0 1 0
|
|
1508
|
+
2 5 1 0 0
|
|
1509
|
+
3 19 0 1 0
|
|
1510
|
+
4 15 0 1 0
|
|
1511
|
+
5 40 0 0 1
|
|
1512
|
+
6 7 1 0 0
|
|
1513
|
+
7 22 0 0 1
|
|
1514
|
+
8 36 1 0 0
|
|
1515
|
+
9 38 0 0 1
|
|
1516
|
+
>>>
|
|
1517
|
+
|
|
1518
|
+
|
|
1519
|
+
# Example 4: Encode all values 'Novice', 'Advanced', and 'Beginner'
|
|
1520
|
+
# in "programming" column using "dummy" style.
|
|
1521
|
+
# Example shows controling of the output column name with dictionary
|
|
1522
|
+
# and "out_columns" argument.
|
|
1523
|
+
# In this example, we would like to name the output column for
|
|
1524
|
+
# value 'Advanced' as 'Adv', 'Beginner' as 'Beg', 'Novice' as 'Nov_prog'.
|
|
1525
|
+
>>> values = {"Novice": None, "Advanced": "Adv", "Beginner": "Beg"}
|
|
1526
|
+
>>> dc = OneHotEncoder(style="dummy", values=values, columns="programming",
|
|
1527
|
+
... out_columns="prog")
|
|
1528
|
+
|
|
1529
|
+
# Execute Transform() function.
|
|
1530
|
+
>>> obj = valib.Transform(data=df, one_hot_encode=dc, key_columns="id")
|
|
1531
|
+
>>> obj.result
|
|
1532
|
+
id Novice_prog Adv Beg
|
|
1533
|
+
0 15 0 1 0
|
|
1534
|
+
1 7 1 0 0
|
|
1535
|
+
2 22 0 0 1
|
|
1536
|
+
3 17 0 1 0
|
|
1537
|
+
4 13 1 0 0
|
|
1538
|
+
5 38 0 0 1
|
|
1539
|
+
6 26 0 1 0
|
|
1540
|
+
7 5 1 0 0
|
|
1541
|
+
8 34 0 0 1
|
|
1542
|
+
9 40 0 0 1
|
|
1543
|
+
>>>
|
|
1544
|
+
|
|
1545
|
+
|
|
1546
|
+
# Example 5: Encode 'yes' value in "masters" column using "contrast" style
|
|
1547
|
+
# with reference value as 0.
|
|
1548
|
+
>>> dc = OneHotEncoder(style="contrast", values="yes", reference_value=0,
|
|
1549
|
+
... columns="masters")
|
|
1550
|
+
|
|
1551
|
+
# Execute Transform() function.
|
|
1552
|
+
>>> obj = valib.Transform(data=df, one_hot_encode=dc, key_columns="id")
|
|
1553
|
+
>>> obj.result
|
|
1554
|
+
id yes_masters
|
|
1555
|
+
0 15 1
|
|
1556
|
+
1 7 1
|
|
1557
|
+
2 22 1
|
|
1558
|
+
3 17 0
|
|
1559
|
+
4 13 0
|
|
1560
|
+
5 38 1
|
|
1561
|
+
6 26 1
|
|
1562
|
+
7 5 0
|
|
1563
|
+
8 34 1
|
|
1564
|
+
9 40 1
|
|
1565
|
+
>>>
|
|
1566
|
+
|
|
1567
|
+
|
|
1568
|
+
# Example 6: Encode all values in "programming" column using "contrast" style
|
|
1569
|
+
# with reference_value as 'Advanced'.
|
|
1570
|
+
>>> values = {"Advanced": "Adv", "Beginner": "Beg", "Novice": "Nov"}
|
|
1571
|
+
>>> dc = OneHotEncoder(style="contrast", values=values, reference_value="Advanced",
|
|
1572
|
+
... columns="programming")
|
|
1573
|
+
|
|
1574
|
+
# Execute Transform() function.
|
|
1575
|
+
>>> obj = valib.Transform(data=df, one_hot_encode=dc, key_columns="id")
|
|
1576
|
+
>>> obj.result
|
|
1577
|
+
id Adv Beg Nov
|
|
1578
|
+
0 15 1 -1 -1
|
|
1579
|
+
1 7 0 0 1
|
|
1580
|
+
2 22 0 1 0
|
|
1581
|
+
3 17 1 -1 -1
|
|
1582
|
+
4 13 0 0 1
|
|
1583
|
+
5 38 0 1 0
|
|
1584
|
+
6 26 1 -1 -1
|
|
1585
|
+
7 5 0 0 1
|
|
1586
|
+
8 34 0 1 0
|
|
1587
|
+
9 40 0 1 0
|
|
1588
|
+
>>>
|
|
1589
|
+
|
|
1590
|
+
|
|
1591
|
+
# Example 7: Example shows combining multiple one hot encoding styles on
|
|
1592
|
+
# different columns.
|
|
1593
|
+
|
|
1594
|
+
# Encode all values in 'programming' column using 'dummy' encoding style.
|
|
1595
|
+
>>> dc_prog_dummy = OneHotEncoder(values=["Novice", "Advanced", "Beginner"],
|
|
1596
|
+
... columns="programming", out_columns="prog")
|
|
1597
|
+
>>>
|
|
1598
|
+
|
|
1599
|
+
# Encode all values in 'stats' column using 'dummy' encoding style.
|
|
1600
|
+
# Also, combine it with null replacement.
|
|
1601
|
+
>>> values = {"Advanced": "Adv", "Beginner": "Beg"}
|
|
1602
|
+
>>> fillna = FillNa("literal", "Advanced")
|
|
1603
|
+
>>> dc_stats_dummy = OneHotEncoder(values=values, columns="stats", fillna=fillna)
|
|
1604
|
+
>>>
|
|
1605
|
+
|
|
1606
|
+
# Encode 'yes' in 'masters' column using 'contrast' encoding style.
|
|
1607
|
+
# Reference value used is 'no'.
|
|
1608
|
+
>>> dc_mast_contrast = OneHotEncoder(style="contrast", values="yes", reference_value="no",
|
|
1609
|
+
... columns="masters")
|
|
1610
|
+
>>>
|
|
1611
|
+
|
|
1612
|
+
# Encode all values in 'programming' column using 'contrast' encoding style.
|
|
1613
|
+
# Reference value used is 'Advanced'.
|
|
1614
|
+
>>> dc_prog_contrast = OneHotEncoder(style="contrast",
|
|
1615
|
+
... values=["Novice", "Advanced", "Beginner"],
|
|
1616
|
+
... reference_value="Advanced",
|
|
1617
|
+
... columns="programming")
|
|
1618
|
+
>>>
|
|
1619
|
+
|
|
1620
|
+
# Execute Transform() function.
|
|
1621
|
+
>>> obj = valib.Transform(data=df,
|
|
1622
|
+
... one_hot_encode=[dc_prog_dummy, dc_stats_dummy,
|
|
1623
|
+
... dc_mast_contrast, dc_prog_contrast],
|
|
1624
|
+
... key_columns="id")
|
|
1625
|
+
>>> obj.result
|
|
1626
|
+
id Novice_prog Advanced_prog Beginner_prog Adv Beg yes_masters Novice_programming Advanced_programming Beginner_programming
|
|
1627
|
+
0 13 1 0 0 1 0 -1 1 0 0
|
|
1628
|
+
1 26 0 1 0 1 0 1 -1 1 -1
|
|
1629
|
+
2 5 1 0 0 0 0 -1 1 0 0
|
|
1630
|
+
3 19 0 1 0 1 0 1 -1 1 -1
|
|
1631
|
+
4 15 0 1 0 1 0 1 -1 1 -1
|
|
1632
|
+
5 40 0 0 1 0 0 1 0 0 1
|
|
1633
|
+
6 7 1 0 0 0 0 1 1 0 0
|
|
1634
|
+
7 22 0 0 1 0 0 1 0 0 1
|
|
1635
|
+
8 36 1 0 0 1 0 -1 1 0 0
|
|
1636
|
+
9 38 0 0 1 1 0 1 0 0 1
|
|
1637
|
+
>>>
|
|
1638
|
+
"""
|
|
1639
|
+
# Initialize style and value as data members.
|
|
1640
|
+
self.style = style
|
|
1641
|
+
self.values = values
|
|
1642
|
+
self.reference_value = reference_value
|
|
1643
|
+
self.fillna = fillna
|
|
1644
|
+
self.columns = columns
|
|
1645
|
+
self.out_columns = out_columns
|
|
1646
|
+
|
|
1647
|
+
# Validations
|
|
1648
|
+
arg_info_matrix = []
|
|
1649
|
+
permitted_styles = ["DUMMY", "CONTRAST"]
|
|
1650
|
+
arg_info_matrix.append(["style", self.style, True, str, True, permitted_styles])
|
|
1651
|
+
arg_info_matrix.append(["values", self.values, False,
|
|
1652
|
+
(bool, float, int, str, list, dict, date)])
|
|
1653
|
+
arg_info_matrix.append(["reference_value", self.reference_value, True,
|
|
1654
|
+
(bool, int, float, str, date)])
|
|
1655
|
+
arg_info_matrix.append(["fillna", self.fillna, True, FillNa])
|
|
1656
|
+
# "columns" and "out_columns" they can only accept a string, hence are being validated
|
|
1657
|
+
# here.
|
|
1658
|
+
arg_info_matrix.append(["columns", self.columns, False, str])
|
|
1659
|
+
arg_info_matrix.append(["out_columns", self.out_columns, True, str])
|
|
1660
|
+
# Other argument validations.
|
|
1661
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
1662
|
+
|
|
1663
|
+
# Call super()
|
|
1664
|
+
super().__init__(columns=columns, out_columns=out_columns, datatype=datatype)
|
|
1665
|
+
# Note:
|
|
1666
|
+
# Validations for "datatype" is done by super().
|
|
1667
|
+
|
|
1668
|
+
# "reference_value" should be provided when "style" is 'contrast'.
|
|
1669
|
+
if self.style.upper() == "CONTRAST" and self.reference_value is None:
|
|
1670
|
+
err_ = Messages.get_message(MessageCodes.DEPENDENT_ARG_MISSING,
|
|
1671
|
+
"reference_value",
|
|
1672
|
+
"style={}".format(self.style))
|
|
1673
|
+
raise TeradataMlException(err_, MessageCodes.DEPENDENT_ARG_MISSING)
|
|
1674
|
+
|
|
1675
|
+
if isinstance(self.reference_value, date):
|
|
1676
|
+
self.reference_value = UtilFuncs._convert_date_to_string(self.reference_value)
|
|
1677
|
+
|
|
1678
|
+
def _val_sql_syntax(self):
|
|
1679
|
+
"""
|
|
1680
|
+
DESCRIPTION:
|
|
1681
|
+
Internal function to return a string representation of design code
|
|
1682
|
+
Transformation as required by SQL.
|
|
1683
|
+
|
|
1684
|
+
PARAMETERS:
|
|
1685
|
+
None.
|
|
1686
|
+
|
|
1687
|
+
RETURNS:
|
|
1688
|
+
String representing SQL syntax for 'designcode' SQL argument.
|
|
1689
|
+
|
|
1690
|
+
RAISES:
|
|
1691
|
+
None.
|
|
1692
|
+
|
|
1693
|
+
EXAMPLE:
|
|
1694
|
+
self._val_sql_syntax()
|
|
1695
|
+
"""
|
|
1696
|
+
# Generate syntax for "designstyle" and "designvalues" SQL arguments.
|
|
1697
|
+
design_style = "dummycode"
|
|
1698
|
+
if self.style.upper() == "CONTRAST":
|
|
1699
|
+
design_style = "contrastcode, {}".format(self.reference_value)
|
|
1700
|
+
|
|
1701
|
+
if isinstance(self.values, list):
|
|
1702
|
+
self.values = [
|
|
1703
|
+
UtilFuncs._convert_date_to_string(val) if isinstance(val, date) else val \
|
|
1704
|
+
for val in self.values]
|
|
1705
|
+
design_values = [str(val) if not isinstance(val, str) else val for val in self.values]
|
|
1706
|
+
design_values = ", ".join(design_values)
|
|
1707
|
+
elif isinstance(self.values, dict):
|
|
1708
|
+
values = []
|
|
1709
|
+
for val in self.values:
|
|
1710
|
+
if self.values[val] is not None:
|
|
1711
|
+
if isinstance(self.values[val], date):
|
|
1712
|
+
self.values[val] = UtilFuncs._convert_date_to_string(self.values[val])
|
|
1713
|
+
values.append("{}/{}".format(val, self.values[val]))
|
|
1714
|
+
else:
|
|
1715
|
+
values.append(str(val))
|
|
1716
|
+
design_values = ", ".join(values)
|
|
1717
|
+
elif isinstance(self.values, date):
|
|
1718
|
+
design_values = UtilFuncs._convert_date_to_string(self.values)
|
|
1719
|
+
else:
|
|
1720
|
+
design_values = self.values
|
|
1721
|
+
|
|
1722
|
+
ret_value = "designstyle({}), designvalues({})".format(design_style,
|
|
1723
|
+
design_values)
|
|
1724
|
+
|
|
1725
|
+
# Generate and add syntax for "columns" and "datatype" SQL arguments.
|
|
1726
|
+
columns_fmt = self._val_transformation_fmt()
|
|
1727
|
+
ret_value = "{}, {}".format(ret_value, columns_fmt)
|
|
1728
|
+
|
|
1729
|
+
# Generate and add syntax for "nullstyle", a SQL arguments.
|
|
1730
|
+
if self.fillna:
|
|
1731
|
+
ret_value = "{}, {}".format(ret_value, self.fillna._val_nullstyle_fmt())
|
|
1732
|
+
|
|
1733
|
+
# Return the SQL syntax for "designcode", a SQL argument.
|
|
1734
|
+
return "{" + ret_value + "}"
|
|
1735
|
+
|
|
1736
|
+
|
|
1737
|
+
class LabelEncoder(_Transformations):
|
|
1738
|
+
"""
|
|
1739
|
+
Class to represent label encoding, i.e., variable recoding transformation technique.
|
|
1740
|
+
"""
|
|
1741
|
+
|
|
1742
|
+
def __init__(self, values, columns, default=None, out_columns=None, datatype=None,
|
|
1743
|
+
fillna=None):
|
|
1744
|
+
"""
|
|
1745
|
+
DESCRIPTION:
|
|
1746
|
+
Label encoding a categorical data column is done to re-express existing values
|
|
1747
|
+
of a column (variable) into a new coding scheme or to correct data quality
|
|
1748
|
+
problems and focus an analysis of a particular value. It allows for mapping
|
|
1749
|
+
individual values, NULL values, or any number of remaining values (ELSE
|
|
1750
|
+
option) to a new value, a NULL value or the same value.
|
|
1751
|
+
Label encoding supports character, numeric, and date type columns.
|
|
1752
|
+
|
|
1753
|
+
Note:
|
|
1754
|
+
Output of this function is passed to "label_encode" argument of "Transform"
|
|
1755
|
+
function from Vantage Analytic Library.
|
|
1756
|
+
|
|
1757
|
+
PARAMETERS:
|
|
1758
|
+
values:
|
|
1759
|
+
Required Argument.
|
|
1760
|
+
Specifies the values to be label encoded. Values can be specified in
|
|
1761
|
+
two formats:
|
|
1762
|
+
1. A list of two-tuples, where first value in the tuple is a
|
|
1763
|
+
old value and second value is a new value.
|
|
1764
|
+
For example,
|
|
1765
|
+
values = [(old_val1, new_val2), (old_val2, new_val2)]
|
|
1766
|
+
2. A dictionary with key as old value and value as new value.
|
|
1767
|
+
For example,
|
|
1768
|
+
values = {old_val1: new_val2, old_val2: new_val2}
|
|
1769
|
+
Note:
|
|
1770
|
+
1. If date values are entered as string, the keyword 'DATE' must precede
|
|
1771
|
+
the date value, and do not enclose in single quotes OR
|
|
1772
|
+
pass a datetime.date object.
|
|
1773
|
+
For example,
|
|
1774
|
+
value='DATE 1987-06-09'
|
|
1775
|
+
value=date(1987, 6, 9)
|
|
1776
|
+
2. To keep the old value as is, one can pass 'same' as it's new value.
|
|
1777
|
+
3. To use NULL values for old or new value, one can either use string
|
|
1778
|
+
'null' or None.
|
|
1779
|
+
Types: two-tuple, list of two-tuples, dict
|
|
1780
|
+
|
|
1781
|
+
columns:
|
|
1782
|
+
Required Argument.
|
|
1783
|
+
Specifies the names of the columns containing values to be label encoded.
|
|
1784
|
+
Types: str or list of str
|
|
1785
|
+
|
|
1786
|
+
default:
|
|
1787
|
+
Optional Argument.
|
|
1788
|
+
Specifies the value assumed for all other cases.
|
|
1789
|
+
Permitted Values: None, 'SAME', 'NULL', a literal
|
|
1790
|
+
Default Value: None
|
|
1791
|
+
Types: bool, float, int, str
|
|
1792
|
+
|
|
1793
|
+
out_columns:
|
|
1794
|
+
Optional Argument.
|
|
1795
|
+
Specifies the names of the output columns. Value passed to this argument
|
|
1796
|
+
also plays a crucial role in determining the output column name.
|
|
1797
|
+
Note:
|
|
1798
|
+
Number of elements in "columns" and "out_columns" must be same.
|
|
1799
|
+
Types: str or list of str
|
|
1800
|
+
|
|
1801
|
+
datatype:
|
|
1802
|
+
Optional Argument.
|
|
1803
|
+
Specifies the name of the intended datatype of the output column.
|
|
1804
|
+
Intended data types for the output column can be specified using either the
|
|
1805
|
+
teradatasqlalchemy types or the permitted strings mentioned below:
|
|
1806
|
+
-------------------------------------------------------------------
|
|
1807
|
+
| If intended SQL Data Type is | Permitted Value to be passed is |
|
|
1808
|
+
|-------------------------------------------------------------------|
|
|
1809
|
+
| bigint | bigint |
|
|
1810
|
+
| byteint | byteint |
|
|
1811
|
+
| char(n) | char,n |
|
|
1812
|
+
| date | date |
|
|
1813
|
+
| decimal(m,n) | decimal,m,n |
|
|
1814
|
+
| float | float |
|
|
1815
|
+
| integer | integer |
|
|
1816
|
+
| number(*) | number |
|
|
1817
|
+
| number(n) | number,n |
|
|
1818
|
+
| number(*,n) | number,*,n |
|
|
1819
|
+
| number(n,n) | number,n,n |
|
|
1820
|
+
| smallint | smallint |
|
|
1821
|
+
| time(p) | time,p |
|
|
1822
|
+
| timestamp(p) | timestamp,p |
|
|
1823
|
+
| varchar(n) | varchar,n |
|
|
1824
|
+
--------------------------------------------------------------------
|
|
1825
|
+
Notes:
|
|
1826
|
+
1. Argument is ignored if "columns" argument is not used.
|
|
1827
|
+
2. char without a size is not supported.
|
|
1828
|
+
3. number(*) does not include the * in its datatype format.
|
|
1829
|
+
Examples:
|
|
1830
|
+
1. If intended datatype for the output column is "bigint", then
|
|
1831
|
+
pass string "bigint" to the argument as shown below:
|
|
1832
|
+
datatype="bigint"
|
|
1833
|
+
2. If intended datatype for the output column is "decimal(3,5)", then
|
|
1834
|
+
pass string "decimal,3,5" to the argument as shown below:
|
|
1835
|
+
datatype="decimal,3,5"
|
|
1836
|
+
Types: str, BIGINT, BYTEINT, CHAR, DATE, DECIMAL, FLOAT, INTEGER, NUMBER, SMALLINT, TIME,
|
|
1837
|
+
TIMESTAMP, VARCHAR.
|
|
1838
|
+
|
|
1839
|
+
fillna:
|
|
1840
|
+
Optional Argument.
|
|
1841
|
+
Specifies whether the null replacement/missing value treatment should
|
|
1842
|
+
be performed with recoding or not. Output of FillNa() can be passed to
|
|
1843
|
+
this argument.
|
|
1844
|
+
Note:
|
|
1845
|
+
If the FillNa object is created with its arguments "columns",
|
|
1846
|
+
"out_columns" and "datatype", then values passed in FillNa() arguments
|
|
1847
|
+
are ignored. Only nullstyle information is captured from the same.
|
|
1848
|
+
Types: FillNa
|
|
1849
|
+
|
|
1850
|
+
RETURNS:
|
|
1851
|
+
An instance of LabelEncoder class.
|
|
1852
|
+
|
|
1853
|
+
RAISES:
|
|
1854
|
+
TeradataMlException, TypeError, ValueError
|
|
1855
|
+
|
|
1856
|
+
EXAMPLE:
|
|
1857
|
+
# Note:
|
|
1858
|
+
# To run any transformation, user needs to use Transform() function from
|
|
1859
|
+
# Vantage Analytic Library.
|
|
1860
|
+
# To do so import valib first and set the "val_install_location".
|
|
1861
|
+
>>> from teradataml import configure, DataFrame, LabelEncoder, FillNa, load_example_data, valib
|
|
1862
|
+
>>> configure.val_install_location = "SYSLIB"
|
|
1863
|
+
>>>
|
|
1864
|
+
|
|
1865
|
+
# Load example data.
|
|
1866
|
+
>>> load_example_data("dataframe", "admissions_train")
|
|
1867
|
+
>>>
|
|
1868
|
+
|
|
1869
|
+
# Create the required DataFrame.
|
|
1870
|
+
>>> admissions_train = DataFrame("admissions_train")
|
|
1871
|
+
>>> admissions_train
|
|
1872
|
+
masters gpa stats programming admitted
|
|
1873
|
+
id
|
|
1874
|
+
13 no 4.00 Advanced Novice 1
|
|
1875
|
+
26 yes 3.57 Advanced Advanced 1
|
|
1876
|
+
5 no 3.44 Novice Novice 0
|
|
1877
|
+
19 yes 1.98 Advanced Advanced 0
|
|
1878
|
+
15 yes 4.00 Advanced Advanced 1
|
|
1879
|
+
40 yes 3.95 Novice Beginner 0
|
|
1880
|
+
7 yes 2.33 Novice Novice 1
|
|
1881
|
+
22 yes 3.46 Novice Beginner 0
|
|
1882
|
+
36 no 3.00 Advanced Novice 0
|
|
1883
|
+
38 yes 2.65 Advanced Beginner 1
|
|
1884
|
+
>>>
|
|
1885
|
+
|
|
1886
|
+
# Example 1: Recode all values 'Novice', 'Advanced', and 'Beginner'
|
|
1887
|
+
# in "programming" and "stats" columns.
|
|
1888
|
+
# We will pass values to "label_encode" as dictionary.
|
|
1889
|
+
>>> rc = LabelEncoder(values={"Novice": 1, "Advanced": 2, "Beginner": 3}, columns=["stats", "programming"])
|
|
1890
|
+
|
|
1891
|
+
# Execute Transform() function.
|
|
1892
|
+
>>> obj = valib.Transform(data=admissions_train, label_encode=rc)
|
|
1893
|
+
>>> obj.result
|
|
1894
|
+
id stats programming
|
|
1895
|
+
0 22 1 3
|
|
1896
|
+
1 36 2 1
|
|
1897
|
+
2 15 2 2
|
|
1898
|
+
3 38 2 3
|
|
1899
|
+
4 5 1 1
|
|
1900
|
+
5 17 2 2
|
|
1901
|
+
6 34 2 3
|
|
1902
|
+
7 13 2 1
|
|
1903
|
+
8 26 2 2
|
|
1904
|
+
9 19 2 2
|
|
1905
|
+
>>>
|
|
1906
|
+
|
|
1907
|
+
# Example 2: Recode value 'Novice' as 1 which is passed as tuple to "values"
|
|
1908
|
+
# argument and "label_encode" other values as 0 by passing it to "default"
|
|
1909
|
+
# argument in "programming" and "stats" columns.
|
|
1910
|
+
>>> rc = LabelEncoder(values=("Novice", 1), columns=["stats", "programming"], default=0)
|
|
1911
|
+
|
|
1912
|
+
# Execute Transform() function.
|
|
1913
|
+
>>> obj = valib.Transform(data=admissions_train, label_encode=rc)
|
|
1914
|
+
>>> obj.result
|
|
1915
|
+
id stats programming
|
|
1916
|
+
0 15 0 0
|
|
1917
|
+
1 7 1 1
|
|
1918
|
+
2 22 1 0
|
|
1919
|
+
3 17 0 0
|
|
1920
|
+
4 13 0 1
|
|
1921
|
+
5 38 0 0
|
|
1922
|
+
6 26 0 0
|
|
1923
|
+
7 5 1 1
|
|
1924
|
+
8 34 0 0
|
|
1925
|
+
9 40 1 0
|
|
1926
|
+
>>>
|
|
1927
|
+
|
|
1928
|
+
# Example 3: In this example we encode values differently for multiple columns.
|
|
1929
|
+
|
|
1930
|
+
# For values in "programming" column, recoding will be done as follows:
|
|
1931
|
+
# Novice --> 0
|
|
1932
|
+
# Advanced --> 1 and
|
|
1933
|
+
# Rest of the values as --> NULL
|
|
1934
|
+
>>> rc_prog = LabelEncoder(values=[("Novice", 0), ("Advanced", 1)], columns="programming",
|
|
1935
|
+
... default=None)
|
|
1936
|
+
>>>
|
|
1937
|
+
|
|
1938
|
+
# For values in "stats" column, recoding will be done as follows:
|
|
1939
|
+
# Novice --> N
|
|
1940
|
+
# Advanced --> keep it as is and
|
|
1941
|
+
# Beginner --> NULL
|
|
1942
|
+
>>> rc_stats = LabelEncoder(values={"Novice": 0, "Advanced": "same", "Beginner": None},
|
|
1943
|
+
... columns="stats")
|
|
1944
|
+
>>>
|
|
1945
|
+
|
|
1946
|
+
# For values in "masters" column, recoding will be done as follows:
|
|
1947
|
+
# yes --> 1 and other as 0
|
|
1948
|
+
>>> rc_yes = LabelEncoder(values=("yes", 1), columns="masters", default=0,
|
|
1949
|
+
... out_columns="masters_yes")
|
|
1950
|
+
>>>
|
|
1951
|
+
|
|
1952
|
+
# For values in "masters" column, label encoding will be done as follows:
|
|
1953
|
+
# no --> 1 and other as 0
|
|
1954
|
+
>>> rc_no = LabelEncoder(values=("no", 1), columns="masters", default=0,
|
|
1955
|
+
... out_columns="masters_no")
|
|
1956
|
+
>>>
|
|
1957
|
+
|
|
1958
|
+
# Execute Transform() function.
|
|
1959
|
+
>>> obj = valib.Transform(data=admissions_train, label_encode=[rc_prog, rc_stats, rc_yes,
|
|
1960
|
+
... rc_no])
|
|
1961
|
+
>>> obj.result
|
|
1962
|
+
id programming stats masters_yes masters_no
|
|
1963
|
+
0 13 0 Advanced 0 1
|
|
1964
|
+
1 26 1 Advanced 1 0
|
|
1965
|
+
2 5 0 0 0 1
|
|
1966
|
+
3 19 1 Advanced 1 0
|
|
1967
|
+
4 15 1 Advanced 1 0
|
|
1968
|
+
5 40 None 0 1 0
|
|
1969
|
+
6 7 0 0 1 0
|
|
1970
|
+
7 22 None 0 1 0
|
|
1971
|
+
8 36 0 Advanced 0 1
|
|
1972
|
+
9 38 None Advanced 1 0
|
|
1973
|
+
>>>
|
|
1974
|
+
"""
|
|
1975
|
+
# Call super()
|
|
1976
|
+
super().__init__(columns=columns, out_columns=out_columns, datatype=datatype,
|
|
1977
|
+
columns_optional=False)
|
|
1978
|
+
|
|
1979
|
+
# Initialize style and value as data members.
|
|
1980
|
+
self.values = values
|
|
1981
|
+
self.default = default
|
|
1982
|
+
self.fillna = fillna
|
|
1983
|
+
|
|
1984
|
+
# Validations
|
|
1985
|
+
if isinstance(self.values, tuple):
|
|
1986
|
+
if len(self.values) != 2:
|
|
1987
|
+
raise ValueError("Number of values in a tuple can only be 2.")
|
|
1988
|
+
elif isinstance(self.values, list):
|
|
1989
|
+
for tup in self.values:
|
|
1990
|
+
if not isinstance(tup, tuple):
|
|
1991
|
+
err_ = Messages.get_message(MessageCodes.UNSUPPORTED_DATATYPE)
|
|
1992
|
+
raise TypeError(err_.format("values", ['tuple or dict or list of tuples']))
|
|
1993
|
+
|
|
1994
|
+
if len(tup) != 2:
|
|
1995
|
+
raise ValueError("Number of values in a tuple can only be 2.")
|
|
1996
|
+
|
|
1997
|
+
elif not isinstance(self.values, dict):
|
|
1998
|
+
err_ = Messages.get_message(MessageCodes.UNSUPPORTED_DATATYPE)
|
|
1999
|
+
raise TypeError(err_.format("values", ['tuple or dict or list of tuples']))
|
|
2000
|
+
|
|
2001
|
+
arg_info_matrix = []
|
|
2002
|
+
arg_info_matrix.append(["values", self.values, False, (tuple, list, dict)])
|
|
2003
|
+
arg_info_matrix.append(["default", self.default, True, (bool, int, float, str)])
|
|
2004
|
+
arg_info_matrix.append(["fillna", self.fillna, True, FillNa])
|
|
2005
|
+
|
|
2006
|
+
# Other argument validations.
|
|
2007
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
2008
|
+
# Note:
|
|
2009
|
+
# Validations for "columns", "out_column" and "datatype" is done by super().
|
|
2010
|
+
|
|
2011
|
+
def _val_sql_syntax(self):
|
|
2012
|
+
"""
|
|
2013
|
+
DESCRIPTION:
|
|
2014
|
+
Internal function to return a string representation of LabelEncoder
|
|
2015
|
+
Transformation as required by SQL.
|
|
2016
|
+
|
|
2017
|
+
PARAMETERS:
|
|
2018
|
+
None.
|
|
2019
|
+
|
|
2020
|
+
RETURNS:
|
|
2021
|
+
String representing SQL syntax for 'recode' SQL argument.
|
|
2022
|
+
|
|
2023
|
+
RAISES:
|
|
2024
|
+
None.
|
|
2025
|
+
|
|
2026
|
+
EXAMPLE:
|
|
2027
|
+
self._val_sql_syntax()
|
|
2028
|
+
"""
|
|
2029
|
+
# Generate syntax for "recodevalues".
|
|
2030
|
+
if isinstance(self.values, tuple):
|
|
2031
|
+
old_val = self._get_value_string_repr(self.values[0])
|
|
2032
|
+
new_val = self._get_value_string_repr(self.values[1])
|
|
2033
|
+
recode_values = "{}/{}".format(old_val, new_val)
|
|
2034
|
+
elif isinstance(self.values, list):
|
|
2035
|
+
recode_values = []
|
|
2036
|
+
for val in self.values:
|
|
2037
|
+
old_val = self._get_value_string_repr(val[0])
|
|
2038
|
+
new_val = self._get_value_string_repr(val[1])
|
|
2039
|
+
recode_values.append("{}/{}".format(old_val, new_val))
|
|
2040
|
+
recode_values = ", ".join(recode_values)
|
|
2041
|
+
else:
|
|
2042
|
+
recode_values = []
|
|
2043
|
+
for key in self.values:
|
|
2044
|
+
old_val = self._get_value_string_repr(key)
|
|
2045
|
+
new_val = self._get_value_string_repr(self.values[key])
|
|
2046
|
+
recode_values.append("{}/{}".format(old_val, new_val))
|
|
2047
|
+
recode_values = ", ".join(recode_values)
|
|
2048
|
+
|
|
2049
|
+
recode_other = "NULL" if self.default is None else self.default
|
|
2050
|
+
|
|
2051
|
+
ret_value = "recodevalues({}), recodeother({})".format(recode_values,
|
|
2052
|
+
recode_other)
|
|
2053
|
+
|
|
2054
|
+
# Generate and add syntax for "columns" and "datatype" SQL arguments.
|
|
2055
|
+
columns_fmt = self._val_transformation_fmt()
|
|
2056
|
+
ret_value = "{}, {}".format(ret_value, columns_fmt)
|
|
2057
|
+
|
|
2058
|
+
# Generate and add syntax for "nullstyle", a SQL arguments.
|
|
2059
|
+
if self.fillna:
|
|
2060
|
+
ret_value = "{}, {}".format(ret_value, self.fillna._val_nullstyle_fmt())
|
|
2061
|
+
# Return the SQL syntax for "recode", a SQL argument.
|
|
2062
|
+
return "{" + ret_value + "}"
|
|
2063
|
+
|
|
2064
|
+
def _get_value_string_repr(self, value):
|
|
2065
|
+
"""
|
|
2066
|
+
DESCRIPTION:
|
|
2067
|
+
Internal function to return a string representation of given value if required.
|
|
2068
|
+
|
|
2069
|
+
PARAMETERS:
|
|
2070
|
+
value:
|
|
2071
|
+
Required Argument.
|
|
2072
|
+
Specifies the value to perform conversion on.
|
|
2073
|
+
Types: str, bool, float, None, datetime.date
|
|
2074
|
+
|
|
2075
|
+
RETURNS:
|
|
2076
|
+
String representation of passed argument.
|
|
2077
|
+
|
|
2078
|
+
RAISES:
|
|
2079
|
+
None.
|
|
2080
|
+
|
|
2081
|
+
Examples:
|
|
2082
|
+
self._get_value_string_repr(key)
|
|
2083
|
+
|
|
2084
|
+
"""
|
|
2085
|
+
if value is not None and isinstance(value, date):
|
|
2086
|
+
updated_val = UtilFuncs._convert_date_to_string(value)
|
|
2087
|
+
elif value is None:
|
|
2088
|
+
updated_val = "NULL"
|
|
2089
|
+
elif value == "":
|
|
2090
|
+
updated_val = "\"\""
|
|
2091
|
+
else:
|
|
2092
|
+
updated_val = value
|
|
2093
|
+
return updated_val
|
|
2094
|
+
|
|
2095
|
+
|
|
2096
|
+
class MinMaxScalar(_Transformations):
|
|
2097
|
+
""" Class to represent rescale transformation technique. """
|
|
2098
|
+
|
|
2099
|
+
def __init__(self, columns, lbound=0, ubound=1, out_columns=None, datatype=None,
|
|
2100
|
+
fillna=None):
|
|
2101
|
+
"""
|
|
2102
|
+
DESCRIPTION:
|
|
2103
|
+
MinMaxScalar allows rescaling that limits the upper and lower boundaries of the
|
|
2104
|
+
data in a continuous numeric column using a linear rescaling function based on
|
|
2105
|
+
maximum and minimum data values. MinMaxScalar is useful with algorithms that require
|
|
2106
|
+
or work better with data within a certain range. MinMaxScalar is only valid on numeric
|
|
2107
|
+
columns, and not columns of type date.
|
|
2108
|
+
|
|
2109
|
+
The rescale transformation formulas are shown in the following examples.
|
|
2110
|
+
The l denotes the left bound and r denotes the right bound.
|
|
2111
|
+
* When both the lower and upper bounds are specified:
|
|
2112
|
+
f(x,l,r) = (l+(x-min(x))(r-l))/(max(x)-min(x))
|
|
2113
|
+
* When only the lower bound is specified:
|
|
2114
|
+
f(x,l) = x-min(x)+l
|
|
2115
|
+
* When only the upper bound is specified:
|
|
2116
|
+
f(x,r) = x-max(x)+r
|
|
2117
|
+
Rescaling supports only numeric type columns.
|
|
2118
|
+
|
|
2119
|
+
Note:
|
|
2120
|
+
Output of this function is passed to "rescale" argument of "Transform"
|
|
2121
|
+
function from Vantage Analytic Library.
|
|
2122
|
+
|
|
2123
|
+
PARAMETERS:
|
|
2124
|
+
columns:
|
|
2125
|
+
Required Argument.
|
|
2126
|
+
Specifies the names of the columns to perform transformation on.
|
|
2127
|
+
Types: str or list of str
|
|
2128
|
+
|
|
2129
|
+
lbound:
|
|
2130
|
+
Optional Argument.
|
|
2131
|
+
Specifies the lowerbound value required for rescaling the numeric data.
|
|
2132
|
+
If only the lower boundary is supplied, the variable is aligned to this
|
|
2133
|
+
value. This can be achieved by passing None to "ubound" argument.
|
|
2134
|
+
Default Value: 0
|
|
2135
|
+
Types: float, int
|
|
2136
|
+
|
|
2137
|
+
ubound:
|
|
2138
|
+
Optional Argument.
|
|
2139
|
+
Specifies the upperbound value required for rescaling the numeric data.
|
|
2140
|
+
If only an upper boundary value is specified, the variable is aligned to
|
|
2141
|
+
this value. This can be achieved by passing None to "lbound" argument.
|
|
2142
|
+
Default Value: 1
|
|
2143
|
+
Types: float, int
|
|
2144
|
+
|
|
2145
|
+
out_columns:
|
|
2146
|
+
Optional Argument.
|
|
2147
|
+
Specifies the names of the output columns.
|
|
2148
|
+
Note:
|
|
2149
|
+
Number of elements in "columns" and "out_columns" must be same.
|
|
2150
|
+
Types: str or list of str
|
|
2151
|
+
|
|
2152
|
+
datatype:
|
|
2153
|
+
Optional Argument.
|
|
2154
|
+
Specifies the name of the intended datatype of the output column.
|
|
2155
|
+
Intended data types for the output column can be specified using either the
|
|
2156
|
+
teradatasqlalchemy types or the permitted strings mentioned below:
|
|
2157
|
+
-------------------------------------------------------------------
|
|
2158
|
+
| If intended SQL Data Type is | Permitted Value to be passed is |
|
|
2159
|
+
|-------------------------------------------------------------------|
|
|
2160
|
+
| bigint | bigint |
|
|
2161
|
+
| byteint | byteint |
|
|
2162
|
+
| char(n) | char,n |
|
|
2163
|
+
| date | date |
|
|
2164
|
+
| decimal(m,n) | decimal,m,n |
|
|
2165
|
+
| float | float |
|
|
2166
|
+
| integer | integer |
|
|
2167
|
+
| number(*) | number |
|
|
2168
|
+
| number(n) | number,n |
|
|
2169
|
+
| number(*,n) | number,*,n |
|
|
2170
|
+
| number(n,n) | number,n,n |
|
|
2171
|
+
| smallint | smallint |
|
|
2172
|
+
| time(p) | time,p |
|
|
2173
|
+
| timestamp(p) | timestamp,p |
|
|
2174
|
+
| varchar(n) | varchar,n |
|
|
2175
|
+
--------------------------------------------------------------------
|
|
2176
|
+
Notes:
|
|
2177
|
+
1. Argument is ignored if "columns" argument is not used.
|
|
2178
|
+
2. char without a size is not supported.
|
|
2179
|
+
3. number(*) does not include the * in its datatype format.
|
|
2180
|
+
Examples:
|
|
2181
|
+
1. If intended datatype for the output column is "bigint", then
|
|
2182
|
+
pass string "bigint" to the argument as shown below:
|
|
2183
|
+
datatype="bigint"
|
|
2184
|
+
2. If intended datatype for the output column is "decimal(3,5)", then
|
|
2185
|
+
pass string "decimal,3,5" to the argument as shown below:
|
|
2186
|
+
datatype="decimal,3,5"
|
|
2187
|
+
Types: str, BIGINT, BYTEINT, CHAR, DATE, DECIMAL, FLOAT, INTEGER, NUMBER, SMALLINT, TIME,
|
|
2188
|
+
TIMESTAMP, VARCHAR.
|
|
2189
|
+
|
|
2190
|
+
fillna:
|
|
2191
|
+
Optional Argument.
|
|
2192
|
+
Specifies whether the null replacement/missing value treatment should
|
|
2193
|
+
be performed with rescaling or not. Output of 'FillNa()' can be passed to
|
|
2194
|
+
this argument.
|
|
2195
|
+
Note:
|
|
2196
|
+
If the FillNa object is created with its arguments "columns",
|
|
2197
|
+
"out_columns" and "datatype", then values passed in FillNa() arguments
|
|
2198
|
+
are ignored. Only nullstyle information is captured from the same.
|
|
2199
|
+
Types: FillNa
|
|
2200
|
+
|
|
2201
|
+
RETURNS:
|
|
2202
|
+
An instance of MinMaxScalar class.
|
|
2203
|
+
|
|
2204
|
+
RAISES:
|
|
2205
|
+
TeradataMlException, TypeError, ValueError
|
|
2206
|
+
|
|
2207
|
+
EXAMPLE:
|
|
2208
|
+
# Note:
|
|
2209
|
+
# To run any transformation, user needs to use Transform() function from
|
|
2210
|
+
# Vantage Analytic Library.
|
|
2211
|
+
# To do so import valib first and set the "val_install_location".
|
|
2212
|
+
>>> from teradataml import configure, DataFrame, MinMaxScalar, FillNa, load_example_data, valib
|
|
2213
|
+
>>> configure.val_install_location = "SYSLIB"
|
|
2214
|
+
>>>
|
|
2215
|
+
|
|
2216
|
+
# Load example data.
|
|
2217
|
+
>>> load_example_data("dataframe", "sales")
|
|
2218
|
+
>>>
|
|
2219
|
+
|
|
2220
|
+
# Create the required DataFrames.
|
|
2221
|
+
>>> df = DataFrame("sales")
|
|
2222
|
+
>>> df
|
|
2223
|
+
Feb Jan Mar Apr datetime
|
|
2224
|
+
accounts
|
|
2225
|
+
Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
|
|
2226
|
+
Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
|
|
2227
|
+
Yellow Inc 90.0 NaN NaN NaN 04/01/2017
|
|
2228
|
+
Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
|
|
2229
|
+
Red Inc 200.0 150.0 140.0 NaN 04/01/2017
|
|
2230
|
+
Orange Inc 210.0 NaN NaN 250.0 04/01/2017
|
|
2231
|
+
>>>
|
|
2232
|
+
|
|
2233
|
+
# Example 1: Rescale values in column "Feb", using the default bounds, which is
|
|
2234
|
+
# with lowerbound as 0 and upperbound as 1.
|
|
2235
|
+
>>> rs = MinMaxScalar(columns="Feb")
|
|
2236
|
+
|
|
2237
|
+
# Execute Transform() function.
|
|
2238
|
+
>>> obj = valib.Transform(data=df, rescale=rs)
|
|
2239
|
+
>>> obj.result
|
|
2240
|
+
accounts Feb
|
|
2241
|
+
0 Blue Inc 0.000000
|
|
2242
|
+
1 Alpha Co 1.000000
|
|
2243
|
+
2 Jones LLC 0.916667
|
|
2244
|
+
3 Yellow Inc 0.000000
|
|
2245
|
+
4 Orange Inc 1.000000
|
|
2246
|
+
5 Red Inc 0.916667
|
|
2247
|
+
>>>
|
|
2248
|
+
|
|
2249
|
+
# Example 2: Rescale values in column "Feb", using only lowerbound as -1.
|
|
2250
|
+
# To use only lowerbound, one must pass None to "ubound".
|
|
2251
|
+
>>> rs = MinMaxScalar(columns="Feb", lbound=-1, ubound=None)
|
|
2252
|
+
|
|
2253
|
+
# Execute Transform() function.
|
|
2254
|
+
>>> obj = valib.Transform(data=df, rescale=rs)
|
|
2255
|
+
>>> obj.result
|
|
2256
|
+
accounts Feb
|
|
2257
|
+
0 Jones LLC 109.0
|
|
2258
|
+
1 Yellow Inc -1.0
|
|
2259
|
+
2 Red Inc 109.0
|
|
2260
|
+
3 Blue Inc -1.0
|
|
2261
|
+
4 Alpha Co 119.0
|
|
2262
|
+
5 Orange Inc 119.0
|
|
2263
|
+
>>>
|
|
2264
|
+
|
|
2265
|
+
# Example 3: Rescale values in columns "Jan" and "Apr", using only upperbound as 10.
|
|
2266
|
+
# To use only upperbound, one must pass None to "lbound".
|
|
2267
|
+
# We shall also combine this with missing value treatment. We shall replace
|
|
2268
|
+
# missing values with "mode" null style replacement.
|
|
2269
|
+
>>> fn = FillNa(style="mode")
|
|
2270
|
+
>>> rs = MinMaxScalar(columns=["Jan", "Apr"], lbound=None, ubound=10, fillna=fn)
|
|
2271
|
+
|
|
2272
|
+
# Execute Transform() function.
|
|
2273
|
+
>>> obj = valib.Transform(data=df, rescale=rs, key_columns="accounts")
|
|
2274
|
+
>>> obj.result
|
|
2275
|
+
accounts Jan Apr
|
|
2276
|
+
0 Alpha Co 10.0 10.0
|
|
2277
|
+
1 Blue Inc -140.0 -139.0
|
|
2278
|
+
2 Yellow Inc -40.0 10.0
|
|
2279
|
+
3 Jones LLC -40.0 -60.0
|
|
2280
|
+
4 Red Inc -40.0 10.0
|
|
2281
|
+
5 Orange Inc -40.0 10.0
|
|
2282
|
+
>>>
|
|
2283
|
+
|
|
2284
|
+
# Example 4: This example shows combining multiple ways of rescaling in one
|
|
2285
|
+
# Transform() call.
|
|
2286
|
+
|
|
2287
|
+
# Rescale values in column "Feb" using lowerbound as -1 and upperbound as 1.
|
|
2288
|
+
# Name the output column as "Feb1".
|
|
2289
|
+
>>> rs_1 = MinMaxScalar(columns="Feb", lbound=-1, ubound=1, out_columns="Feb1")
|
|
2290
|
+
>>>
|
|
2291
|
+
|
|
2292
|
+
# Rescale values in column "Feb" using only upperbound as 1.
|
|
2293
|
+
# Name the output column as "FebU".
|
|
2294
|
+
>>> rs_2 = MinMaxScalar(columns="Feb", lbound=None, ubound=1, out_columns="FebU")
|
|
2295
|
+
>>>
|
|
2296
|
+
|
|
2297
|
+
# Rescale values in column "Feb" using only lowerbound as 0 (default value).
|
|
2298
|
+
# Name the output column as "FebL".
|
|
2299
|
+
>>> rs_3 = MinMaxScalar(columns="Feb", ubound=None, out_columns="FebL")
|
|
2300
|
+
>>>
|
|
2301
|
+
|
|
2302
|
+
# Rescale values in columns "Jan" and "Apr" using default bounds.
|
|
2303
|
+
# Name the output columns as "Jan1" and "Apr1".
|
|
2304
|
+
# Combine with Missing value treatment, with literal null replacement.
|
|
2305
|
+
>>> fn_1 = FillNa(style="literal", value=0)
|
|
2306
|
+
>>> rs_4 = MinMaxScalar(columns=["Jan", "Apr"], out_columns=["Jan1", "Apr1"], fillna=fn_1)
|
|
2307
|
+
>>>
|
|
2308
|
+
|
|
2309
|
+
# Rescale values in columns "Jan" and "Apr" using default bounds.
|
|
2310
|
+
# Name the output columns as "Jan2" and "Apr2".
|
|
2311
|
+
# Combine with Missing value treatment, with median null replacement.
|
|
2312
|
+
>>> fn_2 = FillNa(style="median")
|
|
2313
|
+
>>> rs_5 = MinMaxScalar(columns=["Jan", "Apr"], out_columns=["Jan2", "Apr2"], fillna=fn_2)
|
|
2314
|
+
>>>
|
|
2315
|
+
|
|
2316
|
+
# Execute Transform() function.
|
|
2317
|
+
>>> obj = valib.Transform(data=df, rescale=[rs_1, rs_2, rs_3, rs_4, rs_5],
|
|
2318
|
+
... key_columns="accounts")
|
|
2319
|
+
>>> obj.result
|
|
2320
|
+
accounts Feb1 FebU FebL Jan1 Apr1 Jan2 Apr2
|
|
2321
|
+
0 Blue Inc -1.000000 -119.0 0.0 0.25 0.404 0.000000 0.000000
|
|
2322
|
+
1 Alpha Co 1.000000 1.0 120.0 1.00 1.000 1.000000 1.000000
|
|
2323
|
+
2 Jones LLC 0.833333 -9.0 110.0 0.75 0.720 0.666667 0.530201
|
|
2324
|
+
3 Yellow Inc -1.000000 -119.0 0.0 0.00 0.000 0.666667 0.765101
|
|
2325
|
+
4 Orange Inc 1.000000 1.0 120.0 0.00 1.000 0.666667 1.000000
|
|
2326
|
+
5 Red Inc 0.833333 -9.0 110.0 0.75 0.000 0.666667 0.765101
|
|
2327
|
+
>>>
|
|
2328
|
+
"""
|
|
2329
|
+
# Call super()
|
|
2330
|
+
super().__init__(columns=columns, out_columns=out_columns, datatype=datatype,
|
|
2331
|
+
columns_optional=False)
|
|
2332
|
+
|
|
2333
|
+
# Initialize style and value as data members.
|
|
2334
|
+
self.lbound = lbound
|
|
2335
|
+
self.ubound = ubound
|
|
2336
|
+
self.fillna = fillna
|
|
2337
|
+
|
|
2338
|
+
# Validations
|
|
2339
|
+
arg_info_matrix = []
|
|
2340
|
+
arg_info_matrix.append(["lbound", self.lbound, True, (float, int)])
|
|
2341
|
+
arg_info_matrix.append(["ubound", self.ubound, True, (float, int)])
|
|
2342
|
+
arg_info_matrix.append(["fillna", self.fillna, True, FillNa])
|
|
2343
|
+
# Note:
|
|
2344
|
+
# Validations for "columns", "out_columns" and "datatype" is done by super().
|
|
2345
|
+
# Other argument validations.
|
|
2346
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
2347
|
+
|
|
2348
|
+
if self.lbound is None and self.ubound is None:
|
|
2349
|
+
raise TeradataMlException(
|
|
2350
|
+
Messages.get_message(MessageCodes.SPECIFY_AT_LEAST_ONE_ARG,
|
|
2351
|
+
"lbound", "ubound"),
|
|
2352
|
+
MessageCodes.SPECIFY_AT_LEAST_ONE_ARG)
|
|
2353
|
+
|
|
2354
|
+
def _val_sql_syntax(self):
|
|
2355
|
+
"""
|
|
2356
|
+
DESCRIPTION:
|
|
2357
|
+
Internal function to return a string representation of rescale
|
|
2358
|
+
Transformation as required by SQL.
|
|
2359
|
+
|
|
2360
|
+
PARAMETERS:
|
|
2361
|
+
None.
|
|
2362
|
+
|
|
2363
|
+
RETURNS:
|
|
2364
|
+
String representing SQL syntax for 'rescale' SQL argument.
|
|
2365
|
+
|
|
2366
|
+
RAISES:
|
|
2367
|
+
None.
|
|
2368
|
+
|
|
2369
|
+
EXAMPLE:
|
|
2370
|
+
self._val_sql_syntax()
|
|
2371
|
+
"""
|
|
2372
|
+
# Generate syntax for "rescale" SQL argument.
|
|
2373
|
+
rescale_values = []
|
|
2374
|
+
if self.lbound is not None:
|
|
2375
|
+
rescale_values.append("lowerbound/{}".format(self.lbound))
|
|
2376
|
+
|
|
2377
|
+
if self.ubound is not None:
|
|
2378
|
+
rescale_values.append("upperbound/{}".format(self.ubound))
|
|
2379
|
+
|
|
2380
|
+
ret_value = "rescalebounds({})".format(", ".join(rescale_values))
|
|
2381
|
+
|
|
2382
|
+
# Generate and add syntax for "columns" and "datatype" SQL arguments.
|
|
2383
|
+
columns_fmt = self._val_transformation_fmt()
|
|
2384
|
+
ret_value = "{}, {}".format(ret_value, columns_fmt)
|
|
2385
|
+
|
|
2386
|
+
# Generate and add syntax for "nullstyle", a SQL arguments.
|
|
2387
|
+
if self.fillna:
|
|
2388
|
+
ret_value = "{}, {}".format(ret_value, self.fillna._val_nullstyle_fmt())
|
|
2389
|
+
|
|
2390
|
+
return "{" + ret_value + "}"
|
|
2391
|
+
|
|
2392
|
+
|
|
2393
|
+
class Retain(_Transformations):
|
|
2394
|
+
"""
|
|
2395
|
+
Class to represent Retain transformation technique to retain or copy columns
|
|
2396
|
+
from input to output.
|
|
2397
|
+
"""
|
|
2398
|
+
|
|
2399
|
+
def __init__(self, columns, out_columns=None, datatype=None):
|
|
2400
|
+
"""
|
|
2401
|
+
DESCRIPTION:
|
|
2402
|
+
Retain option allows you to copy one or more columns into the final
|
|
2403
|
+
analytic data set. By default, the result column name is the same as
|
|
2404
|
+
the input column name, but this can be changed. If a specific type is
|
|
2405
|
+
specified, it results in casting the retained column.
|
|
2406
|
+
The Retain transformation is supported for all valid data types.
|
|
2407
|
+
|
|
2408
|
+
Note:
|
|
2409
|
+
Output of this function is passed to "retain" argument of "Transform"
|
|
2410
|
+
function from Vantage Analytic Library.
|
|
2411
|
+
|
|
2412
|
+
PARAMETERS:
|
|
2413
|
+
columns:
|
|
2414
|
+
Required Argument.
|
|
2415
|
+
Specifies the names of the columns to retain.
|
|
2416
|
+
Types: str or list of str
|
|
2417
|
+
|
|
2418
|
+
out_columns:
|
|
2419
|
+
Optional Argument.
|
|
2420
|
+
Specifies the names of the output columns.
|
|
2421
|
+
Note:
|
|
2422
|
+
Number of elements in "columns" and "out_columns" must be same.
|
|
2423
|
+
Types: str or list of str
|
|
2424
|
+
|
|
2425
|
+
datatype:
|
|
2426
|
+
Optional Argument.
|
|
2427
|
+
Specifies the name of the intended datatype of the output column.
|
|
2428
|
+
Intended data types for the output column can be specified using either the
|
|
2429
|
+
teradatasqlalchemy types or the permitted strings mentioned below:
|
|
2430
|
+
-------------------------------------------------------------------
|
|
2431
|
+
| If intended SQL Data Type is | Permitted Value to be passed is |
|
|
2432
|
+
|-------------------------------------------------------------------|
|
|
2433
|
+
| bigint | bigint |
|
|
2434
|
+
| byteint | byteint |
|
|
2435
|
+
| char(n) | char,n |
|
|
2436
|
+
| date | date |
|
|
2437
|
+
| decimal(m,n) | decimal,m,n |
|
|
2438
|
+
| float | float |
|
|
2439
|
+
| integer | integer |
|
|
2440
|
+
| number(*) | number |
|
|
2441
|
+
| number(n) | number,n |
|
|
2442
|
+
| number(*,n) | number,*,n |
|
|
2443
|
+
| number(n,n) | number,n,n |
|
|
2444
|
+
| smallint | smallint |
|
|
2445
|
+
| time(p) | time,p |
|
|
2446
|
+
| timestamp(p) | timestamp,p |
|
|
2447
|
+
| varchar(n) | varchar,n |
|
|
2448
|
+
--------------------------------------------------------------------
|
|
2449
|
+
Notes:
|
|
2450
|
+
1. Argument is ignored if "columns" argument is not used.
|
|
2451
|
+
2. char without a size is not supported.
|
|
2452
|
+
3. number(*) does not include the * in its datatype format.
|
|
2453
|
+
Examples:
|
|
2454
|
+
1. If intended datatype for the output column is "bigint", then
|
|
2455
|
+
pass string "bigint" to the argument as shown below:
|
|
2456
|
+
datatype="bigint"
|
|
2457
|
+
2. If intended datatype for the output column is "decimal(3,5)", then
|
|
2458
|
+
pass string "decimal,3,5" to the argument as shown below:
|
|
2459
|
+
datatype="decimal,3,5"
|
|
2460
|
+
Types: str, BIGINT, BYTEINT, CHAR, DATE, DECIMAL, FLOAT, INTEGER, NUMBER, SMALLINT, TIME,
|
|
2461
|
+
TIMESTAMP, VARCHAR.
|
|
2462
|
+
|
|
2463
|
+
RETURNS:
|
|
2464
|
+
An instance of Retain class.
|
|
2465
|
+
|
|
2466
|
+
RAISES:
|
|
2467
|
+
TeradataMlException, TypeError, ValueError
|
|
2468
|
+
|
|
2469
|
+
EXAMPLE:
|
|
2470
|
+
# Note:
|
|
2471
|
+
# To run any transformation, user needs to use Transform() function from
|
|
2472
|
+
# Vantage Analytic Library.
|
|
2473
|
+
# To do so import valib first and set the "val_install_location".
|
|
2474
|
+
>>> from teradataml import configure, DataFrame, load_example_data, valib, Retain
|
|
2475
|
+
>>> configure.val_install_location = "SYSLIB"
|
|
2476
|
+
>>>
|
|
2477
|
+
|
|
2478
|
+
# Load example data.
|
|
2479
|
+
>>> load_example_data("dataframe", "sales")
|
|
2480
|
+
>>>
|
|
2481
|
+
|
|
2482
|
+
# Create the required DataFrames.
|
|
2483
|
+
>>> sales = DataFrame("sales")
|
|
2484
|
+
>>> sales
|
|
2485
|
+
Feb Jan Mar Apr datetime
|
|
2486
|
+
accounts
|
|
2487
|
+
Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
|
|
2488
|
+
Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
|
|
2489
|
+
Yellow Inc 90.0 NaN NaN NaN 04/01/2017
|
|
2490
|
+
Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
|
|
2491
|
+
Red Inc 200.0 150.0 140.0 NaN 04/01/2017
|
|
2492
|
+
Orange Inc 210.0 NaN NaN 250.0 04/01/2017
|
|
2493
|
+
>>>
|
|
2494
|
+
|
|
2495
|
+
# Example: Shows retaining some column unchanged and some with name or datatype
|
|
2496
|
+
# change.
|
|
2497
|
+
|
|
2498
|
+
# Retain columns "accounts" and "Feb" as is.
|
|
2499
|
+
>>> rt_1 = Retain(columns=["accounts", "Feb"])
|
|
2500
|
+
>>>
|
|
2501
|
+
|
|
2502
|
+
# Retain column "Jan" with name as "january".
|
|
2503
|
+
>>> rt_2 = Retain(columns="Jan", out_columns="january")
|
|
2504
|
+
>>>
|
|
2505
|
+
|
|
2506
|
+
# Retain column "Mar" and "Apr" with name as "march" and "april" with
|
|
2507
|
+
# datatype changed to 'bigint'.
|
|
2508
|
+
>>> rt_3 = Retain(columns=["Mar", "Apr"], out_columns=["march", "april"],
|
|
2509
|
+
... datatype="bigint")
|
|
2510
|
+
>>>
|
|
2511
|
+
|
|
2512
|
+
|
|
2513
|
+
# Execute Transform() function.
|
|
2514
|
+
>>> obj = valib.Transform(data=sales, retain=[rt_1, rt_2, rt_3])
|
|
2515
|
+
>>> obj.result
|
|
2516
|
+
accounts accounts1 Feb january march april
|
|
2517
|
+
0 Alpha Co Alpha Co 210.0 200.0 215.0 250.0
|
|
2518
|
+
1 Blue Inc Blue Inc 90.0 50.0 95.0 101.0
|
|
2519
|
+
2 Yellow Inc Yellow Inc 90.0 NaN NaN NaN
|
|
2520
|
+
3 Jones LLC Jones LLC 200.0 150.0 140.0 180.0
|
|
2521
|
+
4 Red Inc Red Inc 200.0 150.0 140.0 NaN
|
|
2522
|
+
5 Orange Inc Orange Inc 210.0 NaN NaN 250.0
|
|
2523
|
+
>>>
|
|
2524
|
+
"""
|
|
2525
|
+
# Call super()
|
|
2526
|
+
super().__init__(columns=columns, out_columns=out_columns, datatype=datatype,
|
|
2527
|
+
columns_optional=False)
|
|
2528
|
+
|
|
2529
|
+
def _val_sql_syntax(self):
|
|
2530
|
+
"""
|
|
2531
|
+
DESCRIPTION:
|
|
2532
|
+
Internal function to return a string representation of retain
|
|
2533
|
+
Transformation as required by SQL.
|
|
2534
|
+
|
|
2535
|
+
PARAMETERS:
|
|
2536
|
+
None.
|
|
2537
|
+
|
|
2538
|
+
RETURNS:
|
|
2539
|
+
String representing SQL syntax for 'retain' SQL argument.
|
|
2540
|
+
|
|
2541
|
+
RAISES:
|
|
2542
|
+
None.
|
|
2543
|
+
|
|
2544
|
+
EXAMPLE:
|
|
2545
|
+
self._val_sql_syntax()
|
|
2546
|
+
"""
|
|
2547
|
+
# Generate and return syntax for "columns" and "datatype" SQL arguments.
|
|
2548
|
+
return "{" + self._val_transformation_fmt() + "}"
|
|
2549
|
+
|
|
2550
|
+
|
|
2551
|
+
class Sigmoid(_Transformations):
|
|
2552
|
+
"""
|
|
2553
|
+
Class to represent sigmoid transformation technique for rescaling of continuous
|
|
2554
|
+
numeric data.
|
|
2555
|
+
"""
|
|
2556
|
+
|
|
2557
|
+
def __init__(self, columns, style="logit", out_columns=None, datatype=None,
|
|
2558
|
+
fillna=None):
|
|
2559
|
+
"""
|
|
2560
|
+
DESCRIPTION:
|
|
2561
|
+
Sigmoid transformation allows rescaling of continuous numeric data in a more
|
|
2562
|
+
sophisticated way than the Rescaling transformation function. In a Sigmoid
|
|
2563
|
+
transformation, a numeric column is transformed using a type of sigmoid or
|
|
2564
|
+
s-shaped function.
|
|
2565
|
+
|
|
2566
|
+
These non-linear transformations are more useful in data mining than a linear
|
|
2567
|
+
Rescaling transformation. The Sigmoid transformation is supported for numeric
|
|
2568
|
+
columns only.
|
|
2569
|
+
|
|
2570
|
+
For absolute values of x greater than or equal to 36, the value of the
|
|
2571
|
+
sigmoid function is effectively 1 for positive arguments or 0 for negative
|
|
2572
|
+
arguments, within about 15 digits of significance.
|
|
2573
|
+
|
|
2574
|
+
Note:
|
|
2575
|
+
Output of this function is passed to "sigmoid" argument of "Transform"
|
|
2576
|
+
function from Vantage Analytic Library.
|
|
2577
|
+
|
|
2578
|
+
PARAMETERS:
|
|
2579
|
+
columns:
|
|
2580
|
+
Required Argument.
|
|
2581
|
+
Specifies the names of the columns to scale.
|
|
2582
|
+
Types: str or list of str
|
|
2583
|
+
|
|
2584
|
+
style:
|
|
2585
|
+
Optional Argument.
|
|
2586
|
+
Specifies the style of sigmoid function to use.
|
|
2587
|
+
Permitted Values:
|
|
2588
|
+
* "logit":
|
|
2589
|
+
The logit function produces a continuously increasing value
|
|
2590
|
+
between 0 and 1.
|
|
2591
|
+
* "modifiedlogit":
|
|
2592
|
+
The modified logit function is twice the logit minus 1 and
|
|
2593
|
+
produces a value between -1 and 1.
|
|
2594
|
+
* "tanh":
|
|
2595
|
+
The hyperbolic tangent function also produces a value between
|
|
2596
|
+
-1 and 1.
|
|
2597
|
+
Default Value: 'logit'
|
|
2598
|
+
Types: str
|
|
2599
|
+
|
|
2600
|
+
out_columns:
|
|
2601
|
+
Optional Argument.
|
|
2602
|
+
Specifies the names of the output columns.
|
|
2603
|
+
Note:
|
|
2604
|
+
Number of elements in "columns" and "out_columns" must be same.
|
|
2605
|
+
Types: str or list of str
|
|
2606
|
+
|
|
2607
|
+
datatype:
|
|
2608
|
+
Optional Argument.
|
|
2609
|
+
Specifies the name of the intended datatype of the output column.
|
|
2610
|
+
Intended data types for the output column can be specified using either the
|
|
2611
|
+
teradatasqlalchemy types or the permitted strings mentioned below:
|
|
2612
|
+
-------------------------------------------------------------------
|
|
2613
|
+
| If intended SQL Data Type is | Permitted Value to be passed is |
|
|
2614
|
+
|-------------------------------------------------------------------|
|
|
2615
|
+
| bigint | bigint |
|
|
2616
|
+
| byteint | byteint |
|
|
2617
|
+
| char(n) | char,n |
|
|
2618
|
+
| date | date |
|
|
2619
|
+
| decimal(m,n) | decimal,m,n |
|
|
2620
|
+
| float | float |
|
|
2621
|
+
| integer | integer |
|
|
2622
|
+
| number(*) | number |
|
|
2623
|
+
| number(n) | number,n |
|
|
2624
|
+
| number(*,n) | number,*,n |
|
|
2625
|
+
| number(n,n) | number,n,n |
|
|
2626
|
+
| smallint | smallint |
|
|
2627
|
+
| time(p) | time,p |
|
|
2628
|
+
| timestamp(p) | timestamp,p |
|
|
2629
|
+
| varchar(n) | varchar,n |
|
|
2630
|
+
--------------------------------------------------------------------
|
|
2631
|
+
Notes:
|
|
2632
|
+
1. Argument is ignored if "columns" argument is not used.
|
|
2633
|
+
2. char without a size is not supported.
|
|
2634
|
+
3. number(*) does not include the * in its datatype format.
|
|
2635
|
+
Examples:
|
|
2636
|
+
1. If intended datatype for the output column is "bigint", then
|
|
2637
|
+
pass string "bigint" to the argument as shown below:
|
|
2638
|
+
datatype="bigint"
|
|
2639
|
+
2. If intended datatype for the output column is "decimal(3,5)", then
|
|
2640
|
+
pass string "decimal,3,5" to the argument as shown below:
|
|
2641
|
+
datatype="decimal,3,5"
|
|
2642
|
+
Types: str, BIGINT, BYTEINT, CHAR, DATE, DECIMAL, FLOAT, INTEGER, NUMBER, SMALLINT, TIME,
|
|
2643
|
+
TIMESTAMP, VARCHAR.
|
|
2644
|
+
|
|
2645
|
+
fillna:
|
|
2646
|
+
Optional Argument.
|
|
2647
|
+
Specifies whether the null replacement/missing value treatment should
|
|
2648
|
+
be performed with sigmoid transformation or not. Output of FillNa() can be
|
|
2649
|
+
passed to this argument.
|
|
2650
|
+
Note:
|
|
2651
|
+
If the FillNa object is created with its arguments "columns",
|
|
2652
|
+
"out_columns" and "datatype", then values passed in FillNa() arguments
|
|
2653
|
+
are ignored. Only nullstyle information is captured from the same.
|
|
2654
|
+
Types: FillNa
|
|
2655
|
+
|
|
2656
|
+
RETURNS:
|
|
2657
|
+
An instance of Sigmoid class.
|
|
2658
|
+
|
|
2659
|
+
RAISES:
|
|
2660
|
+
TeradataMlException, TypeError, ValueError
|
|
2661
|
+
|
|
2662
|
+
EXAMPLE:
|
|
2663
|
+
# Note:
|
|
2664
|
+
# To run any transformation, user needs to use Transform() function from
|
|
2665
|
+
# Vantage Analytic Library.
|
|
2666
|
+
# To do so import valib first and set the "val_install_location".
|
|
2667
|
+
>>> from teradataml import configure, DataFrame, FillNa, Sigmoid, load_example_data, valib
|
|
2668
|
+
>>> configure.val_install_location = "SYSLIB"
|
|
2669
|
+
>>>
|
|
2670
|
+
|
|
2671
|
+
# Load example data.
|
|
2672
|
+
>>> load_example_data("dataframe", "sales")
|
|
2673
|
+
>>>
|
|
2674
|
+
|
|
2675
|
+
# Create the required teradataml DataFrame.
|
|
2676
|
+
>>> sales = DataFrame("sales")
|
|
2677
|
+
>>> sales
|
|
2678
|
+
Feb Jan Mar Apr datetime
|
|
2679
|
+
accounts
|
|
2680
|
+
Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
|
|
2681
|
+
Orange Inc 210.0 NaN NaN 250.0 04/01/2017
|
|
2682
|
+
Red Inc 200.0 150.0 140.0 NaN 04/01/2017
|
|
2683
|
+
Yellow Inc 90.0 NaN NaN NaN 04/01/2017
|
|
2684
|
+
Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
|
|
2685
|
+
Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
|
|
2686
|
+
>>>
|
|
2687
|
+
|
|
2688
|
+
# Example 1: Scale values in columns "Jan" and "Mar" using sigmoid function "tanh".
|
|
2689
|
+
# Combine the scaling with null replacement.
|
|
2690
|
+
>>> fn = FillNa(style="literal", value=0)
|
|
2691
|
+
>>> sig = Sigmoid(style="tanh", columns=["Jan", "Mar"], fillna=fn)
|
|
2692
|
+
|
|
2693
|
+
# Execute Transform() function.
|
|
2694
|
+
>>> obj = valib.Transform(data=sales, sigmoid=sig, key_columns="accounts")
|
|
2695
|
+
>>> obj.result
|
|
2696
|
+
accounts Jan Mar
|
|
2697
|
+
0 Alpha Co 1.0 1.0
|
|
2698
|
+
1 Red Inc 1.0 1.0
|
|
2699
|
+
2 Orange Inc 0.0 0.0
|
|
2700
|
+
3 Jones LLC 1.0 1.0
|
|
2701
|
+
4 Yellow Inc 0.0 0.0
|
|
2702
|
+
5 Blue Inc 1.0 1.0
|
|
2703
|
+
>>>
|
|
2704
|
+
|
|
2705
|
+
|
|
2706
|
+
# Example 2: Rescaling with Sigmoid is carried out with multiple styles.
|
|
2707
|
+
>>> load_example_data("dataframe", "iris_test")
|
|
2708
|
+
>>> df = DataFrame("iris_test")
|
|
2709
|
+
>>> df
|
|
2710
|
+
sepal_length sepal_width petal_length petal_width species
|
|
2711
|
+
id
|
|
2712
|
+
5 5.0 3.6 1.4 0.2 1
|
|
2713
|
+
60 5.2 2.7 3.9 1.4 2
|
|
2714
|
+
15 5.8 4.0 1.2 0.2 1
|
|
2715
|
+
30 4.7 3.2 1.6 0.2 1
|
|
2716
|
+
40 5.1 3.4 1.5 0.2 1
|
|
2717
|
+
80 5.7 2.6 3.5 1.0 2
|
|
2718
|
+
120 6.0 2.2 5.0 1.5 3
|
|
2719
|
+
70 5.6 2.5 3.9 1.1 2
|
|
2720
|
+
20 5.1 3.8 1.5 0.3 1
|
|
2721
|
+
65 5.6 2.9 3.6 1.3 2
|
|
2722
|
+
>>>
|
|
2723
|
+
|
|
2724
|
+
# Rescale values in columns "sepal_length", "sepal_width", "petal_length"
|
|
2725
|
+
# and "petal_width" with 'logit' (default) sigmoid function.
|
|
2726
|
+
>>> sig_1 = Sigmoid(columns=["sepal_length", "sepal_width", "petal_length",
|
|
2727
|
+
... "petal_width"],
|
|
2728
|
+
... out_columns=["sl", "sw", "pl", "pw"])
|
|
2729
|
+
>>>
|
|
2730
|
+
|
|
2731
|
+
# Rescale values in columns "sepal_length", "sepal_width", "petal_length"
|
|
2732
|
+
# and "petal_width" with 'tanh' sigmoid function.
|
|
2733
|
+
>>> sig_2 = Sigmoid(style="tanh",
|
|
2734
|
+
... columns=["sepal_length", "sepal_width", "petal_length",
|
|
2735
|
+
... "petal_width"],
|
|
2736
|
+
... out_columns=["sl_t", "sw_t", "pl_t", "pw_t"])
|
|
2737
|
+
>>>
|
|
2738
|
+
|
|
2739
|
+
# Rescale values in columns "sepal_length" and "sepal_width" with 'modifiedlogit'
|
|
2740
|
+
# sigmoid function.
|
|
2741
|
+
# Combine it with null replacement using 'median' style.
|
|
2742
|
+
>>> fn = FillNa(style="median")
|
|
2743
|
+
>>> sig_3 = Sigmoid(style="modifiedlogit", columns=["sepal_length", "sepal_width"],
|
|
2744
|
+
... out_columns=["sl_ml", "sw_ml"], fillna=fn)
|
|
2745
|
+
>>>
|
|
2746
|
+
|
|
2747
|
+
# Execute Transform() function.
|
|
2748
|
+
>>> obj = valib.Transform(data=df, sigmoid=[sig_1, sig_2, sig_3],
|
|
2749
|
+
... key_columns="id")
|
|
2750
|
+
>>> obj.result
|
|
2751
|
+
id sl sw pl pw sl_t sw_t pl_t pw_t sl_ml sw_ml
|
|
2752
|
+
0 5 0.993307 0.973403 0.802184 0.549834 0.999909 0.998508 0.885352 0.197375 0.986614 0.946806
|
|
2753
|
+
1 60 0.994514 0.937027 0.980160 0.802184 0.999939 0.991007 0.999181 0.885352 0.989027 0.874053
|
|
2754
|
+
2 15 0.996982 0.982014 0.768525 0.549834 0.999982 0.999329 0.833655 0.197375 0.993963 0.964028
|
|
2755
|
+
3 30 0.990987 0.960834 0.832018 0.549834 0.999835 0.996682 0.921669 0.197375 0.981973 0.921669
|
|
2756
|
+
4 40 0.993940 0.967705 0.817574 0.549834 0.999926 0.997775 0.905148 0.197375 0.987880 0.935409
|
|
2757
|
+
5 80 0.996665 0.930862 0.970688 0.731059 0.999978 0.989027 0.998178 0.761594 0.993330 0.861723
|
|
2758
|
+
6 120 0.997527 0.900250 0.993307 0.817574 0.999988 0.975743 0.999909 0.905148 0.995055 0.800499
|
|
2759
|
+
7 70 0.996316 0.924142 0.980160 0.750260 0.999973 0.986614 0.999181 0.800499 0.992632 0.848284
|
|
2760
|
+
8 20 0.993940 0.978119 0.817574 0.574443 0.999926 0.999000 0.905148 0.291313 0.987880 0.956237
|
|
2761
|
+
9 65 0.996316 0.947846 0.973403 0.785835 0.999973 0.993963 0.998508 0.861723 0.992632 0.895693
|
|
2762
|
+
>>>
|
|
2763
|
+
"""
|
|
2764
|
+
# Call super()
|
|
2765
|
+
super().__init__(columns=columns, out_columns=out_columns, datatype=datatype,
|
|
2766
|
+
columns_optional=False)
|
|
2767
|
+
# Initialize style and value as data members.
|
|
2768
|
+
self.style = style
|
|
2769
|
+
self.fillna = fillna
|
|
2770
|
+
|
|
2771
|
+
# Validations
|
|
2772
|
+
arg_info_matrix = []
|
|
2773
|
+
permitted_styles = ["LOGIT", "MODIFIEDLOGIT", "TANH"]
|
|
2774
|
+
arg_info_matrix.append(["style", self.style, True, str, True, permitted_styles])
|
|
2775
|
+
arg_info_matrix.append(["fillna", self.fillna, True, FillNa])
|
|
2776
|
+
# Note:
|
|
2777
|
+
# Validations for "columns", "out_columns" and "datatype" is done by super().
|
|
2778
|
+
# Other argument validations.
|
|
2779
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
2780
|
+
|
|
2781
|
+
def _val_sql_syntax(self):
|
|
2782
|
+
"""
|
|
2783
|
+
DESCRIPTION:
|
|
2784
|
+
Internal function to return a string representation of sigmoid
|
|
2785
|
+
Transformation as required by SQL.
|
|
2786
|
+
|
|
2787
|
+
PARAMETERS:
|
|
2788
|
+
None.
|
|
2789
|
+
|
|
2790
|
+
RETURNS:
|
|
2791
|
+
String representing SQL syntax for 'sigmoidstyle' SQL argument.
|
|
2792
|
+
|
|
2793
|
+
RAISES:
|
|
2794
|
+
None.
|
|
2795
|
+
|
|
2796
|
+
EXAMPLE:
|
|
2797
|
+
self._val_sql_syntax()
|
|
2798
|
+
"""
|
|
2799
|
+
# Generate and add syntax for "sigmoidstyle" SQL argument.
|
|
2800
|
+
ret_value = "sigmoidstyle({})".format(self.style.lower())
|
|
2801
|
+
|
|
2802
|
+
# Generate and add syntax for "columns" and "datatype" SQL arguments.
|
|
2803
|
+
columns_fmt = self._val_transformation_fmt()
|
|
2804
|
+
ret_value = "{}, {}".format(ret_value, columns_fmt)
|
|
2805
|
+
|
|
2806
|
+
if self.fillna:
|
|
2807
|
+
ret_value = "{}, {}".format(ret_value, self.fillna._val_nullstyle_fmt())
|
|
2808
|
+
|
|
2809
|
+
return "{" + ret_value + "}"
|
|
2810
|
+
|
|
2811
|
+
|
|
2812
|
+
class ZScore(_Transformations):
|
|
2813
|
+
""" Class to represent Z-Score transformation technique for rescaling. """
|
|
2814
|
+
|
|
2815
|
+
def __init__(self, columns, out_columns=None, datatype=None, fillna=None):
|
|
2816
|
+
"""
|
|
2817
|
+
DESCRIPTION:
|
|
2818
|
+
ZScore will allows rescaling of continuous numeric data in a more
|
|
2819
|
+
sophisticated way than a Rescaling transformation. In a Z-Score
|
|
2820
|
+
transformation, a numeric column is transformed into its Z-score based
|
|
2821
|
+
on the mean value and standard deviation of the data in the column.
|
|
2822
|
+
Z-Score transforms each column value into the number of standard
|
|
2823
|
+
deviations from the mean value of the column. This non-linear transformation
|
|
2824
|
+
is useful in data mining rather than in a linear Rescaling transformation.
|
|
2825
|
+
The Z-Score transformation supports both numeric and date type input data.
|
|
2826
|
+
|
|
2827
|
+
Note:
|
|
2828
|
+
Output of this function is passed to "zscore" argument of "Transform"
|
|
2829
|
+
function from Vantage Analytic Library.
|
|
2830
|
+
|
|
2831
|
+
PARAMETERS:
|
|
2832
|
+
columns:
|
|
2833
|
+
Required Argument.
|
|
2834
|
+
Specifies the name(s) of the column(s) to perform transformation on.
|
|
2835
|
+
Types: str or list of str
|
|
2836
|
+
|
|
2837
|
+
out_columns:
|
|
2838
|
+
Optional Argument.
|
|
2839
|
+
Specifies the names of the output columns.
|
|
2840
|
+
Note:
|
|
2841
|
+
Number of elements in "columns" and "out_columns" must be same.
|
|
2842
|
+
Types: str or list of str
|
|
2843
|
+
|
|
2844
|
+
datatype:
|
|
2845
|
+
Optional Argument.
|
|
2846
|
+
Specifies the name of the intended datatype of the output column.
|
|
2847
|
+
Intended data types for the output column can be specified using either the
|
|
2848
|
+
teradatasqlalchemy types or the permitted strings mentioned below:
|
|
2849
|
+
-------------------------------------------------------------------
|
|
2850
|
+
| If intended SQL Data Type is | Permitted Value to be passed is |
|
|
2851
|
+
|-------------------------------------------------------------------|
|
|
2852
|
+
| bigint | bigint |
|
|
2853
|
+
| byteint | byteint |
|
|
2854
|
+
| char(n) | char,n |
|
|
2855
|
+
| date | date |
|
|
2856
|
+
| decimal(m,n) | decimal,m,n |
|
|
2857
|
+
| float | float |
|
|
2858
|
+
| integer | integer |
|
|
2859
|
+
| number(*) | number |
|
|
2860
|
+
| number(n) | number,n |
|
|
2861
|
+
| number(*,n) | number,*,n |
|
|
2862
|
+
| number(n,n) | number,n,n |
|
|
2863
|
+
| smallint | smallint |
|
|
2864
|
+
| time(p) | time,p |
|
|
2865
|
+
| timestamp(p) | timestamp,p |
|
|
2866
|
+
| varchar(n) | varchar,n |
|
|
2867
|
+
--------------------------------------------------------------------
|
|
2868
|
+
Notes:
|
|
2869
|
+
1. Argument is ignored if "columns" argument is not used.
|
|
2870
|
+
2. char without a size is not supported.
|
|
2871
|
+
3. number(*) does not include the * in its datatype format.
|
|
2872
|
+
Examples:
|
|
2873
|
+
1. If intended datatype for the output column is "bigint", then
|
|
2874
|
+
pass string "bigint" to the argument as shown below:
|
|
2875
|
+
datatype="bigint"
|
|
2876
|
+
2. If intended datatype for the output column is "decimal(3,5)", then
|
|
2877
|
+
pass string "decimal,3,5" to the argument as shown below:
|
|
2878
|
+
datatype="decimal,3,5"
|
|
2879
|
+
Types: str, BIGINT, BYTEINT, CHAR, DATE, DECIMAL, FLOAT, INTEGER, NUMBER, SMALLINT, TIME,
|
|
2880
|
+
TIMESTAMP, VARCHAR.
|
|
2881
|
+
|
|
2882
|
+
fillna:
|
|
2883
|
+
Optional Argument.
|
|
2884
|
+
Specifies whether the null replacement/missing value treatment should
|
|
2885
|
+
be performed with Z-Score transformation or not. Output of 'FillNa()'
|
|
2886
|
+
can be passed to this argument.
|
|
2887
|
+
Note:
|
|
2888
|
+
If the FillNa object is created with its arguments "columns",
|
|
2889
|
+
"out_columns" and "datatype", then values passed in FillNa() arguments
|
|
2890
|
+
are ignored. Only nullstyle information is captured from the same.
|
|
2891
|
+
Types: FillNa
|
|
2892
|
+
|
|
2893
|
+
RETURNS:
|
|
2894
|
+
An instance of ZScore class.
|
|
2895
|
+
|
|
2896
|
+
RAISES:
|
|
2897
|
+
TeradataMlException, TypeError, ValueError
|
|
2898
|
+
|
|
2899
|
+
EXAMPLE:
|
|
2900
|
+
# Note:
|
|
2901
|
+
# To run any transformation, user needs to use Transform() function from
|
|
2902
|
+
# Vantage Analytic Library.
|
|
2903
|
+
# To do so import valib first and set the "val_install_location".
|
|
2904
|
+
>>> from teradataml import configure, DataFrame, FillNa, load_example_data, valib, ZScore
|
|
2905
|
+
>>> configure.val_install_location = "SYSLIB"
|
|
2906
|
+
>>>
|
|
2907
|
+
|
|
2908
|
+
# Load example data.
|
|
2909
|
+
>>> load_example_data("dataframe", "sales")
|
|
2910
|
+
>>>
|
|
2911
|
+
|
|
2912
|
+
# Create the required DataFrames.
|
|
2913
|
+
>>> sales = DataFrame("sales")
|
|
2914
|
+
>>> sales
|
|
2915
|
+
Feb Jan Mar Apr datetime
|
|
2916
|
+
accounts
|
|
2917
|
+
Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
|
|
2918
|
+
Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
|
|
2919
|
+
Yellow Inc 90.0 NaN NaN NaN 04/01/2017
|
|
2920
|
+
Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
|
|
2921
|
+
Red Inc 200.0 150.0 140.0 NaN 04/01/2017
|
|
2922
|
+
Orange Inc 210.0 NaN NaN 250.0 04/01/2017
|
|
2923
|
+
>>>
|
|
2924
|
+
|
|
2925
|
+
# Example 1: Rescaling with ZScore is carried out on "Feb" column.
|
|
2926
|
+
>>> zs = ZScore(columns="Feb")
|
|
2927
|
+
|
|
2928
|
+
# Execute Transform() function.
|
|
2929
|
+
>>> obj = valib.Transform(data=sales, zscore=zs)
|
|
2930
|
+
>>> obj.result
|
|
2931
|
+
accounts Feb
|
|
2932
|
+
0 Blue Inc -1.410220
|
|
2933
|
+
1 Alpha Co 0.797081
|
|
2934
|
+
2 Jones LLC 0.613139
|
|
2935
|
+
3 Yellow Inc -1.410220
|
|
2936
|
+
4 Orange Inc 0.797081
|
|
2937
|
+
5 Red Inc 0.613139
|
|
2938
|
+
>>>
|
|
2939
|
+
|
|
2940
|
+
|
|
2941
|
+
# Example 2: Rescaling with ZScore is carried out with multiple columns "Jan"
|
|
2942
|
+
# and "Apr" with null replacement using "mode" style.
|
|
2943
|
+
>>> fn = FillNa(style="mode")
|
|
2944
|
+
>>> zs = ZScore(columns=["Jan", "Apr"], out_columns=["january", "april"], fillna=fn)
|
|
2945
|
+
|
|
2946
|
+
# Execute Transform() function.
|
|
2947
|
+
>>> obj = valib.Transform(data=sales, zscore=zs, key_columns="accounts")
|
|
2948
|
+
>>> obj.result
|
|
2949
|
+
accounts january april
|
|
2950
|
+
0 Blue Inc -2.042649 -1.993546
|
|
2951
|
+
1 Alpha Co 1.299867 0.646795
|
|
2952
|
+
2 Jones LLC 0.185695 -0.593634
|
|
2953
|
+
3 Yellow Inc 0.185695 0.646795
|
|
2954
|
+
4 Orange Inc 0.185695 0.646795
|
|
2955
|
+
5 Red Inc 0.185695 0.646795
|
|
2956
|
+
>>>
|
|
2957
|
+
"""
|
|
2958
|
+
# Call super()
|
|
2959
|
+
super().__init__(columns=columns, out_columns=out_columns, datatype=datatype,
|
|
2960
|
+
columns_optional=False)
|
|
2961
|
+
self.fillna = fillna
|
|
2962
|
+
|
|
2963
|
+
# Validations
|
|
2964
|
+
arg_info_matrix = []
|
|
2965
|
+
arg_info_matrix.append(["fillna", self.fillna, True, FillNa])
|
|
2966
|
+
# Note:
|
|
2967
|
+
# Validations for "columns", "out_columns" and "datatype" is done by super().
|
|
2968
|
+
# Other argument validations.
|
|
2969
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
2970
|
+
|
|
2971
|
+
def _val_sql_syntax(self):
|
|
2972
|
+
"""
|
|
2973
|
+
DESCRIPTION:
|
|
2974
|
+
Internal function to return a string representation of zscore
|
|
2975
|
+
Transformation as required by SQL.
|
|
2976
|
+
|
|
2977
|
+
PARAMETERS:
|
|
2978
|
+
None.
|
|
2979
|
+
|
|
2980
|
+
RETURNS:
|
|
2981
|
+
String representing SQL syntax for 'zscore' SQL argument.
|
|
2982
|
+
|
|
2983
|
+
RAISES:
|
|
2984
|
+
None.
|
|
2985
|
+
|
|
2986
|
+
EXAMPLE:
|
|
2987
|
+
self._val_sql_syntax()
|
|
2988
|
+
"""
|
|
2989
|
+
# Generate and add syntax for "columns" and "datatype" SQL arguments.
|
|
2990
|
+
ret_value = self._val_transformation_fmt()
|
|
2991
|
+
|
|
2992
|
+
# Generate and add syntax for "nullstyle", a SQL arguments.
|
|
2993
|
+
if self.fillna:
|
|
2994
|
+
ret_value = "{}, {}".format(ret_value, self.fillna._val_nullstyle_fmt())
|
|
2995
|
+
|
|
2996
|
+
return "{" + ret_value + "}"
|