teradataml 20.0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +2762 -0
- teradataml/__init__.py +78 -0
- teradataml/_version.py +11 -0
- teradataml/analytics/Transformations.py +2996 -0
- teradataml/analytics/__init__.py +82 -0
- teradataml/analytics/analytic_function_executor.py +2416 -0
- teradataml/analytics/analytic_query_generator.py +1050 -0
- teradataml/analytics/byom/H2OPredict.py +514 -0
- teradataml/analytics/byom/PMMLPredict.py +437 -0
- teradataml/analytics/byom/__init__.py +16 -0
- teradataml/analytics/json_parser/__init__.py +133 -0
- teradataml/analytics/json_parser/analytic_functions_argument.py +1805 -0
- teradataml/analytics/json_parser/json_store.py +191 -0
- teradataml/analytics/json_parser/metadata.py +1666 -0
- teradataml/analytics/json_parser/utils.py +805 -0
- teradataml/analytics/meta_class.py +236 -0
- teradataml/analytics/sqle/DecisionTreePredict.py +456 -0
- teradataml/analytics/sqle/NaiveBayesPredict.py +420 -0
- teradataml/analytics/sqle/__init__.py +128 -0
- teradataml/analytics/sqle/json/decisiontreepredict_sqle.json +78 -0
- teradataml/analytics/sqle/json/naivebayespredict_sqle.json +62 -0
- teradataml/analytics/table_operator/__init__.py +11 -0
- teradataml/analytics/uaf/__init__.py +82 -0
- teradataml/analytics/utils.py +828 -0
- teradataml/analytics/valib.py +1617 -0
- teradataml/automl/__init__.py +5835 -0
- teradataml/automl/autodataprep/__init__.py +493 -0
- teradataml/automl/custom_json_utils.py +1625 -0
- teradataml/automl/data_preparation.py +1384 -0
- teradataml/automl/data_transformation.py +1254 -0
- teradataml/automl/feature_engineering.py +2273 -0
- teradataml/automl/feature_exploration.py +1873 -0
- teradataml/automl/model_evaluation.py +488 -0
- teradataml/automl/model_training.py +1407 -0
- teradataml/catalog/__init__.py +2 -0
- teradataml/catalog/byom.py +1759 -0
- teradataml/catalog/function_argument_mapper.py +859 -0
- teradataml/catalog/model_cataloging_utils.py +491 -0
- teradataml/clients/__init__.py +0 -0
- teradataml/clients/auth_client.py +137 -0
- teradataml/clients/keycloak_client.py +165 -0
- teradataml/clients/pkce_client.py +481 -0
- teradataml/common/__init__.py +1 -0
- teradataml/common/aed_utils.py +2078 -0
- teradataml/common/bulk_exposed_utils.py +113 -0
- teradataml/common/constants.py +1669 -0
- teradataml/common/deprecations.py +166 -0
- teradataml/common/exceptions.py +147 -0
- teradataml/common/formula.py +743 -0
- teradataml/common/garbagecollector.py +666 -0
- teradataml/common/logger.py +1261 -0
- teradataml/common/messagecodes.py +518 -0
- teradataml/common/messages.py +262 -0
- teradataml/common/pylogger.py +67 -0
- teradataml/common/sqlbundle.py +764 -0
- teradataml/common/td_coltype_code_to_tdtype.py +48 -0
- teradataml/common/utils.py +3166 -0
- teradataml/common/warnings.py +36 -0
- teradataml/common/wrapper_utils.py +625 -0
- teradataml/config/__init__.py +0 -0
- teradataml/config/dummy_file1.cfg +5 -0
- teradataml/config/dummy_file2.cfg +3 -0
- teradataml/config/sqlengine_alias_definitions_v1.0 +14 -0
- teradataml/config/sqlengine_alias_definitions_v1.1 +20 -0
- teradataml/config/sqlengine_alias_definitions_v1.3 +19 -0
- teradataml/context/__init__.py +0 -0
- teradataml/context/aed_context.py +223 -0
- teradataml/context/context.py +1462 -0
- teradataml/data/A_loan.csv +19 -0
- teradataml/data/BINARY_REALS_LEFT.csv +11 -0
- teradataml/data/BINARY_REALS_RIGHT.csv +11 -0
- teradataml/data/B_loan.csv +49 -0
- teradataml/data/BuoyData2.csv +17 -0
- teradataml/data/CONVOLVE2_COMPLEX_LEFT.csv +5 -0
- teradataml/data/CONVOLVE2_COMPLEX_RIGHT.csv +5 -0
- teradataml/data/Convolve2RealsLeft.csv +5 -0
- teradataml/data/Convolve2RealsRight.csv +5 -0
- teradataml/data/Convolve2ValidLeft.csv +11 -0
- teradataml/data/Convolve2ValidRight.csv +11 -0
- teradataml/data/DFFTConv_Real_8_8.csv +65 -0
- teradataml/data/Employee.csv +5 -0
- teradataml/data/Employee_Address.csv +4 -0
- teradataml/data/Employee_roles.csv +5 -0
- teradataml/data/JulesBelvezeDummyData.csv +100 -0
- teradataml/data/Mall_customer_data.csv +201 -0
- teradataml/data/Orders1_12mf.csv +25 -0
- teradataml/data/Pi_loan.csv +7 -0
- teradataml/data/SMOOTHED_DATA.csv +7 -0
- teradataml/data/TestDFFT8.csv +9 -0
- teradataml/data/TestRiver.csv +109 -0
- teradataml/data/Traindata.csv +28 -0
- teradataml/data/__init__.py +0 -0
- teradataml/data/acf.csv +17 -0
- teradataml/data/adaboost_example.json +34 -0
- teradataml/data/adaboostpredict_example.json +24 -0
- teradataml/data/additional_table.csv +11 -0
- teradataml/data/admissions_test.csv +21 -0
- teradataml/data/admissions_train.csv +41 -0
- teradataml/data/admissions_train_nulls.csv +41 -0
- teradataml/data/advertising.csv +201 -0
- teradataml/data/ageandheight.csv +13 -0
- teradataml/data/ageandpressure.csv +31 -0
- teradataml/data/amazon_reviews_25.csv +26 -0
- teradataml/data/antiselect_example.json +36 -0
- teradataml/data/antiselect_input.csv +8 -0
- teradataml/data/antiselect_input_mixed_case.csv +8 -0
- teradataml/data/applicant_external.csv +7 -0
- teradataml/data/applicant_reference.csv +7 -0
- teradataml/data/apriori_example.json +22 -0
- teradataml/data/arima_example.json +9 -0
- teradataml/data/assortedtext_input.csv +8 -0
- teradataml/data/attribution_example.json +34 -0
- teradataml/data/attribution_sample_table.csv +27 -0
- teradataml/data/attribution_sample_table1.csv +6 -0
- teradataml/data/attribution_sample_table2.csv +11 -0
- teradataml/data/bank_churn.csv +10001 -0
- teradataml/data/bank_marketing.csv +11163 -0
- teradataml/data/bank_web_clicks1.csv +43 -0
- teradataml/data/bank_web_clicks2.csv +91 -0
- teradataml/data/bank_web_url.csv +85 -0
- teradataml/data/barrier.csv +2 -0
- teradataml/data/barrier_new.csv +3 -0
- teradataml/data/betweenness_example.json +14 -0
- teradataml/data/bike_sharing.csv +732 -0
- teradataml/data/bin_breaks.csv +8 -0
- teradataml/data/bin_fit_ip.csv +4 -0
- teradataml/data/binary_complex_left.csv +11 -0
- teradataml/data/binary_complex_right.csv +11 -0
- teradataml/data/binary_matrix_complex_left.csv +21 -0
- teradataml/data/binary_matrix_complex_right.csv +21 -0
- teradataml/data/binary_matrix_real_left.csv +21 -0
- teradataml/data/binary_matrix_real_right.csv +21 -0
- teradataml/data/blood2ageandweight.csv +26 -0
- teradataml/data/bmi.csv +501 -0
- teradataml/data/boston.csv +507 -0
- teradataml/data/boston2cols.csv +721 -0
- teradataml/data/breast_cancer.csv +570 -0
- teradataml/data/buoydata_mix.csv +11 -0
- teradataml/data/burst_data.csv +5 -0
- teradataml/data/burst_example.json +21 -0
- teradataml/data/byom_example.json +34 -0
- teradataml/data/bytes_table.csv +4 -0
- teradataml/data/cal_housing_ex_raw.csv +70 -0
- teradataml/data/callers.csv +7 -0
- teradataml/data/calls.csv +10 -0
- teradataml/data/cars_hist.csv +33 -0
- teradataml/data/cat_table.csv +25 -0
- teradataml/data/ccm_example.json +32 -0
- teradataml/data/ccm_input.csv +91 -0
- teradataml/data/ccm_input2.csv +13 -0
- teradataml/data/ccmexample.csv +101 -0
- teradataml/data/ccmprepare_example.json +9 -0
- teradataml/data/ccmprepare_input.csv +91 -0
- teradataml/data/cfilter_example.json +12 -0
- teradataml/data/changepointdetection_example.json +18 -0
- teradataml/data/changepointdetectionrt_example.json +8 -0
- teradataml/data/chi_sq.csv +3 -0
- teradataml/data/churn_data.csv +14 -0
- teradataml/data/churn_emission.csv +35 -0
- teradataml/data/churn_initial.csv +3 -0
- teradataml/data/churn_state_transition.csv +5 -0
- teradataml/data/citedges_2.csv +745 -0
- teradataml/data/citvertices_2.csv +1210 -0
- teradataml/data/clicks2.csv +16 -0
- teradataml/data/clickstream.csv +13 -0
- teradataml/data/clickstream1.csv +11 -0
- teradataml/data/closeness_example.json +16 -0
- teradataml/data/complaints.csv +21 -0
- teradataml/data/complaints_mini.csv +3 -0
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_testtoken.csv +224 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/complaints_tokens_test.csv +353 -0
- teradataml/data/complaints_traintoken.csv +472 -0
- teradataml/data/computers_category.csv +1001 -0
- teradataml/data/computers_test1.csv +1252 -0
- teradataml/data/computers_train1.csv +5009 -0
- teradataml/data/computers_train1_clustered.csv +5009 -0
- teradataml/data/confusionmatrix_example.json +9 -0
- teradataml/data/conversion_event_table.csv +3 -0
- teradataml/data/corr_input.csv +17 -0
- teradataml/data/correlation_example.json +11 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/coxhazardratio_example.json +39 -0
- teradataml/data/coxph_example.json +15 -0
- teradataml/data/coxsurvival_example.json +28 -0
- teradataml/data/cpt.csv +41 -0
- teradataml/data/credit_ex_merged.csv +45 -0
- teradataml/data/creditcard_data.csv +1001 -0
- teradataml/data/customer_loyalty.csv +301 -0
- teradataml/data/customer_loyalty_newseq.csv +31 -0
- teradataml/data/customer_segmentation_test.csv +2628 -0
- teradataml/data/customer_segmentation_train.csv +8069 -0
- teradataml/data/dataframe_example.json +173 -0
- teradataml/data/decisionforest_example.json +37 -0
- teradataml/data/decisionforestpredict_example.json +38 -0
- teradataml/data/decisiontree_example.json +21 -0
- teradataml/data/decisiontreepredict_example.json +45 -0
- teradataml/data/dfft2_size4_real.csv +17 -0
- teradataml/data/dfft2_test_matrix16.csv +17 -0
- teradataml/data/dfft2conv_real_4_4.csv +65 -0
- teradataml/data/diabetes.csv +443 -0
- teradataml/data/diabetes_test.csv +89 -0
- teradataml/data/dict_table.csv +5 -0
- teradataml/data/docperterm_table.csv +4 -0
- teradataml/data/docs/__init__.py +1 -0
- teradataml/data/docs/byom/__init__.py +0 -0
- teradataml/data/docs/byom/docs/DataRobotPredict.py +180 -0
- teradataml/data/docs/byom/docs/DataikuPredict.py +217 -0
- teradataml/data/docs/byom/docs/H2OPredict.py +325 -0
- teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
- teradataml/data/docs/byom/docs/ONNXPredict.py +283 -0
- teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
- teradataml/data/docs/byom/docs/PMMLPredict.py +278 -0
- teradataml/data/docs/byom/docs/__init__.py +0 -0
- teradataml/data/docs/sqle/__init__.py +0 -0
- teradataml/data/docs/sqle/docs_17_10/Antiselect.py +83 -0
- teradataml/data/docs/sqle/docs_17_10/Attribution.py +200 -0
- teradataml/data/docs/sqle/docs_17_10/BincodeFit.py +172 -0
- teradataml/data/docs/sqle/docs_17_10/BincodeTransform.py +131 -0
- teradataml/data/docs/sqle/docs_17_10/CategoricalSummary.py +86 -0
- teradataml/data/docs/sqle/docs_17_10/ChiSq.py +90 -0
- teradataml/data/docs/sqle/docs_17_10/ColumnSummary.py +86 -0
- teradataml/data/docs/sqle/docs_17_10/ConvertTo.py +96 -0
- teradataml/data/docs/sqle/docs_17_10/DecisionForestPredict.py +139 -0
- teradataml/data/docs/sqle/docs_17_10/DecisionTreePredict.py +152 -0
- teradataml/data/docs/sqle/docs_17_10/FTest.py +161 -0
- teradataml/data/docs/sqle/docs_17_10/FillRowId.py +83 -0
- teradataml/data/docs/sqle/docs_17_10/Fit.py +88 -0
- teradataml/data/docs/sqle/docs_17_10/GLMPredict.py +144 -0
- teradataml/data/docs/sqle/docs_17_10/GetRowsWithMissingValues.py +85 -0
- teradataml/data/docs/sqle/docs_17_10/GetRowsWithoutMissingValues.py +82 -0
- teradataml/data/docs/sqle/docs_17_10/Histogram.py +165 -0
- teradataml/data/docs/sqle/docs_17_10/MovingAverage.py +134 -0
- teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +209 -0
- teradataml/data/docs/sqle/docs_17_10/NPath.py +266 -0
- teradataml/data/docs/sqle/docs_17_10/NaiveBayesPredict.py +116 -0
- teradataml/data/docs/sqle/docs_17_10/NaiveBayesTextClassifierPredict.py +176 -0
- teradataml/data/docs/sqle/docs_17_10/NumApply.py +147 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +135 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +109 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterFit.py +166 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +105 -0
- teradataml/data/docs/sqle/docs_17_10/Pack.py +128 -0
- teradataml/data/docs/sqle/docs_17_10/PolynomialFeaturesFit.py +112 -0
- teradataml/data/docs/sqle/docs_17_10/PolynomialFeaturesTransform.py +102 -0
- teradataml/data/docs/sqle/docs_17_10/QQNorm.py +105 -0
- teradataml/data/docs/sqle/docs_17_10/RoundColumns.py +110 -0
- teradataml/data/docs/sqle/docs_17_10/RowNormalizeFit.py +118 -0
- teradataml/data/docs/sqle/docs_17_10/RowNormalizeTransform.py +99 -0
- teradataml/data/docs/sqle/docs_17_10/SVMSparsePredict.py +153 -0
- teradataml/data/docs/sqle/docs_17_10/ScaleFit.py +197 -0
- teradataml/data/docs/sqle/docs_17_10/ScaleTransform.py +99 -0
- teradataml/data/docs/sqle/docs_17_10/Sessionize.py +114 -0
- teradataml/data/docs/sqle/docs_17_10/SimpleImputeFit.py +116 -0
- teradataml/data/docs/sqle/docs_17_10/SimpleImputeTransform.py +98 -0
- teradataml/data/docs/sqle/docs_17_10/StrApply.py +187 -0
- teradataml/data/docs/sqle/docs_17_10/StringSimilarity.py +146 -0
- teradataml/data/docs/sqle/docs_17_10/Transform.py +105 -0
- teradataml/data/docs/sqle/docs_17_10/UnivariateStatistics.py +142 -0
- teradataml/data/docs/sqle/docs_17_10/Unpack.py +214 -0
- teradataml/data/docs/sqle/docs_17_10/WhichMax.py +83 -0
- teradataml/data/docs/sqle/docs_17_10/WhichMin.py +83 -0
- teradataml/data/docs/sqle/docs_17_10/ZTest.py +155 -0
- teradataml/data/docs/sqle/docs_17_10/__init__.py +0 -0
- teradataml/data/docs/sqle/docs_17_20/ANOVA.py +186 -0
- teradataml/data/docs/sqle/docs_17_20/Antiselect.py +83 -0
- teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
- teradataml/data/docs/sqle/docs_17_20/Attribution.py +201 -0
- teradataml/data/docs/sqle/docs_17_20/BincodeFit.py +172 -0
- teradataml/data/docs/sqle/docs_17_20/BincodeTransform.py +139 -0
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/CategoricalSummary.py +86 -0
- teradataml/data/docs/sqle/docs_17_20/ChiSq.py +90 -0
- teradataml/data/docs/sqle/docs_17_20/ClassificationEvaluator.py +166 -0
- teradataml/data/docs/sqle/docs_17_20/ColumnSummary.py +86 -0
- teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +246 -0
- teradataml/data/docs/sqle/docs_17_20/ConvertTo.py +113 -0
- teradataml/data/docs/sqle/docs_17_20/DecisionForest.py +280 -0
- teradataml/data/docs/sqle/docs_17_20/DecisionForestPredict.py +144 -0
- teradataml/data/docs/sqle/docs_17_20/DecisionTreePredict.py +136 -0
- teradataml/data/docs/sqle/docs_17_20/FTest.py +240 -0
- teradataml/data/docs/sqle/docs_17_20/FillRowId.py +83 -0
- teradataml/data/docs/sqle/docs_17_20/Fit.py +88 -0
- teradataml/data/docs/sqle/docs_17_20/GLM.py +541 -0
- teradataml/data/docs/sqle/docs_17_20/GLMPerSegment.py +415 -0
- teradataml/data/docs/sqle/docs_17_20/GLMPredict.py +144 -0
- teradataml/data/docs/sqle/docs_17_20/GLMPredictPerSegment.py +233 -0
- teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +125 -0
- teradataml/data/docs/sqle/docs_17_20/GetRowsWithMissingValues.py +109 -0
- teradataml/data/docs/sqle/docs_17_20/GetRowsWithoutMissingValues.py +106 -0
- teradataml/data/docs/sqle/docs_17_20/Histogram.py +224 -0
- teradataml/data/docs/sqle/docs_17_20/KMeans.py +251 -0
- teradataml/data/docs/sqle/docs_17_20/KMeansPredict.py +144 -0
- teradataml/data/docs/sqle/docs_17_20/KNN.py +215 -0
- teradataml/data/docs/sqle/docs_17_20/MovingAverage.py +134 -0
- teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +209 -0
- teradataml/data/docs/sqle/docs_17_20/NPath.py +266 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayesPredict.py +116 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierPredict.py +177 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierTrainer.py +127 -0
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +119 -0
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineTransform.py +112 -0
- teradataml/data/docs/sqle/docs_17_20/NumApply.py +147 -0
- teradataml/data/docs/sqle/docs_17_20/OneClassSVM.py +307 -0
- teradataml/data/docs/sqle/docs_17_20/OneClassSVMPredict.py +185 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +231 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +121 -0
- teradataml/data/docs/sqle/docs_17_20/OrdinalEncodingFit.py +220 -0
- teradataml/data/docs/sqle/docs_17_20/OrdinalEncodingTransform.py +127 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +191 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterTransform.py +117 -0
- teradataml/data/docs/sqle/docs_17_20/Pack.py +128 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/PolynomialFeaturesFit.py +112 -0
- teradataml/data/docs/sqle/docs_17_20/PolynomialFeaturesTransform.py +112 -0
- teradataml/data/docs/sqle/docs_17_20/QQNorm.py +105 -0
- teradataml/data/docs/sqle/docs_17_20/ROC.py +164 -0
- teradataml/data/docs/sqle/docs_17_20/RandomProjectionFit.py +155 -0
- teradataml/data/docs/sqle/docs_17_20/RandomProjectionMinComponents.py +106 -0
- teradataml/data/docs/sqle/docs_17_20/RandomProjectionTransform.py +120 -0
- teradataml/data/docs/sqle/docs_17_20/RegressionEvaluator.py +211 -0
- teradataml/data/docs/sqle/docs_17_20/RoundColumns.py +109 -0
- teradataml/data/docs/sqle/docs_17_20/RowNormalizeFit.py +118 -0
- teradataml/data/docs/sqle/docs_17_20/RowNormalizeTransform.py +111 -0
- teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
- teradataml/data/docs/sqle/docs_17_20/SVM.py +414 -0
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +213 -0
- teradataml/data/docs/sqle/docs_17_20/SVMSparsePredict.py +153 -0
- teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +315 -0
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +202 -0
- teradataml/data/docs/sqle/docs_17_20/SentimentExtractor.py +206 -0
- teradataml/data/docs/sqle/docs_17_20/Sessionize.py +114 -0
- teradataml/data/docs/sqle/docs_17_20/Shap.py +225 -0
- teradataml/data/docs/sqle/docs_17_20/Silhouette.py +153 -0
- teradataml/data/docs/sqle/docs_17_20/SimpleImputeFit.py +116 -0
- teradataml/data/docs/sqle/docs_17_20/SimpleImputeTransform.py +109 -0
- teradataml/data/docs/sqle/docs_17_20/StrApply.py +187 -0
- teradataml/data/docs/sqle/docs_17_20/StringSimilarity.py +146 -0
- teradataml/data/docs/sqle/docs_17_20/TDDecisionForestPredict.py +207 -0
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +333 -0
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/TargetEncodingFit.py +267 -0
- teradataml/data/docs/sqle/docs_17_20/TargetEncodingTransform.py +141 -0
- teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +224 -0
- teradataml/data/docs/sqle/docs_17_20/TrainTestSplit.py +160 -0
- teradataml/data/docs/sqle/docs_17_20/Transform.py +123 -0
- teradataml/data/docs/sqle/docs_17_20/UnivariateStatistics.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/Unpack.py +214 -0
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/sqle/docs_17_20/VectorDistance.py +169 -0
- teradataml/data/docs/sqle/docs_17_20/WhichMax.py +83 -0
- teradataml/data/docs/sqle/docs_17_20/WhichMin.py +83 -0
- teradataml/data/docs/sqle/docs_17_20/WordEmbeddings.py +237 -0
- teradataml/data/docs/sqle/docs_17_20/XGBoost.py +362 -0
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +281 -0
- teradataml/data/docs/sqle/docs_17_20/ZTest.py +220 -0
- teradataml/data/docs/sqle/docs_17_20/__init__.py +0 -0
- teradataml/data/docs/tableoperator/__init__.py +0 -0
- teradataml/data/docs/tableoperator/docs_17_00/ReadNOS.py +430 -0
- teradataml/data/docs/tableoperator/docs_17_00/__init__.py +0 -0
- teradataml/data/docs/tableoperator/docs_17_05/ReadNOS.py +430 -0
- teradataml/data/docs/tableoperator/docs_17_05/WriteNOS.py +348 -0
- teradataml/data/docs/tableoperator/docs_17_05/__init__.py +0 -0
- teradataml/data/docs/tableoperator/docs_17_10/ReadNOS.py +429 -0
- teradataml/data/docs/tableoperator/docs_17_10/WriteNOS.py +348 -0
- teradataml/data/docs/tableoperator/docs_17_10/__init__.py +0 -0
- teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
- teradataml/data/docs/tableoperator/docs_17_20/ReadNOS.py +440 -0
- teradataml/data/docs/tableoperator/docs_17_20/WriteNOS.py +387 -0
- teradataml/data/docs/tableoperator/docs_17_20/__init__.py +0 -0
- teradataml/data/docs/uaf/__init__.py +0 -0
- teradataml/data/docs/uaf/docs_17_20/ACF.py +186 -0
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +370 -0
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +172 -0
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +161 -0
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BinaryMatrixOp.py +248 -0
- teradataml/data/docs/uaf/docs_17_20/BinarySeriesOp.py +252 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +178 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +175 -0
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +230 -0
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +218 -0
- teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +185 -0
- teradataml/data/docs/uaf/docs_17_20/DFFT.py +204 -0
- teradataml/data/docs/uaf/docs_17_20/DFFT2.py +216 -0
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +216 -0
- teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +192 -0
- teradataml/data/docs/uaf/docs_17_20/DIFF.py +175 -0
- teradataml/data/docs/uaf/docs_17_20/DTW.py +180 -0
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +217 -0
- teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +142 -0
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +184 -0
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +185 -0
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/FitMetrics.py +172 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesFormula.py +206 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +143 -0
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +198 -0
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +260 -0
- teradataml/data/docs/uaf/docs_17_20/IDFFT.py +165 -0
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +191 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/InputValidator.py +121 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +156 -0
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +215 -0
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +174 -0
- teradataml/data/docs/uaf/docs_17_20/MInfo.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +191 -0
- teradataml/data/docs/uaf/docs_17_20/PACF.py +157 -0
- teradataml/data/docs/uaf/docs_17_20/Portman.py +217 -0
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +203 -0
- teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +155 -0
- teradataml/data/docs/uaf/docs_17_20/Resample.py +237 -0
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SInfo.py +123 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +173 -0
- teradataml/data/docs/uaf/docs_17_20/SelectionCriteria.py +174 -0
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +171 -0
- teradataml/data/docs/uaf/docs_17_20/SignifResidmean.py +164 -0
- teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +180 -0
- teradataml/data/docs/uaf/docs_17_20/Smoothma.py +208 -0
- teradataml/data/docs/uaf/docs_17_20/TrackingOp.py +151 -0
- teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +171 -0
- teradataml/data/docs/uaf/docs_17_20/Unnormalize.py +202 -0
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +171 -0
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/docs/uaf/docs_17_20/__init__.py +0 -0
- teradataml/data/dtw_example.json +18 -0
- teradataml/data/dtw_t1.csv +11 -0
- teradataml/data/dtw_t2.csv +4 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt2d_example.json +16 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_example.json +15 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/dwt_filter_dim.csv +5 -0
- teradataml/data/emission.csv +9 -0
- teradataml/data/emp_table_by_dept.csv +19 -0
- teradataml/data/employee_info.csv +4 -0
- teradataml/data/employee_table.csv +6 -0
- teradataml/data/excluding_event_table.csv +2 -0
- teradataml/data/finance_data.csv +6 -0
- teradataml/data/finance_data2.csv +61 -0
- teradataml/data/finance_data3.csv +93 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/fish.csv +160 -0
- teradataml/data/fm_blood2ageandweight.csv +26 -0
- teradataml/data/fmeasure_example.json +12 -0
- teradataml/data/followers_leaders.csv +10 -0
- teradataml/data/fpgrowth_example.json +12 -0
- teradataml/data/frequentpaths_example.json +29 -0
- teradataml/data/friends.csv +9 -0
- teradataml/data/fs_input.csv +33 -0
- teradataml/data/fs_input1.csv +33 -0
- teradataml/data/genData.csv +513 -0
- teradataml/data/geodataframe_example.json +40 -0
- teradataml/data/glass_types.csv +215 -0
- teradataml/data/glm_admissions_model.csv +12 -0
- teradataml/data/glm_example.json +56 -0
- teradataml/data/glml1l2_example.json +28 -0
- teradataml/data/glml1l2predict_example.json +54 -0
- teradataml/data/glmpredict_example.json +54 -0
- teradataml/data/gq_t1.csv +21 -0
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/hconvolve_complex_right.csv +5 -0
- teradataml/data/hconvolve_complex_rightmulti.csv +5 -0
- teradataml/data/histogram_example.json +12 -0
- teradataml/data/hmmdecoder_example.json +79 -0
- teradataml/data/hmmevaluator_example.json +25 -0
- teradataml/data/hmmsupervised_example.json +10 -0
- teradataml/data/hmmunsupervised_example.json +8 -0
- teradataml/data/hnsw_alter_data.csv +5 -0
- teradataml/data/hnsw_data.csv +10 -0
- teradataml/data/house_values.csv +12 -0
- teradataml/data/house_values2.csv +13 -0
- teradataml/data/housing_cat.csv +7 -0
- teradataml/data/housing_data.csv +9 -0
- teradataml/data/housing_test.csv +47 -0
- teradataml/data/housing_test_binary.csv +47 -0
- teradataml/data/housing_train.csv +493 -0
- teradataml/data/housing_train_attribute.csv +5 -0
- teradataml/data/housing_train_binary.csv +437 -0
- teradataml/data/housing_train_parameter.csv +2 -0
- teradataml/data/housing_train_response.csv +493 -0
- teradataml/data/housing_train_segment.csv +201 -0
- teradataml/data/ibm_stock.csv +370 -0
- teradataml/data/ibm_stock1.csv +370 -0
- teradataml/data/identitymatch_example.json +22 -0
- teradataml/data/idf_table.csv +4 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/impressions.csv +101 -0
- teradataml/data/inflation.csv +21 -0
- teradataml/data/initial.csv +3 -0
- teradataml/data/insect2Cols.csv +61 -0
- teradataml/data/insect_sprays.csv +13 -0
- teradataml/data/insurance.csv +1339 -0
- teradataml/data/interpolator_example.json +13 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/iris_altinput.csv +481 -0
- teradataml/data/iris_attribute_output.csv +8 -0
- teradataml/data/iris_attribute_test.csv +121 -0
- teradataml/data/iris_attribute_train.csv +481 -0
- teradataml/data/iris_category_expect_predict.csv +31 -0
- teradataml/data/iris_data.csv +151 -0
- teradataml/data/iris_input.csv +151 -0
- teradataml/data/iris_response_train.csv +121 -0
- teradataml/data/iris_test.csv +31 -0
- teradataml/data/iris_train.csv +121 -0
- teradataml/data/join_table1.csv +4 -0
- teradataml/data/join_table2.csv +4 -0
- teradataml/data/jsons/anly_function_name.json +7 -0
- teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
- teradataml/data/jsons/byom/dataikupredict.json +148 -0
- teradataml/data/jsons/byom/datarobotpredict.json +147 -0
- teradataml/data/jsons/byom/h2opredict.json +195 -0
- teradataml/data/jsons/byom/onnxembeddings.json +267 -0
- teradataml/data/jsons/byom/onnxpredict.json +187 -0
- teradataml/data/jsons/byom/pmmlpredict.json +147 -0
- teradataml/data/jsons/paired_functions.json +450 -0
- teradataml/data/jsons/sqle/16.20/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/16.20/Attribution.json +249 -0
- teradataml/data/jsons/sqle/16.20/DecisionForestPredict.json +156 -0
- teradataml/data/jsons/sqle/16.20/DecisionTreePredict.json +170 -0
- teradataml/data/jsons/sqle/16.20/GLMPredict.json +122 -0
- teradataml/data/jsons/sqle/16.20/MovingAverage.json +367 -0
- teradataml/data/jsons/sqle/16.20/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/16.20/NaiveBayesPredict.json +136 -0
- teradataml/data/jsons/sqle/16.20/NaiveBayesTextClassifierPredict.json +235 -0
- teradataml/data/jsons/sqle/16.20/Pack.json +98 -0
- teradataml/data/jsons/sqle/16.20/SVMSparsePredict.json +162 -0
- teradataml/data/jsons/sqle/16.20/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/16.20/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/16.20/Unpack.json +166 -0
- teradataml/data/jsons/sqle/16.20/nPath.json +269 -0
- teradataml/data/jsons/sqle/17.00/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/17.00/Attribution.json +249 -0
- teradataml/data/jsons/sqle/17.00/DecisionForestPredict.json +156 -0
- teradataml/data/jsons/sqle/17.00/DecisionTreePredict.json +170 -0
- teradataml/data/jsons/sqle/17.00/GLMPredict.json +122 -0
- teradataml/data/jsons/sqle/17.00/MovingAverage.json +367 -0
- teradataml/data/jsons/sqle/17.00/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/17.00/NaiveBayesPredict.json +136 -0
- teradataml/data/jsons/sqle/17.00/NaiveBayesTextClassifierPredict.json +235 -0
- teradataml/data/jsons/sqle/17.00/Pack.json +98 -0
- teradataml/data/jsons/sqle/17.00/SVMSparsePredict.json +162 -0
- teradataml/data/jsons/sqle/17.00/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/17.00/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/17.00/Unpack.json +166 -0
- teradataml/data/jsons/sqle/17.00/nPath.json +269 -0
- teradataml/data/jsons/sqle/17.05/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/17.05/Attribution.json +249 -0
- teradataml/data/jsons/sqle/17.05/DecisionForestPredict.json +156 -0
- teradataml/data/jsons/sqle/17.05/DecisionTreePredict.json +170 -0
- teradataml/data/jsons/sqle/17.05/GLMPredict.json +122 -0
- teradataml/data/jsons/sqle/17.05/MovingAverage.json +367 -0
- teradataml/data/jsons/sqle/17.05/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/17.05/NaiveBayesPredict.json +136 -0
- teradataml/data/jsons/sqle/17.05/NaiveBayesTextClassifierPredict.json +235 -0
- teradataml/data/jsons/sqle/17.05/Pack.json +98 -0
- teradataml/data/jsons/sqle/17.05/SVMSparsePredict.json +162 -0
- teradataml/data/jsons/sqle/17.05/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/17.05/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/17.05/Unpack.json +166 -0
- teradataml/data/jsons/sqle/17.05/nPath.json +269 -0
- teradataml/data/jsons/sqle/17.10/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/17.10/Attribution.json +249 -0
- teradataml/data/jsons/sqle/17.10/DecisionForestPredict.json +185 -0
- teradataml/data/jsons/sqle/17.10/DecisionTreePredict.json +172 -0
- teradataml/data/jsons/sqle/17.10/GLMPredict.json +151 -0
- teradataml/data/jsons/sqle/17.10/MovingAverage.json +368 -0
- teradataml/data/jsons/sqle/17.10/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/17.10/NaiveBayesPredict.json +149 -0
- teradataml/data/jsons/sqle/17.10/NaiveBayesTextClassifierPredict.json +288 -0
- teradataml/data/jsons/sqle/17.10/Pack.json +133 -0
- teradataml/data/jsons/sqle/17.10/SVMSparsePredict.json +193 -0
- teradataml/data/jsons/sqle/17.10/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/17.10/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/17.10/TD_BinCodeFit.json +239 -0
- teradataml/data/jsons/sqle/17.10/TD_BinCodeTransform.json +70 -0
- teradataml/data/jsons/sqle/17.10/TD_CategoricalSummary.json +54 -0
- teradataml/data/jsons/sqle/17.10/TD_Chisq.json +68 -0
- teradataml/data/jsons/sqle/17.10/TD_ColumnSummary.json +54 -0
- teradataml/data/jsons/sqle/17.10/TD_ConvertTo.json +69 -0
- teradataml/data/jsons/sqle/17.10/TD_FTest.json +187 -0
- teradataml/data/jsons/sqle/17.10/TD_FillRowID.json +52 -0
- teradataml/data/jsons/sqle/17.10/TD_FunctionFit.json +46 -0
- teradataml/data/jsons/sqle/17.10/TD_FunctionTransform.json +72 -0
- teradataml/data/jsons/sqle/17.10/TD_GetRowsWithMissingValues.json +53 -0
- teradataml/data/jsons/sqle/17.10/TD_GetRowsWithoutMissingValues.json +53 -0
- teradataml/data/jsons/sqle/17.10/TD_Histogram.json +133 -0
- teradataml/data/jsons/sqle/17.10/TD_NumApply.json +147 -0
- teradataml/data/jsons/sqle/17.10/TD_OneHotEncodingFit.json +183 -0
- teradataml/data/jsons/sqle/17.10/TD_OneHotEncodingTransform.json +66 -0
- teradataml/data/jsons/sqle/17.10/TD_OutlierFilterFit.json +197 -0
- teradataml/data/jsons/sqle/17.10/TD_OutlierFilterTransform.json +48 -0
- teradataml/data/jsons/sqle/17.10/TD_PolynomialFeaturesFit.json +114 -0
- teradataml/data/jsons/sqle/17.10/TD_PolynomialFeaturesTransform.json +72 -0
- teradataml/data/jsons/sqle/17.10/TD_QQNorm.json +112 -0
- teradataml/data/jsons/sqle/17.10/TD_RoundColumns.json +93 -0
- teradataml/data/jsons/sqle/17.10/TD_RowNormalizeFit.json +128 -0
- teradataml/data/jsons/sqle/17.10/TD_RowNormalizeTransform.json +71 -0
- teradataml/data/jsons/sqle/17.10/TD_ScaleFit.json +157 -0
- teradataml/data/jsons/sqle/17.10/TD_ScaleTransform.json +71 -0
- teradataml/data/jsons/sqle/17.10/TD_SimpleImputeFit.json +148 -0
- teradataml/data/jsons/sqle/17.10/TD_SimpleImputeTransform.json +48 -0
- teradataml/data/jsons/sqle/17.10/TD_StrApply.json +240 -0
- teradataml/data/jsons/sqle/17.10/TD_UnivariateStatistics.json +119 -0
- teradataml/data/jsons/sqle/17.10/TD_WhichMax.json +53 -0
- teradataml/data/jsons/sqle/17.10/TD_WhichMin.json +53 -0
- teradataml/data/jsons/sqle/17.10/TD_ZTest.json +171 -0
- teradataml/data/jsons/sqle/17.10/Unpack.json +188 -0
- teradataml/data/jsons/sqle/17.10/nPath.json +269 -0
- teradataml/data/jsons/sqle/17.20/Antiselect.json +56 -0
- teradataml/data/jsons/sqle/17.20/Attribution.json +249 -0
- teradataml/data/jsons/sqle/17.20/DecisionForestPredict.json +185 -0
- teradataml/data/jsons/sqle/17.20/DecisionTreePredict.json +172 -0
- teradataml/data/jsons/sqle/17.20/GLMPredict.json +151 -0
- teradataml/data/jsons/sqle/17.20/MovingAverage.json +367 -0
- teradataml/data/jsons/sqle/17.20/NGramSplitter.json +239 -0
- teradataml/data/jsons/sqle/17.20/NaiveBayesPredict.json +149 -0
- teradataml/data/jsons/sqle/17.20/NaiveBayesTextClassifierPredict.json +287 -0
- teradataml/data/jsons/sqle/17.20/Pack.json +133 -0
- teradataml/data/jsons/sqle/17.20/SVMSparsePredict.json +192 -0
- teradataml/data/jsons/sqle/17.20/Sessionize.json +105 -0
- teradataml/data/jsons/sqle/17.20/StringSimilarity.json +86 -0
- teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +149 -0
- teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
- teradataml/data/jsons/sqle/17.20/TD_BinCodeFit.json +239 -0
- teradataml/data/jsons/sqle/17.20/TD_BinCodeTransform.json +71 -0
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_CategoricalSummary.json +53 -0
- teradataml/data/jsons/sqle/17.20/TD_Chisq.json +68 -0
- teradataml/data/jsons/sqle/17.20/TD_ClassificationEvaluator.json +146 -0
- teradataml/data/jsons/sqle/17.20/TD_ColumnSummary.json +53 -0
- teradataml/data/jsons/sqle/17.20/TD_ColumnTransformer.json +218 -0
- teradataml/data/jsons/sqle/17.20/TD_ConvertTo.json +92 -0
- teradataml/data/jsons/sqle/17.20/TD_DecisionForest.json +260 -0
- teradataml/data/jsons/sqle/17.20/TD_DecisionForestPredict.json +139 -0
- teradataml/data/jsons/sqle/17.20/TD_FTest.json +269 -0
- teradataml/data/jsons/sqle/17.20/TD_FillRowID.json +52 -0
- teradataml/data/jsons/sqle/17.20/TD_FunctionFit.json +46 -0
- teradataml/data/jsons/sqle/17.20/TD_FunctionTransform.json +72 -0
- teradataml/data/jsons/sqle/17.20/TD_GLM.json +507 -0
- teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +168 -0
- teradataml/data/jsons/sqle/17.20/TD_GLMPerSegment.json +411 -0
- teradataml/data/jsons/sqle/17.20/TD_GLMPredictPerSegment.json +146 -0
- teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +93 -0
- teradataml/data/jsons/sqle/17.20/TD_GetRowsWithMissingValues.json +76 -0
- teradataml/data/jsons/sqle/17.20/TD_GetRowsWithoutMissingValues.json +76 -0
- teradataml/data/jsons/sqle/17.20/TD_Histogram.json +152 -0
- teradataml/data/jsons/sqle/17.20/TD_KMeans.json +232 -0
- teradataml/data/jsons/sqle/17.20/TD_KMeansPredict.json +87 -0
- teradataml/data/jsons/sqle/17.20/TD_KNN.json +262 -0
- teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesTextClassifierTrainer.json +137 -0
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +102 -0
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineTransform.json +71 -0
- teradataml/data/jsons/sqle/17.20/TD_NumApply.json +147 -0
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +316 -0
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVMPredict.json +124 -0
- teradataml/data/jsons/sqle/17.20/TD_OneHotEncodingFit.json +271 -0
- teradataml/data/jsons/sqle/17.20/TD_OneHotEncodingTransform.json +65 -0
- teradataml/data/jsons/sqle/17.20/TD_OrdinalEncodingFit.json +229 -0
- teradataml/data/jsons/sqle/17.20/TD_OrdinalEncodingTransform.json +75 -0
- teradataml/data/jsons/sqle/17.20/TD_OutlierFilterFit.json +217 -0
- teradataml/data/jsons/sqle/17.20/TD_OutlierFilterTransform.json +48 -0
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_PolynomialFeaturesFit.json +114 -0
- teradataml/data/jsons/sqle/17.20/TD_PolynomialFeaturesTransform.json +72 -0
- teradataml/data/jsons/sqle/17.20/TD_QQNorm.json +111 -0
- teradataml/data/jsons/sqle/17.20/TD_ROC.json +179 -0
- teradataml/data/jsons/sqle/17.20/TD_RandomProjectionFit.json +179 -0
- teradataml/data/jsons/sqle/17.20/TD_RandomProjectionMinComponents.json +74 -0
- teradataml/data/jsons/sqle/17.20/TD_RandomProjectionTransform.json +74 -0
- teradataml/data/jsons/sqle/17.20/TD_RegressionEvaluator.json +138 -0
- teradataml/data/jsons/sqle/17.20/TD_RoundColumns.json +93 -0
- teradataml/data/jsons/sqle/17.20/TD_RowNormalizeFit.json +128 -0
- teradataml/data/jsons/sqle/17.20/TD_RowNormalizeTransform.json +71 -0
- teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
- teradataml/data/jsons/sqle/17.20/TD_SVM.json +389 -0
- teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +142 -0
- teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +310 -0
- teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +120 -0
- teradataml/data/jsons/sqle/17.20/TD_SentimentExtractor.json +194 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +221 -0
- teradataml/data/jsons/sqle/17.20/TD_Silhouette.json +143 -0
- teradataml/data/jsons/sqle/17.20/TD_SimpleImputeFit.json +147 -0
- teradataml/data/jsons/sqle/17.20/TD_SimpleImputeTransform.json +48 -0
- teradataml/data/jsons/sqle/17.20/TD_StrApply.json +240 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_TargetEncodingFit.json +248 -0
- teradataml/data/jsons/sqle/17.20/TD_TargetEncodingTransform.json +75 -0
- teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +297 -0
- teradataml/data/jsons/sqle/17.20/TD_TrainTestSplit.json +142 -0
- teradataml/data/jsons/sqle/17.20/TD_UnivariateStatistics.json +117 -0
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/sqle/17.20/TD_VectorDistance.json +183 -0
- teradataml/data/jsons/sqle/17.20/TD_WhichMax.json +53 -0
- teradataml/data/jsons/sqle/17.20/TD_WhichMin.json +53 -0
- teradataml/data/jsons/sqle/17.20/TD_WordEmbeddings.json +241 -0
- teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +330 -0
- teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +195 -0
- teradataml/data/jsons/sqle/17.20/TD_ZTest.json +247 -0
- teradataml/data/jsons/sqle/17.20/Unpack.json +188 -0
- teradataml/data/jsons/sqle/17.20/nPath.json +269 -0
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +370 -0
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +460 -0
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +385 -0
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +369 -0
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +369 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +369 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +369 -0
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +400 -0
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +401 -0
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +384 -0
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +384 -0
- teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
- teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
- teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/tableoperator/17.00/read_nos.json +198 -0
- teradataml/data/jsons/tableoperator/17.05/read_nos.json +198 -0
- teradataml/data/jsons/tableoperator/17.05/write_nos.json +195 -0
- teradataml/data/jsons/tableoperator/17.10/read_nos.json +184 -0
- teradataml/data/jsons/tableoperator/17.10/write_nos.json +195 -0
- teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
- teradataml/data/jsons/tableoperator/17.20/read_nos.json +183 -0
- teradataml/data/jsons/tableoperator/17.20/write_nos.json +224 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +132 -0
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +396 -0
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +77 -0
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +153 -0
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +107 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +106 -0
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +89 -0
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +104 -0
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +78 -0
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +66 -0
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +87 -0
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +134 -0
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +144 -0
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +108 -0
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +108 -0
- teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +78 -0
- teradataml/data/jsons/uaf/17.20/TD_DIFF.json +92 -0
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +114 -0
- teradataml/data/jsons/uaf/17.20/TD_DURBIN_WATSON.json +101 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_EXTRACT_RESULTS.json +39 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +101 -0
- teradataml/data/jsons/uaf/17.20/TD_GENSERIES4FORMULA.json +85 -0
- teradataml/data/jsons/uaf/17.20/TD_GENSERIES4SINUSOIDS.json +71 -0
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +139 -0
- teradataml/data/jsons/uaf/17.20/TD_HOLT_WINTERS_FORECASTER.json +313 -0
- teradataml/data/jsons/uaf/17.20/TD_IDFFT.json +58 -0
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +81 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_INPUTVALIDATOR.json +64 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +182 -0
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +103 -0
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +181 -0
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_MATRIXMULTIPLY.json +68 -0
- teradataml/data/jsons/uaf/17.20/TD_MINFO.json +67 -0
- teradataml/data/jsons/uaf/17.20/TD_MULTIVAR_REGR.json +179 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +114 -0
- teradataml/data/jsons/uaf/17.20/TD_PORTMAN.json +119 -0
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +175 -0
- teradataml/data/jsons/uaf/17.20/TD_POWERTRANSFORM.json +98 -0
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +194 -0
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +210 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +143 -0
- teradataml/data/jsons/uaf/17.20/TD_SELECTION_CRITERIA.json +90 -0
- teradataml/data/jsons/uaf/17.20/TD_SIGNIF_PERIODICITIES.json +80 -0
- teradataml/data/jsons/uaf/17.20/TD_SIGNIF_RESIDMEAN.json +68 -0
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +184 -0
- teradataml/data/jsons/uaf/17.20/TD_SINFO.json +58 -0
- teradataml/data/jsons/uaf/17.20/TD_SMOOTHMA.json +163 -0
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +101 -0
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +112 -0
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +95 -0
- teradataml/data/jsons/uaf/17.20/TD_WHITES_GENERAL.json +78 -0
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +410 -0
- teradataml/data/kmeans_example.json +23 -0
- teradataml/data/kmeans_table.csv +10 -0
- teradataml/data/kmeans_us_arrests_data.csv +51 -0
- teradataml/data/knn_example.json +19 -0
- teradataml/data/knnrecommender_example.json +7 -0
- teradataml/data/knnrecommenderpredict_example.json +12 -0
- teradataml/data/lar_example.json +17 -0
- teradataml/data/larpredict_example.json +30 -0
- teradataml/data/lc_new_predictors.csv +5 -0
- teradataml/data/lc_new_reference.csv +9 -0
- teradataml/data/lda_example.json +9 -0
- teradataml/data/ldainference_example.json +15 -0
- teradataml/data/ldatopicsummary_example.json +9 -0
- teradataml/data/levendist_input.csv +13 -0
- teradataml/data/levenshteindistance_example.json +10 -0
- teradataml/data/linreg_example.json +10 -0
- teradataml/data/load_example_data.py +350 -0
- teradataml/data/loan_prediction.csv +295 -0
- teradataml/data/lungcancer.csv +138 -0
- teradataml/data/mappingdata.csv +12 -0
- teradataml/data/medical_readings.csv +101 -0
- teradataml/data/milk_timeseries.csv +157 -0
- teradataml/data/min_max_titanic.csv +4 -0
- teradataml/data/minhash_example.json +6 -0
- teradataml/data/ml_ratings.csv +7547 -0
- teradataml/data/ml_ratings_10.csv +2445 -0
- teradataml/data/mobile_data.csv +13 -0
- teradataml/data/model1_table.csv +5 -0
- teradataml/data/model2_table.csv +5 -0
- teradataml/data/models/License_file.txt +1 -0
- teradataml/data/models/License_file_empty.txt +0 -0
- teradataml/data/models/dataiku_iris_data_ann_thin +0 -0
- teradataml/data/models/dr_iris_rf +0 -0
- teradataml/data/models/iris_db_dt_model_sklearn.onnx +0 -0
- teradataml/data/models/iris_db_dt_model_sklearn_floattensor.onnx +0 -0
- teradataml/data/models/iris_db_glm_model.pmml +57 -0
- teradataml/data/models/iris_db_xgb_model.pmml +4471 -0
- teradataml/data/models/iris_kmeans_model +0 -0
- teradataml/data/models/iris_mojo_glm_h2o_model +0 -0
- teradataml/data/models/iris_mojo_xgb_h2o_model +0 -0
- teradataml/data/modularity_example.json +12 -0
- teradataml/data/movavg_example.json +8 -0
- teradataml/data/mtx1.csv +7 -0
- teradataml/data/mtx2.csv +13 -0
- teradataml/data/multi_model_classification.csv +401 -0
- teradataml/data/multi_model_regression.csv +401 -0
- teradataml/data/mvdfft8.csv +9 -0
- teradataml/data/naivebayes_example.json +10 -0
- teradataml/data/naivebayespredict_example.json +19 -0
- teradataml/data/naivebayestextclassifier2_example.json +7 -0
- teradataml/data/naivebayestextclassifier_example.json +8 -0
- teradataml/data/naivebayestextclassifierpredict_example.json +32 -0
- teradataml/data/name_Find_configure.csv +10 -0
- teradataml/data/namedentityfinder_example.json +14 -0
- teradataml/data/namedentityfinderevaluator_example.json +10 -0
- teradataml/data/namedentityfindertrainer_example.json +6 -0
- teradataml/data/nb_iris_input_test.csv +31 -0
- teradataml/data/nb_iris_input_train.csv +121 -0
- teradataml/data/nbp_iris_model.csv +13 -0
- teradataml/data/ner_dict.csv +8 -0
- teradataml/data/ner_extractor_text.csv +2 -0
- teradataml/data/ner_input_eng.csv +7 -0
- teradataml/data/ner_rule.csv +5 -0
- teradataml/data/ner_sports_test2.csv +29 -0
- teradataml/data/ner_sports_train.csv +501 -0
- teradataml/data/nerevaluator_example.json +6 -0
- teradataml/data/nerextractor_example.json +18 -0
- teradataml/data/nermem_sports_test.csv +18 -0
- teradataml/data/nermem_sports_train.csv +51 -0
- teradataml/data/nertrainer_example.json +7 -0
- teradataml/data/ngrams_example.json +7 -0
- teradataml/data/notebooks/__init__.py +0 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Aggregate Functions using SQLAlchemy.ipynb +1455 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Arithmetic Functions Using SQLAlchemy.ipynb +1993 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Bit-Byte Manipulation Functions using SQLAlchemy.ipynb +1492 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Built-in functions using SQLAlchemy.ipynb +536 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Regular Expressions Using SQLAlchemy.ipynb +570 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage String Functions Using SQLAlchemy.ipynb +2559 -0
- teradataml/data/notebooks/sqlalchemy/Teradata Vantage Window Aggregate Functions using SQLAlchemy.ipynb +2911 -0
- teradataml/data/notebooks/sqlalchemy/Using Generic SQLAlchemy ClauseElements teradataml DataFrame assign method.ipynb +698 -0
- teradataml/data/notebooks/sqlalchemy/__init__.py +0 -0
- teradataml/data/notebooks/sqlalchemy/teradataml filtering using SQLAlchemy ClauseElements.ipynb +784 -0
- teradataml/data/npath_example.json +23 -0
- teradataml/data/ntree_example.json +14 -0
- teradataml/data/numeric_strings.csv +5 -0
- teradataml/data/numerics.csv +4 -0
- teradataml/data/ocean_buoy.csv +17 -0
- teradataml/data/ocean_buoy2.csv +17 -0
- teradataml/data/ocean_buoys.csv +28 -0
- teradataml/data/ocean_buoys2.csv +10 -0
- teradataml/data/ocean_buoys_nonpti.csv +28 -0
- teradataml/data/ocean_buoys_seq.csv +29 -0
- teradataml/data/onehot_encoder_train.csv +4 -0
- teradataml/data/openml_example.json +92 -0
- teradataml/data/optional_event_table.csv +4 -0
- teradataml/data/orders1.csv +11 -0
- teradataml/data/orders1_12.csv +13 -0
- teradataml/data/orders_ex.csv +4 -0
- teradataml/data/pack_example.json +9 -0
- teradataml/data/package_tracking.csv +19 -0
- teradataml/data/package_tracking_pti.csv +19 -0
- teradataml/data/pagerank_example.json +13 -0
- teradataml/data/paragraphs_input.csv +6 -0
- teradataml/data/pathanalyzer_example.json +8 -0
- teradataml/data/pathgenerator_example.json +8 -0
- teradataml/data/patient_profile.csv +101 -0
- teradataml/data/pattern_matching_data.csv +11 -0
- teradataml/data/payment_fraud_dataset.csv +10001 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/phrases.csv +7 -0
- teradataml/data/pivot_example.json +9 -0
- teradataml/data/pivot_input.csv +22 -0
- teradataml/data/playerRating.csv +31 -0
- teradataml/data/pos_input.csv +40 -0
- teradataml/data/postagger_example.json +7 -0
- teradataml/data/posttagger_output.csv +44 -0
- teradataml/data/production_data.csv +17 -0
- teradataml/data/production_data2.csv +7 -0
- teradataml/data/randomsample_example.json +32 -0
- teradataml/data/randomwalksample_example.json +9 -0
- teradataml/data/rank_table.csv +6 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/ref_mobile_data.csv +4 -0
- teradataml/data/ref_mobile_data_dense.csv +2 -0
- teradataml/data/ref_url.csv +17 -0
- teradataml/data/restaurant_reviews.csv +7 -0
- teradataml/data/retail_churn_table.csv +27772 -0
- teradataml/data/river_data.csv +145 -0
- teradataml/data/roc_example.json +8 -0
- teradataml/data/roc_input.csv +101 -0
- teradataml/data/rule_inputs.csv +6 -0
- teradataml/data/rule_table.csv +2 -0
- teradataml/data/sales.csv +7 -0
- teradataml/data/sales_transaction.csv +501 -0
- teradataml/data/salesdata.csv +342 -0
- teradataml/data/sample_cities.csv +3 -0
- teradataml/data/sample_shapes.csv +11 -0
- teradataml/data/sample_streets.csv +3 -0
- teradataml/data/sampling_example.json +16 -0
- teradataml/data/sax_example.json +17 -0
- teradataml/data/scale_attributes.csv +3 -0
- teradataml/data/scale_example.json +74 -0
- teradataml/data/scale_housing.csv +11 -0
- teradataml/data/scale_housing_test.csv +6 -0
- teradataml/data/scale_input_part_sparse.csv +31 -0
- teradataml/data/scale_input_partitioned.csv +16 -0
- teradataml/data/scale_input_sparse.csv +11 -0
- teradataml/data/scale_parameters.csv +3 -0
- teradataml/data/scale_stat.csv +11 -0
- teradataml/data/scalebypartition_example.json +13 -0
- teradataml/data/scalemap_example.json +13 -0
- teradataml/data/scalesummary_example.json +12 -0
- teradataml/data/score_category.csv +101 -0
- teradataml/data/score_summary.csv +4 -0
- teradataml/data/script_example.json +10 -0
- teradataml/data/scripts/deploy_script.py +84 -0
- teradataml/data/scripts/lightgbm/dataset.template +175 -0
- teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +264 -0
- teradataml/data/scripts/lightgbm/lightgbm_function.template +234 -0
- teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +177 -0
- teradataml/data/scripts/mapper.R +20 -0
- teradataml/data/scripts/mapper.py +16 -0
- teradataml/data/scripts/mapper_replace.py +16 -0
- teradataml/data/scripts/sklearn/__init__.py +0 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +205 -0
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +148 -0
- teradataml/data/scripts/sklearn/sklearn_function.template +144 -0
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +166 -0
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +161 -0
- teradataml/data/scripts/sklearn/sklearn_score.py +145 -0
- teradataml/data/scripts/sklearn/sklearn_transform.py +327 -0
- teradataml/data/sdk/modelops/modelops_spec.json +101737 -0
- teradataml/data/seeds.csv +10 -0
- teradataml/data/sentenceextractor_example.json +7 -0
- teradataml/data/sentiment_extract_input.csv +11 -0
- teradataml/data/sentiment_train.csv +16 -0
- teradataml/data/sentiment_word.csv +20 -0
- teradataml/data/sentiment_word_input.csv +20 -0
- teradataml/data/sentimentextractor_example.json +24 -0
- teradataml/data/sentimenttrainer_example.json +8 -0
- teradataml/data/sequence_table.csv +10 -0
- teradataml/data/seriessplitter_example.json +8 -0
- teradataml/data/sessionize_example.json +17 -0
- teradataml/data/sessionize_table.csv +116 -0
- teradataml/data/setop_test1.csv +24 -0
- teradataml/data/setop_test2.csv +22 -0
- teradataml/data/soc_nw_edges.csv +11 -0
- teradataml/data/soc_nw_vertices.csv +8 -0
- teradataml/data/souvenir_timeseries.csv +168 -0
- teradataml/data/sparse_iris_attribute.csv +5 -0
- teradataml/data/sparse_iris_test.csv +121 -0
- teradataml/data/sparse_iris_train.csv +601 -0
- teradataml/data/star1.csv +6 -0
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/state_transition.csv +5 -0
- teradataml/data/stock_data.csv +53 -0
- teradataml/data/stock_movement.csv +11 -0
- teradataml/data/stock_vol.csv +76 -0
- teradataml/data/stop_words.csv +8 -0
- teradataml/data/store_sales.csv +37 -0
- teradataml/data/stringsimilarity_example.json +8 -0
- teradataml/data/strsimilarity_input.csv +13 -0
- teradataml/data/students.csv +101 -0
- teradataml/data/svm_iris_input_test.csv +121 -0
- teradataml/data/svm_iris_input_train.csv +481 -0
- teradataml/data/svm_iris_model.csv +7 -0
- teradataml/data/svmdense_example.json +10 -0
- teradataml/data/svmdensepredict_example.json +19 -0
- teradataml/data/svmsparse_example.json +8 -0
- teradataml/data/svmsparsepredict_example.json +14 -0
- teradataml/data/svmsparsesummary_example.json +8 -0
- teradataml/data/target_mobile_data.csv +13 -0
- teradataml/data/target_mobile_data_dense.csv +5 -0
- teradataml/data/target_udt_data.csv +8 -0
- teradataml/data/tdnerextractor_example.json +14 -0
- teradataml/data/templatedata.csv +1201 -0
- teradataml/data/templates/open_source_ml.json +11 -0
- teradataml/data/teradata_icon.ico +0 -0
- teradataml/data/teradataml_example.json +1473 -0
- teradataml/data/test_classification.csv +101 -0
- teradataml/data/test_loan_prediction.csv +53 -0
- teradataml/data/test_pacf_12.csv +37 -0
- teradataml/data/test_prediction.csv +101 -0
- teradataml/data/test_regression.csv +101 -0
- teradataml/data/test_river2.csv +109 -0
- teradataml/data/text_inputs.csv +6 -0
- teradataml/data/textchunker_example.json +8 -0
- teradataml/data/textclassifier_example.json +7 -0
- teradataml/data/textclassifier_input.csv +7 -0
- teradataml/data/textclassifiertrainer_example.json +7 -0
- teradataml/data/textmorph_example.json +11 -0
- teradataml/data/textparser_example.json +15 -0
- teradataml/data/texttagger_example.json +12 -0
- teradataml/data/texttokenizer_example.json +7 -0
- teradataml/data/texttrainer_input.csv +11 -0
- teradataml/data/tf_example.json +7 -0
- teradataml/data/tfidf_example.json +14 -0
- teradataml/data/tfidf_input1.csv +201 -0
- teradataml/data/tfidf_train.csv +6 -0
- teradataml/data/time_table1.csv +535 -0
- teradataml/data/time_table2.csv +14 -0
- teradataml/data/timeseriesdata.csv +1601 -0
- teradataml/data/timeseriesdatasetsd4.csv +105 -0
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic.csv +892 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/to_num_data.csv +4 -0
- teradataml/data/tochar_data.csv +5 -0
- teradataml/data/token_table.csv +696 -0
- teradataml/data/train_multiclass.csv +101 -0
- teradataml/data/train_regression.csv +101 -0
- teradataml/data/train_regression_multiple_labels.csv +101 -0
- teradataml/data/train_tracking.csv +28 -0
- teradataml/data/trans_dense.csv +16 -0
- teradataml/data/trans_sparse.csv +55 -0
- teradataml/data/transformation_table.csv +6 -0
- teradataml/data/transformation_table_new.csv +2 -0
- teradataml/data/tv_spots.csv +16 -0
- teradataml/data/twod_climate_data.csv +117 -0
- teradataml/data/uaf_example.json +529 -0
- teradataml/data/univariatestatistics_example.json +9 -0
- teradataml/data/unpack_example.json +10 -0
- teradataml/data/unpivot_example.json +25 -0
- teradataml/data/unpivot_input.csv +8 -0
- teradataml/data/url_data.csv +10 -0
- teradataml/data/us_air_pass.csv +37 -0
- teradataml/data/us_population.csv +624 -0
- teradataml/data/us_states_shapes.csv +52 -0
- teradataml/data/varmax_example.json +18 -0
- teradataml/data/vectordistance_example.json +30 -0
- teradataml/data/ville_climatedata.csv +121 -0
- teradataml/data/ville_tempdata.csv +12 -0
- teradataml/data/ville_tempdata1.csv +12 -0
- teradataml/data/ville_temperature.csv +11 -0
- teradataml/data/waveletTable.csv +1605 -0
- teradataml/data/waveletTable2.csv +1605 -0
- teradataml/data/weightedmovavg_example.json +9 -0
- teradataml/data/wft_testing.csv +5 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/data/wine_data.csv +1600 -0
- teradataml/data/word_embed_input_table1.csv +6 -0
- teradataml/data/word_embed_input_table2.csv +5 -0
- teradataml/data/word_embed_model.csv +23 -0
- teradataml/data/words_input.csv +13 -0
- teradataml/data/xconvolve_complex_left.csv +6 -0
- teradataml/data/xconvolve_complex_leftmulti.csv +6 -0
- teradataml/data/xgboost_example.json +36 -0
- teradataml/data/xgboostpredict_example.json +32 -0
- teradataml/data/ztest_example.json +16 -0
- teradataml/dataframe/__init__.py +0 -0
- teradataml/dataframe/copy_to.py +2446 -0
- teradataml/dataframe/data_transfer.py +2840 -0
- teradataml/dataframe/dataframe.py +20908 -0
- teradataml/dataframe/dataframe_utils.py +2114 -0
- teradataml/dataframe/fastload.py +794 -0
- teradataml/dataframe/functions.py +2110 -0
- teradataml/dataframe/indexer.py +424 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +1171 -0
- teradataml/dataframe/sql.py +10904 -0
- teradataml/dataframe/sql_function_parameters.py +440 -0
- teradataml/dataframe/sql_functions.py +652 -0
- teradataml/dataframe/sql_interfaces.py +220 -0
- teradataml/dataframe/vantage_function_types.py +675 -0
- teradataml/dataframe/window.py +694 -0
- teradataml/dbutils/__init__.py +3 -0
- teradataml/dbutils/dbutils.py +2871 -0
- teradataml/dbutils/filemgr.py +318 -0
- teradataml/gen_ai/__init__.py +2 -0
- teradataml/gen_ai/convAI.py +473 -0
- teradataml/geospatial/__init__.py +4 -0
- teradataml/geospatial/geodataframe.py +1105 -0
- teradataml/geospatial/geodataframecolumn.py +392 -0
- teradataml/geospatial/geometry_types.py +926 -0
- teradataml/hyperparameter_tuner/__init__.py +1 -0
- teradataml/hyperparameter_tuner/optimizer.py +4115 -0
- teradataml/hyperparameter_tuner/utils.py +303 -0
- teradataml/lib/__init__.py +0 -0
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/lib/libaed_0_1.dylib +0 -0
- teradataml/lib/libaed_0_1.so +0 -0
- teradataml/lib/libaed_0_1_aarch64.so +0 -0
- teradataml/lib/libaed_0_1_ppc64le.so +0 -0
- teradataml/opensource/__init__.py +1 -0
- teradataml/opensource/_base.py +1321 -0
- teradataml/opensource/_class.py +464 -0
- teradataml/opensource/_constants.py +61 -0
- teradataml/opensource/_lightgbm.py +949 -0
- teradataml/opensource/_sklearn.py +1008 -0
- teradataml/opensource/_wrapper_utils.py +267 -0
- teradataml/options/__init__.py +148 -0
- teradataml/options/configure.py +489 -0
- teradataml/options/display.py +187 -0
- teradataml/plot/__init__.py +3 -0
- teradataml/plot/axis.py +1427 -0
- teradataml/plot/constants.py +15 -0
- teradataml/plot/figure.py +431 -0
- teradataml/plot/plot.py +810 -0
- teradataml/plot/query_generator.py +83 -0
- teradataml/plot/subplot.py +216 -0
- teradataml/scriptmgmt/UserEnv.py +4273 -0
- teradataml/scriptmgmt/__init__.py +3 -0
- teradataml/scriptmgmt/lls_utils.py +2157 -0
- teradataml/sdk/README.md +79 -0
- teradataml/sdk/__init__.py +4 -0
- teradataml/sdk/_auth_modes.py +422 -0
- teradataml/sdk/_func_params.py +487 -0
- teradataml/sdk/_json_parser.py +453 -0
- teradataml/sdk/_openapi_spec_constants.py +249 -0
- teradataml/sdk/_utils.py +236 -0
- teradataml/sdk/api_client.py +900 -0
- teradataml/sdk/constants.py +62 -0
- teradataml/sdk/modelops/__init__.py +98 -0
- teradataml/sdk/modelops/_client.py +409 -0
- teradataml/sdk/modelops/_constants.py +304 -0
- teradataml/sdk/modelops/models.py +2308 -0
- teradataml/sdk/spinner.py +107 -0
- teradataml/series/__init__.py +0 -0
- teradataml/series/series.py +537 -0
- teradataml/series/series_utils.py +71 -0
- teradataml/store/__init__.py +12 -0
- teradataml/store/feature_store/__init__.py +0 -0
- teradataml/store/feature_store/constants.py +658 -0
- teradataml/store/feature_store/feature_store.py +4814 -0
- teradataml/store/feature_store/mind_map.py +639 -0
- teradataml/store/feature_store/models.py +7330 -0
- teradataml/store/feature_store/utils.py +390 -0
- teradataml/table_operators/Apply.py +979 -0
- teradataml/table_operators/Script.py +1739 -0
- teradataml/table_operators/TableOperator.py +1343 -0
- teradataml/table_operators/__init__.py +2 -0
- teradataml/table_operators/apply_query_generator.py +262 -0
- teradataml/table_operators/query_generator.py +493 -0
- teradataml/table_operators/table_operator_query_generator.py +462 -0
- teradataml/table_operators/table_operator_util.py +726 -0
- teradataml/table_operators/templates/dataframe_apply.template +184 -0
- teradataml/table_operators/templates/dataframe_map.template +176 -0
- teradataml/table_operators/templates/dataframe_register.template +73 -0
- teradataml/table_operators/templates/dataframe_udf.template +67 -0
- teradataml/table_operators/templates/script_executor.template +170 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +53 -0
- teradataml/utils/__init__.py +0 -0
- teradataml/utils/docstring.py +527 -0
- teradataml/utils/dtypes.py +943 -0
- teradataml/utils/internal_buffer.py +122 -0
- teradataml/utils/print_versions.py +206 -0
- teradataml/utils/utils.py +451 -0
- teradataml/utils/validators.py +3305 -0
- teradataml-20.0.0.8.dist-info/METADATA +2804 -0
- teradataml-20.0.0.8.dist-info/RECORD +1208 -0
- teradataml-20.0.0.8.dist-info/WHEEL +5 -0
- teradataml-20.0.0.8.dist-info/top_level.txt +1 -0
- teradataml-20.0.0.8.dist-info/zip-safe +1 -0
|
@@ -0,0 +1,1873 @@
|
|
|
1
|
+
# ##################################################################
|
|
2
|
+
#
|
|
3
|
+
# Copyright 2025 Teradata. All rights reserved.
|
|
4
|
+
# TERADATA CONFIDENTIAL AND TRADE SECRET
|
|
5
|
+
#
|
|
6
|
+
# Primary Owner: Sweta Shaw
|
|
7
|
+
# Email Id: Sweta.Shaw@Teradata.com
|
|
8
|
+
#
|
|
9
|
+
# Secondary Owner: Akhil Bisht
|
|
10
|
+
# Email Id: AKHIL.BISHT@Teradata.com
|
|
11
|
+
#
|
|
12
|
+
# Version: 1.1
|
|
13
|
+
# Function Version: 1.0
|
|
14
|
+
# ##################################################################
|
|
15
|
+
|
|
16
|
+
# Python Libraries
|
|
17
|
+
import pandas as pd
|
|
18
|
+
import matplotlib.pyplot as plt
|
|
19
|
+
import seaborn as sns
|
|
20
|
+
import numpy as np
|
|
21
|
+
import math
|
|
22
|
+
|
|
23
|
+
# Teradata libraries
|
|
24
|
+
from teradataml.dataframe.dataframe import DataFrame
|
|
25
|
+
from teradataml.dataframe.copy_to import copy_to_sql
|
|
26
|
+
from teradataml import ColumnSummary, CategoricalSummary, GetFutileColumns
|
|
27
|
+
from teradataml import OutlierFilterFit, OutlierFilterTransform
|
|
28
|
+
from teradataml import OrdinalEncodingFit, OrdinalEncodingTransform
|
|
29
|
+
from teradataml.hyperparameter_tuner.utils import _ProgressBar
|
|
30
|
+
from teradataml.common.messages import Messages, MessageCodes
|
|
31
|
+
from teradataml import display as dp
|
|
32
|
+
from teradataml.utils.validators import _Validators
|
|
33
|
+
from teradataml.common.utils import UtilFuncs
|
|
34
|
+
from teradataml.common.garbagecollector import GarbageCollector
|
|
35
|
+
from teradataml.common.logger import TeradataMlLogger, get_td_logger
|
|
36
|
+
|
|
37
|
+
def _is_terminal():
|
|
38
|
+
"""
|
|
39
|
+
DESCRIPTION:
|
|
40
|
+
Common Function detects whether code is running in
|
|
41
|
+
terminal/console or IPython supported environment.
|
|
42
|
+
|
|
43
|
+
PARAMETERS:
|
|
44
|
+
None
|
|
45
|
+
|
|
46
|
+
RETURNS:
|
|
47
|
+
bool
|
|
48
|
+
|
|
49
|
+
RAISES:
|
|
50
|
+
None
|
|
51
|
+
|
|
52
|
+
EXAMPLES:
|
|
53
|
+
>>> is_terminal = _is_terminal()
|
|
54
|
+
"""
|
|
55
|
+
if not hasattr(_is_terminal, 'ipython_imported'):
|
|
56
|
+
try:
|
|
57
|
+
# Check IPython environment
|
|
58
|
+
__IPYTHON__
|
|
59
|
+
# Check if IPython library is installed
|
|
60
|
+
from IPython.display import display, HTML
|
|
61
|
+
_is_terminal.ipython_imported = True
|
|
62
|
+
except (NameError, ImportError):
|
|
63
|
+
# If error, then terminal
|
|
64
|
+
_is_terminal.ipython_imported = False
|
|
65
|
+
|
|
66
|
+
return not _is_terminal.ipython_imported
|
|
67
|
+
|
|
68
|
+
# # conditional import
|
|
69
|
+
if not _is_terminal():
|
|
70
|
+
from IPython.display import display, HTML
|
|
71
|
+
|
|
72
|
+
@TeradataMlLogger
|
|
73
|
+
class _FeatureExplore:
|
|
74
|
+
|
|
75
|
+
def __init__(self,
|
|
76
|
+
data=None,
|
|
77
|
+
target_column=None,
|
|
78
|
+
custom_data=None,
|
|
79
|
+
verbose=0,
|
|
80
|
+
task_type='regression',
|
|
81
|
+
fraud=False,
|
|
82
|
+
churn=False,
|
|
83
|
+
cluster=False,
|
|
84
|
+
**kwargs):
|
|
85
|
+
"""
|
|
86
|
+
DESCRIPTION:
|
|
87
|
+
Internal function initializes the data, target column for feature exploration.
|
|
88
|
+
|
|
89
|
+
PARAMETERS:
|
|
90
|
+
data:
|
|
91
|
+
Required Argument.
|
|
92
|
+
Specifies the input teradataml DataFrame for feature exploration.
|
|
93
|
+
Types: teradataml Dataframe
|
|
94
|
+
|
|
95
|
+
target_column:
|
|
96
|
+
Required Arugment.
|
|
97
|
+
Set to None for Clustering
|
|
98
|
+
Specifies the name of the target column in "data".
|
|
99
|
+
Types: str
|
|
100
|
+
|
|
101
|
+
custom_data:
|
|
102
|
+
Optional Argument.
|
|
103
|
+
Specifies json object containing user customized input.
|
|
104
|
+
Types: json object
|
|
105
|
+
|
|
106
|
+
verbose:
|
|
107
|
+
Optional Argument.
|
|
108
|
+
Specifies the detailed execution steps based on verbose level.
|
|
109
|
+
Default Value: 0
|
|
110
|
+
Permitted Values:
|
|
111
|
+
* 0: prints the progress bar and leaderboard
|
|
112
|
+
* 1: prints the execution steps of AutoML.
|
|
113
|
+
* 2: prints the intermediate data between the execution of each step of AutoML.
|
|
114
|
+
Types: int
|
|
115
|
+
|
|
116
|
+
task_type:
|
|
117
|
+
Optional Argument.
|
|
118
|
+
Specifies the task type of the data.
|
|
119
|
+
Default Value: 'regression'
|
|
120
|
+
Permitted Values:
|
|
121
|
+
* 'regression'
|
|
122
|
+
* 'classification'
|
|
123
|
+
Types: str
|
|
124
|
+
|
|
125
|
+
fraud:
|
|
126
|
+
Optional Argument.
|
|
127
|
+
Specifies whether to apply fraud detection techniques.
|
|
128
|
+
Default Value: False
|
|
129
|
+
Types: bool
|
|
130
|
+
|
|
131
|
+
churn:
|
|
132
|
+
Optional Argument.
|
|
133
|
+
Specifies whether to apply churn prediction techniques.
|
|
134
|
+
Default Value: False
|
|
135
|
+
Types: bool
|
|
136
|
+
|
|
137
|
+
cluster:
|
|
138
|
+
Optional Argument.
|
|
139
|
+
Specifies whether to apply clustering techniques.
|
|
140
|
+
Default Value: False
|
|
141
|
+
Types: bool
|
|
142
|
+
|
|
143
|
+
**kwargs:
|
|
144
|
+
Specifies the additional arguments for feature exploration.
|
|
145
|
+
Types: dict
|
|
146
|
+
|
|
147
|
+
RETURNS:
|
|
148
|
+
None
|
|
149
|
+
|
|
150
|
+
RAISES:
|
|
151
|
+
None
|
|
152
|
+
|
|
153
|
+
EXAMPLES:
|
|
154
|
+
>>> explorer = _FeatureExplore(data=df, target_column="target", verbose=1)
|
|
155
|
+
"""
|
|
156
|
+
self.data = data
|
|
157
|
+
self.target_column = target_column
|
|
158
|
+
self.verbose = verbose
|
|
159
|
+
self.custom_data = custom_data
|
|
160
|
+
self.data_transform_dict = {}
|
|
161
|
+
self.data_types = {key: value for key, value in self.data._column_names_and_types}
|
|
162
|
+
self.terminal_print = _is_terminal()
|
|
163
|
+
self.style = self._common_style()
|
|
164
|
+
self.task_type = task_type
|
|
165
|
+
|
|
166
|
+
self.fraud = fraud
|
|
167
|
+
self.churn = churn
|
|
168
|
+
self.cluster = cluster
|
|
169
|
+
|
|
170
|
+
def _exploration(self,
|
|
171
|
+
**kwargs):
|
|
172
|
+
"""
|
|
173
|
+
DESCRIPTION:
|
|
174
|
+
Internal function performs following operations:
|
|
175
|
+
1. Column summary of columns of the dataset
|
|
176
|
+
2. Statistics of numeric columns of the dataset
|
|
177
|
+
3. Categorical column summary
|
|
178
|
+
4. Futile columns in the dataset
|
|
179
|
+
5. Target column distribution, not applicable for Clustering task_type
|
|
180
|
+
6. Outlier Percentage in numeric columns of the dataset
|
|
181
|
+
7. Heatmap of Numerical Features
|
|
182
|
+
8. Boxplots of Feature Distribution
|
|
183
|
+
9. Countplot of Categorical features
|
|
184
|
+
10.Scatterplot for selected features for Clustering task_type
|
|
185
|
+
|
|
186
|
+
PARAMETERS:
|
|
187
|
+
**kwargs:
|
|
188
|
+
Specifies the additional arguments for exploration.
|
|
189
|
+
Types: dict
|
|
190
|
+
|
|
191
|
+
RETURNS:
|
|
192
|
+
None
|
|
193
|
+
|
|
194
|
+
RAISES:
|
|
195
|
+
None
|
|
196
|
+
|
|
197
|
+
EXAMPLES:
|
|
198
|
+
>>> self._exploration()
|
|
199
|
+
"""
|
|
200
|
+
numerical_columns = []
|
|
201
|
+
categorical_columns= []
|
|
202
|
+
date_column_list = []
|
|
203
|
+
|
|
204
|
+
aml_phases = kwargs.get('automl_phases', None)
|
|
205
|
+
self._display_heading(phase=0,
|
|
206
|
+
automl_phases=aml_phases)
|
|
207
|
+
|
|
208
|
+
self._display_msg(msg="Feature Exploration started")
|
|
209
|
+
# Detecting numerical and categorical column
|
|
210
|
+
for col, d_type in self.data._column_names_and_types:
|
|
211
|
+
if d_type in ['int','float']:
|
|
212
|
+
numerical_columns.append(col)
|
|
213
|
+
elif d_type in ['str']:
|
|
214
|
+
categorical_columns.append(col)
|
|
215
|
+
elif d_type in ['datetime.date','datetime.datetime']:
|
|
216
|
+
date_column_list.append(col)
|
|
217
|
+
|
|
218
|
+
# Display initial Count of data
|
|
219
|
+
self._display_msg(msg = 'Data Overview:', show_data=True)
|
|
220
|
+
self._logger.info(f"Total Rows in the data: {self.data.shape[0]}")
|
|
221
|
+
self._logger.info(f"Total Columns in the data: {self.data.shape[1]}")
|
|
222
|
+
|
|
223
|
+
# Displaying date columns
|
|
224
|
+
if len(date_column_list)!=0:
|
|
225
|
+
self._display_msg(msg='Identified Date Columns:',
|
|
226
|
+
data=date_column_list)
|
|
227
|
+
|
|
228
|
+
# Column Summary of each feature of data
|
|
229
|
+
# such as null count, datatype, non null count
|
|
230
|
+
self._column_summary()
|
|
231
|
+
|
|
232
|
+
# Displays statistics such as mean/median/mode
|
|
233
|
+
self._statistics()
|
|
234
|
+
|
|
235
|
+
# Categorcial Summary and futile column detection
|
|
236
|
+
if len(categorical_columns) != 0:
|
|
237
|
+
categorical_obj = self._categorical_summary(categorical_columns)
|
|
238
|
+
self._futile_column(categorical_obj)
|
|
239
|
+
|
|
240
|
+
if not self.cluster:
|
|
241
|
+
# Plot a graph of target column
|
|
242
|
+
self._target_column_details()
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
# Displays outlier percentage
|
|
246
|
+
if self.fraud or self.churn:
|
|
247
|
+
outlier_method = "percentile"
|
|
248
|
+
df = self._outlier_detection(outlier_method, numerical_columns, lower_percentile=0.01, upper_percentile=0.99)
|
|
249
|
+
else:
|
|
250
|
+
outlier_method = "Tukey"
|
|
251
|
+
df = self._outlier_detection(outlier_method, numerical_columns)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
# Convert data to pandas once for all visualization functions
|
|
255
|
+
if (self.fraud or self.churn or self.cluster) and self._check_visualization_libraries() and not _is_terminal():
|
|
256
|
+
pandas_data = self.data.to_pandas().reset_index()
|
|
257
|
+
|
|
258
|
+
# Boxplots and Heatmap for feature distribution by target column
|
|
259
|
+
self._boxplot_heatmap(plot_data=pandas_data)
|
|
260
|
+
|
|
261
|
+
# Countplots for feature distribution by target column
|
|
262
|
+
self._countplot_categorical_distribution(plot_data=pandas_data)
|
|
263
|
+
|
|
264
|
+
if self.cluster:
|
|
265
|
+
# Use same pandas data for scatter plot
|
|
266
|
+
self._scatter_plot(plot_data=pandas_data)
|
|
267
|
+
|
|
268
|
+
def _statistics(self):
|
|
269
|
+
"""
|
|
270
|
+
DESCRIPTION:
|
|
271
|
+
Internal function displays the statistics of numeric columns such mean, mode, median.
|
|
272
|
+
|
|
273
|
+
PARAMETERS:
|
|
274
|
+
None
|
|
275
|
+
|
|
276
|
+
RETURNS:
|
|
277
|
+
None
|
|
278
|
+
|
|
279
|
+
RAISES:
|
|
280
|
+
None
|
|
281
|
+
|
|
282
|
+
EXAMPLES:
|
|
283
|
+
>>> self._statistics()
|
|
284
|
+
"""
|
|
285
|
+
# Statistics of numerical columns
|
|
286
|
+
self._display_msg(msg='Statistics of Data:',
|
|
287
|
+
data=self.data.describe(),
|
|
288
|
+
show_data=True)
|
|
289
|
+
|
|
290
|
+
def _column_summary(self):
|
|
291
|
+
"""
|
|
292
|
+
DESCRIPTION:
|
|
293
|
+
Internal function displays the column summary of categorical column such as
|
|
294
|
+
datatype, null count, non null count, zero count.
|
|
295
|
+
|
|
296
|
+
PARAMETERS:
|
|
297
|
+
None
|
|
298
|
+
|
|
299
|
+
RETURNS:
|
|
300
|
+
None
|
|
301
|
+
|
|
302
|
+
RAISES:
|
|
303
|
+
None
|
|
304
|
+
|
|
305
|
+
EXAMPLES:
|
|
306
|
+
>>> self._column_summary()
|
|
307
|
+
"""
|
|
308
|
+
dp.max_rows = self.data.shape[1]
|
|
309
|
+
# Column Summary of all columns of dataset
|
|
310
|
+
obj = ColumnSummary(data=self.data,
|
|
311
|
+
target_columns=self.data.columns)
|
|
312
|
+
self._display_msg(msg='Column Summary:',
|
|
313
|
+
data=obj.result,
|
|
314
|
+
show_data=True)
|
|
315
|
+
dp.max_rows = 10
|
|
316
|
+
|
|
317
|
+
def _categorical_summary(self,
|
|
318
|
+
categorical_columns=None):
|
|
319
|
+
"""
|
|
320
|
+
DESCRIPTION:
|
|
321
|
+
Internal function display the categorical summary of categorical column such count, distinct values.
|
|
322
|
+
|
|
323
|
+
PARAMETERS:
|
|
324
|
+
categorical_columns:
|
|
325
|
+
Required Argument.
|
|
326
|
+
Specifies the categorical columns.
|
|
327
|
+
Types: str or list of strings (str)
|
|
328
|
+
|
|
329
|
+
RETURNS:
|
|
330
|
+
Instance of ColumnSummary.
|
|
331
|
+
|
|
332
|
+
RAISES:
|
|
333
|
+
None
|
|
334
|
+
|
|
335
|
+
EXAMPLES:
|
|
336
|
+
>>> obj = self._categorical_summary(categorical_columns=["category1", "category2"])
|
|
337
|
+
"""
|
|
338
|
+
self._display_msg(msg='Categorical Columns with their Distinct values:',
|
|
339
|
+
show_data=True)
|
|
340
|
+
|
|
341
|
+
# Categorical Summary of categorical columns
|
|
342
|
+
obj = CategoricalSummary(data=self.data,
|
|
343
|
+
target_columns=categorical_columns)
|
|
344
|
+
|
|
345
|
+
catg_obj = obj.result[obj.result['DistinctValue'] != None]
|
|
346
|
+
print("{:<25} {:<10}".format("ColumnName", "DistinctValueCount"))
|
|
347
|
+
for col in categorical_columns:
|
|
348
|
+
dst_val = catg_obj[catg_obj['ColumnName'] == col].size//3
|
|
349
|
+
print("{:<25} {:<10}".format(col, dst_val))
|
|
350
|
+
|
|
351
|
+
return obj
|
|
352
|
+
|
|
353
|
+
def _futile_column(self,
|
|
354
|
+
categorical_obj):
|
|
355
|
+
"""
|
|
356
|
+
DESCRIPTION:
|
|
357
|
+
Internal function detects the futile columns.
|
|
358
|
+
|
|
359
|
+
PARAMETERS:
|
|
360
|
+
categorical_obj:
|
|
361
|
+
Required Argument.
|
|
362
|
+
Specifies the instance of CategoricalSummary for futile column detection.
|
|
363
|
+
Types: Instance of CategoricalSummary
|
|
364
|
+
|
|
365
|
+
RETURNS:
|
|
366
|
+
None
|
|
367
|
+
|
|
368
|
+
RAISES:
|
|
369
|
+
None
|
|
370
|
+
|
|
371
|
+
EXAMPLES:
|
|
372
|
+
>>> self._futile_column(categorical_obj=cat_summary_obj)
|
|
373
|
+
"""
|
|
374
|
+
# Futile columns detection using categorical column object
|
|
375
|
+
gfc_out = GetFutileColumns(data=self.data,
|
|
376
|
+
object=categorical_obj,
|
|
377
|
+
category_summary_column="ColumnName",
|
|
378
|
+
threshold_value=0.7)
|
|
379
|
+
|
|
380
|
+
# Extracts the futile column present in the first column
|
|
381
|
+
f_cols = [i[0] for i in gfc_out.result.itertuples()]
|
|
382
|
+
|
|
383
|
+
if len(f_cols) == 0:
|
|
384
|
+
self._display_msg(inline_msg='No Futile columns found.',
|
|
385
|
+
show_data=True)
|
|
386
|
+
else:
|
|
387
|
+
self._display_msg(msg='Futile columns in dataset:',
|
|
388
|
+
data=gfc_out.result,
|
|
389
|
+
show_data=True)
|
|
390
|
+
|
|
391
|
+
def _target_column_details(self,
|
|
392
|
+
plot_data=None):
|
|
393
|
+
"""
|
|
394
|
+
DESCRIPTION:
|
|
395
|
+
Internal function displays the target column distribution of Target column/ Response column.
|
|
396
|
+
|
|
397
|
+
PARAMETERS:
|
|
398
|
+
plot_data:
|
|
399
|
+
Optional Argument.
|
|
400
|
+
Specifies the input teradataml DataFrame for plotting distribution.
|
|
401
|
+
Types: teradataml Dataframe
|
|
402
|
+
|
|
403
|
+
RETURNS:
|
|
404
|
+
None
|
|
405
|
+
|
|
406
|
+
RAISES:
|
|
407
|
+
None
|
|
408
|
+
|
|
409
|
+
EXAMPLES:
|
|
410
|
+
>>> self._target_column_details(plot_data=df)
|
|
411
|
+
"""
|
|
412
|
+
if self._check_visualization_libraries() and not _is_terminal():
|
|
413
|
+
import matplotlib.pyplot as plt
|
|
414
|
+
import seaborn as sns
|
|
415
|
+
if plot_data is None:
|
|
416
|
+
target_data = self.data.select([self.target_column]).to_pandas()
|
|
417
|
+
else:
|
|
418
|
+
target_data = plot_data[[self.target_column]]
|
|
419
|
+
self._display_msg(msg='Target Column Distribution:',
|
|
420
|
+
show_data=True)
|
|
421
|
+
plt.figure(figsize=(8, 6))
|
|
422
|
+
# Ploting a histogram for target column
|
|
423
|
+
plt.hist(target_data, bins=10, density=True, edgecolor='black')
|
|
424
|
+
plt.xlabel(self.target_column)
|
|
425
|
+
plt.ylabel('Density')
|
|
426
|
+
plt.show()
|
|
427
|
+
|
|
428
|
+
def _countplot_categorical_distribution(self, plot_data, top_n=20, max_unique_threshold=50):
|
|
429
|
+
"""
|
|
430
|
+
DESCRIPTION:
|
|
431
|
+
Function to plot count plots for categorical features based on the target column.
|
|
432
|
+
Limits the number of unique categories to avoid messy visuals.
|
|
433
|
+
|
|
434
|
+
PARAMETERS:
|
|
435
|
+
plot_data:
|
|
436
|
+
Required Argument.
|
|
437
|
+
Specifies the pre-converted pandas DataFrame for plotting distribution.
|
|
438
|
+
This parameter is always provided by the main _exploration() method for performance optimization.
|
|
439
|
+
Types: pandas DataFrame
|
|
440
|
+
|
|
441
|
+
top_n:
|
|
442
|
+
Optional Argument.
|
|
443
|
+
Maximum number of categories to display per feature.
|
|
444
|
+
Default Value: 20
|
|
445
|
+
Types: int
|
|
446
|
+
|
|
447
|
+
max_unique_threshold:
|
|
448
|
+
Optional Argument.
|
|
449
|
+
Only plot features with unique values below this threshold.
|
|
450
|
+
Default Value: 50
|
|
451
|
+
Types: int
|
|
452
|
+
|
|
453
|
+
RETURNS:
|
|
454
|
+
None
|
|
455
|
+
|
|
456
|
+
RAISES:
|
|
457
|
+
None
|
|
458
|
+
|
|
459
|
+
EXAMPLES:
|
|
460
|
+
>>> self._countplot_categorical_distribution(plot_data=df, top_n=15)
|
|
461
|
+
"""
|
|
462
|
+
# Use the pre-converted pandas data
|
|
463
|
+
data = plot_data.copy()
|
|
464
|
+
|
|
465
|
+
target_column = self.target_column
|
|
466
|
+
|
|
467
|
+
# Select categorical features
|
|
468
|
+
categorical_features = data.select_dtypes(include=['object', 'category']).columns
|
|
469
|
+
|
|
470
|
+
if not self.cluster:
|
|
471
|
+
categorical_features = [col for col in categorical_features if col != target_column]
|
|
472
|
+
|
|
473
|
+
# Filter categorical features based on unique value threshold
|
|
474
|
+
categorical_features = [col for col in categorical_features if data[col].nunique() <= max_unique_threshold]
|
|
475
|
+
|
|
476
|
+
if len(categorical_features) == 0:
|
|
477
|
+
self._display_msg(msg="No categorical columns found with unique values within the threshold.")
|
|
478
|
+
return
|
|
479
|
+
|
|
480
|
+
self._display_msg(msg='Categorical Feature Distributions by Target Column (Count Plots):',
|
|
481
|
+
show_data=False)
|
|
482
|
+
|
|
483
|
+
for feature in categorical_features:
|
|
484
|
+
plt.figure(figsize=(10, 6))
|
|
485
|
+
|
|
486
|
+
# Get value counts and filter top N categories
|
|
487
|
+
value_counts = data[feature].value_counts()
|
|
488
|
+
|
|
489
|
+
top_categories = value_counts.nlargest(top_n).index.tolist()
|
|
490
|
+
|
|
491
|
+
# Remove duplicates while preserving order
|
|
492
|
+
top_categories = list(dict.fromkeys(top_categories))
|
|
493
|
+
|
|
494
|
+
# Replace less frequent categories with "Other"
|
|
495
|
+
data[feature] = data[feature].apply(lambda x: x if x in top_categories else "Other")
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
# Generate count plot
|
|
499
|
+
if not self.cluster:
|
|
500
|
+
cntplot = sns.countplot(data=data, x=feature, hue=target_column, order=top_categories)
|
|
501
|
+
else:
|
|
502
|
+
cntplot = sns.countplot(data=data, x=feature, order=top_categories)
|
|
503
|
+
for p in cntplot.patches:
|
|
504
|
+
height = p.get_height()
|
|
505
|
+
if height > 0: # Only display if height is greater than 0
|
|
506
|
+
cntplot.annotate(f'{int(height)}',
|
|
507
|
+
(p.get_x() + p.get_width() / 2, height),
|
|
508
|
+
ha='center', va='bottom', fontsize=10, fontweight='bold')
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
if not self.cluster:
|
|
512
|
+
plt.title(f"Distribution of {feature} by {target_column}")
|
|
513
|
+
else:
|
|
514
|
+
plt.title(f"Distribution of {feature}")
|
|
515
|
+
plt.xlabel(feature)
|
|
516
|
+
plt.ylabel("Count")
|
|
517
|
+
plt.xticks(rotation=45, ha='right') # Improve label visibility
|
|
518
|
+
if not self.cluster:
|
|
519
|
+
plt.legend(title=target_column)
|
|
520
|
+
plt.tight_layout()
|
|
521
|
+
plt.show()
|
|
522
|
+
|
|
523
|
+
def _correlation(self, data, threshold=0.1, max_features=10, min_features=2):
|
|
524
|
+
"""
|
|
525
|
+
DESCRIPTION:
|
|
526
|
+
Function to calculate the correlation values between features.
|
|
527
|
+
|
|
528
|
+
PARAMETERS:
|
|
529
|
+
data:
|
|
530
|
+
Required Argument.
|
|
531
|
+
Specifies the input pandas DataFrame for correlation analysis.
|
|
532
|
+
Types: pandas DataFrame
|
|
533
|
+
|
|
534
|
+
threshold:
|
|
535
|
+
Optional Argument.
|
|
536
|
+
Specifies the minimum correlation threshold for feature selection.
|
|
537
|
+
Default Value: 0.1
|
|
538
|
+
Types: float
|
|
539
|
+
|
|
540
|
+
max_features:
|
|
541
|
+
Optional Argument.
|
|
542
|
+
Specifies the maximum number of features to select.
|
|
543
|
+
Default Value: 10
|
|
544
|
+
Types: int
|
|
545
|
+
|
|
546
|
+
min_features:
|
|
547
|
+
Optional Argument.
|
|
548
|
+
Specifies the minimum number of features to select as fallback.
|
|
549
|
+
Default Value: 2
|
|
550
|
+
Types: int
|
|
551
|
+
|
|
552
|
+
RETURNS:
|
|
553
|
+
tuple containing filtered correlations, selected features, correlation matrix, and selection criteria.
|
|
554
|
+
|
|
555
|
+
RAISES:
|
|
556
|
+
None
|
|
557
|
+
|
|
558
|
+
EXAMPLES:
|
|
559
|
+
>>> corr_result = self._correlation(data=df, threshold=0.2, max_features=8)
|
|
560
|
+
"""
|
|
561
|
+
import numpy as np
|
|
562
|
+
|
|
563
|
+
numerical_features = data.select_dtypes(include=['float64', 'int64']).columns
|
|
564
|
+
|
|
565
|
+
# For AutoML, exclude target_column from numerical features
|
|
566
|
+
if not self.cluster and self.target_column in numerical_features:
|
|
567
|
+
numerical_features = [col for col in numerical_features if col != self.target_column]
|
|
568
|
+
|
|
569
|
+
total_numerical_features = len(numerical_features)
|
|
570
|
+
|
|
571
|
+
if self.cluster:
|
|
572
|
+
# Clustering: feature vs feature correlation
|
|
573
|
+
corr_matrix = data[numerical_features].corr()
|
|
574
|
+
# Extract upper triangle without diagonal
|
|
575
|
+
mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
|
|
576
|
+
corr_vals = corr_matrix.where(mask).stack().reset_index()
|
|
577
|
+
corr_vals.columns = ['Feature1', 'Feature2', 'Correlation']
|
|
578
|
+
corr_vals['Abs_Correlation'] = corr_vals['Correlation'].abs()
|
|
579
|
+
corr_vals = corr_vals.sort_values(by='Abs_Correlation', ascending=False)
|
|
580
|
+
|
|
581
|
+
filtered = corr_vals[corr_vals['Abs_Correlation'] > threshold].head(max_features)
|
|
582
|
+
selection_criteria = "Top Correlated Feature Pairs"
|
|
583
|
+
|
|
584
|
+
if len(filtered) < 2:
|
|
585
|
+
filtered = corr_vals.head(min(2, len(corr_vals)))
|
|
586
|
+
selection_criteria = f"Top {min(2, len(corr_vals))} Correlated Feature Pairs (Fallback)"
|
|
587
|
+
|
|
588
|
+
# Merge unique features from pairs
|
|
589
|
+
selected_features = list(set(filtered['Feature1'].tolist() + filtered['Feature2'].tolist()))
|
|
590
|
+
selected_features = selected_features[:max_features] # restrict total features
|
|
591
|
+
corr_matrix = data[selected_features].corr()
|
|
592
|
+
|
|
593
|
+
return filtered, selected_features, corr_matrix, selection_criteria
|
|
594
|
+
else:
|
|
595
|
+
# AutoML: correlation with target column
|
|
596
|
+
correlation_values = data[numerical_features].corrwith(data[self.target_column])
|
|
597
|
+
correlation_df = correlation_values.reset_index()
|
|
598
|
+
correlation_df.columns = ['Feature', 'Correlation']
|
|
599
|
+
correlation_df['Abs_Correlation'] = correlation_df['Correlation'].abs()
|
|
600
|
+
correlation_df = correlation_df.sort_values(by='Abs_Correlation', ascending=False)
|
|
601
|
+
|
|
602
|
+
filtered = correlation_df[correlation_df['Abs_Correlation'] > threshold].head(max_features)
|
|
603
|
+
selection_criteria = "Features above threshold correlation with target"
|
|
604
|
+
|
|
605
|
+
if len(filtered) < 2:
|
|
606
|
+
filtered = correlation_df.head(min(min_features, total_numerical_features))
|
|
607
|
+
selection_criteria = f"Top {min(min_features, total_numerical_features)} Correlated Features (Fallback)"
|
|
608
|
+
|
|
609
|
+
selected_features = filtered['Feature'].tolist() + [self.target_column]
|
|
610
|
+
selected_features = list(dict.fromkeys(selected_features)) # preserve order, remove dup
|
|
611
|
+
corr_matrix = data[selected_features].corr()
|
|
612
|
+
|
|
613
|
+
return selected_features, corr_matrix, selection_criteria
|
|
614
|
+
|
|
615
|
+
def _boxplot_heatmap(self, plot_data):
|
|
616
|
+
"""
|
|
617
|
+
DESCRIPTION:
|
|
618
|
+
Internal function to display heatmap and boxplots of selected numerical features.
|
|
619
|
+
Handles both AutoML (feature vs target) and Clustering (feature vs feature).
|
|
620
|
+
|
|
621
|
+
PARAMETERS:
|
|
622
|
+
plot_data:
|
|
623
|
+
Required Argument.
|
|
624
|
+
Specifies the pre-converted pandas DataFrame for plotting.
|
|
625
|
+
This parameter is always provided by the main _exploration() method for performance optimization.
|
|
626
|
+
Types: pandas DataFrame
|
|
627
|
+
|
|
628
|
+
RETURNS:
|
|
629
|
+
None
|
|
630
|
+
|
|
631
|
+
RAISES:
|
|
632
|
+
None
|
|
633
|
+
|
|
634
|
+
EXAMPLES:
|
|
635
|
+
>>> self._boxplot_heatmap(plot_data=df)
|
|
636
|
+
"""
|
|
637
|
+
# Use the pre-converted pandas data
|
|
638
|
+
data = plot_data.copy()
|
|
639
|
+
# Handle ordinal encoding for pandas data if needed
|
|
640
|
+
if not self.cluster and self.data_types.get(self.target_column) in ['str']:
|
|
641
|
+
# For pandas data, convert categorical target to numeric codes
|
|
642
|
+
if data[self.target_column].dtype == 'object':
|
|
643
|
+
data[self.target_column] = data[self.target_column].astype('category').cat.codes
|
|
644
|
+
|
|
645
|
+
if not self.cluster:
|
|
646
|
+
# Get selected features and correlation matrix
|
|
647
|
+
selected_features, corr_matrix, selection_criteria = self._correlation(data=data)
|
|
648
|
+
else:
|
|
649
|
+
filtered, selected_features, corr_matrix, selection_criteria = self._correlation(data=data)
|
|
650
|
+
|
|
651
|
+
# Display heatmap
|
|
652
|
+
mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=0)
|
|
653
|
+
plt.figure(figsize=(8, 6))
|
|
654
|
+
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
|
|
655
|
+
plt.title("Heatmap of Selected Features")
|
|
656
|
+
plt.show()
|
|
657
|
+
|
|
658
|
+
num_features = len(selected_features)
|
|
659
|
+
self._display_msg(msg=f'Number of features selected for Boxplots: {num_features}', show_data=False)
|
|
660
|
+
self._display_msg(msg=f'Selection Criteria: {selection_criteria}', show_data=False)
|
|
661
|
+
self._display_msg(msg=f'Selected Features: {", ".join(selected_features)}', show_data=False)
|
|
662
|
+
self._display_msg(msg='Boxplots:', show_data=False)
|
|
663
|
+
|
|
664
|
+
if self.cluster:
|
|
665
|
+
num_plots = len(filtered)
|
|
666
|
+
cols = 2 if num_plots > 1 else 1
|
|
667
|
+
rows = (num_plots + cols - 1) // cols
|
|
668
|
+
|
|
669
|
+
fig, axes = plt.subplots(rows, cols, figsize=(12, rows * 4))
|
|
670
|
+
axes = axes.flatten() if len(filtered) > 1 else [axes]
|
|
671
|
+
|
|
672
|
+
for i, (idx, row) in enumerate(filtered.iterrows()):
|
|
673
|
+
if i >= len(axes):
|
|
674
|
+
break # prevent IndexError if more data than axes
|
|
675
|
+
|
|
676
|
+
feature_x, feature_y = row["Feature1"], row["Feature2"]
|
|
677
|
+
|
|
678
|
+
x_unique = data[feature_x].nunique()
|
|
679
|
+
x = data[feature_x]
|
|
680
|
+
if x_unique > 20:
|
|
681
|
+
x = pd.qcut(x, q=10, duplicates='drop')
|
|
682
|
+
|
|
683
|
+
sns.boxplot(x=x, y=data[feature_y], ax=axes[i])
|
|
684
|
+
axes[i].set_title(f"{feature_y} vs {feature_x}")
|
|
685
|
+
axes[i].set_xlabel(feature_x)
|
|
686
|
+
axes[i].set_ylabel(feature_y)
|
|
687
|
+
axes[i].tick_params(axis='x', rotation=45)
|
|
688
|
+
else:
|
|
689
|
+
# Prepare boxplot layout
|
|
690
|
+
num_features = len(selected_features)
|
|
691
|
+
cols = 2 if num_features > 1 else 1
|
|
692
|
+
rows = max((num_features // 2) + (num_features % 2),1)
|
|
693
|
+
|
|
694
|
+
rows = max(rows, 1)
|
|
695
|
+
|
|
696
|
+
fig, axes = plt.subplots(rows, cols, figsize=(12, rows * 4))
|
|
697
|
+
axes = axes.flatten() if num_features > 1 else [axes]
|
|
698
|
+
# AutoML: Plot boxplot of feature vs target column
|
|
699
|
+
for i, feature in enumerate(selected_features):
|
|
700
|
+
if feature != self.target_column:
|
|
701
|
+
sns.boxplot(x=data[self.target_column], y=data[feature], ax=axes[i])
|
|
702
|
+
axes[i].set_title(f"{feature}")
|
|
703
|
+
axes[i].set_xlabel(self.target_column)
|
|
704
|
+
axes[i].set_ylabel(feature)
|
|
705
|
+
|
|
706
|
+
plt.tight_layout()
|
|
707
|
+
plt.show()
|
|
708
|
+
|
|
709
|
+
def _scatter_plot(self, plot_data, max_selected_pairs=10, threshold=0.1):
|
|
710
|
+
"""
|
|
711
|
+
DESCRIPTION:
|
|
712
|
+
Internal function to display scatterplots of selected numerical features.
|
|
713
|
+
Handles Clustering (feature vs feature).
|
|
714
|
+
|
|
715
|
+
PARAMETERS:
|
|
716
|
+
plot_data:
|
|
717
|
+
Required Argument.
|
|
718
|
+
Specifies the pre-converted pandas DataFrame for plotting scatter plots.
|
|
719
|
+
This parameter is always provided by the main _exploration() method for performance optimization.
|
|
720
|
+
Types: pandas DataFrame
|
|
721
|
+
|
|
722
|
+
max_selected_pairs:
|
|
723
|
+
Optional Argument.
|
|
724
|
+
Specifies the maximum number of feature pairs to select for scatter plots.
|
|
725
|
+
Default Value: 10
|
|
726
|
+
Types: int
|
|
727
|
+
|
|
728
|
+
threshold:
|
|
729
|
+
Optional Argument.
|
|
730
|
+
Specifies the minimum correlation threshold for feature pair selection.
|
|
731
|
+
Default Value: 0.1
|
|
732
|
+
Types: float
|
|
733
|
+
|
|
734
|
+
RETURNS:
|
|
735
|
+
None
|
|
736
|
+
|
|
737
|
+
RAISES:
|
|
738
|
+
None
|
|
739
|
+
|
|
740
|
+
EXAMPLES:
|
|
741
|
+
>>> self._scatter_plot(plot_data=df, max_selected_pairs=8, threshold=0.15)
|
|
742
|
+
"""
|
|
743
|
+
# Use the pre-converted pandas data
|
|
744
|
+
data = plot_data.copy()
|
|
745
|
+
|
|
746
|
+
# Select numerical features
|
|
747
|
+
numerical_features = data.select_dtypes(include=['float64', 'int64']).columns
|
|
748
|
+
if len(numerical_features) < 2:
|
|
749
|
+
self._display_msg(msg="Not enough numerical features for scatter plots.")
|
|
750
|
+
return
|
|
751
|
+
|
|
752
|
+
# Compute correlation matrix
|
|
753
|
+
corr_matrix = data[numerical_features].corr()
|
|
754
|
+
|
|
755
|
+
# Extract upper triangle (excluding diagonal)
|
|
756
|
+
mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
|
|
757
|
+
corr_vals = corr_matrix.where(mask).stack().reset_index()
|
|
758
|
+
corr_vals.columns = ['Feature1', 'Feature2', 'Correlation']
|
|
759
|
+
corr_vals['Abs_Correlation'] = corr_vals['Correlation'].abs()
|
|
760
|
+
|
|
761
|
+
# Sort and filter top pairs
|
|
762
|
+
corr_vals = corr_vals.sort_values(by='Abs_Correlation', ascending=False)
|
|
763
|
+
filtered = corr_vals[corr_vals['Abs_Correlation'] > threshold].head(max_selected_pairs)
|
|
764
|
+
|
|
765
|
+
if len(filtered) < 2:
|
|
766
|
+
filtered = corr_vals.head(min(2, len(corr_vals)))
|
|
767
|
+
|
|
768
|
+
if len(filtered) == 0:
|
|
769
|
+
self._display_msg(msg="No correlated pairs found above threshold.")
|
|
770
|
+
return
|
|
771
|
+
|
|
772
|
+
self._display_msg(msg=f"Scatter Plots for Top Correlated Feature Pairs:", show_data=False)
|
|
773
|
+
|
|
774
|
+
# Plot scatter plots
|
|
775
|
+
for _, row in filtered.iterrows():
|
|
776
|
+
feature_x, feature_y = row["Feature1"], row["Feature2"]
|
|
777
|
+
|
|
778
|
+
plt.figure(figsize=(6, 4))
|
|
779
|
+
sns.scatterplot(x=data[feature_x], y=data[feature_y], alpha=0.3)
|
|
780
|
+
plt.xlabel(feature_x)
|
|
781
|
+
plt.ylabel(feature_y)
|
|
782
|
+
plt.title(f"Scatter Plot: {feature_x} vs {feature_y} (Corr: {row['Correlation']:.2f})")
|
|
783
|
+
plt.tight_layout()
|
|
784
|
+
plt.show()
|
|
785
|
+
|
|
786
|
+
def _ordinal_encoding(self,
|
|
787
|
+
ordinal_columns):
|
|
788
|
+
"""
|
|
789
|
+
DESCRIPTION:
|
|
790
|
+
Function performs the ordinal encoding to categorical columns or features in the dataset.
|
|
791
|
+
|
|
792
|
+
PARAMETERS:
|
|
793
|
+
ordinal_columns:
|
|
794
|
+
Required Argument.
|
|
795
|
+
Specifies the categorical columns for which ordinal encoding will be performed.
|
|
796
|
+
Types: str or list of strings (str)
|
|
797
|
+
|
|
798
|
+
RETURNS:
|
|
799
|
+
None
|
|
800
|
+
|
|
801
|
+
RAISES:
|
|
802
|
+
None
|
|
803
|
+
|
|
804
|
+
EXAMPLES:
|
|
805
|
+
>>> self._ordinal_encoding(ordinal_columns=["category1", "category2"])
|
|
806
|
+
"""
|
|
807
|
+
# Setting volatile and persist parameters for performing encoding
|
|
808
|
+
volatile, persist = self._get_generic_parameters(func_indicator="CategoricalEncodingIndicator",
|
|
809
|
+
param_name="CategoricalEncodingParam")
|
|
810
|
+
|
|
811
|
+
# Adding fit parameters for performing encoding
|
|
812
|
+
fit_params = {
|
|
813
|
+
"data" : self.data,
|
|
814
|
+
"target_column" : ordinal_columns,
|
|
815
|
+
"volatile" : volatile,
|
|
816
|
+
"persist" : persist
|
|
817
|
+
}
|
|
818
|
+
# Performing ordinal encoding fit on target columns
|
|
819
|
+
ord_fit_obj = OrdinalEncodingFit(**fit_params)
|
|
820
|
+
# Storing fit object and column list for ordinal encoding in data transform dictionary
|
|
821
|
+
if ordinal_columns[0] != self.target_column:
|
|
822
|
+
self.data_transform_dict["custom_ord_encoding_fit_obj"] = ord_fit_obj.result
|
|
823
|
+
self.data_transform_dict['custom_ord_encoding_col'] = ordinal_columns
|
|
824
|
+
else:
|
|
825
|
+
self.data_transform_dict['target_col_encode_ind'] = True
|
|
826
|
+
self.data_transform_dict['target_col_ord_encoding_fit_obj'] = ord_fit_obj.result
|
|
827
|
+
|
|
828
|
+
# Extracting accumulate columns
|
|
829
|
+
accumulate_columns = self._extract_list(self.data.columns, ordinal_columns)
|
|
830
|
+
# Adding transform parameters for performing encoding
|
|
831
|
+
transform_params = {
|
|
832
|
+
"data" : self.data,
|
|
833
|
+
"object" : ord_fit_obj.result,
|
|
834
|
+
"accumulate" : accumulate_columns,
|
|
835
|
+
"persist" : True
|
|
836
|
+
}
|
|
837
|
+
# Disabling display table name if persist is True by default
|
|
838
|
+
if not volatile and not persist:
|
|
839
|
+
transform_params["display_table_name"] = False
|
|
840
|
+
|
|
841
|
+
# Setting persist to False if volatile is True
|
|
842
|
+
if volatile:
|
|
843
|
+
transform_params["volatile"] = True
|
|
844
|
+
transform_params["persist"] = False
|
|
845
|
+
# Performing ordinal encoding transformation
|
|
846
|
+
self.data = OrdinalEncodingTransform(**transform_params).result
|
|
847
|
+
|
|
848
|
+
if not volatile and not persist:
|
|
849
|
+
# Adding transformed data containing table to garbage collector
|
|
850
|
+
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
851
|
+
|
|
852
|
+
if len(ordinal_columns) == 1 and ordinal_columns[0] == self.target_column:
|
|
853
|
+
self.target_label = ord_fit_obj
|
|
854
|
+
|
|
855
|
+
def _extract_list(self,
|
|
856
|
+
list1,
|
|
857
|
+
list2):
|
|
858
|
+
"""
|
|
859
|
+
DESCRIPTION:
|
|
860
|
+
Function to extract elements from list1 which are not present in list2.
|
|
861
|
+
|
|
862
|
+
PARAMETERS:
|
|
863
|
+
list1:
|
|
864
|
+
Required Argument.
|
|
865
|
+
Specifies the first list for extracting elements from.
|
|
866
|
+
Types: list
|
|
867
|
+
|
|
868
|
+
list2:
|
|
869
|
+
Required Argument.
|
|
870
|
+
Specifies the second list to get elements for avoiding in first list while extracting.
|
|
871
|
+
Types: list
|
|
872
|
+
|
|
873
|
+
RETURNS:
|
|
874
|
+
list containing extracted elements.
|
|
875
|
+
|
|
876
|
+
RAISES:
|
|
877
|
+
None
|
|
878
|
+
|
|
879
|
+
EXAMPLES:
|
|
880
|
+
>>> result = self._extract_list(list1=["a", "b", "c"], list2=["b"])
|
|
881
|
+
"""
|
|
882
|
+
new_lst = list(set(list1) - set(list2))
|
|
883
|
+
return new_lst
|
|
884
|
+
|
|
885
|
+
def _get_generic_parameters(self,
|
|
886
|
+
func_indicator=None,
|
|
887
|
+
param_name=None):
|
|
888
|
+
"""
|
|
889
|
+
DESCRIPTION:
|
|
890
|
+
Function to get generic parameters.
|
|
891
|
+
|
|
892
|
+
PARAMETERS:
|
|
893
|
+
func_indicator:
|
|
894
|
+
Optional Argument.
|
|
895
|
+
Specifies the name of function indicator.
|
|
896
|
+
Types: str
|
|
897
|
+
|
|
898
|
+
param_name:
|
|
899
|
+
Optional Argument.
|
|
900
|
+
Specifies the name of the param which contains generic parameters.
|
|
901
|
+
Types: str
|
|
902
|
+
|
|
903
|
+
RETURNS:
|
|
904
|
+
Tuple containing volatile and persist parameters.
|
|
905
|
+
|
|
906
|
+
RAISES:
|
|
907
|
+
None
|
|
908
|
+
|
|
909
|
+
EXAMPLES:
|
|
910
|
+
>>> volatile, persist = self._get_generic_parameters(func_indicator="CategoricalEncodingIndicator",
|
|
911
|
+
param_name="CategoricalEncodingParam")
|
|
912
|
+
"""
|
|
913
|
+
volatile = self.volatile
|
|
914
|
+
persist = self.persist
|
|
915
|
+
if self.custom_data is not None and self.custom_data.get(func_indicator, False):
|
|
916
|
+
volatile = self.custom_data[param_name].get("volatile", False)
|
|
917
|
+
persist = self.custom_data[param_name].get("persist", False)
|
|
918
|
+
|
|
919
|
+
return (volatile, persist)
|
|
920
|
+
|
|
921
|
+
def _check_visualization_libraries(self):
|
|
922
|
+
"""
|
|
923
|
+
DESCRIPTION:
|
|
924
|
+
Internal function Checks the availability of data visualization libraries.
|
|
925
|
+
|
|
926
|
+
PARAMETERS:
|
|
927
|
+
None
|
|
928
|
+
|
|
929
|
+
RETURNS:
|
|
930
|
+
bool
|
|
931
|
+
|
|
932
|
+
RAISES:
|
|
933
|
+
None
|
|
934
|
+
|
|
935
|
+
EXAMPLES:
|
|
936
|
+
>>> has_libs = self._check_visualization_libraries()
|
|
937
|
+
"""
|
|
938
|
+
|
|
939
|
+
# Conditional import
|
|
940
|
+
try:
|
|
941
|
+
import matplotlib.pyplot as plt
|
|
942
|
+
import seaborn as sns
|
|
943
|
+
except ImportError:
|
|
944
|
+
print("Install seaborn and matplotlib libraries to visualize the data.")
|
|
945
|
+
return False
|
|
946
|
+
|
|
947
|
+
return True
|
|
948
|
+
|
|
949
|
+
def _outlier_detection(self,
|
|
950
|
+
outlier_method,
|
|
951
|
+
column_list,
|
|
952
|
+
lower_percentile=None,
|
|
953
|
+
upper_percentile=None):
|
|
954
|
+
"""
|
|
955
|
+
DESCRIPTION:
|
|
956
|
+
Function detects the outlier in numerical column and display thier percentage.
|
|
957
|
+
|
|
958
|
+
PARAMETERS:
|
|
959
|
+
outlier_method:
|
|
960
|
+
Required Argument.
|
|
961
|
+
Specifies the outlier method required for outlier detection.
|
|
962
|
+
Types: str
|
|
963
|
+
|
|
964
|
+
column_list:
|
|
965
|
+
Required Argument.
|
|
966
|
+
Specifies the numeric columns for outlier percentage calculation.
|
|
967
|
+
Types: str or list of strings (str)
|
|
968
|
+
|
|
969
|
+
lower_percentile:
|
|
970
|
+
Optional Argument.
|
|
971
|
+
Specifies the lower percentile value for outlier detection in case of percentile method.
|
|
972
|
+
Types: float
|
|
973
|
+
|
|
974
|
+
upper_percentile:
|
|
975
|
+
Optional Argument.
|
|
976
|
+
Specifies the upper percentile value for outlier detection in case of percentile method.
|
|
977
|
+
Types: float
|
|
978
|
+
|
|
979
|
+
RETURNS:
|
|
980
|
+
Pandas DataFrame containing column name with outlier percentage.
|
|
981
|
+
|
|
982
|
+
RAISES:
|
|
983
|
+
None
|
|
984
|
+
|
|
985
|
+
EXAMPLES:
|
|
986
|
+
>>> outlier_df = self._outlier_detection(outlier_method="Tukey", column_list=["num1", "num2"])
|
|
987
|
+
"""
|
|
988
|
+
# Removing target column from the list of columns
|
|
989
|
+
column_list = [col for col in column_list if col != self.target_column]
|
|
990
|
+
|
|
991
|
+
# Performing outlier fit on the data for replacing outliers with NULL value
|
|
992
|
+
fit_params = {
|
|
993
|
+
"data" : self.data,
|
|
994
|
+
"target_columns" : column_list,
|
|
995
|
+
"outlier_method" : outlier_method,
|
|
996
|
+
"lower_percentile" : lower_percentile,
|
|
997
|
+
"upper_percentile" : upper_percentile,
|
|
998
|
+
"replacement_value" : 'NULL'
|
|
999
|
+
}
|
|
1000
|
+
OutlierFilterFit_out = OutlierFilterFit(**fit_params)
|
|
1001
|
+
transform_params = {
|
|
1002
|
+
"data" : self.data,
|
|
1003
|
+
"object" : OutlierFilterFit_out.result
|
|
1004
|
+
}
|
|
1005
|
+
# Performing outlier transformation on each column
|
|
1006
|
+
OutlierTransform_obj = OutlierFilterTransform(**transform_params)
|
|
1007
|
+
|
|
1008
|
+
# Column summary of each column of the data
|
|
1009
|
+
fit_params = {
|
|
1010
|
+
"data" : OutlierTransform_obj.result,
|
|
1011
|
+
"target_columns" : column_list
|
|
1012
|
+
}
|
|
1013
|
+
colSummary = ColumnSummary(**fit_params)
|
|
1014
|
+
|
|
1015
|
+
null_count_expr = colSummary.result.NullCount
|
|
1016
|
+
non_null_count_expr = colSummary.result.NonNullCount
|
|
1017
|
+
|
|
1018
|
+
# Calculating outlier percentage
|
|
1019
|
+
df = colSummary.result.assign(True,
|
|
1020
|
+
ColumnName = colSummary.result.ColumnName,
|
|
1021
|
+
OutlierPercentage = (null_count_expr/(non_null_count_expr+null_count_expr))*100)
|
|
1022
|
+
|
|
1023
|
+
# Displaying non-zero containing outlier percentage for columns
|
|
1024
|
+
df = df[df['OutlierPercentage']>0]
|
|
1025
|
+
if self.verbose > 0:
|
|
1026
|
+
print(" "*500, end='\r')
|
|
1027
|
+
if df.shape[0] > 0:
|
|
1028
|
+
self._display_msg(msg='Columns with outlier percentage :-',
|
|
1029
|
+
show_data=True)
|
|
1030
|
+
print(df)
|
|
1031
|
+
else:
|
|
1032
|
+
self._display_msg(msg="No outlier found!")
|
|
1033
|
+
|
|
1034
|
+
return df
|
|
1035
|
+
|
|
1036
|
+
def _common_style(self):
|
|
1037
|
+
"""
|
|
1038
|
+
DESCRIPTION:
|
|
1039
|
+
Internal Function sets the style tag for HTML.
|
|
1040
|
+
|
|
1041
|
+
PARAMETERS:
|
|
1042
|
+
None
|
|
1043
|
+
|
|
1044
|
+
RETURNS:
|
|
1045
|
+
str containing style tag.
|
|
1046
|
+
|
|
1047
|
+
RAISES:
|
|
1048
|
+
None
|
|
1049
|
+
|
|
1050
|
+
EXAMPLES:
|
|
1051
|
+
>>> style_str = self._common_style()
|
|
1052
|
+
"""
|
|
1053
|
+
style = '''
|
|
1054
|
+
<style>
|
|
1055
|
+
.custom-div {
|
|
1056
|
+
background-color: lightgray;
|
|
1057
|
+
color: #000000;
|
|
1058
|
+
padding: 10px;
|
|
1059
|
+
border-radius: 8px;
|
|
1060
|
+
box-shadow: 0 3px 4px rgba(0, 0, 0, 0.2);
|
|
1061
|
+
margin-bottom: 10px;
|
|
1062
|
+
text-align: center;
|
|
1063
|
+
}
|
|
1064
|
+
</style>
|
|
1065
|
+
'''
|
|
1066
|
+
return style
|
|
1067
|
+
|
|
1068
|
+
def _display_heading(self,
|
|
1069
|
+
phase=0,
|
|
1070
|
+
progress_bar=None,
|
|
1071
|
+
**kwargs):
|
|
1072
|
+
"""
|
|
1073
|
+
DESCRIPTION:
|
|
1074
|
+
Internal function to print the phase of AutoML that
|
|
1075
|
+
completed in green color.
|
|
1076
|
+
|
|
1077
|
+
PARAMETERS:
|
|
1078
|
+
phase:
|
|
1079
|
+
Optional Argument.
|
|
1080
|
+
Specifies the phase of automl that completed.
|
|
1081
|
+
Default Value: 0
|
|
1082
|
+
Types: int
|
|
1083
|
+
|
|
1084
|
+
progress_bar:
|
|
1085
|
+
Optional Argument.
|
|
1086
|
+
Specifies the _ProgressBar object.
|
|
1087
|
+
Types: object (_ProgressBar)
|
|
1088
|
+
|
|
1089
|
+
**kwargs:
|
|
1090
|
+
Specifies the additional arguments for display heading.
|
|
1091
|
+
Types: dict
|
|
1092
|
+
|
|
1093
|
+
RETURNS:
|
|
1094
|
+
None
|
|
1095
|
+
|
|
1096
|
+
RAISES:
|
|
1097
|
+
None
|
|
1098
|
+
|
|
1099
|
+
EXAMPLES:
|
|
1100
|
+
>>> self._display_heading(phase=1)
|
|
1101
|
+
"""
|
|
1102
|
+
phases = ["1. Feature Exploration ->", " 2. Feature Engineering ->",
|
|
1103
|
+
" 3. Data Preparation ->", " 4. Model Training & Evaluation"]
|
|
1104
|
+
# Phases of automl
|
|
1105
|
+
if kwargs.get('automl_phases', None) is not None:
|
|
1106
|
+
steps = kwargs.get('automl_phases')
|
|
1107
|
+
else:
|
|
1108
|
+
steps = phases
|
|
1109
|
+
|
|
1110
|
+
# Check verbose > 0
|
|
1111
|
+
if self.verbose > 0:
|
|
1112
|
+
|
|
1113
|
+
# Check if code is running in IPython enviornment
|
|
1114
|
+
if not self.terminal_print:
|
|
1115
|
+
# Highlightedt phases of automl
|
|
1116
|
+
highlighted_steps = "".join(steps[:phase])
|
|
1117
|
+
|
|
1118
|
+
# Unhighlighted phases of automl
|
|
1119
|
+
unhighlighted_steps = "".join(steps[phase:])
|
|
1120
|
+
|
|
1121
|
+
# Combining highlighted and unhighlighted phases
|
|
1122
|
+
msg = self.style + f'<br><div class="custom-div"><h3><span style="color: green;">{highlighted_steps}</span>{unhighlighted_steps}<center></h3></center></div>'
|
|
1123
|
+
# Displaying the msg
|
|
1124
|
+
if progress_bar is not None:
|
|
1125
|
+
progress_bar.update(msg=msg,
|
|
1126
|
+
progress=False,
|
|
1127
|
+
ipython=True)
|
|
1128
|
+
else:
|
|
1129
|
+
display(HTML(msg))
|
|
1130
|
+
else:
|
|
1131
|
+
try:
|
|
1132
|
+
# Try to import colorama if not already imported
|
|
1133
|
+
from colorama import Fore, Style, init
|
|
1134
|
+
# initalize the color package
|
|
1135
|
+
init()
|
|
1136
|
+
|
|
1137
|
+
# Highlight the phases of automl
|
|
1138
|
+
highlighted_steps = "".join([Fore.GREEN + Style.BRIGHT + step + Style.RESET_ALL for step in steps[:phase]])
|
|
1139
|
+
|
|
1140
|
+
# Unhighlighted the phases of automl
|
|
1141
|
+
unhighlighted_steps = "".join(steps[phase:])
|
|
1142
|
+
|
|
1143
|
+
# Combining highlighted and unhighlighted phases
|
|
1144
|
+
msg = f'{highlighted_steps}{unhighlighted_steps}'
|
|
1145
|
+
|
|
1146
|
+
except ImportError:
|
|
1147
|
+
msg = "".join(step for step in steps)
|
|
1148
|
+
|
|
1149
|
+
if progress_bar is not None:
|
|
1150
|
+
progress_bar.update(msg=msg,
|
|
1151
|
+
progress=False)
|
|
1152
|
+
else:
|
|
1153
|
+
print(msg)
|
|
1154
|
+
|
|
1155
|
+
def _display_msg(self,
|
|
1156
|
+
msg=None,
|
|
1157
|
+
progress_bar=None,
|
|
1158
|
+
inline_msg=None,
|
|
1159
|
+
data=None,
|
|
1160
|
+
col_lst=None,
|
|
1161
|
+
show_data=False):
|
|
1162
|
+
"""
|
|
1163
|
+
DESCRIPTION:
|
|
1164
|
+
Internal Function to print statement according to
|
|
1165
|
+
environment.
|
|
1166
|
+
|
|
1167
|
+
PARAMETERS:
|
|
1168
|
+
msg:
|
|
1169
|
+
Optional Argument.
|
|
1170
|
+
Specifies the message to print.
|
|
1171
|
+
Types: str
|
|
1172
|
+
|
|
1173
|
+
progress_bar:
|
|
1174
|
+
Optional Argument.
|
|
1175
|
+
Specifies the _ProgressBar object.
|
|
1176
|
+
Types: object (_ProgressBar)
|
|
1177
|
+
|
|
1178
|
+
inline_msg:
|
|
1179
|
+
Optional Argument.
|
|
1180
|
+
Specifies the additional information to print.
|
|
1181
|
+
Types: str
|
|
1182
|
+
|
|
1183
|
+
data:
|
|
1184
|
+
Optional Argument.
|
|
1185
|
+
Specifies the teradataml dataframe to print.
|
|
1186
|
+
Types: teradataml DataFrame
|
|
1187
|
+
|
|
1188
|
+
col_lst:
|
|
1189
|
+
Optional Argument.
|
|
1190
|
+
Specifies the list of columns.
|
|
1191
|
+
Types: list of str/int/data.time
|
|
1192
|
+
|
|
1193
|
+
show_data:
|
|
1194
|
+
Optional Argument.
|
|
1195
|
+
Specifies whether to print msg/data when verbose<2.
|
|
1196
|
+
Default Value: False
|
|
1197
|
+
Types: bool
|
|
1198
|
+
|
|
1199
|
+
RETURNS:
|
|
1200
|
+
None
|
|
1201
|
+
|
|
1202
|
+
RAISES:
|
|
1203
|
+
None
|
|
1204
|
+
|
|
1205
|
+
EXAMPLES:
|
|
1206
|
+
>>> self._display_msg(msg="Processing data", show_data=True)
|
|
1207
|
+
"""
|
|
1208
|
+
# If verbose level is set to 2
|
|
1209
|
+
if self.verbose == 2:
|
|
1210
|
+
# If a progress bar is provided
|
|
1211
|
+
if progress_bar:
|
|
1212
|
+
# If a message is provided
|
|
1213
|
+
if msg:
|
|
1214
|
+
progress_bar.clear_line()
|
|
1215
|
+
self._logger.info(f"{msg}")
|
|
1216
|
+
# Update the progress bar with the message and either the column list or data (if they are not None)
|
|
1217
|
+
# passing empty message to avoid duplication of message in progress bar
|
|
1218
|
+
progress_bar.update(msg="", data=col_lst if col_lst else data if data is not None else None,
|
|
1219
|
+
progress=False,
|
|
1220
|
+
ipython=not self.terminal_print)
|
|
1221
|
+
# Displaying shape of data
|
|
1222
|
+
if data is not None:
|
|
1223
|
+
progress_bar.update(msg=f'{data.shape[0]} rows X {data.shape[1]} columns',
|
|
1224
|
+
progress=False,
|
|
1225
|
+
ipython=not self.terminal_print)
|
|
1226
|
+
# If an inline message is provided instead
|
|
1227
|
+
elif inline_msg:
|
|
1228
|
+
progress_bar.clear_line()
|
|
1229
|
+
self._logger.info(f"{inline_msg}")
|
|
1230
|
+
# If no progress bar is provided
|
|
1231
|
+
else:
|
|
1232
|
+
# If a message is provided
|
|
1233
|
+
if msg:
|
|
1234
|
+
# Print the message
|
|
1235
|
+
self._logger.info(f"{msg}")
|
|
1236
|
+
# If a column list is provided
|
|
1237
|
+
if col_lst:
|
|
1238
|
+
# Print the column list
|
|
1239
|
+
self._logger.info(col_lst)
|
|
1240
|
+
# If data is provided instead
|
|
1241
|
+
elif data is not None:
|
|
1242
|
+
# Print the data if terminal_print is True, else display the data
|
|
1243
|
+
print(data) if self.terminal_print else display(data)
|
|
1244
|
+
# If an inline message is provided instead
|
|
1245
|
+
elif inline_msg:
|
|
1246
|
+
# Print the inline message
|
|
1247
|
+
self._logger.info(f'{inline_msg}')
|
|
1248
|
+
# Exit the function after handling verbose level 2
|
|
1249
|
+
return
|
|
1250
|
+
|
|
1251
|
+
# If verbose level is more than 0 and show_data is True
|
|
1252
|
+
if self.verbose > 0 and show_data:
|
|
1253
|
+
# If a progress bar and a message are provided
|
|
1254
|
+
if progress_bar and msg:
|
|
1255
|
+
progress_bar.clear_line()
|
|
1256
|
+
self._logger.info(f"{msg}")
|
|
1257
|
+
# Update the progress bar with the message and data (if data is not None)
|
|
1258
|
+
# passing empty message to avoid duplication of message in progress bar
|
|
1259
|
+
progress_bar.update(msg="", data=data if data is not None else None,
|
|
1260
|
+
progress=False, ipython=not self.terminal_print)
|
|
1261
|
+
# If no progress bar is provided
|
|
1262
|
+
else:
|
|
1263
|
+
# If a message is provided
|
|
1264
|
+
if msg:
|
|
1265
|
+
# Print the message if terminal_print is True, else display the message
|
|
1266
|
+
self._logger.info(f'{msg}')
|
|
1267
|
+
# If data is provided
|
|
1268
|
+
if data is not None:
|
|
1269
|
+
# Print the data if terminal_print is True, else display the data
|
|
1270
|
+
print(data) if self.terminal_print else display(data)
|
|
1271
|
+
|
|
1272
|
+
@staticmethod
|
|
1273
|
+
def _visualize(data,
|
|
1274
|
+
target_column,
|
|
1275
|
+
plot_type=["target"],
|
|
1276
|
+
length=10,
|
|
1277
|
+
breadth=8,
|
|
1278
|
+
max_features=10,
|
|
1279
|
+
columns=None,
|
|
1280
|
+
problem_type=None):
|
|
1281
|
+
"""
|
|
1282
|
+
DESCRIPTION:
|
|
1283
|
+
Internal function to visualize the data using various plots such as heatmap,
|
|
1284
|
+
pair plot, density, count plot, box plot, and target distribution.
|
|
1285
|
+
|
|
1286
|
+
PARAMETERS:
|
|
1287
|
+
data:
|
|
1288
|
+
Required Argument.
|
|
1289
|
+
Specifies the input teradataml DataFrame for plotting.
|
|
1290
|
+
Types: teradataml Dataframe
|
|
1291
|
+
|
|
1292
|
+
target_column:
|
|
1293
|
+
Required Argument.
|
|
1294
|
+
Specifies the name of the target column in "data".
|
|
1295
|
+
Types: str
|
|
1296
|
+
|
|
1297
|
+
plot_type:
|
|
1298
|
+
Optional Argument.
|
|
1299
|
+
Specifies the type of plot to be displayed.
|
|
1300
|
+
Default Value: "target"
|
|
1301
|
+
Permitted Values:
|
|
1302
|
+
* "heatmap": Displays a heatmap of feature correlations.
|
|
1303
|
+
* "pair": Displays a pair plot of features.
|
|
1304
|
+
* "density": Displays a density plot of features.
|
|
1305
|
+
* "count": Displays a count plot of categorical features.
|
|
1306
|
+
* "box": Displays a box plot of numerical features.
|
|
1307
|
+
* "target": Displays the distribution of the target variable.
|
|
1308
|
+
* "all": Displays all the plots.
|
|
1309
|
+
Types: str, list of str
|
|
1310
|
+
|
|
1311
|
+
length:
|
|
1312
|
+
Optional Argument.
|
|
1313
|
+
Specifies the length of the plot.
|
|
1314
|
+
Default Value: 10
|
|
1315
|
+
Types: int
|
|
1316
|
+
|
|
1317
|
+
breadth:
|
|
1318
|
+
Optional Argument.
|
|
1319
|
+
Specifies the breadth of the plot.
|
|
1320
|
+
Default Value: 8
|
|
1321
|
+
Types: int
|
|
1322
|
+
|
|
1323
|
+
columns:
|
|
1324
|
+
Optional Argument.
|
|
1325
|
+
Specifies the column names to be used for plotting.
|
|
1326
|
+
Types: str or list of string
|
|
1327
|
+
|
|
1328
|
+
max_features:
|
|
1329
|
+
Optional Argument.
|
|
1330
|
+
Specifies the maximum number of features to be used for plotting.
|
|
1331
|
+
Default Value: 10
|
|
1332
|
+
Note:
|
|
1333
|
+
* It applies separately to categorical and numerical features.
|
|
1334
|
+
Types: int
|
|
1335
|
+
|
|
1336
|
+
problem_type:
|
|
1337
|
+
Optional Argument.
|
|
1338
|
+
Specifies the type of problem.
|
|
1339
|
+
Permitted Values:
|
|
1340
|
+
* 'regression'
|
|
1341
|
+
* 'classification'
|
|
1342
|
+
Types: str
|
|
1343
|
+
|
|
1344
|
+
RETURNS:
|
|
1345
|
+
None
|
|
1346
|
+
|
|
1347
|
+
RAISES:
|
|
1348
|
+
TeradataMlException, ValueError, TypeError
|
|
1349
|
+
|
|
1350
|
+
EXAMPLES:
|
|
1351
|
+
>>> _FeatureExplore._visualize(data=data,
|
|
1352
|
+
target_column="target",
|
|
1353
|
+
plot_type="heatmap",
|
|
1354
|
+
length=10,
|
|
1355
|
+
breadth=8,
|
|
1356
|
+
max_features=10,
|
|
1357
|
+
columns=["feature1", "feature2"],
|
|
1358
|
+
problem_type="regression")
|
|
1359
|
+
"""
|
|
1360
|
+
# Appending arguments to list for validation
|
|
1361
|
+
arg_info_matrix = []
|
|
1362
|
+
arg_info_matrix.append(["data", data, False, (DataFrame)])
|
|
1363
|
+
arg_info_matrix.append(["target_column", target_column, False, (str)])
|
|
1364
|
+
arg_info_matrix.append(["plot_type", plot_type, True, (str, list), True, ["heatmap", "pair", "all",
|
|
1365
|
+
"density", "count", "box", "target"]])
|
|
1366
|
+
arg_info_matrix.append(["length", length, True, (int)])
|
|
1367
|
+
arg_info_matrix.append(["breadth", breadth, True, (int)])
|
|
1368
|
+
arg_info_matrix.append(["max_features", max_features, True, (int)])
|
|
1369
|
+
arg_info_matrix.append(["problem_type", problem_type, True, (str), True, ["regression", "classification"]])
|
|
1370
|
+
arg_info_matrix.append(["columns", columns, True, (str, list)])
|
|
1371
|
+
|
|
1372
|
+
# Validate argument types
|
|
1373
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
1374
|
+
|
|
1375
|
+
# Validate that data has the required columns
|
|
1376
|
+
_Validators._validate_dataframe_has_argument_columns(target_column, "target_column", data, "data")
|
|
1377
|
+
_Validators._validate_dataframe_has_argument_columns(columns, "columns", data, "data")
|
|
1378
|
+
|
|
1379
|
+
# Convert data to pandas DataFrame if it's a teradataml DataFrame
|
|
1380
|
+
cols = data.columns
|
|
1381
|
+
data = data.to_pandas().reset_index()
|
|
1382
|
+
# avoiding the index column
|
|
1383
|
+
data = data[cols]
|
|
1384
|
+
|
|
1385
|
+
available_plots = ["target", "density", "count", "box", "pair", "heatmap"]
|
|
1386
|
+
|
|
1387
|
+
# if target_column is str
|
|
1388
|
+
if isinstance(target_column, str):
|
|
1389
|
+
data[target_column] = data[target_column].astype("category").cat.codes
|
|
1390
|
+
|
|
1391
|
+
if plot_type == "all":
|
|
1392
|
+
plot_type = available_plots
|
|
1393
|
+
else:
|
|
1394
|
+
plot_type = UtilFuncs._as_list(plot_type)
|
|
1395
|
+
|
|
1396
|
+
# Identify numerical and categorical columns
|
|
1397
|
+
numerical_features = data.select_dtypes(include=['number']).columns.drop(target_column).tolist()
|
|
1398
|
+
categorical_features = data.select_dtypes(include=['object', 'category']).columns.tolist()
|
|
1399
|
+
|
|
1400
|
+
# Handle selected_columns input
|
|
1401
|
+
if columns:
|
|
1402
|
+
selected_columns = UtilFuncs._as_list(columns)
|
|
1403
|
+
selected_num_features = [col for col in selected_columns if col in numerical_features][:max_features]
|
|
1404
|
+
selected_cat_features = [col for col in selected_columns if col in categorical_features][:max_features]
|
|
1405
|
+
else:
|
|
1406
|
+
# Compute correlation with target and select top correlated numerical features
|
|
1407
|
+
if target_column in data.columns and pd.api.types.is_numeric_dtype(data[target_column]):
|
|
1408
|
+
selected_num_features = (
|
|
1409
|
+
data[numerical_features]
|
|
1410
|
+
.corrwith(data[target_column])
|
|
1411
|
+
.abs()
|
|
1412
|
+
.nlargest(max_features)
|
|
1413
|
+
.index.tolist()
|
|
1414
|
+
)
|
|
1415
|
+
else:
|
|
1416
|
+
selected_num_features = numerical_features[:max_features]
|
|
1417
|
+
|
|
1418
|
+
# Select top categorical features based on appearance
|
|
1419
|
+
selected_cat_features = categorical_features[:max_features]
|
|
1420
|
+
|
|
1421
|
+
irrelevant_plot = []
|
|
1422
|
+
|
|
1423
|
+
# Sort plot_type based on the order in available_plots
|
|
1424
|
+
# display univariate plots first, then bivariate, and finally multivariate
|
|
1425
|
+
sorted_plot_type = sorted(plot_type, key=lambda x: available_plots.index(x.lower()))
|
|
1426
|
+
|
|
1427
|
+
for plot in sorted_plot_type:
|
|
1428
|
+
# Target Distribution
|
|
1429
|
+
if plot.lower() == "target":
|
|
1430
|
+
msg = _FeatureExplore._target_distribution(data=data,
|
|
1431
|
+
target_column=target_column,
|
|
1432
|
+
problem_type=problem_type,
|
|
1433
|
+
length=length,
|
|
1434
|
+
breadth=breadth)
|
|
1435
|
+
# Density Plot (for numerical features) - Grid
|
|
1436
|
+
elif plot.lower() == "density":
|
|
1437
|
+
msg = _FeatureExplore._density_plot(data=data,
|
|
1438
|
+
length=length,
|
|
1439
|
+
breadth=breadth,
|
|
1440
|
+
numerical_features=selected_num_features)
|
|
1441
|
+
# Count Plot (for categorical features) - Grid
|
|
1442
|
+
elif plot.lower() == "count":
|
|
1443
|
+
msg = _FeatureExplore._count_plot(data=data,
|
|
1444
|
+
length=length,
|
|
1445
|
+
breadth=breadth,
|
|
1446
|
+
categorical_features=selected_cat_features)
|
|
1447
|
+
# Box Plot (for numerical features) - Grid
|
|
1448
|
+
elif plot.lower() == "box":
|
|
1449
|
+
msg = _FeatureExplore._box_plot(data=data,
|
|
1450
|
+
length=length,
|
|
1451
|
+
breadth=breadth,
|
|
1452
|
+
numerical_features=selected_num_features)
|
|
1453
|
+
# Scatter Plot / Pair Plot
|
|
1454
|
+
elif plot.lower() == "pair":
|
|
1455
|
+
msg = _FeatureExplore._pair_plot(data=data,
|
|
1456
|
+
target_column=target_column,
|
|
1457
|
+
length=length,
|
|
1458
|
+
breadth=breadth,
|
|
1459
|
+
numerical_features=selected_num_features,
|
|
1460
|
+
categorical_features=selected_cat_features)
|
|
1461
|
+
# Heatmap
|
|
1462
|
+
elif plot.lower() == "heatmap":
|
|
1463
|
+
msg = _FeatureExplore._heatmap(data=data,
|
|
1464
|
+
target_column=target_column,
|
|
1465
|
+
length=length,
|
|
1466
|
+
breadth=breadth,
|
|
1467
|
+
numerical_features=selected_num_features)
|
|
1468
|
+
|
|
1469
|
+
if msg:
|
|
1470
|
+
irrelevant_plot.append(msg)
|
|
1471
|
+
|
|
1472
|
+
if irrelevant_plot:
|
|
1473
|
+
for msg in irrelevant_plot:
|
|
1474
|
+
print(msg)
|
|
1475
|
+
|
|
1476
|
+
@staticmethod
|
|
1477
|
+
def _heatmap(data,
|
|
1478
|
+
target_column,
|
|
1479
|
+
length=10,
|
|
1480
|
+
breadth=8,
|
|
1481
|
+
numerical_features=[]):
|
|
1482
|
+
"""
|
|
1483
|
+
DESCRIPTION:
|
|
1484
|
+
Internal function to visualize the data using heatmap.
|
|
1485
|
+
|
|
1486
|
+
PARAMETERS:
|
|
1487
|
+
data:
|
|
1488
|
+
Required Argument.
|
|
1489
|
+
Specifies the input pandas DataFrame for plotting.
|
|
1490
|
+
Types: pandas Dataframe
|
|
1491
|
+
|
|
1492
|
+
target_column:
|
|
1493
|
+
Required Argument.
|
|
1494
|
+
Specifies the name of the target column in "data".
|
|
1495
|
+
Types: str
|
|
1496
|
+
|
|
1497
|
+
length:
|
|
1498
|
+
Optional Argument.
|
|
1499
|
+
Specifies the length of the plot.
|
|
1500
|
+
Default Value: 10
|
|
1501
|
+
Types: int
|
|
1502
|
+
|
|
1503
|
+
breadth:
|
|
1504
|
+
Optional Argument.
|
|
1505
|
+
Specifies the breadth of the plot.
|
|
1506
|
+
Default Value: 8
|
|
1507
|
+
Types: int
|
|
1508
|
+
|
|
1509
|
+
numerical_features:
|
|
1510
|
+
Optional Argument.
|
|
1511
|
+
Specifies the list of numerical features to be plotted.
|
|
1512
|
+
Types: list of str
|
|
1513
|
+
|
|
1514
|
+
RETURNS:
|
|
1515
|
+
str
|
|
1516
|
+
|
|
1517
|
+
RAISES:
|
|
1518
|
+
None
|
|
1519
|
+
|
|
1520
|
+
EXAMPLES:
|
|
1521
|
+
>>> _FeatureExplore._heatmap(data=data,
|
|
1522
|
+
target_column="target",
|
|
1523
|
+
length=10,
|
|
1524
|
+
breadth=8,
|
|
1525
|
+
numerical_features=["feature1", "feature2"])
|
|
1526
|
+
|
|
1527
|
+
"""
|
|
1528
|
+
if len(numerical_features) >= 1:
|
|
1529
|
+
plt.figure(figsize=(length, breadth))
|
|
1530
|
+
sns.heatmap(data[numerical_features + [target_column]].corr(), annot=True, cmap="coolwarm")
|
|
1531
|
+
plt.title("Feature Correlation Heatmap")
|
|
1532
|
+
plt.show()
|
|
1533
|
+
else:
|
|
1534
|
+
return f"Plot type 'heatmap' is not applicable as no numerical features are available."
|
|
1535
|
+
|
|
1536
|
+
@staticmethod
|
|
1537
|
+
def _pair_plot(data,
|
|
1538
|
+
target_column,
|
|
1539
|
+
length=10,
|
|
1540
|
+
breadth=8,
|
|
1541
|
+
numerical_features=[],
|
|
1542
|
+
categorical_features=[]):
|
|
1543
|
+
"""
|
|
1544
|
+
DESCRIPTION:
|
|
1545
|
+
Internal function to visualize the data using pair plot.
|
|
1546
|
+
|
|
1547
|
+
PARAMETERS:
|
|
1548
|
+
data:
|
|
1549
|
+
Required Argument.
|
|
1550
|
+
Specifies the input pandas DataFrame for plotting.
|
|
1551
|
+
Types: pandas Dataframe
|
|
1552
|
+
|
|
1553
|
+
target_column:
|
|
1554
|
+
Required Argument.
|
|
1555
|
+
Specifies the name of the target column in "data".
|
|
1556
|
+
Types: str
|
|
1557
|
+
|
|
1558
|
+
length:
|
|
1559
|
+
Optional Argument.
|
|
1560
|
+
Specifies the length of the plot.
|
|
1561
|
+
Default Value: 10
|
|
1562
|
+
Types: int
|
|
1563
|
+
|
|
1564
|
+
breadth:
|
|
1565
|
+
Optional Argument.
|
|
1566
|
+
Specifies the breadth of the plot.
|
|
1567
|
+
Default Value: 8
|
|
1568
|
+
Types: int
|
|
1569
|
+
|
|
1570
|
+
numerical_features:
|
|
1571
|
+
Optional Argument.
|
|
1572
|
+
Specifies the list of numerical features to be plotted.
|
|
1573
|
+
Types: list of str
|
|
1574
|
+
|
|
1575
|
+
categorical_features:
|
|
1576
|
+
Optional Argument.
|
|
1577
|
+
Specifies the list of categorical features to be plotted.
|
|
1578
|
+
Types: list of str
|
|
1579
|
+
|
|
1580
|
+
RETURNS:
|
|
1581
|
+
str
|
|
1582
|
+
|
|
1583
|
+
RAISES:
|
|
1584
|
+
None
|
|
1585
|
+
|
|
1586
|
+
EXAMPLES:
|
|
1587
|
+
>>> _FeatureExplore._pair_plot(data=data,
|
|
1588
|
+
target_column="target",
|
|
1589
|
+
length=10,
|
|
1590
|
+
breadth=8,
|
|
1591
|
+
numerical_features=["feature1", "feature2"])
|
|
1592
|
+
|
|
1593
|
+
"""
|
|
1594
|
+
if len(numerical_features) >= 1:
|
|
1595
|
+
pair = sns.pairplot(data[numerical_features + [target_column]],
|
|
1596
|
+
hue=target_column if target_column in categorical_features else None)
|
|
1597
|
+
|
|
1598
|
+
# Add a centered title
|
|
1599
|
+
pair.figure.suptitle("pair Plot", fontsize=16, y=1.02)
|
|
1600
|
+
plt.show()
|
|
1601
|
+
else:
|
|
1602
|
+
return f"Plot type 'pair' is not applicable as no numerical features are available."
|
|
1603
|
+
|
|
1604
|
+
@staticmethod
|
|
1605
|
+
def _density_plot(data,
|
|
1606
|
+
length=10,
|
|
1607
|
+
breadth=8,
|
|
1608
|
+
numerical_features=[]):
|
|
1609
|
+
"""
|
|
1610
|
+
DESCRIPTION:
|
|
1611
|
+
Internal function to visualize the data using density plot.
|
|
1612
|
+
|
|
1613
|
+
PARAMETERS:
|
|
1614
|
+
data:
|
|
1615
|
+
Required Argument.
|
|
1616
|
+
Specifies the input pandas DataFrame for plotting.
|
|
1617
|
+
Types: pandas Dataframe
|
|
1618
|
+
|
|
1619
|
+
length:
|
|
1620
|
+
Optional Argument.
|
|
1621
|
+
Specifies the length of the plot.
|
|
1622
|
+
Default Value: 10
|
|
1623
|
+
Types: int
|
|
1624
|
+
|
|
1625
|
+
breadth:
|
|
1626
|
+
Optional Argument.
|
|
1627
|
+
Specifies the breadth of the plot.
|
|
1628
|
+
Default Value: 8
|
|
1629
|
+
Types: int
|
|
1630
|
+
|
|
1631
|
+
numerical_features:
|
|
1632
|
+
Optional Argument.
|
|
1633
|
+
Specifies the list of numerical features to be plotted.
|
|
1634
|
+
Types: list of str
|
|
1635
|
+
|
|
1636
|
+
RETURNS:
|
|
1637
|
+
str
|
|
1638
|
+
|
|
1639
|
+
RAISES:
|
|
1640
|
+
None
|
|
1641
|
+
|
|
1642
|
+
EXAMPLES:
|
|
1643
|
+
>>> _FeatureExplore._density_plot(data=data,
|
|
1644
|
+
length=10,
|
|
1645
|
+
breadth=8,
|
|
1646
|
+
numerical_features=["feature1", "feature2"])
|
|
1647
|
+
|
|
1648
|
+
"""
|
|
1649
|
+
if len(numerical_features) >= 1:
|
|
1650
|
+
rows = math.ceil(len(numerical_features) / 3)
|
|
1651
|
+
fig, axes = plt.subplots(rows, 3, figsize=(length, breadth))
|
|
1652
|
+
axes = axes.flatten()
|
|
1653
|
+
fig.suptitle("Density plot", fontsize=14)
|
|
1654
|
+
|
|
1655
|
+
for i, feature in enumerate(numerical_features):
|
|
1656
|
+
sns.kdeplot(data[feature], fill=True, color="green", alpha=0.6, ax=axes[i])
|
|
1657
|
+
|
|
1658
|
+
# Hide any empty subplots
|
|
1659
|
+
for i in range(len(numerical_features), len(axes)):
|
|
1660
|
+
axes[i].axis('off')
|
|
1661
|
+
|
|
1662
|
+
plt.tight_layout()
|
|
1663
|
+
plt.show()
|
|
1664
|
+
return None
|
|
1665
|
+
else:
|
|
1666
|
+
return f"Plot type 'density' is not applicable as no numerical features are available."
|
|
1667
|
+
|
|
1668
|
+
@staticmethod
|
|
1669
|
+
def _target_distribution(data,
|
|
1670
|
+
target_column,
|
|
1671
|
+
problem_type=None,
|
|
1672
|
+
length=10,
|
|
1673
|
+
breadth=8):
|
|
1674
|
+
"""
|
|
1675
|
+
DESCRIPTION:
|
|
1676
|
+
Function visualizes the target distribution.
|
|
1677
|
+
|
|
1678
|
+
PARAMETERS:
|
|
1679
|
+
data:
|
|
1680
|
+
Required Argument.
|
|
1681
|
+
Specifies the input pandas DataFrame for plotting.
|
|
1682
|
+
Types: pandas Dataframe
|
|
1683
|
+
|
|
1684
|
+
target_column:
|
|
1685
|
+
Required Argument.
|
|
1686
|
+
Specifies the name of the target column in "data".
|
|
1687
|
+
Types: str
|
|
1688
|
+
|
|
1689
|
+
problem_type:
|
|
1690
|
+
Optional Argument.
|
|
1691
|
+
Specifies the type of problem.
|
|
1692
|
+
Permitted Values:
|
|
1693
|
+
* 'regression'
|
|
1694
|
+
* 'classification'
|
|
1695
|
+
Types: str
|
|
1696
|
+
|
|
1697
|
+
length:
|
|
1698
|
+
Optional Argument.
|
|
1699
|
+
Specifies the length of the plot.
|
|
1700
|
+
Default Value: 10
|
|
1701
|
+
Types: int
|
|
1702
|
+
|
|
1703
|
+
breadth:
|
|
1704
|
+
Optional Argument.
|
|
1705
|
+
Specifies the breadth of the plot.
|
|
1706
|
+
Default Value: 8
|
|
1707
|
+
Types: int
|
|
1708
|
+
|
|
1709
|
+
RETURNS:
|
|
1710
|
+
None
|
|
1711
|
+
|
|
1712
|
+
RAISES:
|
|
1713
|
+
None
|
|
1714
|
+
|
|
1715
|
+
EXAMPLES:
|
|
1716
|
+
>>> _FeatureExplore._target_distribution(data=data, target_column="target", problem_type="classification")
|
|
1717
|
+
"""
|
|
1718
|
+
plt.figure(figsize=(length, breadth))
|
|
1719
|
+
# Categorical Target
|
|
1720
|
+
if (problem_type is None and data[target_column].nunique() <= 20) or \
|
|
1721
|
+
(problem_type and problem_type.lower() == 'classification'):
|
|
1722
|
+
sns.countplot(x=target_column,
|
|
1723
|
+
data=data,
|
|
1724
|
+
palette="coolwarm",
|
|
1725
|
+
hue=target_column,
|
|
1726
|
+
legend=False)
|
|
1727
|
+
else:
|
|
1728
|
+
# Numerical Target
|
|
1729
|
+
sns.histplot(data[target_column], kde=True, color="blue")
|
|
1730
|
+
plt.title("Target Distribution")
|
|
1731
|
+
plt.tight_layout()
|
|
1732
|
+
plt.show()
|
|
1733
|
+
|
|
1734
|
+
|
|
1735
|
+
@staticmethod
|
|
1736
|
+
def _count_plot(data,
|
|
1737
|
+
length=10,
|
|
1738
|
+
breadth=8,
|
|
1739
|
+
categorical_features=[]):
|
|
1740
|
+
"""
|
|
1741
|
+
DESCRIPTION:
|
|
1742
|
+
Internal function to visualize the data using count plot.
|
|
1743
|
+
|
|
1744
|
+
PARAMETERS:
|
|
1745
|
+
data:
|
|
1746
|
+
Required Argument.
|
|
1747
|
+
Specifies the input pandas DataFrame for plotting.
|
|
1748
|
+
Types: pandas Dataframe
|
|
1749
|
+
|
|
1750
|
+
length:
|
|
1751
|
+
Optional Argument.
|
|
1752
|
+
Specifies the length of the plot.
|
|
1753
|
+
Default Value: 10
|
|
1754
|
+
Types: int
|
|
1755
|
+
|
|
1756
|
+
breadth:
|
|
1757
|
+
Optional Argument.
|
|
1758
|
+
Specifies the breadth of the plot.
|
|
1759
|
+
Default Value: 8
|
|
1760
|
+
Types: int
|
|
1761
|
+
|
|
1762
|
+
categorical_features:
|
|
1763
|
+
Optional Argument.
|
|
1764
|
+
Specifies the list of categorical features to be plotted.
|
|
1765
|
+
Types: list of str
|
|
1766
|
+
|
|
1767
|
+
RETURNS:
|
|
1768
|
+
str
|
|
1769
|
+
|
|
1770
|
+
RAISES:
|
|
1771
|
+
None
|
|
1772
|
+
|
|
1773
|
+
EXAMPLES:
|
|
1774
|
+
>>> _FeatureExplore._count_plot(data=data,
|
|
1775
|
+
length=10,
|
|
1776
|
+
breadth=8,
|
|
1777
|
+
categorical_features=["feature1", "feature2"])
|
|
1778
|
+
"""
|
|
1779
|
+
if len(categorical_features) >= 1:
|
|
1780
|
+
rows = math.ceil(len(categorical_features) / 3)
|
|
1781
|
+
fig, axes = plt.subplots(rows, 3, figsize=(length, rows * 5))
|
|
1782
|
+
axes = axes.flatten()
|
|
1783
|
+
fig.suptitle("Count plot", fontsize=14)
|
|
1784
|
+
|
|
1785
|
+
for i, feature in enumerate(categorical_features):
|
|
1786
|
+
# Get top 20 most frequent categories
|
|
1787
|
+
top_categories = data[feature].value_counts().nlargest(25)
|
|
1788
|
+
|
|
1789
|
+
# Plot only top 20 categories
|
|
1790
|
+
sns.barplot(x=top_categories.index,
|
|
1791
|
+
y=top_categories.values,
|
|
1792
|
+
hue=top_categories.index,
|
|
1793
|
+
palette="coolwarm",
|
|
1794
|
+
legend=False,
|
|
1795
|
+
ax=axes[i])
|
|
1796
|
+
|
|
1797
|
+
# Rotate labels for readability
|
|
1798
|
+
axes[i].tick_params(axis='x', rotation=90)
|
|
1799
|
+
|
|
1800
|
+
# Hide empty subplots
|
|
1801
|
+
for i in range(len(categorical_features), len(axes)):
|
|
1802
|
+
axes[i].axis('off')
|
|
1803
|
+
|
|
1804
|
+
# Adjust layout spacing
|
|
1805
|
+
plt.subplots_adjust(hspace=1.5, wspace=0.3)
|
|
1806
|
+
plt.show()
|
|
1807
|
+
else:
|
|
1808
|
+
return f"Plot type 'count' is not applicable as no categorical features are available."
|
|
1809
|
+
|
|
1810
|
+
@staticmethod
|
|
1811
|
+
def _box_plot(data,
|
|
1812
|
+
length=10,
|
|
1813
|
+
breadth=8,
|
|
1814
|
+
numerical_features=[]):
|
|
1815
|
+
"""
|
|
1816
|
+
DESCRIPTION:
|
|
1817
|
+
Internal function to visualize the data using box plot.
|
|
1818
|
+
|
|
1819
|
+
PARAMETERS:
|
|
1820
|
+
data:
|
|
1821
|
+
Required Argument.
|
|
1822
|
+
Specifies the input pandas DataFrame for plotting.
|
|
1823
|
+
Types: pandas Dataframe
|
|
1824
|
+
|
|
1825
|
+
length:
|
|
1826
|
+
Optional Argument.
|
|
1827
|
+
Specifies the length of the plot.
|
|
1828
|
+
Default Value: 10
|
|
1829
|
+
Types: int
|
|
1830
|
+
|
|
1831
|
+
breadth:
|
|
1832
|
+
Optional Argument.
|
|
1833
|
+
Specifies the breadth of the plot.
|
|
1834
|
+
Default Value: 8
|
|
1835
|
+
Types: int
|
|
1836
|
+
|
|
1837
|
+
numerical_features:
|
|
1838
|
+
Optional Argument.
|
|
1839
|
+
Specifies the list of numerical features to be plotted.
|
|
1840
|
+
Types: list of str
|
|
1841
|
+
|
|
1842
|
+
RETURNS:
|
|
1843
|
+
str
|
|
1844
|
+
|
|
1845
|
+
RAISES:
|
|
1846
|
+
None
|
|
1847
|
+
|
|
1848
|
+
EXAMPLES:
|
|
1849
|
+
>>> _FeatureExplore._box_plot(data=data,
|
|
1850
|
+
length=10,
|
|
1851
|
+
breadth=8,
|
|
1852
|
+
numerical_features=["feature1", "feature2"])
|
|
1853
|
+
|
|
1854
|
+
"""
|
|
1855
|
+
if len(numerical_features) >= 1:
|
|
1856
|
+
rows = math.ceil(len(numerical_features) / 3)
|
|
1857
|
+
fig, axes = plt.subplots(rows, 3, figsize=(length, breadth))
|
|
1858
|
+
axes = axes.flatten()
|
|
1859
|
+
fig.suptitle("Box plot", fontsize=14)
|
|
1860
|
+
|
|
1861
|
+
for i, feature in enumerate(numerical_features):
|
|
1862
|
+
# Removed the hue argument and passed only the feature to x
|
|
1863
|
+
sns.boxplot(y=data[feature], data=data, ax=axes[i], legend=False)
|
|
1864
|
+
# Adjust layout to prevent label overlap
|
|
1865
|
+
plt.tight_layout()
|
|
1866
|
+
|
|
1867
|
+
# Hide any empty subplots
|
|
1868
|
+
for i in range(len(numerical_features), len(axes)):
|
|
1869
|
+
axes[i].axis('off')
|
|
1870
|
+
|
|
1871
|
+
plt.show()
|
|
1872
|
+
else:
|
|
1873
|
+
return f"Plot type 'box' is not applicable as no numerical features are available."
|