teradataml 17.20.0.6__py3-none-any.whl → 20.0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +238 -1
- teradataml/__init__.py +13 -3
- teradataml/_version.py +1 -1
- teradataml/analytics/Transformations.py +4 -4
- teradataml/analytics/__init__.py +0 -2
- teradataml/analytics/analytic_function_executor.py +3 -0
- teradataml/analytics/json_parser/utils.py +13 -12
- teradataml/analytics/sqle/DecisionTreePredict.py +15 -30
- teradataml/analytics/sqle/NaiveBayesPredict.py +11 -20
- teradataml/analytics/sqle/__init__.py +0 -13
- teradataml/analytics/utils.py +1 -0
- teradataml/analytics/valib.py +3 -0
- teradataml/automl/__init__.py +1628 -0
- teradataml/automl/custom_json_utils.py +1270 -0
- teradataml/automl/data_preparation.py +993 -0
- teradataml/automl/data_transformation.py +727 -0
- teradataml/automl/feature_engineering.py +1648 -0
- teradataml/automl/feature_exploration.py +547 -0
- teradataml/automl/model_evaluation.py +163 -0
- teradataml/automl/model_training.py +887 -0
- teradataml/catalog/__init__.py +0 -2
- teradataml/catalog/byom.py +49 -6
- teradataml/catalog/function_argument_mapper.py +0 -2
- teradataml/catalog/model_cataloging_utils.py +2 -1021
- teradataml/common/aed_utils.py +6 -2
- teradataml/common/constants.py +50 -58
- teradataml/common/deprecations.py +160 -0
- teradataml/common/garbagecollector.py +61 -104
- teradataml/common/messagecodes.py +27 -36
- teradataml/common/messages.py +11 -15
- teradataml/common/utils.py +205 -287
- teradataml/common/wrapper_utils.py +1 -110
- teradataml/context/context.py +150 -78
- teradataml/data/bank_churn.csv +10001 -0
- teradataml/data/bmi.csv +501 -0
- teradataml/data/docs/sqle/docs_17_10/BincodeFit.py +3 -3
- teradataml/data/docs/sqle/docs_17_10/BincodeTransform.py +6 -5
- teradataml/data/docs/sqle/docs_17_10/Fit.py +1 -1
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +1 -1
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +1 -1
- teradataml/data/docs/sqle/docs_17_10/PolynomialFeaturesTransform.py +2 -2
- teradataml/data/docs/sqle/docs_17_10/RowNormalizeTransform.py +2 -1
- teradataml/data/docs/sqle/docs_17_10/ScaleTransform.py +1 -0
- teradataml/data/docs/sqle/docs_17_10/SimpleImputeTransform.py +1 -1
- teradataml/data/docs/sqle/docs_17_10/Transform.py +2 -1
- teradataml/data/docs/sqle/docs_17_20/BincodeFit.py +3 -3
- teradataml/data/docs/sqle/docs_17_20/BincodeTransform.py +6 -5
- teradataml/data/docs/sqle/docs_17_20/Fit.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/GLM.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/GLMPredictPerSegment.py +9 -10
- teradataml/data/docs/sqle/docs_17_20/KMeansPredict.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierPredict.py +16 -15
- teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierTrainer.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineTransform.py +8 -8
- teradataml/data/docs/sqle/docs_17_20/OneClassSVMPredict.py +21 -20
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterTransform.py +8 -3
- teradataml/data/docs/sqle/docs_17_20/PolynomialFeaturesTransform.py +6 -5
- teradataml/data/docs/sqle/docs_17_20/RandomProjectionTransform.py +6 -6
- teradataml/data/docs/sqle/docs_17_20/RowNormalizeTransform.py +2 -1
- teradataml/data/docs/sqle/docs_17_20/SVM.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +16 -16
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +1 -0
- teradataml/data/docs/sqle/docs_17_20/SimpleImputeTransform.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/TDDecisionForestPredict.py +4 -4
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +19 -19
- teradataml/data/docs/sqle/docs_17_20/TargetEncodingTransform.py +5 -4
- teradataml/data/docs/sqle/docs_17_20/Transform.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +9 -9
- teradataml/data/fish.csv +160 -0
- teradataml/data/glass_types.csv +215 -0
- teradataml/data/insurance.csv +1 -1
- teradataml/data/iris_data.csv +151 -0
- teradataml/data/jsons/sqle/17.10/TD_FunctionTransform.json +1 -0
- teradataml/data/jsons/sqle/17.10/TD_OneHotEncodingTransform.json +1 -0
- teradataml/data/jsons/sqle/17.10/TD_OutlierFilterTransform.json +1 -0
- teradataml/data/jsons/sqle/17.10/TD_PolynomialFeaturesTransform.json +1 -0
- teradataml/data/jsons/sqle/17.10/TD_RowNormalizeTransform.json +1 -0
- teradataml/data/jsons/sqle/17.10/TD_ScaleTransform.json +1 -0
- teradataml/data/jsons/sqle/17.10/TD_SimpleImputeTransform.json +1 -0
- teradataml/data/load_example_data.py +3 -0
- teradataml/data/multi_model_classification.csv +401 -0
- teradataml/data/multi_model_regression.csv +401 -0
- teradataml/data/openml_example.json +63 -0
- teradataml/data/scripts/deploy_script.py +65 -0
- teradataml/data/scripts/mapper.R +20 -0
- teradataml/data/scripts/sklearn/__init__.py +0 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +175 -0
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +135 -0
- teradataml/data/scripts/sklearn/sklearn_function.template +113 -0
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +158 -0
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +152 -0
- teradataml/data/scripts/sklearn/sklearn_score.py +128 -0
- teradataml/data/scripts/sklearn/sklearn_transform.py +179 -0
- teradataml/data/templates/open_source_ml.json +9 -0
- teradataml/data/teradataml_example.json +73 -1
- teradataml/data/test_classification.csv +101 -0
- teradataml/data/test_prediction.csv +101 -0
- teradataml/data/test_regression.csv +101 -0
- teradataml/data/train_multiclass.csv +101 -0
- teradataml/data/train_regression.csv +101 -0
- teradataml/data/train_regression_multiple_labels.csv +101 -0
- teradataml/data/wine_data.csv +1600 -0
- teradataml/dataframe/copy_to.py +79 -13
- teradataml/dataframe/data_transfer.py +8 -0
- teradataml/dataframe/dataframe.py +910 -311
- teradataml/dataframe/dataframe_utils.py +102 -5
- teradataml/dataframe/fastload.py +11 -3
- teradataml/dataframe/setop.py +15 -2
- teradataml/dataframe/sql.py +3735 -77
- teradataml/dataframe/sql_function_parameters.py +56 -5
- teradataml/dataframe/vantage_function_types.py +45 -1
- teradataml/dataframe/window.py +30 -29
- teradataml/dbutils/dbutils.py +18 -1
- teradataml/geospatial/geodataframe.py +18 -7
- teradataml/geospatial/geodataframecolumn.py +5 -0
- teradataml/hyperparameter_tuner/optimizer.py +910 -120
- teradataml/hyperparameter_tuner/utils.py +131 -37
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/lib/libaed_0_1.dylib +0 -0
- teradataml/lib/libaed_0_1.so +0 -0
- teradataml/libaed_0_1.dylib +0 -0
- teradataml/libaed_0_1.so +0 -0
- teradataml/opensource/__init__.py +1 -0
- teradataml/opensource/sklearn/__init__.py +1 -0
- teradataml/opensource/sklearn/_class.py +255 -0
- teradataml/opensource/sklearn/_sklearn_wrapper.py +1668 -0
- teradataml/opensource/sklearn/_wrapper_utils.py +268 -0
- teradataml/opensource/sklearn/constants.py +54 -0
- teradataml/options/__init__.py +3 -6
- teradataml/options/configure.py +21 -20
- teradataml/scriptmgmt/UserEnv.py +61 -5
- teradataml/scriptmgmt/lls_utils.py +135 -53
- teradataml/table_operators/Apply.py +38 -6
- teradataml/table_operators/Script.py +45 -308
- teradataml/table_operators/TableOperator.py +182 -591
- teradataml/table_operators/__init__.py +0 -1
- teradataml/table_operators/table_operator_util.py +32 -40
- teradataml/utils/validators.py +127 -3
- {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/METADATA +243 -3
- {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/RECORD +147 -391
- teradataml/analytics/mle/AdaBoost.py +0 -651
- teradataml/analytics/mle/AdaBoostPredict.py +0 -564
- teradataml/analytics/mle/Antiselect.py +0 -342
- teradataml/analytics/mle/Arima.py +0 -641
- teradataml/analytics/mle/ArimaPredict.py +0 -477
- teradataml/analytics/mle/Attribution.py +0 -1070
- teradataml/analytics/mle/Betweenness.py +0 -658
- teradataml/analytics/mle/Burst.py +0 -711
- teradataml/analytics/mle/CCM.py +0 -600
- teradataml/analytics/mle/CCMPrepare.py +0 -324
- teradataml/analytics/mle/CFilter.py +0 -460
- teradataml/analytics/mle/ChangePointDetection.py +0 -572
- teradataml/analytics/mle/ChangePointDetectionRT.py +0 -477
- teradataml/analytics/mle/Closeness.py +0 -737
- teradataml/analytics/mle/ConfusionMatrix.py +0 -420
- teradataml/analytics/mle/Correlation.py +0 -477
- teradataml/analytics/mle/Correlation2.py +0 -573
- teradataml/analytics/mle/CoxHazardRatio.py +0 -679
- teradataml/analytics/mle/CoxPH.py +0 -556
- teradataml/analytics/mle/CoxSurvival.py +0 -478
- teradataml/analytics/mle/CumulativeMovAvg.py +0 -363
- teradataml/analytics/mle/DTW.py +0 -623
- teradataml/analytics/mle/DWT.py +0 -564
- teradataml/analytics/mle/DWT2D.py +0 -599
- teradataml/analytics/mle/DecisionForest.py +0 -716
- teradataml/analytics/mle/DecisionForestEvaluator.py +0 -363
- teradataml/analytics/mle/DecisionForestPredict.py +0 -561
- teradataml/analytics/mle/DecisionTree.py +0 -830
- teradataml/analytics/mle/DecisionTreePredict.py +0 -528
- teradataml/analytics/mle/ExponentialMovAvg.py +0 -418
- teradataml/analytics/mle/FMeasure.py +0 -402
- teradataml/analytics/mle/FPGrowth.py +0 -734
- teradataml/analytics/mle/FrequentPaths.py +0 -695
- teradataml/analytics/mle/GLM.py +0 -558
- teradataml/analytics/mle/GLML1L2.py +0 -547
- teradataml/analytics/mle/GLML1L2Predict.py +0 -519
- teradataml/analytics/mle/GLMPredict.py +0 -529
- teradataml/analytics/mle/HMMDecoder.py +0 -945
- teradataml/analytics/mle/HMMEvaluator.py +0 -901
- teradataml/analytics/mle/HMMSupervised.py +0 -521
- teradataml/analytics/mle/HMMUnsupervised.py +0 -572
- teradataml/analytics/mle/Histogram.py +0 -561
- teradataml/analytics/mle/IDWT.py +0 -476
- teradataml/analytics/mle/IDWT2D.py +0 -493
- teradataml/analytics/mle/IdentityMatch.py +0 -763
- teradataml/analytics/mle/Interpolator.py +0 -918
- teradataml/analytics/mle/KMeans.py +0 -485
- teradataml/analytics/mle/KNN.py +0 -627
- teradataml/analytics/mle/KNNRecommender.py +0 -488
- teradataml/analytics/mle/KNNRecommenderPredict.py +0 -581
- teradataml/analytics/mle/LAR.py +0 -439
- teradataml/analytics/mle/LARPredict.py +0 -478
- teradataml/analytics/mle/LDA.py +0 -548
- teradataml/analytics/mle/LDAInference.py +0 -492
- teradataml/analytics/mle/LDATopicSummary.py +0 -464
- teradataml/analytics/mle/LevenshteinDistance.py +0 -450
- teradataml/analytics/mle/LinReg.py +0 -433
- teradataml/analytics/mle/LinRegPredict.py +0 -438
- teradataml/analytics/mle/MinHash.py +0 -544
- teradataml/analytics/mle/Modularity.py +0 -587
- teradataml/analytics/mle/NEREvaluator.py +0 -410
- teradataml/analytics/mle/NERExtractor.py +0 -595
- teradataml/analytics/mle/NERTrainer.py +0 -458
- teradataml/analytics/mle/NGrams.py +0 -570
- teradataml/analytics/mle/NPath.py +0 -634
- teradataml/analytics/mle/NTree.py +0 -549
- teradataml/analytics/mle/NaiveBayes.py +0 -462
- teradataml/analytics/mle/NaiveBayesPredict.py +0 -513
- teradataml/analytics/mle/NaiveBayesTextClassifier.py +0 -607
- teradataml/analytics/mle/NaiveBayesTextClassifier2.py +0 -531
- teradataml/analytics/mle/NaiveBayesTextClassifierPredict.py +0 -799
- teradataml/analytics/mle/NamedEntityFinder.py +0 -529
- teradataml/analytics/mle/NamedEntityFinderEvaluator.py +0 -414
- teradataml/analytics/mle/NamedEntityFinderTrainer.py +0 -396
- teradataml/analytics/mle/POSTagger.py +0 -417
- teradataml/analytics/mle/Pack.py +0 -411
- teradataml/analytics/mle/PageRank.py +0 -535
- teradataml/analytics/mle/PathAnalyzer.py +0 -426
- teradataml/analytics/mle/PathGenerator.py +0 -367
- teradataml/analytics/mle/PathStart.py +0 -464
- teradataml/analytics/mle/PathSummarizer.py +0 -470
- teradataml/analytics/mle/Pivot.py +0 -471
- teradataml/analytics/mle/ROC.py +0 -425
- teradataml/analytics/mle/RandomSample.py +0 -637
- teradataml/analytics/mle/RandomWalkSample.py +0 -490
- teradataml/analytics/mle/SAX.py +0 -779
- teradataml/analytics/mle/SVMDense.py +0 -677
- teradataml/analytics/mle/SVMDensePredict.py +0 -536
- teradataml/analytics/mle/SVMDenseSummary.py +0 -437
- teradataml/analytics/mle/SVMSparse.py +0 -557
- teradataml/analytics/mle/SVMSparsePredict.py +0 -553
- teradataml/analytics/mle/SVMSparseSummary.py +0 -435
- teradataml/analytics/mle/Sampling.py +0 -549
- teradataml/analytics/mle/Scale.py +0 -565
- teradataml/analytics/mle/ScaleByPartition.py +0 -496
- teradataml/analytics/mle/ScaleMap.py +0 -378
- teradataml/analytics/mle/ScaleSummary.py +0 -320
- teradataml/analytics/mle/SentenceExtractor.py +0 -363
- teradataml/analytics/mle/SentimentEvaluator.py +0 -432
- teradataml/analytics/mle/SentimentExtractor.py +0 -578
- teradataml/analytics/mle/SentimentTrainer.py +0 -405
- teradataml/analytics/mle/SeriesSplitter.py +0 -641
- teradataml/analytics/mle/Sessionize.py +0 -475
- teradataml/analytics/mle/SimpleMovAvg.py +0 -397
- teradataml/analytics/mle/StringSimilarity.py +0 -425
- teradataml/analytics/mle/TF.py +0 -389
- teradataml/analytics/mle/TFIDF.py +0 -504
- teradataml/analytics/mle/TextChunker.py +0 -414
- teradataml/analytics/mle/TextClassifier.py +0 -399
- teradataml/analytics/mle/TextClassifierEvaluator.py +0 -413
- teradataml/analytics/mle/TextClassifierTrainer.py +0 -565
- teradataml/analytics/mle/TextMorph.py +0 -494
- teradataml/analytics/mle/TextParser.py +0 -623
- teradataml/analytics/mle/TextTagger.py +0 -530
- teradataml/analytics/mle/TextTokenizer.py +0 -502
- teradataml/analytics/mle/UnivariateStatistics.py +0 -488
- teradataml/analytics/mle/Unpack.py +0 -526
- teradataml/analytics/mle/Unpivot.py +0 -438
- teradataml/analytics/mle/VarMax.py +0 -776
- teradataml/analytics/mle/VectorDistance.py +0 -762
- teradataml/analytics/mle/WeightedMovAvg.py +0 -400
- teradataml/analytics/mle/XGBoost.py +0 -842
- teradataml/analytics/mle/XGBoostPredict.py +0 -627
- teradataml/analytics/mle/__init__.py +0 -123
- teradataml/analytics/mle/json/adaboost_mle.json +0 -135
- teradataml/analytics/mle/json/adaboostpredict_mle.json +0 -85
- teradataml/analytics/mle/json/antiselect_mle.json +0 -34
- teradataml/analytics/mle/json/antiselect_mle_mle.json +0 -34
- teradataml/analytics/mle/json/arima_mle.json +0 -172
- teradataml/analytics/mle/json/arimapredict_mle.json +0 -52
- teradataml/analytics/mle/json/attribution_mle_mle.json +0 -143
- teradataml/analytics/mle/json/betweenness_mle.json +0 -97
- teradataml/analytics/mle/json/burst_mle.json +0 -140
- teradataml/analytics/mle/json/ccm_mle.json +0 -124
- teradataml/analytics/mle/json/ccmprepare_mle.json +0 -14
- teradataml/analytics/mle/json/cfilter_mle.json +0 -93
- teradataml/analytics/mle/json/changepointdetection_mle.json +0 -92
- teradataml/analytics/mle/json/changepointdetectionrt_mle.json +0 -78
- teradataml/analytics/mle/json/closeness_mle.json +0 -104
- teradataml/analytics/mle/json/confusionmatrix_mle.json +0 -79
- teradataml/analytics/mle/json/correlation_mle.json +0 -86
- teradataml/analytics/mle/json/correlationreduce_mle.json +0 -49
- teradataml/analytics/mle/json/coxhazardratio_mle.json +0 -89
- teradataml/analytics/mle/json/coxph_mle.json +0 -98
- teradataml/analytics/mle/json/coxsurvival_mle.json +0 -79
- teradataml/analytics/mle/json/cumulativemovavg_mle.json +0 -34
- teradataml/analytics/mle/json/decisionforest_mle.json +0 -167
- teradataml/analytics/mle/json/decisionforestevaluator_mle.json +0 -33
- teradataml/analytics/mle/json/decisionforestpredict_mle_mle.json +0 -74
- teradataml/analytics/mle/json/decisiontree_mle.json +0 -194
- teradataml/analytics/mle/json/decisiontreepredict_mle_mle.json +0 -86
- teradataml/analytics/mle/json/dtw_mle.json +0 -97
- teradataml/analytics/mle/json/dwt2d_mle.json +0 -116
- teradataml/analytics/mle/json/dwt_mle.json +0 -101
- teradataml/analytics/mle/json/exponentialmovavg_mle.json +0 -55
- teradataml/analytics/mle/json/fmeasure_mle.json +0 -58
- teradataml/analytics/mle/json/fpgrowth_mle.json +0 -159
- teradataml/analytics/mle/json/frequentpaths_mle.json +0 -129
- teradataml/analytics/mle/json/glm_mle.json +0 -111
- teradataml/analytics/mle/json/glml1l2_mle.json +0 -106
- teradataml/analytics/mle/json/glml1l2predict_mle.json +0 -57
- teradataml/analytics/mle/json/glmpredict_mle_mle.json +0 -74
- teradataml/analytics/mle/json/histogram_mle.json +0 -100
- teradataml/analytics/mle/json/hmmdecoder_mle.json +0 -192
- teradataml/analytics/mle/json/hmmevaluator_mle.json +0 -206
- teradataml/analytics/mle/json/hmmsupervised_mle.json +0 -91
- teradataml/analytics/mle/json/hmmunsupervised_mle.json +0 -114
- teradataml/analytics/mle/json/identitymatch_mle.json +0 -88
- teradataml/analytics/mle/json/idwt2d_mle.json +0 -73
- teradataml/analytics/mle/json/idwt_mle.json +0 -66
- teradataml/analytics/mle/json/interpolator_mle.json +0 -151
- teradataml/analytics/mle/json/kmeans_mle.json +0 -97
- teradataml/analytics/mle/json/knn_mle.json +0 -141
- teradataml/analytics/mle/json/knnrecommender_mle.json +0 -111
- teradataml/analytics/mle/json/knnrecommenderpredict_mle.json +0 -75
- teradataml/analytics/mle/json/lar_mle.json +0 -78
- teradataml/analytics/mle/json/larpredict_mle.json +0 -69
- teradataml/analytics/mle/json/lda_mle.json +0 -130
- teradataml/analytics/mle/json/ldainference_mle.json +0 -78
- teradataml/analytics/mle/json/ldatopicsummary_mle.json +0 -64
- teradataml/analytics/mle/json/levenshteindistance_mle.json +0 -92
- teradataml/analytics/mle/json/linreg_mle.json +0 -42
- teradataml/analytics/mle/json/linregpredict_mle.json +0 -56
- teradataml/analytics/mle/json/minhash_mle.json +0 -113
- teradataml/analytics/mle/json/modularity_mle.json +0 -91
- teradataml/analytics/mle/json/naivebayespredict_mle_mle.json +0 -85
- teradataml/analytics/mle/json/naivebayesreduce_mle.json +0 -52
- teradataml/analytics/mle/json/naivebayestextclassifierpredict_mle_mle.json +0 -147
- teradataml/analytics/mle/json/naivebayestextclassifiertrainer2_mle.json +0 -108
- teradataml/analytics/mle/json/naivebayestextclassifiertrainer_mle.json +0 -102
- teradataml/analytics/mle/json/namedentityfinder_mle.json +0 -84
- teradataml/analytics/mle/json/namedentityfinderevaluatorreduce_mle.json +0 -43
- teradataml/analytics/mle/json/namedentityfindertrainer_mle.json +0 -64
- teradataml/analytics/mle/json/nerevaluator_mle.json +0 -54
- teradataml/analytics/mle/json/nerextractor_mle.json +0 -87
- teradataml/analytics/mle/json/nertrainer_mle.json +0 -89
- teradataml/analytics/mle/json/ngrams_mle.json +0 -137
- teradataml/analytics/mle/json/ngramsplitter_mle_mle.json +0 -137
- teradataml/analytics/mle/json/npath@coprocessor_mle.json +0 -73
- teradataml/analytics/mle/json/ntree@coprocessor_mle.json +0 -123
- teradataml/analytics/mle/json/pack_mle.json +0 -58
- teradataml/analytics/mle/json/pack_mle_mle.json +0 -58
- teradataml/analytics/mle/json/pagerank_mle.json +0 -81
- teradataml/analytics/mle/json/pathanalyzer_mle.json +0 -63
- teradataml/analytics/mle/json/pathgenerator_mle.json +0 -40
- teradataml/analytics/mle/json/pathstart_mle.json +0 -62
- teradataml/analytics/mle/json/pathsummarizer_mle.json +0 -72
- teradataml/analytics/mle/json/pivoting_mle.json +0 -71
- teradataml/analytics/mle/json/postagger_mle.json +0 -51
- teradataml/analytics/mle/json/randomsample_mle.json +0 -131
- teradataml/analytics/mle/json/randomwalksample_mle.json +0 -85
- teradataml/analytics/mle/json/roc_mle.json +0 -73
- teradataml/analytics/mle/json/sampling_mle.json +0 -75
- teradataml/analytics/mle/json/sax_mle.json +0 -154
- teradataml/analytics/mle/json/scale_mle.json +0 -93
- teradataml/analytics/mle/json/scalebypartition_mle.json +0 -89
- teradataml/analytics/mle/json/scalemap_mle.json +0 -44
- teradataml/analytics/mle/json/scalesummary_mle.json +0 -14
- teradataml/analytics/mle/json/sentenceextractor_mle.json +0 -41
- teradataml/analytics/mle/json/sentimentevaluator_mle.json +0 -43
- teradataml/analytics/mle/json/sentimentextractor_mle.json +0 -100
- teradataml/analytics/mle/json/sentimenttrainer_mle.json +0 -68
- teradataml/analytics/mle/json/seriessplitter_mle.json +0 -133
- teradataml/analytics/mle/json/sessionize_mle_mle.json +0 -62
- teradataml/analytics/mle/json/simplemovavg_mle.json +0 -48
- teradataml/analytics/mle/json/stringsimilarity_mle.json +0 -50
- teradataml/analytics/mle/json/stringsimilarity_mle_mle.json +0 -50
- teradataml/analytics/mle/json/svmdense_mle.json +0 -165
- teradataml/analytics/mle/json/svmdensepredict_mle.json +0 -95
- teradataml/analytics/mle/json/svmdensesummary_mle.json +0 -58
- teradataml/analytics/mle/json/svmsparse_mle.json +0 -148
- teradataml/analytics/mle/json/svmsparsepredict_mle_mle.json +0 -103
- teradataml/analytics/mle/json/svmsparsesummary_mle.json +0 -57
- teradataml/analytics/mle/json/textchunker_mle.json +0 -40
- teradataml/analytics/mle/json/textclassifier_mle.json +0 -51
- teradataml/analytics/mle/json/textclassifierevaluator_mle.json +0 -43
- teradataml/analytics/mle/json/textclassifiertrainer_mle.json +0 -103
- teradataml/analytics/mle/json/textmorph_mle.json +0 -63
- teradataml/analytics/mle/json/textparser_mle.json +0 -166
- teradataml/analytics/mle/json/texttagger_mle.json +0 -81
- teradataml/analytics/mle/json/texttokenizer_mle.json +0 -91
- teradataml/analytics/mle/json/tf_mle.json +0 -33
- teradataml/analytics/mle/json/tfidf_mle.json +0 -34
- teradataml/analytics/mle/json/univariatestatistics_mle.json +0 -81
- teradataml/analytics/mle/json/unpack_mle.json +0 -91
- teradataml/analytics/mle/json/unpack_mle_mle.json +0 -91
- teradataml/analytics/mle/json/unpivoting_mle.json +0 -63
- teradataml/analytics/mle/json/varmax_mle.json +0 -176
- teradataml/analytics/mle/json/vectordistance_mle.json +0 -179
- teradataml/analytics/mle/json/weightedmovavg_mle.json +0 -48
- teradataml/analytics/mle/json/xgboost_mle.json +0 -178
- teradataml/analytics/mle/json/xgboostpredict_mle.json +0 -104
- teradataml/analytics/sqle/Antiselect.py +0 -321
- teradataml/analytics/sqle/Attribution.py +0 -603
- teradataml/analytics/sqle/DecisionForestPredict.py +0 -408
- teradataml/analytics/sqle/GLMPredict.py +0 -430
- teradataml/analytics/sqle/MovingAverage.py +0 -543
- teradataml/analytics/sqle/NGramSplitter.py +0 -548
- teradataml/analytics/sqle/NPath.py +0 -632
- teradataml/analytics/sqle/NaiveBayesTextClassifierPredict.py +0 -515
- teradataml/analytics/sqle/Pack.py +0 -388
- teradataml/analytics/sqle/SVMSparsePredict.py +0 -464
- teradataml/analytics/sqle/Sessionize.py +0 -390
- teradataml/analytics/sqle/StringSimilarity.py +0 -400
- teradataml/analytics/sqle/Unpack.py +0 -503
- teradataml/analytics/sqle/json/antiselect_sqle.json +0 -21
- teradataml/analytics/sqle/json/attribution_sqle.json +0 -92
- teradataml/analytics/sqle/json/decisionforestpredict_sqle.json +0 -48
- teradataml/analytics/sqle/json/glmpredict_sqle.json +0 -48
- teradataml/analytics/sqle/json/h2opredict_sqle.json +0 -63
- teradataml/analytics/sqle/json/movingaverage_sqle.json +0 -58
- teradataml/analytics/sqle/json/naivebayestextclassifierpredict_sqle.json +0 -76
- teradataml/analytics/sqle/json/ngramsplitter_sqle.json +0 -126
- teradataml/analytics/sqle/json/npath_sqle.json +0 -67
- teradataml/analytics/sqle/json/pack_sqle.json +0 -47
- teradataml/analytics/sqle/json/pmmlpredict_sqle.json +0 -55
- teradataml/analytics/sqle/json/sessionize_sqle.json +0 -43
- teradataml/analytics/sqle/json/stringsimilarity_sqle.json +0 -39
- teradataml/analytics/sqle/json/svmsparsepredict_sqle.json +0 -74
- teradataml/analytics/sqle/json/unpack_sqle.json +0 -80
- teradataml/catalog/model_cataloging.py +0 -980
- teradataml/config/mlengine_alias_definitions_v1.0 +0 -118
- teradataml/config/mlengine_alias_definitions_v1.1 +0 -127
- teradataml/config/mlengine_alias_definitions_v1.3 +0 -129
- teradataml/table_operators/sandbox_container_util.py +0 -643
- {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/WHEEL +0 -0
- {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/top_level.txt +0 -0
- {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/zip-safe +0 -0
|
@@ -1,637 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# ##################################################################
|
|
3
|
-
#
|
|
4
|
-
# Copyright 2018 Teradata. All rights reserved.
|
|
5
|
-
# TERADATA CONFIDENTIAL AND TRADE SECRET
|
|
6
|
-
#
|
|
7
|
-
# Primary Owner: Mounika Kotha (mounika.kotha@teradata.com)
|
|
8
|
-
# Secondary Owner: Pankaj Purandare (pankajvinod.purandare@teradata.com)
|
|
9
|
-
#
|
|
10
|
-
# Version: 1.2
|
|
11
|
-
# Function Version: 1.5
|
|
12
|
-
#
|
|
13
|
-
# ##################################################################
|
|
14
|
-
|
|
15
|
-
import inspect
|
|
16
|
-
import time
|
|
17
|
-
from teradataml.common.wrapper_utils import AnalyticsWrapperUtils
|
|
18
|
-
from teradataml.common.utils import UtilFuncs
|
|
19
|
-
from teradataml.context.context import *
|
|
20
|
-
from teradataml.dataframe.dataframe import DataFrame
|
|
21
|
-
from teradataml.common.aed_utils import AedUtils
|
|
22
|
-
from teradataml.analytics.analytic_query_generator import AnalyticQueryGenerator
|
|
23
|
-
from teradataml.common.exceptions import TeradataMlException
|
|
24
|
-
from teradataml.common.messages import Messages
|
|
25
|
-
from teradataml.common.messagecodes import MessageCodes
|
|
26
|
-
from teradataml.common.constants import TeradataConstants
|
|
27
|
-
from teradataml.dataframe.dataframe_utils import DataFrameUtils as df_utils
|
|
28
|
-
from teradataml.options.display import display
|
|
29
|
-
|
|
30
|
-
class RandomSample:
|
|
31
|
-
|
|
32
|
-
def __init__(self,
|
|
33
|
-
data = None,
|
|
34
|
-
num_sample = None,
|
|
35
|
-
weight_column = None,
|
|
36
|
-
sampling_mode = "Basic",
|
|
37
|
-
distance = "EUCLIDEAN",
|
|
38
|
-
input_columns = None,
|
|
39
|
-
as_categories = None,
|
|
40
|
-
category_weights = None,
|
|
41
|
-
categorical_distance = "OVERLAP",
|
|
42
|
-
seed = None,
|
|
43
|
-
seed_column = None,
|
|
44
|
-
over_sampling_rate = 1.0,
|
|
45
|
-
iteration_num = 5,
|
|
46
|
-
setid_as_first_column = True,
|
|
47
|
-
data_sequence_column = None):
|
|
48
|
-
"""
|
|
49
|
-
DESCRIPTION:
|
|
50
|
-
The RandomSample function takes a data set and uses a specified
|
|
51
|
-
sampling method to output one or more random samples. Each sample has
|
|
52
|
-
exactly the number of rows specified.
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
PARAMETERS:
|
|
56
|
-
data:
|
|
57
|
-
Required Argument.
|
|
58
|
-
Specifies the name of the teradataml DataFrame that contains the data
|
|
59
|
-
set from which to take samples.
|
|
60
|
-
|
|
61
|
-
num_sample:
|
|
62
|
-
Required Argument.
|
|
63
|
-
Specifies both the number of samples and their sizes. For each
|
|
64
|
-
sample_size (an int value), the function selects a sample that has
|
|
65
|
-
sample_size rows.
|
|
66
|
-
Types: int OR list of Integers (int)
|
|
67
|
-
|
|
68
|
-
weight_column:
|
|
69
|
-
Optional Argument.
|
|
70
|
-
Specifies the name of the teradataml DataFrame column that
|
|
71
|
-
contains weights for weighted sampling. The weight_column must
|
|
72
|
-
have a numeric SQL data type. By default, rows have equal weight.
|
|
73
|
-
Types: str
|
|
74
|
-
|
|
75
|
-
sampling_mode:
|
|
76
|
-
Optional Argument.
|
|
77
|
-
Specifies the sampling mode and can be one of the following:
|
|
78
|
-
• "Basic": Each input_table row has a probability of being
|
|
79
|
-
selected that is proportional to its weight. The weight
|
|
80
|
-
of each row is in weight_column.
|
|
81
|
-
• "KMeans++": One row is selected in each of k iterations,
|
|
82
|
-
where k is the number of desired output rows. The first
|
|
83
|
-
row is selected randomly. In subsequent iterations, the
|
|
84
|
-
probability of a row being selected is proportional to the
|
|
85
|
-
value in the weight_column multiplied by the distance from
|
|
86
|
-
the nearest row in the set of selected rows. The distance
|
|
87
|
-
is calculated using the methods specified by the distance
|
|
88
|
-
and categorical_distance arguments.
|
|
89
|
-
• "KMeans||": Enhanced version of KMeans++ that exploits
|
|
90
|
-
parallel architecture to accelerate the sampling process.
|
|
91
|
-
The algorithm is described in the paper Scalable KMeans++
|
|
92
|
-
by Bahmani et al (http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf).
|
|
93
|
-
Briefly, at each iteration, the probability that a row is
|
|
94
|
-
selected is proportional to the value in the weight_column
|
|
95
|
-
multiplied by the distance from the nearest row in the set of
|
|
96
|
-
selected rows (as in KMeans++). However, the KMeans|| algorithm
|
|
97
|
-
oversamples at each iteration, significantly reducing the
|
|
98
|
-
required number of iterations; therefore, the resulting set of
|
|
99
|
-
rows might have more than k data points. Each row in the
|
|
100
|
-
resulting set is then weighted by the number of rows in the
|
|
101
|
-
teradataml DataFrame that are closer to that row than to any
|
|
102
|
-
other selected row, and the rows are clustered to produce
|
|
103
|
-
exactly k rows.
|
|
104
|
-
Tip: For optimal performance, use "KMeans++" when the
|
|
105
|
-
desired sample size is less than 15 and "KMeans||" otherwise.
|
|
106
|
-
Default Value: "Basic"
|
|
107
|
-
Permitted Values: Basic, KMeans++, KMeans||
|
|
108
|
-
Types: str
|
|
109
|
-
|
|
110
|
-
distance:
|
|
111
|
-
Optional Argument.
|
|
112
|
-
For KMeans++ and KMeans|| sampling, specifies the function for
|
|
113
|
-
computing the distance between numerical variables:
|
|
114
|
-
• 'EUCLIDEAN' : The distance between two variables is defined
|
|
115
|
-
using Euclidean Distance.
|
|
116
|
-
• 'MANHATTAN': The distance between two variables is defined
|
|
117
|
-
using Manhattan Distance.
|
|
118
|
-
Default Value: "EUCLIDEAN"
|
|
119
|
-
Permitted Values: MANHATTAN, EUCLIDEAN
|
|
120
|
-
Types: str
|
|
121
|
-
|
|
122
|
-
input_columns:
|
|
123
|
-
Optional Argument.
|
|
124
|
-
For KMeans++ and KMeans|| sampling, specifies the names of the
|
|
125
|
-
teradataml DataFrame columns to calculate the distance between
|
|
126
|
-
numerical variables.
|
|
127
|
-
Types: str OR list of Strings (str)
|
|
128
|
-
|
|
129
|
-
as_categories:
|
|
130
|
-
Optional Argument.
|
|
131
|
-
For KMeans++ and KMeans|| sampling, specifies the names of the
|
|
132
|
-
teradataml DataFrame columns that contain numerical variables
|
|
133
|
-
to treat as categorical variables.
|
|
134
|
-
Types: str OR list of Strings (str)
|
|
135
|
-
|
|
136
|
-
category_weights:
|
|
137
|
-
Optional Argument.
|
|
138
|
-
For KMeans++ and KMeans|| sampling, specifies the weights
|
|
139
|
-
(float values) of the categorical variables, including those
|
|
140
|
-
that 'as_categories' argument specifies. Specify the weights in
|
|
141
|
-
the order (from left to right) that the variables appear in the
|
|
142
|
-
input teradataml Dataframe. When calculating the distance between
|
|
143
|
-
two rows, distances between categorical values are scaled by
|
|
144
|
-
these weights.
|
|
145
|
-
Types: float or list of Floats (float).
|
|
146
|
-
|
|
147
|
-
categorical_distance:
|
|
148
|
-
Optional Argument.
|
|
149
|
-
For KMeans++ and KMeans|| sampling, specifies the function for
|
|
150
|
-
computing the distance between categorical variables:
|
|
151
|
-
• "OVERLAP" : The distance between two variables is 0 if
|
|
152
|
-
they are the same and 1 if they are different.
|
|
153
|
-
• "HAMMING": The distance beween two variables is the Hamming
|
|
154
|
-
distance between the strings that represent them. The
|
|
155
|
-
strings must have equal length.
|
|
156
|
-
Default Value: "OVERLAP"
|
|
157
|
-
Permitted Values: OVERLAP, HAMMING
|
|
158
|
-
Types: str
|
|
159
|
-
|
|
160
|
-
seed:
|
|
161
|
-
Optional Argument.
|
|
162
|
-
Specifies the random seed used to initialize the algorithm.
|
|
163
|
-
Types: int
|
|
164
|
-
|
|
165
|
-
seed_column:
|
|
166
|
-
Optional Argument.
|
|
167
|
-
Specifies the names of the teradataml DataFrame columns by
|
|
168
|
-
which to partition the input. Function calls that use the same
|
|
169
|
-
input data, seed, and seed_column output the same result. If
|
|
170
|
-
you specify seed_column, you must also specify seed.
|
|
171
|
-
Note: Ideally, the number of distinct values in the seed_column
|
|
172
|
-
is the same as the number of workers in the cluster. A very
|
|
173
|
-
large number of distinct values in the seed_column degrades
|
|
174
|
-
function performance.
|
|
175
|
-
Types: str OR list of Strings (str)
|
|
176
|
-
|
|
177
|
-
over_sampling_rate:
|
|
178
|
-
Optional Argument.
|
|
179
|
-
For KMeans|| sampling, specifies the oversampling rate (a float
|
|
180
|
-
value greater than 0.0). The function multiplies rate by
|
|
181
|
-
sample size (for each sample size).
|
|
182
|
-
Default Value: 1.0
|
|
183
|
-
Types: float
|
|
184
|
-
|
|
185
|
-
iteration_num:
|
|
186
|
-
Optional Argument.
|
|
187
|
-
For KMeans|| sampling, specifies the number of iterations (an
|
|
188
|
-
int value greater than 0).
|
|
189
|
-
Default Value: 5
|
|
190
|
-
Types: int
|
|
191
|
-
|
|
192
|
-
setid_as_first_column:
|
|
193
|
-
Optional Argument.
|
|
194
|
-
Specifies whether the generated set_id values to be included as first
|
|
195
|
-
column in output.
|
|
196
|
-
Note: "setid_as_first_column" argument support is only available
|
|
197
|
-
when teradataml is connected to Vantage 1.1 or later.
|
|
198
|
-
Default Value: True
|
|
199
|
-
Types: bool
|
|
200
|
-
|
|
201
|
-
data_sequence_column:
|
|
202
|
-
Optional Argument.
|
|
203
|
-
Specifies the list of column(s) that uniquely identifies each
|
|
204
|
-
row of the input argument "data". The argument is used to ensure
|
|
205
|
-
deterministic results for functions which produce results that
|
|
206
|
-
vary from run to run.
|
|
207
|
-
Types: str OR list of Strings (str)
|
|
208
|
-
|
|
209
|
-
RETURNS:
|
|
210
|
-
Instance of RandomSample.
|
|
211
|
-
Output teradataml DataFrames can be accessed using attribute
|
|
212
|
-
references, such as RandomSampleObj.<attribute_name>.
|
|
213
|
-
Output teradataml DataFrame attribute name is:
|
|
214
|
-
result
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
RAISES:
|
|
218
|
-
TeradataMlException
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
EXAMPLES:
|
|
222
|
-
# Load example data.
|
|
223
|
-
load_example_data("randomsample", ["fs_input", "fs_input1"])
|
|
224
|
-
|
|
225
|
-
# Create teradataml DataFrame objects. The input tables have
|
|
226
|
-
# observations of 11 variables for different models of cars.
|
|
227
|
-
fs_input = DataFrame.from_table("fs_input")
|
|
228
|
-
fs_input1 = DataFrame.from_table("fs_input1")
|
|
229
|
-
|
|
230
|
-
# Example 1 - Basic Sampling (Weighted).
|
|
231
|
-
# This example uses basic sampling to select one sample of 10 rows,
|
|
232
|
-
# which are weighted by car weight.
|
|
233
|
-
RandomSample_out1 = RandomSample(data = fs_input,
|
|
234
|
-
num_sample = 10,
|
|
235
|
-
weight_column = "wt",
|
|
236
|
-
sampling_mode = "basic",
|
|
237
|
-
seed = 1,
|
|
238
|
-
seed_column = ["model"])
|
|
239
|
-
|
|
240
|
-
# Print the result DataFrame
|
|
241
|
-
print(RandomSample_out1)
|
|
242
|
-
|
|
243
|
-
# Example 2 - KMeans++ Sampling.
|
|
244
|
-
# This example uses KMeans++ sampling with the Manhattan
|
|
245
|
-
# distance metric, and treats the numeric variables cyl,
|
|
246
|
-
# gear, and carb as categorical variables.
|
|
247
|
-
RandomSample_out2 = RandomSample(data = fs_input,
|
|
248
|
-
num_sample = 10,
|
|
249
|
-
sampling_mode = "KMeans++",
|
|
250
|
-
distance = "manhattan",
|
|
251
|
-
input_columns = ['mpg','cyl','disp','hp','drat','wt','qsec','vs','am','gear','carb'],
|
|
252
|
-
as_categories = ["cyl","gear","carb"],
|
|
253
|
-
category_weights = [1000.0,10.0,100.0,100.0,100.0],
|
|
254
|
-
seed = 1,
|
|
255
|
-
seed_column = ["model"]
|
|
256
|
-
)
|
|
257
|
-
|
|
258
|
-
# Print the result DataFrame
|
|
259
|
-
print(RandomSample_out2.result)
|
|
260
|
-
|
|
261
|
-
# Example 3 - KMeans|| Sampling.
|
|
262
|
-
# This example uses KMeans|| sampling with the Manhattan
|
|
263
|
-
# distance metric for the numerical variables and the Hamming
|
|
264
|
-
# distance metric for the categorical variables.
|
|
265
|
-
RandomSample_out3 = RandomSample(data = fs_input1,
|
|
266
|
-
num_sample = 20,
|
|
267
|
-
sampling_mode = "KMeans||",
|
|
268
|
-
distance = "MANHATTAN",
|
|
269
|
-
input_columns = ['mpg','cyl','disp','hp','drat','wt','qsec','vs','am','gear','carb'],
|
|
270
|
-
as_categories = ["cyl","gear","carb"],
|
|
271
|
-
category_weights = [1000.0,10.0,100.0,100.0,100.0],
|
|
272
|
-
categorical_distance = "HAMMING",
|
|
273
|
-
seed = 1,
|
|
274
|
-
seed_column = ["model"],
|
|
275
|
-
iteration_num = 2
|
|
276
|
-
)
|
|
277
|
-
|
|
278
|
-
# Print the result DataFrame
|
|
279
|
-
print(RandomSample_out3.result)
|
|
280
|
-
|
|
281
|
-
# Example 4 - This example uses basic sampling to select 3 sample
|
|
282
|
-
# sets of sizes 2, 3 and 1 rows, weighted by car weight.
|
|
283
|
-
RandomSample_out4 = RandomSample(data = fs_input,
|
|
284
|
-
num_sample = [2,3,1],
|
|
285
|
-
weight_column = "wt"
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
# Print the result DataFrame
|
|
289
|
-
print(RandomSample_out4)
|
|
290
|
-
|
|
291
|
-
"""
|
|
292
|
-
|
|
293
|
-
# Start the timer to get the build time
|
|
294
|
-
_start_time = time.time()
|
|
295
|
-
|
|
296
|
-
self.data = data
|
|
297
|
-
self.num_sample = num_sample
|
|
298
|
-
self.weight_column = weight_column
|
|
299
|
-
self.sampling_mode = sampling_mode
|
|
300
|
-
self.distance = distance
|
|
301
|
-
self.input_columns = input_columns
|
|
302
|
-
self.as_categories = as_categories
|
|
303
|
-
self.category_weights = category_weights
|
|
304
|
-
self.categorical_distance = categorical_distance
|
|
305
|
-
self.seed = seed
|
|
306
|
-
self.seed_column = seed_column
|
|
307
|
-
self.over_sampling_rate = over_sampling_rate
|
|
308
|
-
self.iteration_num = iteration_num
|
|
309
|
-
self.setid_as_first_column = setid_as_first_column
|
|
310
|
-
self.data_sequence_column = data_sequence_column
|
|
311
|
-
|
|
312
|
-
# Create TeradataPyWrapperUtils instance which contains validation functions.
|
|
313
|
-
self.__awu = AnalyticsWrapperUtils()
|
|
314
|
-
self.__aed_utils = AedUtils()
|
|
315
|
-
|
|
316
|
-
# Create argument information matrix to do parameter checking
|
|
317
|
-
self.__arg_info_matrix = []
|
|
318
|
-
self.__arg_info_matrix.append(["data", self.data, False, (DataFrame)])
|
|
319
|
-
self.__arg_info_matrix.append(["num_sample", self.num_sample, False, (int,list)])
|
|
320
|
-
self.__arg_info_matrix.append(["weight_column", self.weight_column, True, (str)])
|
|
321
|
-
self.__arg_info_matrix.append(["sampling_mode", self.sampling_mode, True, (str)])
|
|
322
|
-
self.__arg_info_matrix.append(["distance", self.distance, True, (str)])
|
|
323
|
-
self.__arg_info_matrix.append(["input_columns", self.input_columns, True, (str,list)])
|
|
324
|
-
self.__arg_info_matrix.append(["as_categories", self.as_categories, True, (str,list)])
|
|
325
|
-
self.__arg_info_matrix.append(["category_weights", self.category_weights, True, (float,list)])
|
|
326
|
-
self.__arg_info_matrix.append(["categorical_distance", self.categorical_distance, True, (str)])
|
|
327
|
-
self.__arg_info_matrix.append(["seed", self.seed, True, (int)])
|
|
328
|
-
self.__arg_info_matrix.append(["seed_column", self.seed_column, True, (str,list)])
|
|
329
|
-
self.__arg_info_matrix.append(["over_sampling_rate", self.over_sampling_rate, True, (float)])
|
|
330
|
-
self.__arg_info_matrix.append(["iteration_num", self.iteration_num, True, (int)])
|
|
331
|
-
self.__arg_info_matrix.append(["setid_as_first_column", self.setid_as_first_column, True, (bool)])
|
|
332
|
-
self.__arg_info_matrix.append(["data_sequence_column", self.data_sequence_column, True, (str,list)])
|
|
333
|
-
|
|
334
|
-
if inspect.stack()[1][3] != '_from_model_catalog':
|
|
335
|
-
# Perform the function validations
|
|
336
|
-
self.__validate()
|
|
337
|
-
# Generate the ML query
|
|
338
|
-
self.__form_tdml_query()
|
|
339
|
-
# Execute ML query
|
|
340
|
-
self.__execute()
|
|
341
|
-
# Get the prediction type
|
|
342
|
-
self._prediction_type = self.__awu._get_function_prediction_type(self)
|
|
343
|
-
|
|
344
|
-
# End the timer to get the build time
|
|
345
|
-
_end_time = time.time()
|
|
346
|
-
|
|
347
|
-
# Calculate the build time
|
|
348
|
-
self._build_time = (int)(_end_time - _start_time)
|
|
349
|
-
|
|
350
|
-
def __validate(self):
|
|
351
|
-
"""
|
|
352
|
-
Function to validate sqlmr function arguments, which verifies missing
|
|
353
|
-
arguments, input argument and table types. Also processes the
|
|
354
|
-
argument values.
|
|
355
|
-
"""
|
|
356
|
-
|
|
357
|
-
# Make sure that a non-NULL value has been supplied for all mandatory arguments
|
|
358
|
-
self.__awu._validate_missing_required_arguments(self.__arg_info_matrix)
|
|
359
|
-
|
|
360
|
-
# Make sure that a non-NULL value has been supplied correct type of argument
|
|
361
|
-
self.__awu._validate_argument_types(self.__arg_info_matrix)
|
|
362
|
-
|
|
363
|
-
# Check to make sure input table types are strings or data frame objects or of valid type.
|
|
364
|
-
self.__awu._validate_input_table_datatype(self.data, "data", None)
|
|
365
|
-
|
|
366
|
-
# Check for permitted values
|
|
367
|
-
sampling_mode_permitted_values = ["BASIC", "KMEANS++", "KMEANS||"]
|
|
368
|
-
self.__awu._validate_permitted_values(self.sampling_mode, sampling_mode_permitted_values, "sampling_mode")
|
|
369
|
-
|
|
370
|
-
distance_permitted_values = ["MANHATTAN", "EUCLIDEAN"]
|
|
371
|
-
self.__awu._validate_permitted_values(self.distance, distance_permitted_values, "distance")
|
|
372
|
-
|
|
373
|
-
categorical_distance_permitted_values = ["OVERLAP", "HAMMING"]
|
|
374
|
-
self.__awu._validate_permitted_values(self.categorical_distance, categorical_distance_permitted_values, "categorical_distance")
|
|
375
|
-
|
|
376
|
-
# Check whether the input columns passed to the argument are not empty.
|
|
377
|
-
# Also check whether the input columns passed to the argument valid or not.
|
|
378
|
-
self.__awu._validate_input_columns_not_empty(self.weight_column, "weight_column")
|
|
379
|
-
self.__awu._validate_dataframe_has_argument_columns(self.weight_column, "weight_column", self.data, "data", False)
|
|
380
|
-
|
|
381
|
-
self.__awu._validate_input_columns_not_empty(self.input_columns, "input_columns")
|
|
382
|
-
self.__awu._validate_dataframe_has_argument_columns(self.input_columns, "input_columns", self.data, "data", False)
|
|
383
|
-
|
|
384
|
-
self.__awu._validate_input_columns_not_empty(self.as_categories, "as_categories")
|
|
385
|
-
self.__awu._validate_dataframe_has_argument_columns(self.as_categories, "as_categories", self.data, "data", False)
|
|
386
|
-
|
|
387
|
-
self.__awu._validate_input_columns_not_empty(self.seed_column, "seed_column")
|
|
388
|
-
self.__awu._validate_dataframe_has_argument_columns(self.seed_column, "seed_column", self.data, "data", False)
|
|
389
|
-
|
|
390
|
-
self.__awu._validate_input_columns_not_empty(self.data_sequence_column, "data_sequence_column")
|
|
391
|
-
self.__awu._validate_dataframe_has_argument_columns(self.data_sequence_column, "data_sequence_column", self.data, "data", False)
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
def __form_tdml_query(self):
|
|
395
|
-
"""
|
|
396
|
-
Function to generate the analytical function queries. The function defines
|
|
397
|
-
variables and list of arguments required to form the query.
|
|
398
|
-
"""
|
|
399
|
-
|
|
400
|
-
# Output table arguments list
|
|
401
|
-
self.__func_output_args_sql_names = []
|
|
402
|
-
self.__func_output_args = []
|
|
403
|
-
|
|
404
|
-
# Model Cataloging related attributes.
|
|
405
|
-
self._sql_specific_attributes = {}
|
|
406
|
-
self._sql_formula_attribute_mapper = {}
|
|
407
|
-
self._target_column = None
|
|
408
|
-
self._algorithm_name = None
|
|
409
|
-
|
|
410
|
-
# Generate lists for rest of the function arguments
|
|
411
|
-
self.__func_other_arg_sql_names = []
|
|
412
|
-
self.__func_other_args = []
|
|
413
|
-
self.__func_other_arg_json_datatypes = []
|
|
414
|
-
|
|
415
|
-
if self.weight_column is not None:
|
|
416
|
-
self.__func_other_arg_sql_names.append("WeightColumn")
|
|
417
|
-
self.__func_other_args.append(UtilFuncs._teradata_collapse_arglist(UtilFuncs._teradata_quote_arg(self.weight_column, "\""), "'"))
|
|
418
|
-
self.__func_other_arg_json_datatypes.append("COLUMN_NAMES")
|
|
419
|
-
|
|
420
|
-
if self.input_columns is not None:
|
|
421
|
-
self.__func_other_arg_sql_names.append("InputColumns")
|
|
422
|
-
self.__func_other_args.append(UtilFuncs._teradata_collapse_arglist(UtilFuncs._teradata_quote_arg(self.input_columns, "\""), "'"))
|
|
423
|
-
self.__func_other_arg_json_datatypes.append("COLUMN_NAMES")
|
|
424
|
-
|
|
425
|
-
if self.as_categories is not None:
|
|
426
|
-
self.__func_other_arg_sql_names.append("AsCategories")
|
|
427
|
-
self.__func_other_args.append(UtilFuncs._teradata_collapse_arglist(UtilFuncs._teradata_quote_arg(self.as_categories, "\""), "'"))
|
|
428
|
-
self.__func_other_arg_json_datatypes.append("COLUMN_NAMES")
|
|
429
|
-
|
|
430
|
-
if self.seed_column is not None:
|
|
431
|
-
self.__func_other_arg_sql_names.append("SeedColumn")
|
|
432
|
-
self.__func_other_args.append(UtilFuncs._teradata_collapse_arglist(UtilFuncs._teradata_quote_arg(self.seed_column, "\""), "'"))
|
|
433
|
-
self.__func_other_arg_json_datatypes.append("COLUMN_NAMES")
|
|
434
|
-
|
|
435
|
-
self.__func_other_arg_sql_names.append("NumSample")
|
|
436
|
-
self.__func_other_args.append(UtilFuncs._teradata_collapse_arglist(self.num_sample, "'"))
|
|
437
|
-
self.__func_other_arg_json_datatypes.append("INTEGER")
|
|
438
|
-
|
|
439
|
-
if self.sampling_mode is not None and self.sampling_mode != "Basic":
|
|
440
|
-
self.__func_other_arg_sql_names.append("SamplingMode")
|
|
441
|
-
self.__func_other_args.append(UtilFuncs._teradata_collapse_arglist(self.sampling_mode, "'"))
|
|
442
|
-
self.__func_other_arg_json_datatypes.append("STRING")
|
|
443
|
-
|
|
444
|
-
if self.distance is not None and self.distance != "EUCLIDEAN":
|
|
445
|
-
self.__func_other_arg_sql_names.append("Distance")
|
|
446
|
-
self.__func_other_args.append(UtilFuncs._teradata_collapse_arglist(self.distance, "'"))
|
|
447
|
-
self.__func_other_arg_json_datatypes.append("STRING")
|
|
448
|
-
|
|
449
|
-
if self.categorical_distance is not None and self.categorical_distance != "OVERLAP":
|
|
450
|
-
self.__func_other_arg_sql_names.append("CategoricalDistance")
|
|
451
|
-
self.__func_other_args.append(UtilFuncs._teradata_collapse_arglist(self.categorical_distance, "'"))
|
|
452
|
-
self.__func_other_arg_json_datatypes.append("STRING")
|
|
453
|
-
|
|
454
|
-
if self.category_weights is not None:
|
|
455
|
-
self.__func_other_arg_sql_names.append("CategoryWeights")
|
|
456
|
-
self.__func_other_args.append(UtilFuncs._teradata_collapse_arglist(self.category_weights, "'"))
|
|
457
|
-
self.__func_other_arg_json_datatypes.append("DOUBLE")
|
|
458
|
-
|
|
459
|
-
if self.seed is not None:
|
|
460
|
-
self.__func_other_arg_sql_names.append("Seed")
|
|
461
|
-
self.__func_other_args.append(UtilFuncs._teradata_collapse_arglist(self.seed, "'"))
|
|
462
|
-
self.__func_other_arg_json_datatypes.append("LONG")
|
|
463
|
-
|
|
464
|
-
if self.over_sampling_rate is not None and self.over_sampling_rate != 1.0:
|
|
465
|
-
self.__func_other_arg_sql_names.append("OverSamplingRate")
|
|
466
|
-
self.__func_other_args.append(UtilFuncs._teradata_collapse_arglist(self.over_sampling_rate, "'"))
|
|
467
|
-
self.__func_other_arg_json_datatypes.append("DOUBLE")
|
|
468
|
-
|
|
469
|
-
if self.iteration_num is not None and self.iteration_num != 5:
|
|
470
|
-
self.__func_other_arg_sql_names.append("IterationNum")
|
|
471
|
-
self.__func_other_args.append(UtilFuncs._teradata_collapse_arglist(self.iteration_num, "'"))
|
|
472
|
-
self.__func_other_arg_json_datatypes.append("INTEGER")
|
|
473
|
-
|
|
474
|
-
if self.setid_as_first_column is not None and self.setid_as_first_column != True:
|
|
475
|
-
self.__func_other_arg_sql_names.append("SetIdAsFirstColumn")
|
|
476
|
-
self.__func_other_args.append(UtilFuncs._teradata_collapse_arglist(self.setid_as_first_column, "'"))
|
|
477
|
-
self.__func_other_arg_json_datatypes.append("BOOLEAN")
|
|
478
|
-
|
|
479
|
-
# Generate lists for rest of the function arguments
|
|
480
|
-
sequence_input_by_list = []
|
|
481
|
-
if self.data_sequence_column is not None:
|
|
482
|
-
sequence_input_by_list.append("InputTable:" + UtilFuncs._teradata_collapse_arglist(self.data_sequence_column, ""))
|
|
483
|
-
|
|
484
|
-
if len(sequence_input_by_list) > 0:
|
|
485
|
-
self.__func_other_arg_sql_names.append("SequenceInputBy")
|
|
486
|
-
sequence_input_by_arg_value = UtilFuncs._teradata_collapse_arglist(sequence_input_by_list, "'")
|
|
487
|
-
self.__func_other_args.append(sequence_input_by_arg_value)
|
|
488
|
-
self.__func_other_arg_json_datatypes.append("STRING")
|
|
489
|
-
self._sql_specific_attributes["SequenceInputBy"] = sequence_input_by_arg_value
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
# Declare empty lists to hold input table information.
|
|
493
|
-
self.__func_input_arg_sql_names = []
|
|
494
|
-
self.__func_input_table_view_query = []
|
|
495
|
-
self.__func_input_dataframe_type = []
|
|
496
|
-
self.__func_input_distribution = []
|
|
497
|
-
self.__func_input_partition_by_cols = []
|
|
498
|
-
self.__func_input_order_by_cols = []
|
|
499
|
-
|
|
500
|
-
# Process data
|
|
501
|
-
self.__table_ref = self.__awu._teradata_on_clause_from_dataframe(self.data, False)
|
|
502
|
-
self.__func_input_distribution.append("NONE")
|
|
503
|
-
self.__func_input_arg_sql_names.append("InputTable")
|
|
504
|
-
self.__func_input_table_view_query.append(self.__table_ref["ref"])
|
|
505
|
-
self.__func_input_dataframe_type.append(self.__table_ref["ref_type"])
|
|
506
|
-
self.__func_input_partition_by_cols.append("NA_character_")
|
|
507
|
-
self.__func_input_order_by_cols.append("NA_character_")
|
|
508
|
-
|
|
509
|
-
function_name = "RandomSample"
|
|
510
|
-
# Create instance to generate SQLMR.
|
|
511
|
-
self.__aqg_obj = AnalyticQueryGenerator(function_name,
|
|
512
|
-
self.__func_input_arg_sql_names,
|
|
513
|
-
self.__func_input_table_view_query,
|
|
514
|
-
self.__func_input_dataframe_type,
|
|
515
|
-
self.__func_input_distribution,
|
|
516
|
-
self.__func_input_partition_by_cols,
|
|
517
|
-
self.__func_input_order_by_cols,
|
|
518
|
-
self.__func_other_arg_sql_names,
|
|
519
|
-
self.__func_other_args,
|
|
520
|
-
self.__func_other_arg_json_datatypes,
|
|
521
|
-
self.__func_output_args_sql_names,
|
|
522
|
-
self.__func_output_args,
|
|
523
|
-
engine="ENGINE_ML")
|
|
524
|
-
# Invoke call to SQL-MR generation.
|
|
525
|
-
self.sqlmr_query = self.__aqg_obj._gen_sqlmr_select_stmt_sql()
|
|
526
|
-
|
|
527
|
-
# Print SQL-MR query if requested to do so.
|
|
528
|
-
if display.print_sqlmr_query:
|
|
529
|
-
print(self.sqlmr_query)
|
|
530
|
-
|
|
531
|
-
# Set the algorithm name for Model Cataloging.
|
|
532
|
-
self._algorithm_name = self.__aqg_obj._get_alias_name_for_function(function_name)
|
|
533
|
-
|
|
534
|
-
def __execute(self):
|
|
535
|
-
"""
|
|
536
|
-
Function to execute SQL-MR queries.
|
|
537
|
-
Create DataFrames for the required SQL-MR outputs.
|
|
538
|
-
"""
|
|
539
|
-
# Generate STDOUT table name and add it to the output table list.
|
|
540
|
-
sqlmr_stdout_temp_tablename = UtilFuncs._generate_temp_table_name(prefix="td_sqlmr_out_", use_default_database=True, gc_on_quit=True, quote=False, table_type = TeradataConstants.TERADATA_TABLE)
|
|
541
|
-
try:
|
|
542
|
-
# Generate the output.
|
|
543
|
-
UtilFuncs._create_table(sqlmr_stdout_temp_tablename, self.sqlmr_query)
|
|
544
|
-
except Exception as emsg:
|
|
545
|
-
raise TeradataMlException(Messages.get_message(MessageCodes.TDMLDF_EXEC_SQL_FAILED, str(emsg)), MessageCodes.TDMLDF_EXEC_SQL_FAILED)
|
|
546
|
-
|
|
547
|
-
# Update output table data frames.
|
|
548
|
-
self._mlresults = []
|
|
549
|
-
self.result = self.__awu._create_data_set_object(df_input=UtilFuncs._extract_table_name(sqlmr_stdout_temp_tablename), source_type="table", database_name=UtilFuncs._extract_db_name(sqlmr_stdout_temp_tablename))
|
|
550
|
-
self._mlresults.append(self.result)
|
|
551
|
-
|
|
552
|
-
def show_query(self):
|
|
553
|
-
"""
|
|
554
|
-
Function to return the underlying SQL query.
|
|
555
|
-
When model object is created using retrieve_model(), then None is returned.
|
|
556
|
-
"""
|
|
557
|
-
return self.sqlmr_query
|
|
558
|
-
|
|
559
|
-
def get_prediction_type(self):
|
|
560
|
-
"""
|
|
561
|
-
Function to return the Prediction type of the algorithm.
|
|
562
|
-
When model object is created using retrieve_model(), then the value returned is
|
|
563
|
-
as saved in the Model Catalog.
|
|
564
|
-
"""
|
|
565
|
-
return self._prediction_type
|
|
566
|
-
|
|
567
|
-
def get_target_column(self):
|
|
568
|
-
"""
|
|
569
|
-
Function to return the Target Column of the algorithm.
|
|
570
|
-
When model object is created using retrieve_model(), then the value returned is
|
|
571
|
-
as saved in the Model Catalog.
|
|
572
|
-
"""
|
|
573
|
-
return self._target_column
|
|
574
|
-
|
|
575
|
-
def get_build_time(self):
|
|
576
|
-
"""
|
|
577
|
-
Function to return the build time of the algorithm in seconds.
|
|
578
|
-
When model object is created using retrieve_model(), then the value returned is
|
|
579
|
-
as saved in the Model Catalog.
|
|
580
|
-
"""
|
|
581
|
-
return self._build_time
|
|
582
|
-
|
|
583
|
-
def _get_algorithm_name(self):
|
|
584
|
-
"""
|
|
585
|
-
Function to return the name of the algorithm.
|
|
586
|
-
"""
|
|
587
|
-
return self._algorithm_name
|
|
588
|
-
|
|
589
|
-
def _get_sql_specific_attributes(self):
|
|
590
|
-
"""
|
|
591
|
-
Function to return the dictionary containing the SQL specific attributes of the algorithm.
|
|
592
|
-
"""
|
|
593
|
-
return self._sql_specific_attributes
|
|
594
|
-
|
|
595
|
-
@classmethod
|
|
596
|
-
def _from_model_catalog(cls,
|
|
597
|
-
result = None,
|
|
598
|
-
**kwargs):
|
|
599
|
-
"""
|
|
600
|
-
Classmethod is used by Model Cataloging, to instantiate this wrapper class.
|
|
601
|
-
"""
|
|
602
|
-
kwargs.pop("result", None)
|
|
603
|
-
|
|
604
|
-
# Model Cataloging related attributes.
|
|
605
|
-
target_column = kwargs.pop("__target_column", None)
|
|
606
|
-
prediction_type = kwargs.pop("__prediction_type", None)
|
|
607
|
-
algorithm_name = kwargs.pop("__algorithm_name", None)
|
|
608
|
-
build_time = kwargs.pop("__build_time", None)
|
|
609
|
-
|
|
610
|
-
# Let's create an object of this class.
|
|
611
|
-
obj = cls(**kwargs)
|
|
612
|
-
obj.result = result
|
|
613
|
-
|
|
614
|
-
# Initialize the sqlmr_query class attribute.
|
|
615
|
-
obj.sqlmr_query = None
|
|
616
|
-
|
|
617
|
-
# Initialize the SQL specific Model Cataloging attributes.
|
|
618
|
-
obj._sql_specific_attributes = None
|
|
619
|
-
obj._target_column = target_column
|
|
620
|
-
obj._prediction_type = prediction_type
|
|
621
|
-
obj._algorithm_name = algorithm_name
|
|
622
|
-
obj._build_time = build_time
|
|
623
|
-
|
|
624
|
-
# Update output table data frames.
|
|
625
|
-
obj._mlresults = []
|
|
626
|
-
obj.result = obj.__awu._create_data_set_object(df_input=UtilFuncs._extract_table_name(obj.result), source_type="table", database_name=UtilFuncs._extract_db_name(obj.result))
|
|
627
|
-
obj._mlresults.append(obj.result)
|
|
628
|
-
return obj
|
|
629
|
-
|
|
630
|
-
def __repr__(self):
|
|
631
|
-
"""
|
|
632
|
-
Returns the string representation for a RandomSample class instance.
|
|
633
|
-
"""
|
|
634
|
-
repr_string="############ STDOUT Output ############"
|
|
635
|
-
repr_string = "{}\n\n{}".format(repr_string,self.result)
|
|
636
|
-
return repr_string
|
|
637
|
-
|