teradataml 17.20.0.6__py3-none-any.whl → 20.0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (432) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/LICENSE.pdf +0 -0
  3. teradataml/README.md +238 -1
  4. teradataml/__init__.py +13 -3
  5. teradataml/_version.py +1 -1
  6. teradataml/analytics/Transformations.py +4 -4
  7. teradataml/analytics/__init__.py +0 -2
  8. teradataml/analytics/analytic_function_executor.py +3 -0
  9. teradataml/analytics/json_parser/utils.py +13 -12
  10. teradataml/analytics/sqle/DecisionTreePredict.py +15 -30
  11. teradataml/analytics/sqle/NaiveBayesPredict.py +11 -20
  12. teradataml/analytics/sqle/__init__.py +0 -13
  13. teradataml/analytics/utils.py +1 -0
  14. teradataml/analytics/valib.py +3 -0
  15. teradataml/automl/__init__.py +1628 -0
  16. teradataml/automl/custom_json_utils.py +1270 -0
  17. teradataml/automl/data_preparation.py +993 -0
  18. teradataml/automl/data_transformation.py +727 -0
  19. teradataml/automl/feature_engineering.py +1648 -0
  20. teradataml/automl/feature_exploration.py +547 -0
  21. teradataml/automl/model_evaluation.py +163 -0
  22. teradataml/automl/model_training.py +887 -0
  23. teradataml/catalog/__init__.py +0 -2
  24. teradataml/catalog/byom.py +49 -6
  25. teradataml/catalog/function_argument_mapper.py +0 -2
  26. teradataml/catalog/model_cataloging_utils.py +2 -1021
  27. teradataml/common/aed_utils.py +6 -2
  28. teradataml/common/constants.py +50 -58
  29. teradataml/common/deprecations.py +160 -0
  30. teradataml/common/garbagecollector.py +61 -104
  31. teradataml/common/messagecodes.py +27 -36
  32. teradataml/common/messages.py +11 -15
  33. teradataml/common/utils.py +205 -287
  34. teradataml/common/wrapper_utils.py +1 -110
  35. teradataml/context/context.py +150 -78
  36. teradataml/data/bank_churn.csv +10001 -0
  37. teradataml/data/bmi.csv +501 -0
  38. teradataml/data/docs/sqle/docs_17_10/BincodeFit.py +3 -3
  39. teradataml/data/docs/sqle/docs_17_10/BincodeTransform.py +6 -5
  40. teradataml/data/docs/sqle/docs_17_10/Fit.py +1 -1
  41. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +1 -1
  42. teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +1 -1
  43. teradataml/data/docs/sqle/docs_17_10/PolynomialFeaturesTransform.py +2 -2
  44. teradataml/data/docs/sqle/docs_17_10/RowNormalizeTransform.py +2 -1
  45. teradataml/data/docs/sqle/docs_17_10/ScaleTransform.py +1 -0
  46. teradataml/data/docs/sqle/docs_17_10/SimpleImputeTransform.py +1 -1
  47. teradataml/data/docs/sqle/docs_17_10/Transform.py +2 -1
  48. teradataml/data/docs/sqle/docs_17_20/BincodeFit.py +3 -3
  49. teradataml/data/docs/sqle/docs_17_20/BincodeTransform.py +6 -5
  50. teradataml/data/docs/sqle/docs_17_20/Fit.py +1 -1
  51. teradataml/data/docs/sqle/docs_17_20/GLM.py +1 -1
  52. teradataml/data/docs/sqle/docs_17_20/GLMPredictPerSegment.py +9 -10
  53. teradataml/data/docs/sqle/docs_17_20/KMeansPredict.py +3 -2
  54. teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierPredict.py +16 -15
  55. teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierTrainer.py +2 -2
  56. teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +2 -2
  57. teradataml/data/docs/sqle/docs_17_20/NonLinearCombineTransform.py +8 -8
  58. teradataml/data/docs/sqle/docs_17_20/OneClassSVMPredict.py +21 -20
  59. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +1 -1
  60. teradataml/data/docs/sqle/docs_17_20/OutlierFilterTransform.py +8 -3
  61. teradataml/data/docs/sqle/docs_17_20/PolynomialFeaturesTransform.py +6 -5
  62. teradataml/data/docs/sqle/docs_17_20/RandomProjectionTransform.py +6 -6
  63. teradataml/data/docs/sqle/docs_17_20/RowNormalizeTransform.py +2 -1
  64. teradataml/data/docs/sqle/docs_17_20/SVM.py +1 -1
  65. teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +16 -16
  66. teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +1 -0
  67. teradataml/data/docs/sqle/docs_17_20/SimpleImputeTransform.py +3 -2
  68. teradataml/data/docs/sqle/docs_17_20/TDDecisionForestPredict.py +4 -4
  69. teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +19 -19
  70. teradataml/data/docs/sqle/docs_17_20/TargetEncodingTransform.py +5 -4
  71. teradataml/data/docs/sqle/docs_17_20/Transform.py +2 -2
  72. teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +9 -9
  73. teradataml/data/fish.csv +160 -0
  74. teradataml/data/glass_types.csv +215 -0
  75. teradataml/data/insurance.csv +1 -1
  76. teradataml/data/iris_data.csv +151 -0
  77. teradataml/data/jsons/sqle/17.10/TD_FunctionTransform.json +1 -0
  78. teradataml/data/jsons/sqle/17.10/TD_OneHotEncodingTransform.json +1 -0
  79. teradataml/data/jsons/sqle/17.10/TD_OutlierFilterTransform.json +1 -0
  80. teradataml/data/jsons/sqle/17.10/TD_PolynomialFeaturesTransform.json +1 -0
  81. teradataml/data/jsons/sqle/17.10/TD_RowNormalizeTransform.json +1 -0
  82. teradataml/data/jsons/sqle/17.10/TD_ScaleTransform.json +1 -0
  83. teradataml/data/jsons/sqle/17.10/TD_SimpleImputeTransform.json +1 -0
  84. teradataml/data/load_example_data.py +3 -0
  85. teradataml/data/multi_model_classification.csv +401 -0
  86. teradataml/data/multi_model_regression.csv +401 -0
  87. teradataml/data/openml_example.json +63 -0
  88. teradataml/data/scripts/deploy_script.py +65 -0
  89. teradataml/data/scripts/mapper.R +20 -0
  90. teradataml/data/scripts/sklearn/__init__.py +0 -0
  91. teradataml/data/scripts/sklearn/sklearn_fit.py +175 -0
  92. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +135 -0
  93. teradataml/data/scripts/sklearn/sklearn_function.template +113 -0
  94. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +158 -0
  95. teradataml/data/scripts/sklearn/sklearn_neighbors.py +152 -0
  96. teradataml/data/scripts/sklearn/sklearn_score.py +128 -0
  97. teradataml/data/scripts/sklearn/sklearn_transform.py +179 -0
  98. teradataml/data/templates/open_source_ml.json +9 -0
  99. teradataml/data/teradataml_example.json +73 -1
  100. teradataml/data/test_classification.csv +101 -0
  101. teradataml/data/test_prediction.csv +101 -0
  102. teradataml/data/test_regression.csv +101 -0
  103. teradataml/data/train_multiclass.csv +101 -0
  104. teradataml/data/train_regression.csv +101 -0
  105. teradataml/data/train_regression_multiple_labels.csv +101 -0
  106. teradataml/data/wine_data.csv +1600 -0
  107. teradataml/dataframe/copy_to.py +79 -13
  108. teradataml/dataframe/data_transfer.py +8 -0
  109. teradataml/dataframe/dataframe.py +910 -311
  110. teradataml/dataframe/dataframe_utils.py +102 -5
  111. teradataml/dataframe/fastload.py +11 -3
  112. teradataml/dataframe/setop.py +15 -2
  113. teradataml/dataframe/sql.py +3735 -77
  114. teradataml/dataframe/sql_function_parameters.py +56 -5
  115. teradataml/dataframe/vantage_function_types.py +45 -1
  116. teradataml/dataframe/window.py +30 -29
  117. teradataml/dbutils/dbutils.py +18 -1
  118. teradataml/geospatial/geodataframe.py +18 -7
  119. teradataml/geospatial/geodataframecolumn.py +5 -0
  120. teradataml/hyperparameter_tuner/optimizer.py +910 -120
  121. teradataml/hyperparameter_tuner/utils.py +131 -37
  122. teradataml/lib/aed_0_1.dll +0 -0
  123. teradataml/lib/libaed_0_1.dylib +0 -0
  124. teradataml/lib/libaed_0_1.so +0 -0
  125. teradataml/libaed_0_1.dylib +0 -0
  126. teradataml/libaed_0_1.so +0 -0
  127. teradataml/opensource/__init__.py +1 -0
  128. teradataml/opensource/sklearn/__init__.py +1 -0
  129. teradataml/opensource/sklearn/_class.py +255 -0
  130. teradataml/opensource/sklearn/_sklearn_wrapper.py +1668 -0
  131. teradataml/opensource/sklearn/_wrapper_utils.py +268 -0
  132. teradataml/opensource/sklearn/constants.py +54 -0
  133. teradataml/options/__init__.py +3 -6
  134. teradataml/options/configure.py +21 -20
  135. teradataml/scriptmgmt/UserEnv.py +61 -5
  136. teradataml/scriptmgmt/lls_utils.py +135 -53
  137. teradataml/table_operators/Apply.py +38 -6
  138. teradataml/table_operators/Script.py +45 -308
  139. teradataml/table_operators/TableOperator.py +182 -591
  140. teradataml/table_operators/__init__.py +0 -1
  141. teradataml/table_operators/table_operator_util.py +32 -40
  142. teradataml/utils/validators.py +127 -3
  143. {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/METADATA +243 -3
  144. {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/RECORD +147 -391
  145. teradataml/analytics/mle/AdaBoost.py +0 -651
  146. teradataml/analytics/mle/AdaBoostPredict.py +0 -564
  147. teradataml/analytics/mle/Antiselect.py +0 -342
  148. teradataml/analytics/mle/Arima.py +0 -641
  149. teradataml/analytics/mle/ArimaPredict.py +0 -477
  150. teradataml/analytics/mle/Attribution.py +0 -1070
  151. teradataml/analytics/mle/Betweenness.py +0 -658
  152. teradataml/analytics/mle/Burst.py +0 -711
  153. teradataml/analytics/mle/CCM.py +0 -600
  154. teradataml/analytics/mle/CCMPrepare.py +0 -324
  155. teradataml/analytics/mle/CFilter.py +0 -460
  156. teradataml/analytics/mle/ChangePointDetection.py +0 -572
  157. teradataml/analytics/mle/ChangePointDetectionRT.py +0 -477
  158. teradataml/analytics/mle/Closeness.py +0 -737
  159. teradataml/analytics/mle/ConfusionMatrix.py +0 -420
  160. teradataml/analytics/mle/Correlation.py +0 -477
  161. teradataml/analytics/mle/Correlation2.py +0 -573
  162. teradataml/analytics/mle/CoxHazardRatio.py +0 -679
  163. teradataml/analytics/mle/CoxPH.py +0 -556
  164. teradataml/analytics/mle/CoxSurvival.py +0 -478
  165. teradataml/analytics/mle/CumulativeMovAvg.py +0 -363
  166. teradataml/analytics/mle/DTW.py +0 -623
  167. teradataml/analytics/mle/DWT.py +0 -564
  168. teradataml/analytics/mle/DWT2D.py +0 -599
  169. teradataml/analytics/mle/DecisionForest.py +0 -716
  170. teradataml/analytics/mle/DecisionForestEvaluator.py +0 -363
  171. teradataml/analytics/mle/DecisionForestPredict.py +0 -561
  172. teradataml/analytics/mle/DecisionTree.py +0 -830
  173. teradataml/analytics/mle/DecisionTreePredict.py +0 -528
  174. teradataml/analytics/mle/ExponentialMovAvg.py +0 -418
  175. teradataml/analytics/mle/FMeasure.py +0 -402
  176. teradataml/analytics/mle/FPGrowth.py +0 -734
  177. teradataml/analytics/mle/FrequentPaths.py +0 -695
  178. teradataml/analytics/mle/GLM.py +0 -558
  179. teradataml/analytics/mle/GLML1L2.py +0 -547
  180. teradataml/analytics/mle/GLML1L2Predict.py +0 -519
  181. teradataml/analytics/mle/GLMPredict.py +0 -529
  182. teradataml/analytics/mle/HMMDecoder.py +0 -945
  183. teradataml/analytics/mle/HMMEvaluator.py +0 -901
  184. teradataml/analytics/mle/HMMSupervised.py +0 -521
  185. teradataml/analytics/mle/HMMUnsupervised.py +0 -572
  186. teradataml/analytics/mle/Histogram.py +0 -561
  187. teradataml/analytics/mle/IDWT.py +0 -476
  188. teradataml/analytics/mle/IDWT2D.py +0 -493
  189. teradataml/analytics/mle/IdentityMatch.py +0 -763
  190. teradataml/analytics/mle/Interpolator.py +0 -918
  191. teradataml/analytics/mle/KMeans.py +0 -485
  192. teradataml/analytics/mle/KNN.py +0 -627
  193. teradataml/analytics/mle/KNNRecommender.py +0 -488
  194. teradataml/analytics/mle/KNNRecommenderPredict.py +0 -581
  195. teradataml/analytics/mle/LAR.py +0 -439
  196. teradataml/analytics/mle/LARPredict.py +0 -478
  197. teradataml/analytics/mle/LDA.py +0 -548
  198. teradataml/analytics/mle/LDAInference.py +0 -492
  199. teradataml/analytics/mle/LDATopicSummary.py +0 -464
  200. teradataml/analytics/mle/LevenshteinDistance.py +0 -450
  201. teradataml/analytics/mle/LinReg.py +0 -433
  202. teradataml/analytics/mle/LinRegPredict.py +0 -438
  203. teradataml/analytics/mle/MinHash.py +0 -544
  204. teradataml/analytics/mle/Modularity.py +0 -587
  205. teradataml/analytics/mle/NEREvaluator.py +0 -410
  206. teradataml/analytics/mle/NERExtractor.py +0 -595
  207. teradataml/analytics/mle/NERTrainer.py +0 -458
  208. teradataml/analytics/mle/NGrams.py +0 -570
  209. teradataml/analytics/mle/NPath.py +0 -634
  210. teradataml/analytics/mle/NTree.py +0 -549
  211. teradataml/analytics/mle/NaiveBayes.py +0 -462
  212. teradataml/analytics/mle/NaiveBayesPredict.py +0 -513
  213. teradataml/analytics/mle/NaiveBayesTextClassifier.py +0 -607
  214. teradataml/analytics/mle/NaiveBayesTextClassifier2.py +0 -531
  215. teradataml/analytics/mle/NaiveBayesTextClassifierPredict.py +0 -799
  216. teradataml/analytics/mle/NamedEntityFinder.py +0 -529
  217. teradataml/analytics/mle/NamedEntityFinderEvaluator.py +0 -414
  218. teradataml/analytics/mle/NamedEntityFinderTrainer.py +0 -396
  219. teradataml/analytics/mle/POSTagger.py +0 -417
  220. teradataml/analytics/mle/Pack.py +0 -411
  221. teradataml/analytics/mle/PageRank.py +0 -535
  222. teradataml/analytics/mle/PathAnalyzer.py +0 -426
  223. teradataml/analytics/mle/PathGenerator.py +0 -367
  224. teradataml/analytics/mle/PathStart.py +0 -464
  225. teradataml/analytics/mle/PathSummarizer.py +0 -470
  226. teradataml/analytics/mle/Pivot.py +0 -471
  227. teradataml/analytics/mle/ROC.py +0 -425
  228. teradataml/analytics/mle/RandomSample.py +0 -637
  229. teradataml/analytics/mle/RandomWalkSample.py +0 -490
  230. teradataml/analytics/mle/SAX.py +0 -779
  231. teradataml/analytics/mle/SVMDense.py +0 -677
  232. teradataml/analytics/mle/SVMDensePredict.py +0 -536
  233. teradataml/analytics/mle/SVMDenseSummary.py +0 -437
  234. teradataml/analytics/mle/SVMSparse.py +0 -557
  235. teradataml/analytics/mle/SVMSparsePredict.py +0 -553
  236. teradataml/analytics/mle/SVMSparseSummary.py +0 -435
  237. teradataml/analytics/mle/Sampling.py +0 -549
  238. teradataml/analytics/mle/Scale.py +0 -565
  239. teradataml/analytics/mle/ScaleByPartition.py +0 -496
  240. teradataml/analytics/mle/ScaleMap.py +0 -378
  241. teradataml/analytics/mle/ScaleSummary.py +0 -320
  242. teradataml/analytics/mle/SentenceExtractor.py +0 -363
  243. teradataml/analytics/mle/SentimentEvaluator.py +0 -432
  244. teradataml/analytics/mle/SentimentExtractor.py +0 -578
  245. teradataml/analytics/mle/SentimentTrainer.py +0 -405
  246. teradataml/analytics/mle/SeriesSplitter.py +0 -641
  247. teradataml/analytics/mle/Sessionize.py +0 -475
  248. teradataml/analytics/mle/SimpleMovAvg.py +0 -397
  249. teradataml/analytics/mle/StringSimilarity.py +0 -425
  250. teradataml/analytics/mle/TF.py +0 -389
  251. teradataml/analytics/mle/TFIDF.py +0 -504
  252. teradataml/analytics/mle/TextChunker.py +0 -414
  253. teradataml/analytics/mle/TextClassifier.py +0 -399
  254. teradataml/analytics/mle/TextClassifierEvaluator.py +0 -413
  255. teradataml/analytics/mle/TextClassifierTrainer.py +0 -565
  256. teradataml/analytics/mle/TextMorph.py +0 -494
  257. teradataml/analytics/mle/TextParser.py +0 -623
  258. teradataml/analytics/mle/TextTagger.py +0 -530
  259. teradataml/analytics/mle/TextTokenizer.py +0 -502
  260. teradataml/analytics/mle/UnivariateStatistics.py +0 -488
  261. teradataml/analytics/mle/Unpack.py +0 -526
  262. teradataml/analytics/mle/Unpivot.py +0 -438
  263. teradataml/analytics/mle/VarMax.py +0 -776
  264. teradataml/analytics/mle/VectorDistance.py +0 -762
  265. teradataml/analytics/mle/WeightedMovAvg.py +0 -400
  266. teradataml/analytics/mle/XGBoost.py +0 -842
  267. teradataml/analytics/mle/XGBoostPredict.py +0 -627
  268. teradataml/analytics/mle/__init__.py +0 -123
  269. teradataml/analytics/mle/json/adaboost_mle.json +0 -135
  270. teradataml/analytics/mle/json/adaboostpredict_mle.json +0 -85
  271. teradataml/analytics/mle/json/antiselect_mle.json +0 -34
  272. teradataml/analytics/mle/json/antiselect_mle_mle.json +0 -34
  273. teradataml/analytics/mle/json/arima_mle.json +0 -172
  274. teradataml/analytics/mle/json/arimapredict_mle.json +0 -52
  275. teradataml/analytics/mle/json/attribution_mle_mle.json +0 -143
  276. teradataml/analytics/mle/json/betweenness_mle.json +0 -97
  277. teradataml/analytics/mle/json/burst_mle.json +0 -140
  278. teradataml/analytics/mle/json/ccm_mle.json +0 -124
  279. teradataml/analytics/mle/json/ccmprepare_mle.json +0 -14
  280. teradataml/analytics/mle/json/cfilter_mle.json +0 -93
  281. teradataml/analytics/mle/json/changepointdetection_mle.json +0 -92
  282. teradataml/analytics/mle/json/changepointdetectionrt_mle.json +0 -78
  283. teradataml/analytics/mle/json/closeness_mle.json +0 -104
  284. teradataml/analytics/mle/json/confusionmatrix_mle.json +0 -79
  285. teradataml/analytics/mle/json/correlation_mle.json +0 -86
  286. teradataml/analytics/mle/json/correlationreduce_mle.json +0 -49
  287. teradataml/analytics/mle/json/coxhazardratio_mle.json +0 -89
  288. teradataml/analytics/mle/json/coxph_mle.json +0 -98
  289. teradataml/analytics/mle/json/coxsurvival_mle.json +0 -79
  290. teradataml/analytics/mle/json/cumulativemovavg_mle.json +0 -34
  291. teradataml/analytics/mle/json/decisionforest_mle.json +0 -167
  292. teradataml/analytics/mle/json/decisionforestevaluator_mle.json +0 -33
  293. teradataml/analytics/mle/json/decisionforestpredict_mle_mle.json +0 -74
  294. teradataml/analytics/mle/json/decisiontree_mle.json +0 -194
  295. teradataml/analytics/mle/json/decisiontreepredict_mle_mle.json +0 -86
  296. teradataml/analytics/mle/json/dtw_mle.json +0 -97
  297. teradataml/analytics/mle/json/dwt2d_mle.json +0 -116
  298. teradataml/analytics/mle/json/dwt_mle.json +0 -101
  299. teradataml/analytics/mle/json/exponentialmovavg_mle.json +0 -55
  300. teradataml/analytics/mle/json/fmeasure_mle.json +0 -58
  301. teradataml/analytics/mle/json/fpgrowth_mle.json +0 -159
  302. teradataml/analytics/mle/json/frequentpaths_mle.json +0 -129
  303. teradataml/analytics/mle/json/glm_mle.json +0 -111
  304. teradataml/analytics/mle/json/glml1l2_mle.json +0 -106
  305. teradataml/analytics/mle/json/glml1l2predict_mle.json +0 -57
  306. teradataml/analytics/mle/json/glmpredict_mle_mle.json +0 -74
  307. teradataml/analytics/mle/json/histogram_mle.json +0 -100
  308. teradataml/analytics/mle/json/hmmdecoder_mle.json +0 -192
  309. teradataml/analytics/mle/json/hmmevaluator_mle.json +0 -206
  310. teradataml/analytics/mle/json/hmmsupervised_mle.json +0 -91
  311. teradataml/analytics/mle/json/hmmunsupervised_mle.json +0 -114
  312. teradataml/analytics/mle/json/identitymatch_mle.json +0 -88
  313. teradataml/analytics/mle/json/idwt2d_mle.json +0 -73
  314. teradataml/analytics/mle/json/idwt_mle.json +0 -66
  315. teradataml/analytics/mle/json/interpolator_mle.json +0 -151
  316. teradataml/analytics/mle/json/kmeans_mle.json +0 -97
  317. teradataml/analytics/mle/json/knn_mle.json +0 -141
  318. teradataml/analytics/mle/json/knnrecommender_mle.json +0 -111
  319. teradataml/analytics/mle/json/knnrecommenderpredict_mle.json +0 -75
  320. teradataml/analytics/mle/json/lar_mle.json +0 -78
  321. teradataml/analytics/mle/json/larpredict_mle.json +0 -69
  322. teradataml/analytics/mle/json/lda_mle.json +0 -130
  323. teradataml/analytics/mle/json/ldainference_mle.json +0 -78
  324. teradataml/analytics/mle/json/ldatopicsummary_mle.json +0 -64
  325. teradataml/analytics/mle/json/levenshteindistance_mle.json +0 -92
  326. teradataml/analytics/mle/json/linreg_mle.json +0 -42
  327. teradataml/analytics/mle/json/linregpredict_mle.json +0 -56
  328. teradataml/analytics/mle/json/minhash_mle.json +0 -113
  329. teradataml/analytics/mle/json/modularity_mle.json +0 -91
  330. teradataml/analytics/mle/json/naivebayespredict_mle_mle.json +0 -85
  331. teradataml/analytics/mle/json/naivebayesreduce_mle.json +0 -52
  332. teradataml/analytics/mle/json/naivebayestextclassifierpredict_mle_mle.json +0 -147
  333. teradataml/analytics/mle/json/naivebayestextclassifiertrainer2_mle.json +0 -108
  334. teradataml/analytics/mle/json/naivebayestextclassifiertrainer_mle.json +0 -102
  335. teradataml/analytics/mle/json/namedentityfinder_mle.json +0 -84
  336. teradataml/analytics/mle/json/namedentityfinderevaluatorreduce_mle.json +0 -43
  337. teradataml/analytics/mle/json/namedentityfindertrainer_mle.json +0 -64
  338. teradataml/analytics/mle/json/nerevaluator_mle.json +0 -54
  339. teradataml/analytics/mle/json/nerextractor_mle.json +0 -87
  340. teradataml/analytics/mle/json/nertrainer_mle.json +0 -89
  341. teradataml/analytics/mle/json/ngrams_mle.json +0 -137
  342. teradataml/analytics/mle/json/ngramsplitter_mle_mle.json +0 -137
  343. teradataml/analytics/mle/json/npath@coprocessor_mle.json +0 -73
  344. teradataml/analytics/mle/json/ntree@coprocessor_mle.json +0 -123
  345. teradataml/analytics/mle/json/pack_mle.json +0 -58
  346. teradataml/analytics/mle/json/pack_mle_mle.json +0 -58
  347. teradataml/analytics/mle/json/pagerank_mle.json +0 -81
  348. teradataml/analytics/mle/json/pathanalyzer_mle.json +0 -63
  349. teradataml/analytics/mle/json/pathgenerator_mle.json +0 -40
  350. teradataml/analytics/mle/json/pathstart_mle.json +0 -62
  351. teradataml/analytics/mle/json/pathsummarizer_mle.json +0 -72
  352. teradataml/analytics/mle/json/pivoting_mle.json +0 -71
  353. teradataml/analytics/mle/json/postagger_mle.json +0 -51
  354. teradataml/analytics/mle/json/randomsample_mle.json +0 -131
  355. teradataml/analytics/mle/json/randomwalksample_mle.json +0 -85
  356. teradataml/analytics/mle/json/roc_mle.json +0 -73
  357. teradataml/analytics/mle/json/sampling_mle.json +0 -75
  358. teradataml/analytics/mle/json/sax_mle.json +0 -154
  359. teradataml/analytics/mle/json/scale_mle.json +0 -93
  360. teradataml/analytics/mle/json/scalebypartition_mle.json +0 -89
  361. teradataml/analytics/mle/json/scalemap_mle.json +0 -44
  362. teradataml/analytics/mle/json/scalesummary_mle.json +0 -14
  363. teradataml/analytics/mle/json/sentenceextractor_mle.json +0 -41
  364. teradataml/analytics/mle/json/sentimentevaluator_mle.json +0 -43
  365. teradataml/analytics/mle/json/sentimentextractor_mle.json +0 -100
  366. teradataml/analytics/mle/json/sentimenttrainer_mle.json +0 -68
  367. teradataml/analytics/mle/json/seriessplitter_mle.json +0 -133
  368. teradataml/analytics/mle/json/sessionize_mle_mle.json +0 -62
  369. teradataml/analytics/mle/json/simplemovavg_mle.json +0 -48
  370. teradataml/analytics/mle/json/stringsimilarity_mle.json +0 -50
  371. teradataml/analytics/mle/json/stringsimilarity_mle_mle.json +0 -50
  372. teradataml/analytics/mle/json/svmdense_mle.json +0 -165
  373. teradataml/analytics/mle/json/svmdensepredict_mle.json +0 -95
  374. teradataml/analytics/mle/json/svmdensesummary_mle.json +0 -58
  375. teradataml/analytics/mle/json/svmsparse_mle.json +0 -148
  376. teradataml/analytics/mle/json/svmsparsepredict_mle_mle.json +0 -103
  377. teradataml/analytics/mle/json/svmsparsesummary_mle.json +0 -57
  378. teradataml/analytics/mle/json/textchunker_mle.json +0 -40
  379. teradataml/analytics/mle/json/textclassifier_mle.json +0 -51
  380. teradataml/analytics/mle/json/textclassifierevaluator_mle.json +0 -43
  381. teradataml/analytics/mle/json/textclassifiertrainer_mle.json +0 -103
  382. teradataml/analytics/mle/json/textmorph_mle.json +0 -63
  383. teradataml/analytics/mle/json/textparser_mle.json +0 -166
  384. teradataml/analytics/mle/json/texttagger_mle.json +0 -81
  385. teradataml/analytics/mle/json/texttokenizer_mle.json +0 -91
  386. teradataml/analytics/mle/json/tf_mle.json +0 -33
  387. teradataml/analytics/mle/json/tfidf_mle.json +0 -34
  388. teradataml/analytics/mle/json/univariatestatistics_mle.json +0 -81
  389. teradataml/analytics/mle/json/unpack_mle.json +0 -91
  390. teradataml/analytics/mle/json/unpack_mle_mle.json +0 -91
  391. teradataml/analytics/mle/json/unpivoting_mle.json +0 -63
  392. teradataml/analytics/mle/json/varmax_mle.json +0 -176
  393. teradataml/analytics/mle/json/vectordistance_mle.json +0 -179
  394. teradataml/analytics/mle/json/weightedmovavg_mle.json +0 -48
  395. teradataml/analytics/mle/json/xgboost_mle.json +0 -178
  396. teradataml/analytics/mle/json/xgboostpredict_mle.json +0 -104
  397. teradataml/analytics/sqle/Antiselect.py +0 -321
  398. teradataml/analytics/sqle/Attribution.py +0 -603
  399. teradataml/analytics/sqle/DecisionForestPredict.py +0 -408
  400. teradataml/analytics/sqle/GLMPredict.py +0 -430
  401. teradataml/analytics/sqle/MovingAverage.py +0 -543
  402. teradataml/analytics/sqle/NGramSplitter.py +0 -548
  403. teradataml/analytics/sqle/NPath.py +0 -632
  404. teradataml/analytics/sqle/NaiveBayesTextClassifierPredict.py +0 -515
  405. teradataml/analytics/sqle/Pack.py +0 -388
  406. teradataml/analytics/sqle/SVMSparsePredict.py +0 -464
  407. teradataml/analytics/sqle/Sessionize.py +0 -390
  408. teradataml/analytics/sqle/StringSimilarity.py +0 -400
  409. teradataml/analytics/sqle/Unpack.py +0 -503
  410. teradataml/analytics/sqle/json/antiselect_sqle.json +0 -21
  411. teradataml/analytics/sqle/json/attribution_sqle.json +0 -92
  412. teradataml/analytics/sqle/json/decisionforestpredict_sqle.json +0 -48
  413. teradataml/analytics/sqle/json/glmpredict_sqle.json +0 -48
  414. teradataml/analytics/sqle/json/h2opredict_sqle.json +0 -63
  415. teradataml/analytics/sqle/json/movingaverage_sqle.json +0 -58
  416. teradataml/analytics/sqle/json/naivebayestextclassifierpredict_sqle.json +0 -76
  417. teradataml/analytics/sqle/json/ngramsplitter_sqle.json +0 -126
  418. teradataml/analytics/sqle/json/npath_sqle.json +0 -67
  419. teradataml/analytics/sqle/json/pack_sqle.json +0 -47
  420. teradataml/analytics/sqle/json/pmmlpredict_sqle.json +0 -55
  421. teradataml/analytics/sqle/json/sessionize_sqle.json +0 -43
  422. teradataml/analytics/sqle/json/stringsimilarity_sqle.json +0 -39
  423. teradataml/analytics/sqle/json/svmsparsepredict_sqle.json +0 -74
  424. teradataml/analytics/sqle/json/unpack_sqle.json +0 -80
  425. teradataml/catalog/model_cataloging.py +0 -980
  426. teradataml/config/mlengine_alias_definitions_v1.0 +0 -118
  427. teradataml/config/mlengine_alias_definitions_v1.1 +0 -127
  428. teradataml/config/mlengine_alias_definitions_v1.3 +0 -129
  429. teradataml/table_operators/sandbox_container_util.py +0 -643
  430. {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/WHEEL +0 -0
  431. {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/top_level.txt +0 -0
  432. {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/zip-safe +0 -0
@@ -0,0 +1,993 @@
1
+ # ##################################################################
2
+ #
3
+ # Copyright 2024 Teradata. All rights reserved.
4
+ # TERADATA CONFIDENTIAL AND TRADE SECRET
5
+ #
6
+ # Primary Owner: Sweta Shaw
7
+ # Email Id: Sweta.Shaw@Teradata.com
8
+ #
9
+ # Secondary Owner: Akhil Bisht
10
+ # Email Id: AKHIL.BISHT@Teradata.com
11
+ #
12
+ # Version: 1.1
13
+ # Function Version: 1.0
14
+ # ##################################################################
15
+
16
+ # Python libraries
17
+ import numpy as np
18
+ import pandas as pd
19
+ import random
20
+ import time
21
+ import warnings
22
+ warnings.filterwarnings("ignore")
23
+
24
+ # Teradata libraries
25
+ from teradataml.dataframe.dataframe import DataFrame
26
+ from teradataml.dataframe.copy_to import copy_to_sql
27
+ from teradataml import OutlierFilterFit, OutlierFilterTransform
28
+ from teradataml import RoundColumns, TeradataMlException
29
+ from teradataml import ScaleFit, ScaleTransform
30
+ from teradataml import TrainTestSplit, UtilFuncs, TeradataConstants
31
+ from teradataml.common.messages import Messages, MessageCodes
32
+ from teradataml.utils.validators import _Validators
33
+
34
+
35
+ class _DataPreparation:
36
+
37
+ def __init__(self,
38
+ data=None,
39
+ target_column=None,
40
+ verbose=0,
41
+ excluded_columns=None,
42
+ custom_data=None,
43
+ data_transform_dict=None,
44
+ task_type="Regression"):
45
+ """
46
+ DESCRIPTION:
47
+ Function initializes the data, target column and columns datatypes
48
+ for data preparation.
49
+
50
+ PARAMETERS:
51
+ data:
52
+ Required Argument.
53
+ Specifies the input teradataml Dataframe for data preparation phase.
54
+ Types: teradataml Dataframe
55
+
56
+ target_column:
57
+ Required Arugment.
58
+ Specifies the name of the target column in "data".
59
+ Types: str
60
+
61
+ verbose:
62
+ Optional Argument.
63
+ Specifies the detailed execution steps based on verbose level.
64
+ Default Value: 0
65
+ Permitted Values:
66
+ * 0: prints the progress bar and leaderboard
67
+ * 1: prints the execution steps of AutoML.
68
+ * 2: prints the intermediate data between the execution of each step of AutoML.
69
+ Types: int
70
+
71
+ excluded_columns:
72
+ Required Arugment.
73
+ Specifies the columns should be excluded from any processing.
74
+ Types: str or list of strings (str)
75
+
76
+ custom_data:
77
+ Optional Arugment.
78
+ Specifies json object containing user customized input.
79
+ Types: json object
80
+
81
+ data_transform_dict:
82
+ Optional Arugment.
83
+ Specifies the parameters for data transformation.
84
+ Types: dict
85
+
86
+ task_type:
87
+ Required Arugment.
88
+ Specifies the task type for AutoML, whether to apply regresion OR classification
89
+ on the provived dataset.
90
+ Default Value: "Regression"
91
+ Permitted Values: "Regression", "Classification"
92
+ Types: str
93
+ """
94
+ self.data = data
95
+ self.target_column = target_column
96
+ self.verbose = verbose
97
+ self.excluded_columns = excluded_columns
98
+ self.data_transform_dict = data_transform_dict
99
+ self.custom_data = custom_data
100
+ self.task_type = task_type
101
+
102
+ # Setting default value for auto run mode
103
+ self._train_size = 0.80
104
+ self._data_sampling_method = "SMOTE"
105
+ self._scale_method_reg = "STD"
106
+ self._scale_method_cls = "RANGE"
107
+ self.table_name_mapping = {}
108
+
109
+ random.seed(42)
110
+ np.random.seed(42)
111
+ self.data_types = {key: value for key, value in self.data._column_names_and_types}
112
+
113
+
114
+ def data_preparation(self,
115
+ auto = True):
116
+ """
117
+ DESCRIPTION:
118
+ Function to perform following tasks:-
119
+ 1. Splits the given data into training and testing datasets.
120
+ 2. Performs outlier processing on the training dataset and transformation on the testing dataset.
121
+ 3. Performs feature selection using RFE, PCA, and Lasso.
122
+ 4. Performs feature scaling.
123
+
124
+ PARAMETERS:
125
+ auto:
126
+ Optional Arugment.
127
+ Specifies whether to run AutoML in custom mode or auto mode.
128
+ When set to False, runs in custom mode. Otherwise, by default runs in auto mode.
129
+ Default Value: True
130
+ Types: bool
131
+
132
+ RETURNS:
133
+ list of lists containing, feature selected by rfe, pca and lasso.
134
+ """
135
+ self._display_heading(phase=2,
136
+ progress_bar=self.progress_bar)
137
+ self._display_msg(msg='Data preparation started ...',
138
+ progress_bar=self.progress_bar)
139
+ # Setting user value in case of custom running mode
140
+ if not auto:
141
+ self._set_custom_train_test_split()
142
+ self._set_custom_scaling_method()
143
+ self._set_custom_sampling()
144
+
145
+ # Performing train test split
146
+ self._train_test_split()
147
+ self.progress_bar.update()
148
+
149
+ # Handling ouliers in dataset
150
+ self._handle_outliers(auto)
151
+ self.progress_bar.update()
152
+
153
+ # Handling float type features before processing with feature selection and scaling
154
+ train = self._handle_generated_features('train')
155
+ test = self._handle_generated_features('test')
156
+ self.progress_bar.update()
157
+
158
+ # Temporary Pulling data for feature selection
159
+ # Will change after sto
160
+
161
+ # Checking for data imbalance
162
+ if self._check_data_imbalance(train):
163
+ train = self._data_sampling(train)
164
+ self.progress_bar.update()
165
+
166
+ # Performing feature selection using lasso followed by scaling
167
+ self._feature_selection_Lasso(train, test)
168
+ self._scaling_features(feature_selection_mtd="lasso")
169
+ self.progress_bar.update()
170
+
171
+ # Performing feature selection using rfe followed by scaling
172
+ self._feature_selection_RFE(train, test)
173
+ self._scaling_features(feature_selection_mtd="rfe")
174
+ self.progress_bar.update()
175
+
176
+ # Performing scaling followed by feature selection using pca
177
+ self._scaling_features(feature_selection_mtd="pca")
178
+ self._feature_selection_PCA()
179
+ self.progress_bar.update()
180
+
181
+ return [self.rfe_feature, self.lasso_feature, self.pca_feature], self.data_transform_dict
182
+
183
+ # Splits data into train and test
184
+ def _train_test_split(self):
185
+
186
+ """
187
+ DESCRIPTION:
188
+ Function splits the data into training and testing datasets.
189
+
190
+ PARAMETERS:
191
+ train_size:
192
+ Optional Argument.
193
+ Specifies the training size required for splitting dataset.
194
+ By Default, it takes 0.8 as training size.
195
+ Types: float
196
+ """
197
+ self._display_msg(msg="\nSpliting of dataset into training and testing ...",
198
+ progress_bar=self.progress_bar,
199
+ show_data=True)
200
+ self._display_msg(inline_msg="Training size : {}".format(self._train_size),
201
+ progress_bar=self.progress_bar)
202
+ self._display_msg(inline_msg="Testing size : {}".format(round((1-self._train_size),2)),
203
+ progress_bar=self.progress_bar)
204
+ start_time = time.time()
205
+ # Applying TrainTestSplit function on data
206
+ # Regression
207
+ train_test_func_params = {
208
+ "data" : self.data,
209
+ "id_column" : "id",
210
+ "train_size" : self._train_size,
211
+ "seed" : 42
212
+ }
213
+ if self.is_classification_type():
214
+ train_test_func_params["stratify_column"]=self.target_column
215
+ train_test_split_out = TrainTestSplit(**train_test_func_params)
216
+ train_test_split_out = train_test_split_out.result
217
+
218
+ # Splitting the data into training and testing data
219
+ self.train_df = train_test_split_out[train_test_split_out['TD_IsTrainRow'] == 1].drop('TD_IsTrainRow', axis=1)
220
+ self.test_df = train_test_split_out[train_test_split_out['TD_IsTrainRow'] == 0].drop('TD_IsTrainRow', axis=1)
221
+
222
+ self._display_msg(msg="Training data sample",
223
+ data=self.train_df,
224
+ progress_bar=self.progress_bar)
225
+
226
+ self._display_msg(msg="Testing data sample",
227
+ data=self.test_df,
228
+ progress_bar=self.progress_bar)
229
+
230
+ end_time = time.time()
231
+ self._display_msg(msg="Time taken for spliting of data: {:.2f} sec ".format(end_time - start_time),
232
+ progress_bar=self.progress_bar,
233
+ show_data=True)
234
+
235
+ def _set_custom_train_test_split(self):
236
+ """
237
+ DESCRIPTION:
238
+ Function to split dataset into training and testing based on user input.
239
+
240
+ """
241
+ # Fetching user input for train test split
242
+ train_test_split_input = self.custom_data.get("TrainTestSplitIndicator", False)
243
+ if train_test_split_input:
244
+ # Extracting training size
245
+ custom_train_size = self.custom_data.get("TrainingSize", None)
246
+ if custom_train_size is None:
247
+ self._display_msg(inline_msg="No information provided for training size. Proceeding with default option.",
248
+ progress_bar=self.progress_bar)
249
+ else:
250
+ if not isinstance(custom_train_size, float):
251
+ err = Messages.get_message(MessageCodes.INVALID_COLUMN_TYPE,
252
+ 'custom_train', type(custom_train_size).__name__,
253
+ 'float')
254
+ raise TeradataMlException(err, MessageCodes.INVALID_COLUMN_TYPE)
255
+ self._train_size = custom_train_size
256
+ else:
257
+ self._display_msg(inline_msg="No information provided for performing customized train test split. Proceeding with default option.",
258
+ progress_bar=self.progress_bar)
259
+
260
+ def _handle_outliers(self,
261
+ auto):
262
+ """
263
+ DESCRIPTION:
264
+ Function to handle existing outliers in dataset based on running mode.
265
+ """
266
+ if auto:
267
+ self._outlier_processing()
268
+ else:
269
+ self._custom_outlier_processing()
270
+
271
+ def _check_data_imbalance(self,
272
+ data):
273
+ """
274
+ DESCRIPTION:
275
+ Internal function calculate and checks the imbalance in dataset
276
+ in case of classification.
277
+
278
+ PARAMETERS:
279
+ data:
280
+ Required Argument.
281
+ Specifies the input teradataml DataFrame.
282
+ Types: teradataml Dataframe
283
+ """
284
+ pass
285
+
286
+ def _data_sampling(self,
287
+ data):
288
+ """
289
+ DESCRIPTION:
290
+ Function to handle data imbalance in dataset using sampling techniques
291
+ in case of classification.
292
+ """
293
+ pass
294
+
295
+ def _set_custom_sampling(self):
296
+ """
297
+ DESCRIPTION:
298
+ Internal Function to handle customized data sampling for imbalance dataset.
299
+ """
300
+ pass
301
+
302
+ def _outlier_handling_techniques(self):
303
+ """
304
+ DESCRIPTION:
305
+ Function determines the handling techniques[drop rows/impute values] for outlier columns in the dataset.
306
+ """
307
+ columns_to_drop_rows = []
308
+ columns_to_impute = []
309
+ # Keeping default method for outlier detection "Tukey"
310
+ outlier_method = "Tukey"
311
+
312
+ # List of columns for outlier processing.
313
+ outlier_columns = [col for col in self.train_df.columns if col not in self.excluded_columns]
314
+
315
+ # Detecting outlier percentage in each columns
316
+ outlier_percentage_df = self._outlier_detection(outlier_method, outlier_columns)
317
+
318
+ # Outlier Handling techniques
319
+ for i in outlier_percentage_df.itertuples():
320
+ # Column Name
321
+ col = i[0]
322
+ # Outlier value
323
+ value = i[1]
324
+
325
+ if col == self.target_column:
326
+ if value < 5.0 and value > 0.0:
327
+ columns_to_drop_rows.append(col)
328
+ elif value > 0.0 and value <= 8.0 :
329
+ columns_to_drop_rows.append(col)
330
+ elif value> 8.0 and value <= 25.0:
331
+ columns_to_impute.append(col)
332
+
333
+ return columns_to_drop_rows, columns_to_impute
334
+
335
+ def _outlier_handling(self,
336
+ target_columns,
337
+ outlier_method,
338
+ replacement_value):
339
+ """
340
+ DESCRIPTION:
341
+ Function to handle outlier for target column based outlier method and replacement value.
342
+
343
+ PARAMETERS:
344
+ target_columns:
345
+ Required Argument.
346
+ Specifies the target columns required for outlier handling.
347
+ Types: str or list of strings (str)
348
+
349
+ outlier_method:
350
+ Required Argument.
351
+ Specifies the outlier method required for outlier handling.
352
+ Types: str
353
+
354
+ replacement_value:
355
+ Optional Argument.
356
+ Specifies the value required in case of outlier replacement.
357
+ Types: str, float
358
+
359
+ RETURNS:
360
+ Pandas DataFrame containing, column name with outlier percentage.
361
+
362
+ """
363
+ # Performing fit on train dataset for outlier handling
364
+ fit_params = {
365
+ "data" : self.train_df,
366
+ "target_columns" : target_columns,
367
+ "outlier_method" : outlier_method,
368
+ "replacement_value" : replacement_value
369
+ }
370
+ outlier_fit_out = OutlierFilterFit(**fit_params)
371
+ # Performing transform on train dataset for outlier handling
372
+ transform_params = {
373
+ "data" : self.train_df,
374
+ "object" : outlier_fit_out.result,
375
+ "persist" : True
376
+ }
377
+ self.train_df = OutlierFilterTransform(**transform_params).result
378
+
379
+ def _outlier_processing(self):
380
+ """
381
+ DESCRIPTION:
382
+ Function performs outlier processing on the training dataset. It identifies and handle outliers in the dataset.
383
+
384
+ """
385
+ self._display_msg(msg="\nOutlier preprocessing ...",
386
+ progress_bar=self.progress_bar,
387
+ show_data=True)
388
+ start_time = time.time()
389
+
390
+ # List of columns for dropping rows or imputing
391
+ columns_to_drop_rows, columns_to_impute = self._outlier_handling_techniques()
392
+ # Keeping default method for outlier handling "Tukey"
393
+ outlier_handling_method = "Tukey"
394
+
395
+ # Dropping rows
396
+ if len(columns_to_drop_rows) !=0:
397
+ self._display_msg(msg="Deleting rows of these columns:",
398
+ col_lst=columns_to_drop_rows,
399
+ progress_bar=self.progress_bar)
400
+ target_columns=columns_to_drop_rows
401
+ replacement_strategy = "DELETE"
402
+ self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
403
+
404
+ # Imputing Median value in place of outliers
405
+ if len(columns_to_impute) != 0:
406
+ self._display_msg(msg="median inplace of outliers:",
407
+ col_lst=columns_to_impute,
408
+ progress_bar=self.progress_bar)
409
+ target_columns=columns_to_impute
410
+ replacement_strategy = "MEDIAN"
411
+ self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
412
+
413
+ end_time = time.time()
414
+ self._display_msg("Time Taken by Outlier processing: {:.2f} sec ".format(end_time - start_time),
415
+ progress_bar=self.progress_bar,
416
+ show_data=True)
417
+
418
+ def _custom_outlier_processing(self):
419
+ """
420
+ DESCRIPTION:
421
+ Function to perform outlier processing on the training dataset based on user input.
422
+
423
+ """
424
+ self._display_msg(msg="\nStarting customized outlier processing ...",
425
+ progress_bar=self.progress_bar,
426
+ show_data=True)
427
+ outlier_filter_input = self.custom_data.get("OutlierFilterIndicator", False)
428
+ # Checking user input for outlier filtering
429
+ if outlier_filter_input:
430
+ # List of columns for outlier processing.
431
+ target_columns = [col for col in self.train_df.columns if col not in self.excluded_columns]
432
+ # Checking user input for outlier detection method
433
+ outlier_method = self.custom_data.get("OutlierDetectionMethod", None)
434
+ if outlier_method == 'PERCENTILE':
435
+ lower_percentile = self.custom_data.get("OutlierLowerPercentile", None)
436
+ upper_percentile = self.custom_data.get("OutlierUpperPercentile", None)
437
+ if lower_percentile and upper_percentile:
438
+ # Detecting outlier percentage for each columns
439
+ outlier_df = self._outlier_detection(outlier_method, target_columns, \
440
+ lower_percentile, upper_percentile)
441
+ else:
442
+ # Detecting outlier percentage for each column in case of other than percentile method
443
+ outlier_df = self._outlier_detection(outlier_method, target_columns)
444
+
445
+ # Checking for rows if outlier containing columns exist
446
+ if outlier_df.shape[0]:
447
+ # Checking user input list for outlier handling
448
+ outlier_transform_list = self.custom_data.get("OutlierFilterParam", None)
449
+ if outlier_transform_list:
450
+ # Checking user input for outlier handling
451
+ _Validators._validate_dataframe_has_argument_columns(list(outlier_transform_list.keys()), "OutlierFilterParam",
452
+ self.train_df, "train")
453
+
454
+ for target_col, transform_val in outlier_transform_list.items():
455
+ # Fetching replacement value
456
+ replacement_value = transform_val["replacement_value"]
457
+ # Performing outlier handling
458
+ self._outlier_handling(target_col, outlier_method, replacement_value)
459
+ else:
460
+ self._display_msg(inline_msg="No information provided for feature transformation in outlier handling.",
461
+ progress_bar=self.progress_bar)
462
+ else:
463
+ self._display_msg(inline_msg="No oultiers found in dataset after applying the selected method.",
464
+ progress_bar=self.progress_bar)
465
+ else:
466
+ self._display_msg(inline_msg="No information provided for customized outlier processing. AutoML will proceed with default settings.",
467
+ progress_bar=self.progress_bar)
468
+ # Performing default handling for outliers
469
+ self._outlier_processing()
470
+
471
+ # function for getting value of "K" in k folds cross validation
472
+ def _num_of_folds(self, rows=None):
473
+ """
474
+ DESCRIPTION:
475
+ Function to determine the number of folds for cross-validation
476
+ based on the number of rows in the dataset.
477
+ PARAMETERS:
478
+ rows:
479
+ Required Argument.
480
+ Specifies the number of rows in the dataset.
481
+ Types: int
482
+ RETURNS:
483
+ int, number of folds to be used for cross-validation.
484
+ """
485
+ num_of_folds = lambda rows: 1 if rows > 20000 else (3 if 1000 < rows <= 20000 else 10)
486
+ return num_of_folds(rows)
487
+
488
+ def _feature_selection_PCA(self):
489
+ """
490
+ DESCRIPTION:
491
+ Function performs Principal Component Analysis (PCA) for feature selection.
492
+ It reduces the dimensionality of the dataset by identifying and retaining the most informative features.
493
+ """
494
+ self._display_msg(msg="\nDimension Reduction using pca ...",
495
+ progress_bar=self.progress_bar,
496
+ show_data=True)
497
+ # Required imports for PCA
498
+ from sklearn.decomposition import PCA
499
+
500
+ start_time = time.time()
501
+ # Training and testing data using pandas dataframe
502
+ # Temporary Pulling data for feature selection
503
+ train = DataFrame.from_table(self.table_name_mapping['pca_train']).to_pandas()
504
+ test = DataFrame.from_table(self.table_name_mapping['pca_test']).to_pandas()
505
+
506
+ # Drop unnecessary columns and store the result
507
+ train_data = train.drop(columns=['id', self.target_column], axis=1)
508
+ test_data = test.drop(columns=['id', self.target_column], axis=1)
509
+
510
+ # Initialize and fit PCA
511
+ pca = PCA()
512
+ pca.fit(train_data)
513
+
514
+ # Find the number of components for PCA
515
+ variance = pca.explained_variance_ratio_
516
+ n = np.argmax(np.cumsum(variance) >= 0.95) + 1
517
+
518
+ # Create a new instance of PCA with the optimal number of components
519
+ pca = PCA(n_components=n, random_state=42)
520
+
521
+ # Apply PCA on training and testing dataset
522
+ X_train_pca = pca.fit_transform(train_data)
523
+ X_test_pca = pca.transform(test_data)
524
+
525
+ # storing instance of PCA in data transformation dictionary
526
+ self.data_transform_dict["pca_fit_instance"] = pca
527
+
528
+ #converting the numarray into dataframes
529
+ train_df = pd.DataFrame(X_train_pca)
530
+ test_df = pd.DataFrame(X_test_pca)
531
+
532
+ #creating names for combined columns
533
+ column_name = {col: 'col_'+str(i) for i,col in enumerate(train_df.columns)}
534
+
535
+ # storing the new column names in data transformation dictionary
536
+ self.data_transform_dict['pca_new_column'] = column_name
537
+
538
+ #renaming them
539
+ train_df = train_df.rename(columns=column_name)
540
+ test_df = test_df.rename(columns=column_name)
541
+
542
+ # adding the id column [PCA does not shuffle the dataset]
543
+ train_df = pd.concat([train.reset_index(drop=True)['id'], train_df.reset_index(drop=True)], axis=1)
544
+ test_df = pd.concat([test.reset_index(drop=True)['id'], test_df.reset_index(drop=True)], axis=1)
545
+
546
+ # merging target column with new training and testing data
547
+ train_df[self.target_column] = train[self.target_column].reset_index(drop=True)
548
+ test_df[self.target_column] = test[self.target_column].reset_index(drop=True)
549
+
550
+ self.pca_feature = train_df.drop(columns=['id',self.target_column],axis=1).columns.tolist()
551
+
552
+ self._display_msg(msg="PCA columns:",
553
+ col_lst=self.pca_feature,
554
+ progress_bar=self.progress_bar)
555
+ end_time = time.time()
556
+ self._display_msg(msg="Total time taken by PCA: {:.2f} sec ".format( end_time - start_time),
557
+ progress_bar=self.progress_bar,
558
+ show_data=True)
559
+
560
+ if self.is_classification_type():
561
+ train_df[self.target_column] = train_df[self.target_column].astype('int')
562
+ test_df[self.target_column] = test_df[self.target_column].astype('int')
563
+
564
+ # Pushing the data in database
565
+ self.copy_dataframe_to_sql(train_df, test_df, 'pca')
566
+
567
+ def _feature_selection_RFE(self,
568
+ train=None,
569
+ test=None):
570
+ """
571
+ DESCRIPTION:
572
+ Function performs Recursive Feature Elimination (RFE) for feature selection.
573
+ It identifies a subset of the most relevant features in the dataset.
574
+
575
+ PARAMETERS:
576
+ train:
577
+ Required Argument.
578
+ Specifies the input train pandas DataFrame.
579
+ Types: pandas Dataframe
580
+
581
+ test:
582
+ Required Argument.
583
+ Specifies the input test pandas DataFrame.
584
+ Types: pandas Dataframe
585
+ """
586
+ self._display_msg(msg="\nFeature selection using rfe ...",
587
+ progress_bar=self.progress_bar,
588
+ show_data=True)
589
+
590
+ # Required imports for RFE
591
+ from sklearn.feature_selection import RFECV
592
+ from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
593
+ from sklearn.model_selection import StratifiedKFold,KFold
594
+
595
+ start_time = time.time()
596
+ # Regression
597
+ is_classification = self.is_classification_type()
598
+ # Getting the value of k in k-fold cross-validation
599
+ folds = self._num_of_folds(train.shape[0])
600
+
601
+ # Random forest for RFE model
602
+ RFModel = RandomForestRegressor if not is_classification else RandomForestClassifier
603
+ rf = RFModel(n_estimators=100, random_state=42)
604
+
605
+ # Determine the scoring metric based on the number of unique classes
606
+ score = 'r2' if not self.is_classification_type() \
607
+ else 'roc_auc' if self.data.drop_duplicate(self.target_column).size == 2 else 'f1_macro'
608
+
609
+ # Instantiate StratifiedKFold with shuffling for classification
610
+ cv = folds if not self.is_classification_type() \
611
+ else StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
612
+
613
+ # Define the RFE with cross-validation
614
+ rfecv = RFECV(rf, cv=cv, scoring=score)
615
+
616
+ # Prepare the training data
617
+ train_data = train.drop(columns=['id',self.target_column], axis=1)
618
+ train_target = train[self.target_column]
619
+
620
+ # Fit the RFE using cv
621
+ rfecv.fit(train_data, train_target)
622
+
623
+ # Extract the features
624
+ features = train_data.columns[rfecv.support_].tolist()
625
+
626
+ self._display_msg(msg="feature selected by RFE:",
627
+ col_lst=features,
628
+ progress_bar=self.progress_bar)
629
+ features.append(self.target_column)
630
+ features.insert(0,'id')
631
+
632
+ train_df = train[features]
633
+ test_df = test[features]
634
+
635
+ # storing the rfe selected features in data transformation dictionary
636
+ self.data_transform_dict['rfe_features'] = features
637
+
638
+ columns_to_rename = [col for col in train_df.columns if col not in ['id', self.target_column]]
639
+ new_column = {col: f'r_{col}' for col in columns_to_rename}
640
+ self.excluded_columns.extend([new_column[key] for key in self.excluded_columns if key in new_column])
641
+
642
+ train_df.rename(columns=new_column, inplace=True)
643
+ test_df.rename(columns=new_column, inplace=True)
644
+
645
+ # storing the rename column list in data transformation dictionary
646
+ self.data_transform_dict['rfe_rename_column'] = columns_to_rename
647
+
648
+ end_time = time.time()
649
+ self._display_msg(msg="Total time taken by feature selection: {:.2f} sec ".format( end_time - start_time),
650
+ progress_bar=self.progress_bar,
651
+ show_data=True)
652
+ self.rfe_feature = train_df.drop(columns=['id',self.target_column], axis=1).columns.tolist()
653
+
654
+ # Pushing data into database
655
+ self.copy_dataframe_to_sql(train_df, test_df, 'rfe')
656
+
657
+ def _feature_selection_Lasso(self,
658
+ train=None,
659
+ test=None):
660
+ """
661
+ DESCRIPTION:
662
+ Function performs Lasso Regression for feature selection.
663
+ It helps in identifing and retaining the most important features while setting less important ones to zero.
664
+
665
+ PARAMETERS:
666
+ train:
667
+ Required Argument.
668
+ Specifies the input train pandas DataFrame.
669
+ Types: pandas Dataframe
670
+
671
+ test:
672
+ Required Argument.
673
+ Specifies the input test pandas DataFrame.
674
+ Types: pandas Dataframe
675
+ """
676
+ start_time = time.time()
677
+ self._display_msg(msg="\nFeature selection using lasso ...",
678
+ progress_bar=self.progress_bar,
679
+ show_data=True)
680
+
681
+ # Required imports for Lasso
682
+ from sklearn.model_selection import GridSearchCV
683
+ from sklearn.linear_model import Lasso
684
+ from sklearn.linear_model import LogisticRegression
685
+
686
+ # Getting the value k in k-fold cross-validation
687
+ num_folds = self._num_of_folds(train.shape[0])
688
+
689
+ # Prepare the training data
690
+ train_features = train.drop(columns=['id',self.target_column], axis=1)
691
+ train_target = train[self.target_column]
692
+
693
+ # Determine the estimator and parameters based on the type of problem
694
+ if self.is_classification_type():
695
+ if self.data.drop_duplicate(self.target_column).size == 2:
696
+ scoring_metric = 'roc_auc'
697
+ else:
698
+ scoring_metric = 'f1_macro'
699
+ estimator = LogisticRegression(penalty='l1', solver='liblinear', multi_class='auto')
700
+ parameters = {'C':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
701
+ else:
702
+ estimator = Lasso()
703
+ parameters = {'alpha':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
704
+ scoring_metric = "r2"
705
+
706
+ # Applying hyperparameter tuning and optimizing score
707
+ hyperparameter_search = GridSearchCV(estimator, parameters, cv=num_folds, scoring=scoring_metric, verbose=0)
708
+
709
+ # Fitting the best result from hyperparameter
710
+ hyperparameter_search.fit(train_features, train_target)
711
+
712
+ # Extracting the important estimators
713
+ feature_importance = np.abs(hyperparameter_search.best_estimator_.coef_)
714
+
715
+ # Extracting feature using estimators whose importance > 0
716
+ if self.is_classification_type():
717
+ selected_feature_indices = np.where(np.any(feature_importance > 0, axis=0))[0]
718
+ selected_features = np.array(train_features.columns)[selected_feature_indices]
719
+ important_features = list(set(selected_features))
720
+ else:
721
+ important_features = np.array(train_features.columns)[feature_importance>0].tolist()
722
+
723
+ self._display_msg(msg="feature selected by lasso:",
724
+ col_lst=important_features,
725
+ progress_bar=self.progress_bar)
726
+
727
+ important_features = ['id'] + important_features + [self.target_column]
728
+ train_df = train[important_features]
729
+ test_df = test[important_features]
730
+
731
+ # Storing the lasso selected features in data transformation dictionary
732
+ self.data_transform_dict['lasso_features'] = important_features
733
+
734
+ # Calculate the elapsed time
735
+ end_time = time.time()
736
+ self._display_msg(msg="Total time taken by feature selection: {:.2f} sec ".format( end_time - start_time),
737
+ progress_bar=self.progress_bar,
738
+ show_data=True)
739
+ self.lasso_feature = train_df.drop(columns=['id',self.target_column], axis=1).columns.tolist()
740
+
741
+ self.copy_dataframe_to_sql(train_df, test_df, 'lasso')
742
+
743
+ def copy_dataframe_to_sql(self,
744
+ train,
745
+ test,
746
+ prefix):
747
+ """
748
+ DESCRIPTION:
749
+ Function to copy dataframe to SQL with generated table name.
750
+
751
+ PARAMETERS:
752
+ train:
753
+ Required Argument.
754
+ Specifies the input train pandas DataFrame.
755
+ Types: pandas Dataframe
756
+
757
+ test:
758
+ Required Argument.
759
+ Specifies the input test pandas DataFrame.
760
+ Types: pandas Dataframe
761
+
762
+ prefix:
763
+ Required Argument.
764
+ Specifies the prefix for the table name.
765
+ Types: str
766
+ """
767
+ # Generating table names
768
+ train_table_name = UtilFuncs._generate_temp_table_name(prefix='{}_train'.format(prefix),
769
+ table_type = TeradataConstants.TERADATA_TABLE)
770
+ test_table_name = UtilFuncs._generate_temp_table_name(prefix='{}_test'.format(prefix),
771
+ table_type = TeradataConstants.TERADATA_TABLE)
772
+
773
+ # Storing the table names in the table name mapping dictionary
774
+ self.table_name_mapping['{}_train'.format(prefix)] = train_table_name
775
+ self.table_name_mapping['{}_test'.format(prefix)] = test_table_name
776
+
777
+ # Pushing data into database
778
+ copy_to_sql(df=train, table_name=train_table_name, if_exists="replace")
779
+ copy_to_sql(df=test, table_name=test_table_name, if_exists="replace")
780
+
781
+
782
+
783
+ def _scaling_features_helper(self,
784
+ train=None,
785
+ feature_selection_mtd=None):
786
+ """
787
+ DESCRIPTION:
788
+ This function selects the features on which feature scaling should be applied.
789
+
790
+ PARAMETERS:
791
+ train:
792
+ Required Argument.
793
+ Specifies the training data.
794
+ Types: teradataml Dataframe
795
+
796
+ feature_selection_mtd:
797
+ Required Argument.
798
+ Specifies the feature selection algorithm used.
799
+ Types: str
800
+
801
+ RETURNS:
802
+ scl_col:
803
+ list containing, the scaled columns.
804
+ """
805
+ columns_to_scale = []
806
+
807
+ # Iterating over the columns
808
+ for col in train.columns:
809
+ # Selecting columns that will be scaled
810
+ # Exculding target_col and columns with single value
811
+ if col not in ['id', self.target_column] and train.drop_duplicate(col).size > 1:
812
+ columns_to_scale.append(col)
813
+
814
+ if feature_selection_mtd == "lasso":
815
+ self.lasso_feature = columns_to_scale
816
+ elif feature_selection_mtd == "rfe":
817
+ self.rfe_feature = columns_to_scale
818
+ else:
819
+ self.pca_feature = columns_to_scale
820
+
821
+ columns_to_scale = [col for col in columns_to_scale if col not in self.excluded_columns]
822
+ return columns_to_scale
823
+
824
+ def _scaling_features(self,
825
+ feature_selection_mtd=None):
826
+ """
827
+ DESCRIPTION:
828
+ Function performs feature scaling on columns present inside the dataset
829
+ using scaling methods [RANGE/ABS/STD/USTD/MEAN/MIDRANGE/RESCALE].
830
+
831
+ PARAMETERS:
832
+ feature_selection_mtd:
833
+ Required Argument.
834
+ Specifies the feature selection algorithm used.
835
+ Types: str
836
+ """
837
+
838
+ self._display_msg(msg="\nscaling Features of {} data ...".format(feature_selection_mtd),
839
+ progress_bar=self.progress_bar,
840
+ show_data=True)
841
+
842
+ start_time = time.time()
843
+ train = None
844
+ test = None
845
+
846
+ if self.is_classification_type():
847
+ scale_method = self._scale_method_cls
848
+ else:
849
+ scale_method = self._scale_method_reg
850
+
851
+ # Loading data for feature scaling based of feature selection method
852
+ if feature_selection_mtd == 'rfe':
853
+ train = DataFrame(self.table_name_mapping['rfe_train'])
854
+ test = DataFrame(self.table_name_mapping['rfe_test'])
855
+ elif feature_selection_mtd == 'lasso':
856
+ train = DataFrame(self.table_name_mapping['lasso_train'])
857
+ test = DataFrame(self.table_name_mapping['lasso_test'])
858
+ else:
859
+ train = self.train_df
860
+ test = self.test_df
861
+
862
+ # List of columns that will be scaled
863
+ scale_col= self._scaling_features_helper(train, feature_selection_mtd)
864
+
865
+ if len(scale_col) != 0:
866
+ self._display_msg(msg="columns that will be scaled: ",
867
+ col_lst=scale_col,
868
+ progress_bar=self.progress_bar)
869
+
870
+ # Scale Fit
871
+ fit_obj = ScaleFit(data=train,
872
+ target_columns=scale_col,
873
+ scale_method=scale_method)
874
+
875
+ # storing the scale fit object and columns in data transformation dictionary
876
+ self.data_transform_dict['{}_scale_fit_obj'.format(feature_selection_mtd)] = fit_obj
877
+ self.data_transform_dict['{}_scale_col'.format(feature_selection_mtd)] = scale_col
878
+
879
+ # List of columns to copy to the output generated by scale transform
880
+ accumulate_cols = list(set(train.columns) - set(scale_col))
881
+
882
+ # Scaling on training dataset
883
+ tr_obj = ScaleTransform(data=train,
884
+ object=fit_obj,
885
+ accumulate=accumulate_cols)
886
+
887
+ # Scaling on testing dataset
888
+ ts_obj = ScaleTransform(data=test,
889
+ object=fit_obj,
890
+ accumulate=accumulate_cols)
891
+
892
+ train = tr_obj.result
893
+ test = ts_obj.result
894
+
895
+ self._display_msg(msg="Training dataset sample after scaling:",
896
+ data=train,
897
+ progress_bar=self.progress_bar)
898
+ self._display_msg(msg="Testing dataset sample after scaling:",
899
+ data=test,
900
+ progress_bar=self.progress_bar)
901
+ else:
902
+ self._display_msg(msg="No columns to scale.",
903
+ progress_bar=self.progress_bar)
904
+
905
+ if self.is_classification_type():
906
+ train, test = self._bigint_to_int(train, test)
907
+
908
+ self.copy_dataframe_to_sql(train, test, feature_selection_mtd)
909
+
910
+ end_time = time.time()
911
+ self._display_msg(msg="Total time taken by feature scaling: {:.2f} sec".format( end_time - start_time),
912
+ progress_bar=self.progress_bar,
913
+ show_data=True)
914
+
915
+ def _bigint_to_int(self, train, test):
916
+ tr = train.to_pandas()
917
+ tr[self.target_column] = tr[self.target_column].astype('int')
918
+
919
+ ts = test.to_pandas()
920
+ ts[self.target_column] = ts[self.target_column].astype('int')
921
+
922
+ return tr, ts
923
+
924
+ def _set_custom_scaling_method(self):
925
+ """
926
+ DESCRIPTION:
927
+ Function to perform feature scaling based on user input.
928
+
929
+ """
930
+ # Fetching user input for performing customized scaling
931
+ feature_scaling_input = self.custom_data.get("FeatureScalingIndicator", False)
932
+ # Checking user input for feature scaling
933
+ if feature_scaling_input:
934
+ # Extracting scaling method
935
+ custom_scaling_method = self.custom_data.get("FeatureScalingMethod", None)
936
+ if custom_scaling_method is None:
937
+ self._display_msg(inline_msg="No information provided for customized scaling method. AutoML will continue with default option.",
938
+ progress_bar=self.progress_bar)
939
+ else:
940
+ if self.is_classification_type():
941
+ self._scale_method_cls = custom_scaling_method
942
+ else:
943
+ self._scale_method_reg = custom_scaling_method
944
+ else:
945
+ self._display_msg(inline_msg="No information provided for performing customized feature scaling. Proceeding with default option.",
946
+ progress_bar=self.progress_bar)
947
+
948
+
949
+ def _handle_generated_features(self,
950
+ label = None):
951
+ """
952
+ DESCRIPTION:
953
+ Function to handle newly generated float features. It will round them upto 4 digit after decimal point.
954
+
955
+ PARAMETERS:
956
+ label:
957
+ Optional Argument.
958
+ Specifies label for dataset on which rounding up is getting done i.e., 'train' for training
959
+ and 'test' for testing dataset.
960
+ By Default, it takes None and transformation is getting applied to whole dataset.
961
+ Types: str
962
+
963
+ """
964
+ # Checking for label and accordingly deciding target dataset.
965
+ if label == 'train':
966
+ target_df = self.train_df
967
+ elif label == 'test':
968
+ target_df = self.test_df
969
+ else:
970
+ target_df=self.data
971
+
972
+ # Detecting list of float columns on target dataset
973
+ float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float"]]
974
+
975
+ if len(float_columns) == 0:
976
+ return target_df.to_pandas()
977
+
978
+ # storing the column details for round up in data transformation dictionary
979
+ self.data_transform_dict["round_columns"] = float_columns
980
+
981
+ # Extracting accumulate columns
982
+ accumulate_columns = self._extract_list(target_df.columns,float_columns)
983
+ # Performing rounding up on target column upto 4 precision digit
984
+ fit_params = {
985
+ "data" : target_df,
986
+ "target_columns" : float_columns,
987
+ "precision_digit" : 4,
988
+ "accumulate" : accumulate_columns,
989
+ "persist" : True}
990
+
991
+ obj = RoundColumns(**fit_params).result
992
+ df = obj.to_pandas()
993
+ return df.reset_index()