teradataml 17.20.0.6__py3-none-any.whl → 20.0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (432) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/LICENSE.pdf +0 -0
  3. teradataml/README.md +238 -1
  4. teradataml/__init__.py +13 -3
  5. teradataml/_version.py +1 -1
  6. teradataml/analytics/Transformations.py +4 -4
  7. teradataml/analytics/__init__.py +0 -2
  8. teradataml/analytics/analytic_function_executor.py +3 -0
  9. teradataml/analytics/json_parser/utils.py +13 -12
  10. teradataml/analytics/sqle/DecisionTreePredict.py +15 -30
  11. teradataml/analytics/sqle/NaiveBayesPredict.py +11 -20
  12. teradataml/analytics/sqle/__init__.py +0 -13
  13. teradataml/analytics/utils.py +1 -0
  14. teradataml/analytics/valib.py +3 -0
  15. teradataml/automl/__init__.py +1628 -0
  16. teradataml/automl/custom_json_utils.py +1270 -0
  17. teradataml/automl/data_preparation.py +993 -0
  18. teradataml/automl/data_transformation.py +727 -0
  19. teradataml/automl/feature_engineering.py +1648 -0
  20. teradataml/automl/feature_exploration.py +547 -0
  21. teradataml/automl/model_evaluation.py +163 -0
  22. teradataml/automl/model_training.py +887 -0
  23. teradataml/catalog/__init__.py +0 -2
  24. teradataml/catalog/byom.py +49 -6
  25. teradataml/catalog/function_argument_mapper.py +0 -2
  26. teradataml/catalog/model_cataloging_utils.py +2 -1021
  27. teradataml/common/aed_utils.py +6 -2
  28. teradataml/common/constants.py +50 -58
  29. teradataml/common/deprecations.py +160 -0
  30. teradataml/common/garbagecollector.py +61 -104
  31. teradataml/common/messagecodes.py +27 -36
  32. teradataml/common/messages.py +11 -15
  33. teradataml/common/utils.py +205 -287
  34. teradataml/common/wrapper_utils.py +1 -110
  35. teradataml/context/context.py +150 -78
  36. teradataml/data/bank_churn.csv +10001 -0
  37. teradataml/data/bmi.csv +501 -0
  38. teradataml/data/docs/sqle/docs_17_10/BincodeFit.py +3 -3
  39. teradataml/data/docs/sqle/docs_17_10/BincodeTransform.py +6 -5
  40. teradataml/data/docs/sqle/docs_17_10/Fit.py +1 -1
  41. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +1 -1
  42. teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +1 -1
  43. teradataml/data/docs/sqle/docs_17_10/PolynomialFeaturesTransform.py +2 -2
  44. teradataml/data/docs/sqle/docs_17_10/RowNormalizeTransform.py +2 -1
  45. teradataml/data/docs/sqle/docs_17_10/ScaleTransform.py +1 -0
  46. teradataml/data/docs/sqle/docs_17_10/SimpleImputeTransform.py +1 -1
  47. teradataml/data/docs/sqle/docs_17_10/Transform.py +2 -1
  48. teradataml/data/docs/sqle/docs_17_20/BincodeFit.py +3 -3
  49. teradataml/data/docs/sqle/docs_17_20/BincodeTransform.py +6 -5
  50. teradataml/data/docs/sqle/docs_17_20/Fit.py +1 -1
  51. teradataml/data/docs/sqle/docs_17_20/GLM.py +1 -1
  52. teradataml/data/docs/sqle/docs_17_20/GLMPredictPerSegment.py +9 -10
  53. teradataml/data/docs/sqle/docs_17_20/KMeansPredict.py +3 -2
  54. teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierPredict.py +16 -15
  55. teradataml/data/docs/sqle/docs_17_20/NaiveBayesTextClassifierTrainer.py +2 -2
  56. teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +2 -2
  57. teradataml/data/docs/sqle/docs_17_20/NonLinearCombineTransform.py +8 -8
  58. teradataml/data/docs/sqle/docs_17_20/OneClassSVMPredict.py +21 -20
  59. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +1 -1
  60. teradataml/data/docs/sqle/docs_17_20/OutlierFilterTransform.py +8 -3
  61. teradataml/data/docs/sqle/docs_17_20/PolynomialFeaturesTransform.py +6 -5
  62. teradataml/data/docs/sqle/docs_17_20/RandomProjectionTransform.py +6 -6
  63. teradataml/data/docs/sqle/docs_17_20/RowNormalizeTransform.py +2 -1
  64. teradataml/data/docs/sqle/docs_17_20/SVM.py +1 -1
  65. teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +16 -16
  66. teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +1 -0
  67. teradataml/data/docs/sqle/docs_17_20/SimpleImputeTransform.py +3 -2
  68. teradataml/data/docs/sqle/docs_17_20/TDDecisionForestPredict.py +4 -4
  69. teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +19 -19
  70. teradataml/data/docs/sqle/docs_17_20/TargetEncodingTransform.py +5 -4
  71. teradataml/data/docs/sqle/docs_17_20/Transform.py +2 -2
  72. teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +9 -9
  73. teradataml/data/fish.csv +160 -0
  74. teradataml/data/glass_types.csv +215 -0
  75. teradataml/data/insurance.csv +1 -1
  76. teradataml/data/iris_data.csv +151 -0
  77. teradataml/data/jsons/sqle/17.10/TD_FunctionTransform.json +1 -0
  78. teradataml/data/jsons/sqle/17.10/TD_OneHotEncodingTransform.json +1 -0
  79. teradataml/data/jsons/sqle/17.10/TD_OutlierFilterTransform.json +1 -0
  80. teradataml/data/jsons/sqle/17.10/TD_PolynomialFeaturesTransform.json +1 -0
  81. teradataml/data/jsons/sqle/17.10/TD_RowNormalizeTransform.json +1 -0
  82. teradataml/data/jsons/sqle/17.10/TD_ScaleTransform.json +1 -0
  83. teradataml/data/jsons/sqle/17.10/TD_SimpleImputeTransform.json +1 -0
  84. teradataml/data/load_example_data.py +3 -0
  85. teradataml/data/multi_model_classification.csv +401 -0
  86. teradataml/data/multi_model_regression.csv +401 -0
  87. teradataml/data/openml_example.json +63 -0
  88. teradataml/data/scripts/deploy_script.py +65 -0
  89. teradataml/data/scripts/mapper.R +20 -0
  90. teradataml/data/scripts/sklearn/__init__.py +0 -0
  91. teradataml/data/scripts/sklearn/sklearn_fit.py +175 -0
  92. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +135 -0
  93. teradataml/data/scripts/sklearn/sklearn_function.template +113 -0
  94. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +158 -0
  95. teradataml/data/scripts/sklearn/sklearn_neighbors.py +152 -0
  96. teradataml/data/scripts/sklearn/sklearn_score.py +128 -0
  97. teradataml/data/scripts/sklearn/sklearn_transform.py +179 -0
  98. teradataml/data/templates/open_source_ml.json +9 -0
  99. teradataml/data/teradataml_example.json +73 -1
  100. teradataml/data/test_classification.csv +101 -0
  101. teradataml/data/test_prediction.csv +101 -0
  102. teradataml/data/test_regression.csv +101 -0
  103. teradataml/data/train_multiclass.csv +101 -0
  104. teradataml/data/train_regression.csv +101 -0
  105. teradataml/data/train_regression_multiple_labels.csv +101 -0
  106. teradataml/data/wine_data.csv +1600 -0
  107. teradataml/dataframe/copy_to.py +79 -13
  108. teradataml/dataframe/data_transfer.py +8 -0
  109. teradataml/dataframe/dataframe.py +910 -311
  110. teradataml/dataframe/dataframe_utils.py +102 -5
  111. teradataml/dataframe/fastload.py +11 -3
  112. teradataml/dataframe/setop.py +15 -2
  113. teradataml/dataframe/sql.py +3735 -77
  114. teradataml/dataframe/sql_function_parameters.py +56 -5
  115. teradataml/dataframe/vantage_function_types.py +45 -1
  116. teradataml/dataframe/window.py +30 -29
  117. teradataml/dbutils/dbutils.py +18 -1
  118. teradataml/geospatial/geodataframe.py +18 -7
  119. teradataml/geospatial/geodataframecolumn.py +5 -0
  120. teradataml/hyperparameter_tuner/optimizer.py +910 -120
  121. teradataml/hyperparameter_tuner/utils.py +131 -37
  122. teradataml/lib/aed_0_1.dll +0 -0
  123. teradataml/lib/libaed_0_1.dylib +0 -0
  124. teradataml/lib/libaed_0_1.so +0 -0
  125. teradataml/libaed_0_1.dylib +0 -0
  126. teradataml/libaed_0_1.so +0 -0
  127. teradataml/opensource/__init__.py +1 -0
  128. teradataml/opensource/sklearn/__init__.py +1 -0
  129. teradataml/opensource/sklearn/_class.py +255 -0
  130. teradataml/opensource/sklearn/_sklearn_wrapper.py +1668 -0
  131. teradataml/opensource/sklearn/_wrapper_utils.py +268 -0
  132. teradataml/opensource/sklearn/constants.py +54 -0
  133. teradataml/options/__init__.py +3 -6
  134. teradataml/options/configure.py +21 -20
  135. teradataml/scriptmgmt/UserEnv.py +61 -5
  136. teradataml/scriptmgmt/lls_utils.py +135 -53
  137. teradataml/table_operators/Apply.py +38 -6
  138. teradataml/table_operators/Script.py +45 -308
  139. teradataml/table_operators/TableOperator.py +182 -591
  140. teradataml/table_operators/__init__.py +0 -1
  141. teradataml/table_operators/table_operator_util.py +32 -40
  142. teradataml/utils/validators.py +127 -3
  143. {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/METADATA +243 -3
  144. {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/RECORD +147 -391
  145. teradataml/analytics/mle/AdaBoost.py +0 -651
  146. teradataml/analytics/mle/AdaBoostPredict.py +0 -564
  147. teradataml/analytics/mle/Antiselect.py +0 -342
  148. teradataml/analytics/mle/Arima.py +0 -641
  149. teradataml/analytics/mle/ArimaPredict.py +0 -477
  150. teradataml/analytics/mle/Attribution.py +0 -1070
  151. teradataml/analytics/mle/Betweenness.py +0 -658
  152. teradataml/analytics/mle/Burst.py +0 -711
  153. teradataml/analytics/mle/CCM.py +0 -600
  154. teradataml/analytics/mle/CCMPrepare.py +0 -324
  155. teradataml/analytics/mle/CFilter.py +0 -460
  156. teradataml/analytics/mle/ChangePointDetection.py +0 -572
  157. teradataml/analytics/mle/ChangePointDetectionRT.py +0 -477
  158. teradataml/analytics/mle/Closeness.py +0 -737
  159. teradataml/analytics/mle/ConfusionMatrix.py +0 -420
  160. teradataml/analytics/mle/Correlation.py +0 -477
  161. teradataml/analytics/mle/Correlation2.py +0 -573
  162. teradataml/analytics/mle/CoxHazardRatio.py +0 -679
  163. teradataml/analytics/mle/CoxPH.py +0 -556
  164. teradataml/analytics/mle/CoxSurvival.py +0 -478
  165. teradataml/analytics/mle/CumulativeMovAvg.py +0 -363
  166. teradataml/analytics/mle/DTW.py +0 -623
  167. teradataml/analytics/mle/DWT.py +0 -564
  168. teradataml/analytics/mle/DWT2D.py +0 -599
  169. teradataml/analytics/mle/DecisionForest.py +0 -716
  170. teradataml/analytics/mle/DecisionForestEvaluator.py +0 -363
  171. teradataml/analytics/mle/DecisionForestPredict.py +0 -561
  172. teradataml/analytics/mle/DecisionTree.py +0 -830
  173. teradataml/analytics/mle/DecisionTreePredict.py +0 -528
  174. teradataml/analytics/mle/ExponentialMovAvg.py +0 -418
  175. teradataml/analytics/mle/FMeasure.py +0 -402
  176. teradataml/analytics/mle/FPGrowth.py +0 -734
  177. teradataml/analytics/mle/FrequentPaths.py +0 -695
  178. teradataml/analytics/mle/GLM.py +0 -558
  179. teradataml/analytics/mle/GLML1L2.py +0 -547
  180. teradataml/analytics/mle/GLML1L2Predict.py +0 -519
  181. teradataml/analytics/mle/GLMPredict.py +0 -529
  182. teradataml/analytics/mle/HMMDecoder.py +0 -945
  183. teradataml/analytics/mle/HMMEvaluator.py +0 -901
  184. teradataml/analytics/mle/HMMSupervised.py +0 -521
  185. teradataml/analytics/mle/HMMUnsupervised.py +0 -572
  186. teradataml/analytics/mle/Histogram.py +0 -561
  187. teradataml/analytics/mle/IDWT.py +0 -476
  188. teradataml/analytics/mle/IDWT2D.py +0 -493
  189. teradataml/analytics/mle/IdentityMatch.py +0 -763
  190. teradataml/analytics/mle/Interpolator.py +0 -918
  191. teradataml/analytics/mle/KMeans.py +0 -485
  192. teradataml/analytics/mle/KNN.py +0 -627
  193. teradataml/analytics/mle/KNNRecommender.py +0 -488
  194. teradataml/analytics/mle/KNNRecommenderPredict.py +0 -581
  195. teradataml/analytics/mle/LAR.py +0 -439
  196. teradataml/analytics/mle/LARPredict.py +0 -478
  197. teradataml/analytics/mle/LDA.py +0 -548
  198. teradataml/analytics/mle/LDAInference.py +0 -492
  199. teradataml/analytics/mle/LDATopicSummary.py +0 -464
  200. teradataml/analytics/mle/LevenshteinDistance.py +0 -450
  201. teradataml/analytics/mle/LinReg.py +0 -433
  202. teradataml/analytics/mle/LinRegPredict.py +0 -438
  203. teradataml/analytics/mle/MinHash.py +0 -544
  204. teradataml/analytics/mle/Modularity.py +0 -587
  205. teradataml/analytics/mle/NEREvaluator.py +0 -410
  206. teradataml/analytics/mle/NERExtractor.py +0 -595
  207. teradataml/analytics/mle/NERTrainer.py +0 -458
  208. teradataml/analytics/mle/NGrams.py +0 -570
  209. teradataml/analytics/mle/NPath.py +0 -634
  210. teradataml/analytics/mle/NTree.py +0 -549
  211. teradataml/analytics/mle/NaiveBayes.py +0 -462
  212. teradataml/analytics/mle/NaiveBayesPredict.py +0 -513
  213. teradataml/analytics/mle/NaiveBayesTextClassifier.py +0 -607
  214. teradataml/analytics/mle/NaiveBayesTextClassifier2.py +0 -531
  215. teradataml/analytics/mle/NaiveBayesTextClassifierPredict.py +0 -799
  216. teradataml/analytics/mle/NamedEntityFinder.py +0 -529
  217. teradataml/analytics/mle/NamedEntityFinderEvaluator.py +0 -414
  218. teradataml/analytics/mle/NamedEntityFinderTrainer.py +0 -396
  219. teradataml/analytics/mle/POSTagger.py +0 -417
  220. teradataml/analytics/mle/Pack.py +0 -411
  221. teradataml/analytics/mle/PageRank.py +0 -535
  222. teradataml/analytics/mle/PathAnalyzer.py +0 -426
  223. teradataml/analytics/mle/PathGenerator.py +0 -367
  224. teradataml/analytics/mle/PathStart.py +0 -464
  225. teradataml/analytics/mle/PathSummarizer.py +0 -470
  226. teradataml/analytics/mle/Pivot.py +0 -471
  227. teradataml/analytics/mle/ROC.py +0 -425
  228. teradataml/analytics/mle/RandomSample.py +0 -637
  229. teradataml/analytics/mle/RandomWalkSample.py +0 -490
  230. teradataml/analytics/mle/SAX.py +0 -779
  231. teradataml/analytics/mle/SVMDense.py +0 -677
  232. teradataml/analytics/mle/SVMDensePredict.py +0 -536
  233. teradataml/analytics/mle/SVMDenseSummary.py +0 -437
  234. teradataml/analytics/mle/SVMSparse.py +0 -557
  235. teradataml/analytics/mle/SVMSparsePredict.py +0 -553
  236. teradataml/analytics/mle/SVMSparseSummary.py +0 -435
  237. teradataml/analytics/mle/Sampling.py +0 -549
  238. teradataml/analytics/mle/Scale.py +0 -565
  239. teradataml/analytics/mle/ScaleByPartition.py +0 -496
  240. teradataml/analytics/mle/ScaleMap.py +0 -378
  241. teradataml/analytics/mle/ScaleSummary.py +0 -320
  242. teradataml/analytics/mle/SentenceExtractor.py +0 -363
  243. teradataml/analytics/mle/SentimentEvaluator.py +0 -432
  244. teradataml/analytics/mle/SentimentExtractor.py +0 -578
  245. teradataml/analytics/mle/SentimentTrainer.py +0 -405
  246. teradataml/analytics/mle/SeriesSplitter.py +0 -641
  247. teradataml/analytics/mle/Sessionize.py +0 -475
  248. teradataml/analytics/mle/SimpleMovAvg.py +0 -397
  249. teradataml/analytics/mle/StringSimilarity.py +0 -425
  250. teradataml/analytics/mle/TF.py +0 -389
  251. teradataml/analytics/mle/TFIDF.py +0 -504
  252. teradataml/analytics/mle/TextChunker.py +0 -414
  253. teradataml/analytics/mle/TextClassifier.py +0 -399
  254. teradataml/analytics/mle/TextClassifierEvaluator.py +0 -413
  255. teradataml/analytics/mle/TextClassifierTrainer.py +0 -565
  256. teradataml/analytics/mle/TextMorph.py +0 -494
  257. teradataml/analytics/mle/TextParser.py +0 -623
  258. teradataml/analytics/mle/TextTagger.py +0 -530
  259. teradataml/analytics/mle/TextTokenizer.py +0 -502
  260. teradataml/analytics/mle/UnivariateStatistics.py +0 -488
  261. teradataml/analytics/mle/Unpack.py +0 -526
  262. teradataml/analytics/mle/Unpivot.py +0 -438
  263. teradataml/analytics/mle/VarMax.py +0 -776
  264. teradataml/analytics/mle/VectorDistance.py +0 -762
  265. teradataml/analytics/mle/WeightedMovAvg.py +0 -400
  266. teradataml/analytics/mle/XGBoost.py +0 -842
  267. teradataml/analytics/mle/XGBoostPredict.py +0 -627
  268. teradataml/analytics/mle/__init__.py +0 -123
  269. teradataml/analytics/mle/json/adaboost_mle.json +0 -135
  270. teradataml/analytics/mle/json/adaboostpredict_mle.json +0 -85
  271. teradataml/analytics/mle/json/antiselect_mle.json +0 -34
  272. teradataml/analytics/mle/json/antiselect_mle_mle.json +0 -34
  273. teradataml/analytics/mle/json/arima_mle.json +0 -172
  274. teradataml/analytics/mle/json/arimapredict_mle.json +0 -52
  275. teradataml/analytics/mle/json/attribution_mle_mle.json +0 -143
  276. teradataml/analytics/mle/json/betweenness_mle.json +0 -97
  277. teradataml/analytics/mle/json/burst_mle.json +0 -140
  278. teradataml/analytics/mle/json/ccm_mle.json +0 -124
  279. teradataml/analytics/mle/json/ccmprepare_mle.json +0 -14
  280. teradataml/analytics/mle/json/cfilter_mle.json +0 -93
  281. teradataml/analytics/mle/json/changepointdetection_mle.json +0 -92
  282. teradataml/analytics/mle/json/changepointdetectionrt_mle.json +0 -78
  283. teradataml/analytics/mle/json/closeness_mle.json +0 -104
  284. teradataml/analytics/mle/json/confusionmatrix_mle.json +0 -79
  285. teradataml/analytics/mle/json/correlation_mle.json +0 -86
  286. teradataml/analytics/mle/json/correlationreduce_mle.json +0 -49
  287. teradataml/analytics/mle/json/coxhazardratio_mle.json +0 -89
  288. teradataml/analytics/mle/json/coxph_mle.json +0 -98
  289. teradataml/analytics/mle/json/coxsurvival_mle.json +0 -79
  290. teradataml/analytics/mle/json/cumulativemovavg_mle.json +0 -34
  291. teradataml/analytics/mle/json/decisionforest_mle.json +0 -167
  292. teradataml/analytics/mle/json/decisionforestevaluator_mle.json +0 -33
  293. teradataml/analytics/mle/json/decisionforestpredict_mle_mle.json +0 -74
  294. teradataml/analytics/mle/json/decisiontree_mle.json +0 -194
  295. teradataml/analytics/mle/json/decisiontreepredict_mle_mle.json +0 -86
  296. teradataml/analytics/mle/json/dtw_mle.json +0 -97
  297. teradataml/analytics/mle/json/dwt2d_mle.json +0 -116
  298. teradataml/analytics/mle/json/dwt_mle.json +0 -101
  299. teradataml/analytics/mle/json/exponentialmovavg_mle.json +0 -55
  300. teradataml/analytics/mle/json/fmeasure_mle.json +0 -58
  301. teradataml/analytics/mle/json/fpgrowth_mle.json +0 -159
  302. teradataml/analytics/mle/json/frequentpaths_mle.json +0 -129
  303. teradataml/analytics/mle/json/glm_mle.json +0 -111
  304. teradataml/analytics/mle/json/glml1l2_mle.json +0 -106
  305. teradataml/analytics/mle/json/glml1l2predict_mle.json +0 -57
  306. teradataml/analytics/mle/json/glmpredict_mle_mle.json +0 -74
  307. teradataml/analytics/mle/json/histogram_mle.json +0 -100
  308. teradataml/analytics/mle/json/hmmdecoder_mle.json +0 -192
  309. teradataml/analytics/mle/json/hmmevaluator_mle.json +0 -206
  310. teradataml/analytics/mle/json/hmmsupervised_mle.json +0 -91
  311. teradataml/analytics/mle/json/hmmunsupervised_mle.json +0 -114
  312. teradataml/analytics/mle/json/identitymatch_mle.json +0 -88
  313. teradataml/analytics/mle/json/idwt2d_mle.json +0 -73
  314. teradataml/analytics/mle/json/idwt_mle.json +0 -66
  315. teradataml/analytics/mle/json/interpolator_mle.json +0 -151
  316. teradataml/analytics/mle/json/kmeans_mle.json +0 -97
  317. teradataml/analytics/mle/json/knn_mle.json +0 -141
  318. teradataml/analytics/mle/json/knnrecommender_mle.json +0 -111
  319. teradataml/analytics/mle/json/knnrecommenderpredict_mle.json +0 -75
  320. teradataml/analytics/mle/json/lar_mle.json +0 -78
  321. teradataml/analytics/mle/json/larpredict_mle.json +0 -69
  322. teradataml/analytics/mle/json/lda_mle.json +0 -130
  323. teradataml/analytics/mle/json/ldainference_mle.json +0 -78
  324. teradataml/analytics/mle/json/ldatopicsummary_mle.json +0 -64
  325. teradataml/analytics/mle/json/levenshteindistance_mle.json +0 -92
  326. teradataml/analytics/mle/json/linreg_mle.json +0 -42
  327. teradataml/analytics/mle/json/linregpredict_mle.json +0 -56
  328. teradataml/analytics/mle/json/minhash_mle.json +0 -113
  329. teradataml/analytics/mle/json/modularity_mle.json +0 -91
  330. teradataml/analytics/mle/json/naivebayespredict_mle_mle.json +0 -85
  331. teradataml/analytics/mle/json/naivebayesreduce_mle.json +0 -52
  332. teradataml/analytics/mle/json/naivebayestextclassifierpredict_mle_mle.json +0 -147
  333. teradataml/analytics/mle/json/naivebayestextclassifiertrainer2_mle.json +0 -108
  334. teradataml/analytics/mle/json/naivebayestextclassifiertrainer_mle.json +0 -102
  335. teradataml/analytics/mle/json/namedentityfinder_mle.json +0 -84
  336. teradataml/analytics/mle/json/namedentityfinderevaluatorreduce_mle.json +0 -43
  337. teradataml/analytics/mle/json/namedentityfindertrainer_mle.json +0 -64
  338. teradataml/analytics/mle/json/nerevaluator_mle.json +0 -54
  339. teradataml/analytics/mle/json/nerextractor_mle.json +0 -87
  340. teradataml/analytics/mle/json/nertrainer_mle.json +0 -89
  341. teradataml/analytics/mle/json/ngrams_mle.json +0 -137
  342. teradataml/analytics/mle/json/ngramsplitter_mle_mle.json +0 -137
  343. teradataml/analytics/mle/json/npath@coprocessor_mle.json +0 -73
  344. teradataml/analytics/mle/json/ntree@coprocessor_mle.json +0 -123
  345. teradataml/analytics/mle/json/pack_mle.json +0 -58
  346. teradataml/analytics/mle/json/pack_mle_mle.json +0 -58
  347. teradataml/analytics/mle/json/pagerank_mle.json +0 -81
  348. teradataml/analytics/mle/json/pathanalyzer_mle.json +0 -63
  349. teradataml/analytics/mle/json/pathgenerator_mle.json +0 -40
  350. teradataml/analytics/mle/json/pathstart_mle.json +0 -62
  351. teradataml/analytics/mle/json/pathsummarizer_mle.json +0 -72
  352. teradataml/analytics/mle/json/pivoting_mle.json +0 -71
  353. teradataml/analytics/mle/json/postagger_mle.json +0 -51
  354. teradataml/analytics/mle/json/randomsample_mle.json +0 -131
  355. teradataml/analytics/mle/json/randomwalksample_mle.json +0 -85
  356. teradataml/analytics/mle/json/roc_mle.json +0 -73
  357. teradataml/analytics/mle/json/sampling_mle.json +0 -75
  358. teradataml/analytics/mle/json/sax_mle.json +0 -154
  359. teradataml/analytics/mle/json/scale_mle.json +0 -93
  360. teradataml/analytics/mle/json/scalebypartition_mle.json +0 -89
  361. teradataml/analytics/mle/json/scalemap_mle.json +0 -44
  362. teradataml/analytics/mle/json/scalesummary_mle.json +0 -14
  363. teradataml/analytics/mle/json/sentenceextractor_mle.json +0 -41
  364. teradataml/analytics/mle/json/sentimentevaluator_mle.json +0 -43
  365. teradataml/analytics/mle/json/sentimentextractor_mle.json +0 -100
  366. teradataml/analytics/mle/json/sentimenttrainer_mle.json +0 -68
  367. teradataml/analytics/mle/json/seriessplitter_mle.json +0 -133
  368. teradataml/analytics/mle/json/sessionize_mle_mle.json +0 -62
  369. teradataml/analytics/mle/json/simplemovavg_mle.json +0 -48
  370. teradataml/analytics/mle/json/stringsimilarity_mle.json +0 -50
  371. teradataml/analytics/mle/json/stringsimilarity_mle_mle.json +0 -50
  372. teradataml/analytics/mle/json/svmdense_mle.json +0 -165
  373. teradataml/analytics/mle/json/svmdensepredict_mle.json +0 -95
  374. teradataml/analytics/mle/json/svmdensesummary_mle.json +0 -58
  375. teradataml/analytics/mle/json/svmsparse_mle.json +0 -148
  376. teradataml/analytics/mle/json/svmsparsepredict_mle_mle.json +0 -103
  377. teradataml/analytics/mle/json/svmsparsesummary_mle.json +0 -57
  378. teradataml/analytics/mle/json/textchunker_mle.json +0 -40
  379. teradataml/analytics/mle/json/textclassifier_mle.json +0 -51
  380. teradataml/analytics/mle/json/textclassifierevaluator_mle.json +0 -43
  381. teradataml/analytics/mle/json/textclassifiertrainer_mle.json +0 -103
  382. teradataml/analytics/mle/json/textmorph_mle.json +0 -63
  383. teradataml/analytics/mle/json/textparser_mle.json +0 -166
  384. teradataml/analytics/mle/json/texttagger_mle.json +0 -81
  385. teradataml/analytics/mle/json/texttokenizer_mle.json +0 -91
  386. teradataml/analytics/mle/json/tf_mle.json +0 -33
  387. teradataml/analytics/mle/json/tfidf_mle.json +0 -34
  388. teradataml/analytics/mle/json/univariatestatistics_mle.json +0 -81
  389. teradataml/analytics/mle/json/unpack_mle.json +0 -91
  390. teradataml/analytics/mle/json/unpack_mle_mle.json +0 -91
  391. teradataml/analytics/mle/json/unpivoting_mle.json +0 -63
  392. teradataml/analytics/mle/json/varmax_mle.json +0 -176
  393. teradataml/analytics/mle/json/vectordistance_mle.json +0 -179
  394. teradataml/analytics/mle/json/weightedmovavg_mle.json +0 -48
  395. teradataml/analytics/mle/json/xgboost_mle.json +0 -178
  396. teradataml/analytics/mle/json/xgboostpredict_mle.json +0 -104
  397. teradataml/analytics/sqle/Antiselect.py +0 -321
  398. teradataml/analytics/sqle/Attribution.py +0 -603
  399. teradataml/analytics/sqle/DecisionForestPredict.py +0 -408
  400. teradataml/analytics/sqle/GLMPredict.py +0 -430
  401. teradataml/analytics/sqle/MovingAverage.py +0 -543
  402. teradataml/analytics/sqle/NGramSplitter.py +0 -548
  403. teradataml/analytics/sqle/NPath.py +0 -632
  404. teradataml/analytics/sqle/NaiveBayesTextClassifierPredict.py +0 -515
  405. teradataml/analytics/sqle/Pack.py +0 -388
  406. teradataml/analytics/sqle/SVMSparsePredict.py +0 -464
  407. teradataml/analytics/sqle/Sessionize.py +0 -390
  408. teradataml/analytics/sqle/StringSimilarity.py +0 -400
  409. teradataml/analytics/sqle/Unpack.py +0 -503
  410. teradataml/analytics/sqle/json/antiselect_sqle.json +0 -21
  411. teradataml/analytics/sqle/json/attribution_sqle.json +0 -92
  412. teradataml/analytics/sqle/json/decisionforestpredict_sqle.json +0 -48
  413. teradataml/analytics/sqle/json/glmpredict_sqle.json +0 -48
  414. teradataml/analytics/sqle/json/h2opredict_sqle.json +0 -63
  415. teradataml/analytics/sqle/json/movingaverage_sqle.json +0 -58
  416. teradataml/analytics/sqle/json/naivebayestextclassifierpredict_sqle.json +0 -76
  417. teradataml/analytics/sqle/json/ngramsplitter_sqle.json +0 -126
  418. teradataml/analytics/sqle/json/npath_sqle.json +0 -67
  419. teradataml/analytics/sqle/json/pack_sqle.json +0 -47
  420. teradataml/analytics/sqle/json/pmmlpredict_sqle.json +0 -55
  421. teradataml/analytics/sqle/json/sessionize_sqle.json +0 -43
  422. teradataml/analytics/sqle/json/stringsimilarity_sqle.json +0 -39
  423. teradataml/analytics/sqle/json/svmsparsepredict_sqle.json +0 -74
  424. teradataml/analytics/sqle/json/unpack_sqle.json +0 -80
  425. teradataml/catalog/model_cataloging.py +0 -980
  426. teradataml/config/mlengine_alias_definitions_v1.0 +0 -118
  427. teradataml/config/mlengine_alias_definitions_v1.1 +0 -127
  428. teradataml/config/mlengine_alias_definitions_v1.3 +0 -129
  429. teradataml/table_operators/sandbox_container_util.py +0 -643
  430. {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/WHEEL +0 -0
  431. {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/top_level.txt +0 -0
  432. {teradataml-17.20.0.6.dist-info → teradataml-20.0.0.0.dist-info}/zip-safe +0 -0
@@ -17,7 +17,9 @@ import numpy as np
17
17
  import pandas as pd
18
18
  import random
19
19
  import time
20
+ import threading
20
21
  from itertools import product
22
+ from collections import defaultdict
21
23
  from teradataml import DataFrame, valib, TeradataMlException
22
24
  from teradataml.common.messages import Messages, MessageCodes
23
25
  from teradataml.hyperparameter_tuner.utils import _ProgressBar
@@ -171,7 +173,9 @@ class _BaseSearch:
171
173
  self.__progress_bar = None
172
174
  # '__model_err_records' holds error messages of failed model.
173
175
  self.__model_err_records = dict()
174
-
176
+ # '__parallel_stop_event' is used to stop threads in parallel execution.
177
+ self.__parallel_stop_event = None
178
+
175
179
  # Get the function name.
176
180
  self.__func_name = func._tdml_valib_name if "_VALIB" in str(func.__class__) \
177
181
  else func.__name__
@@ -227,6 +231,9 @@ class _BaseSearch:
227
231
  if self.__func_comparator[self.__evaluation_metric] \
228
232
  else self.__best_score_ <= self.__early_stop
229
233
 
234
+ # '_is_time_stoppable' function is to check whether HPT execution reached self.__timeout value.
235
+ self._is_time_stoppable = lambda : True if time.time() - self.__start_time >= self.__timeout else False
236
+
230
237
  # Special case comparator for "MPE" metrics.
231
238
  # When "curr_score" argument is 'None' then lambda function checks
232
239
  # for '_is_early_stoppable'. Otherwise, it checks for '_is_best_metrics'.
@@ -876,10 +883,6 @@ class _BaseSearch:
876
883
  self.__sampled_df_mapper[_data_id] = [{train_data_arg:_train_data},
877
884
  {test_data_arg:_test_data}]
878
885
 
879
- # Update model trainer function parameter grid.
880
- self.__update_model_parameters()
881
-
882
-
883
886
  def __update_model_parameters(self):
884
887
  """
885
888
  DESCRIPTION:
@@ -924,16 +927,13 @@ class _BaseSearch:
924
927
  'data_id': 'DF_1'}
925
928
  ]
926
929
  """
927
-
928
930
  # Get data identifiers.
929
- _model_ids = self.__sampled_df_mapper.keys()
930
-
931
+ _model_ids = self.__sampled_df_mapper.keys()
931
932
  # Update '_parameter_grid' with data identifiers by performing
932
933
  # cartesian product.
933
934
  self._parameter_grid = [{"param":param[0] , self.__DATA_ID:param[1]} for \
934
935
  param in product(self._parameter_grid, _model_ids)]
935
-
936
-
936
+
937
937
  def __validate_model_trainer_input_data_argument(self, data, is_optional_arg=True):
938
938
  """
939
939
  DESCRIPTION:
@@ -1006,6 +1006,7 @@ class _BaseSearch:
1006
1006
  stratify_column=None,
1007
1007
  sample_id_column=None,
1008
1008
  sample_seed=None,
1009
+ max_time=None,
1009
1010
  **kwargs):
1010
1011
  """
1011
1012
  DESCRIPTION:
@@ -1146,6 +1147,12 @@ class _BaseSearch:
1146
1147
  * Mandatory when "sample_seed" argument is present.
1147
1148
  Types: str
1148
1149
 
1150
+ max_time:
1151
+ Optional Argument.
1152
+ Specifies the maximum time for the completion of Hyperparameter tuning execution.
1153
+ Default Value: None
1154
+ Types: int or float
1155
+
1149
1156
  kwargs:
1150
1157
  Optional Argument.
1151
1158
  Specifies the keyword arguments. Accepts additional arguments
@@ -1225,24 +1232,6 @@ class _BaseSearch:
1225
1232
  # Set the flag to notify fit method is called.
1226
1233
  self.__is_fit_called = True
1227
1234
 
1228
- if self.__is_trainable:
1229
- # "data" argument is a required argument for model trainer function
1230
- # when data argument is not passed with hyperparameters. On other side,
1231
- # "data" argument will be optional argument when data argument
1232
- # is passed with hyperparameters.
1233
- _is_optional_arg = self.__model_trainer_input_data is not None
1234
- # validate the model trainer function 'data' argument.
1235
- self.__validate_model_trainer_input_data_argument(data, _is_optional_arg)
1236
-
1237
- if not data is None:
1238
- # '__model_trainer_input_data' is assigned with "data" argument,
1239
- # when user passes data argument in fit() method.
1240
- # Note: if user attempts to pass data argument in both "params"
1241
- # argument as hyperparameters or "data" argument in fit()
1242
- # method, then latest "data" argument value is considered
1243
- # for model training.
1244
- self.__model_trainer_input_data = data
1245
-
1246
1235
  # Validate "early_stop".
1247
1236
  arg_info_matrix = []
1248
1237
  arg_info_matrix.append(["early_stop", early_stop, True, (int, float)])
@@ -1251,24 +1240,29 @@ class _BaseSearch:
1251
1240
  arg_info_matrix.append(["wait", wait, True, (bool)])
1252
1241
  arg_info_matrix.append(["evaluation_metric", evaluation_metric, True,
1253
1242
  (str), True, list(self.__func_comparator)])
1243
+ arg_info_matrix.append(["verbose", verbose, True, (int), True, [0,1,2]])
1244
+ arg_info_matrix.append(["max_time", max_time, True, (int, float)])
1254
1245
 
1255
1246
  _Validators._validate_function_arguments(arg_info_matrix)
1256
1247
 
1248
+ # set timeout value.
1249
+ self.__timeout = max_time
1250
+
1251
+ self._setting_model_trainer_data(data)
1252
+
1257
1253
  # Set the evaluation metrics.
1258
1254
  if evaluation_metric is not None:
1259
1255
  self.__evaluation_metric = evaluation_metric.upper()
1260
1256
  self.__early_stop = early_stop
1261
-
1262
1257
  if self.__is_trainable and self.__is_evaluatable and self.__is_sqle_function:
1258
+
1263
1259
  # When "evaluation_metric" is 'MPE' then use the spl comparators.
1264
1260
  if self.__evaluation_metric == "MPE":
1265
1261
  self._is_best_metrics = self._is_early_stoppable = self._spl_abs_comparator
1266
1262
 
1267
1263
  if not isinstance(self.__model_trainer_input_data, dict):
1268
- # Label the data with unique IDs.
1269
- _labeled_data = self._add_data_label()
1270
1264
  # Sample all the labeled data for model training and testing.
1271
- self.__perform_train_test_sampling(_labeled_data, frac, stratify_column,
1265
+ self.__perform_train_test_sampling(self._labeled_data, frac, stratify_column,
1272
1266
  sample_id_column, sample_seed)
1273
1267
 
1274
1268
  elif isinstance(self.__model_trainer_input_data, dict):
@@ -1276,6 +1270,8 @@ class _BaseSearch:
1276
1270
  self.__perform_train_test_sampling(self.__model_trainer_input_data, frac,
1277
1271
  stratify_column, sample_id_column,
1278
1272
  sample_seed)
1273
+ # Update model trainer function parameter grid.
1274
+ self.__update_model_parameters()
1279
1275
 
1280
1276
  self.__eval_params = kwargs if self.__is_evaluatable else None
1281
1277
 
@@ -1287,11 +1283,13 @@ class _BaseSearch:
1287
1283
  self.__sampled_df_mapper = self._add_data_label("data")
1288
1284
  # Update model trainer function parameter grid.
1289
1285
  self.__update_model_parameters()
1290
-
1286
+
1291
1287
  # Initialize logging.
1292
1288
  if verbose > 0:
1293
1289
  self.__progress_bar = _ProgressBar(jobs=len(self._parameter_grid), verbose=verbose)
1294
1290
  if not run_parallel:
1291
+ # Setting start time of Sequential execution.
1292
+ self.__start_time = time.time() if self.__timeout is not None else None
1295
1293
  # TODO: Factorize the code once parallel execution part is completed in ELE-6154 JIRA.
1296
1294
  # Execute all parameters from populated parameter grid for both trainable
1297
1295
  # and non trainable function.
@@ -1302,8 +1300,8 @@ class _BaseSearch:
1302
1300
  # trainer function.
1303
1301
  if self.__early_stop is not None and self.__is_evaluatable:
1304
1302
  if self.__is_finite and self._is_early_stoppable():
1305
- # Terminate HPT execution when the trained model attains the
1306
- # specified "__early_stop" value.
1303
+ # Terminate HPT execution when the trained model attains the
1304
+ # given "early_stop" value.
1307
1305
  break
1308
1306
  elif not self.__is_finite:
1309
1307
  # Raise error because non-finite values cannot be compared
@@ -1316,6 +1314,10 @@ class _BaseSearch:
1316
1314
  " when '{metric}' metric results inconsistent value.".format(
1317
1315
  metric=self.__evaluation_metric))
1318
1316
  raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
1317
+ if self.__timeout is not None and self._is_time_stoppable():
1318
+ # Terminate HPT execution when the execution time exceeds the
1319
+ # given time limit.
1320
+ break
1319
1321
 
1320
1322
  else:
1321
1323
  # TODO: Added support for early_stop feature along with concurrency in ELE-6154 JIRA.
@@ -1328,9 +1330,13 @@ class _BaseSearch:
1328
1330
  _temp_params["model_param"] = param
1329
1331
  _temp_params.update(kwargs)
1330
1332
  async_exec_params.append(_temp_params)
1331
-
1333
+
1334
+ # Initialize the stopping event
1335
+ self.__parallel_stop_event = threading.Event()
1332
1336
  # let's initialize "_AsyncDBExecutor".
1333
1337
  self._async_executor = _AsyncDBExecutor(wait=wait)
1338
+ # Setting start time of Parallel execution.
1339
+ self.__start_time = time.time() if self.__timeout is not None else None
1334
1340
  # Trigger parallel thread execution.
1335
1341
  self._async_executor.submit(self._execute_fit, *async_exec_params)
1336
1342
 
@@ -1377,15 +1383,24 @@ class _BaseSearch:
1377
1383
  EXAMPLES:
1378
1384
  >>> self.__model_trainer_routine(param=param, iter=iter, **kwargs)
1379
1385
  """
1386
+
1380
1387
  # Define model name used for model metadata.
1381
1388
  model_name = self._generate_model_name(iter)
1382
1389
  # Get the unique data identifier present in "model_param".
1383
1390
  _data_id = model_param[self.__DATA_ID]
1384
- # Retrieve the train and test data using data identifier.
1385
- _train_data, _test_data = self.__sampled_df_mapper[_data_id]
1386
1391
  # 'param' variable holds model training parameters and train dataframe.
1387
1392
  # Get the model training parameters.
1388
1393
  param = model_param["param"]
1394
+
1395
+ # Check the stop_event set or not
1396
+ if self.__parallel_stop_event is not None and self.__parallel_stop_event.is_set():
1397
+ # Update the model metadata for Skip execution.
1398
+ self.__update_model_metadata(model_name, param, "SKIP", 0, _data_id)
1399
+ return
1400
+
1401
+ # Retrieve the train and test data using data identifier.
1402
+ _train_data, _test_data = self.__sampled_df_mapper[_data_id]
1403
+
1389
1404
  # Update model training argument with train DataFrame.
1390
1405
  param.update(_train_data)
1391
1406
  # Update the test DataFrame for model evaluation.
@@ -1418,6 +1433,7 @@ class _BaseSearch:
1418
1433
  # Default evaluation metric is set to "MAE" for Regression models.
1419
1434
  if self.__evaluation_metric is None:
1420
1435
  self.__evaluation_metric = "MAE"
1436
+
1421
1437
  else:
1422
1438
  # ClassificationEvaluator results are stored under "output_data"
1423
1439
  # attribute. "output_data" dataframe 'column 1' contains metrics
@@ -1431,11 +1447,21 @@ class _BaseSearch:
1431
1447
  # classification models.
1432
1448
  if self.__evaluation_metric is None:
1433
1449
  self.__evaluation_metric = "ACCURACY"
1450
+
1434
1451
  # Update the model metadata for successful model training.
1435
-
1436
1452
  self.__update_model_metadata(model_name, param, "PASS",
1437
1453
  training_time, _data_id,
1438
1454
  columns, eval_values)
1455
+
1456
+ # Check whether self.__parallel_stop_event is None or not
1457
+ if self.__parallel_stop_event is not None:
1458
+ # SET the self.__parallel_stop_event
1459
+ # When trained model evaluation metric value exceeds self.__early_stop
1460
+ # or When execution time exceeds self.__timeout
1461
+ if (self.__early_stop is not None and self._is_early_stoppable())\
1462
+ or (self.__timeout is not None and self._is_time_stoppable()):
1463
+ self.__parallel_stop_event.set()
1464
+
1439
1465
  except Exception as _err_msg:
1440
1466
  # Record error message with corresponding "model_name".
1441
1467
  self.__model_err_records[model_name] = str(_err_msg)
@@ -1513,7 +1539,11 @@ class _BaseSearch:
1513
1539
  else:
1514
1540
  # Initialize param for non-model trainer functions.
1515
1541
  param = model_param
1516
-
1542
+ # Check the stop_event set or not
1543
+ if self.__parallel_stop_event is not None and self.__parallel_stop_event.is_set():
1544
+ # Update the model metadata for Skip execution.
1545
+ self.__update_model_metadata(model_name, param, "SKIP", 0, _data_id)
1546
+ return
1517
1547
  try:
1518
1548
  # Record starting time of model training.
1519
1549
  start_time = time.perf_counter()
@@ -1541,6 +1571,13 @@ class _BaseSearch:
1541
1571
  # Update the model metadata for failed execution.
1542
1572
  self.__update_model_metadata(model_name, param, "FAIL", training_time, _data_id)
1543
1573
  pass
1574
+
1575
+ if self.__parallel_stop_event is not None:
1576
+ # SET the self.__parallel_stop_event
1577
+ # When execution time exceeds self.__timeout
1578
+ if self.__timeout is not None and self._is_time_stoppable():
1579
+ self.__parallel_stop_event.set()
1580
+
1544
1581
 
1545
1582
 
1546
1583
  def __update_model_metadata(self, model_name,
@@ -1573,6 +1610,7 @@ class _BaseSearch:
1573
1610
  Permitted Values:
1574
1611
  * PASS: Function result present in the vantage.
1575
1612
  * FAIL: Function execution failed for the chosen parameters.
1613
+ * SKIP: Function execution skipped for the chosen parameters.
1576
1614
  Types: str
1577
1615
 
1578
1616
  data_id:
@@ -1622,7 +1660,6 @@ class _BaseSearch:
1622
1660
  model_metadata = {"MODEL_ID" : model_name,
1623
1661
  "PARAMETERS" : param,
1624
1662
  "STATUS" : status}
1625
-
1626
1663
  if self.__is_trainable:
1627
1664
  # Update "data_id" for model trainer functions.
1628
1665
  model_metadata[self.__DATA_ID.upper()] = data_id
@@ -1664,7 +1701,7 @@ class _BaseSearch:
1664
1701
  # training best model.
1665
1702
  self.__best_data_id = data_id
1666
1703
 
1667
- if not self.__progress_bar is None:
1704
+ if not self.__progress_bar is None and status != 'SKIP':
1668
1705
  # Update progress bar when logging is required.
1669
1706
  self.__progress_bar.update(msg=_msg)
1670
1707
  # Update "__model_eval_records" with the formatted metadata.
@@ -2057,6 +2094,160 @@ class _BaseSearch:
2057
2094
 
2058
2095
  # Return list of dictionary containing all possible combinations.
2059
2096
  return [dict(param) for param in product(*param_pairs)]
2097
+
2098
+ def _data_mapping(self):
2099
+ """
2100
+ DESCRIPTION:
2101
+ Internal function to create a Cartesian product of data mapped with input columns
2102
+ and parameter grid.
2103
+
2104
+ PARAMETERS:
2105
+ None
2106
+
2107
+ RETURNS:
2108
+ None
2109
+ """
2110
+ # Get the input columns from the params.
2111
+ input_columns = self.__params.pop("input_columns")
2112
+ # Create a list of dictionaries with data_id and input_columns
2113
+ data_mapping_list = []
2114
+ # Iterate over the labeled data and create a list of dictionaries
2115
+ for data_ids, data in self._labeled_data.items():
2116
+ # Check if all input columns are present in the data
2117
+ for input_cols in input_columns:
2118
+ if all(col in data.columns for col in input_cols):
2119
+ data_mapping_list.append({'data_id': data_ids,
2120
+ 'input_columns': input_cols})
2121
+
2122
+ self._parameter_grid = self.__populate_parameter_grid()
2123
+
2124
+ cartesian_product = product(self._parameter_grid, data_mapping_list)
2125
+
2126
+ result_list = []
2127
+
2128
+ # Iterate over the Cartesian product and construct the desired dictionaries
2129
+ for params, data_mapping in cartesian_product:
2130
+ result_dict = {
2131
+ 'param': {**params, 'input_columns': data_mapping['input_columns']},
2132
+ self.__DATA_ID: data_mapping['data_id']
2133
+ }
2134
+ result_list.append(result_dict)
2135
+
2136
+ self._parameter_grid = result_list
2137
+
2138
+
2139
+ def _setting_model_trainer_data(self,
2140
+ data=None):
2141
+ """
2142
+ DESCRIPTION:
2143
+ Internal function to set the model trainer input data for model
2144
+ training.
2145
+
2146
+ PARAMETERS:
2147
+ data:
2148
+ Optional Argument.
2149
+ Specifies the input data used for model training.
2150
+ Note:
2151
+ * "data" argument is a required argument for model trainer
2152
+ function when data argument is not passed with hyperparameters.
2153
+ * When data argument is passed with hyperparameters then
2154
+ "data" argument is optional.
2155
+ Types: teradataml DataFrame
2156
+
2157
+ RETURNS:
2158
+ None
2159
+
2160
+ Example:
2161
+ >>> print(self.__model_trainer_input_data)
2162
+ ( id admitted gpa stats programming masters
2163
+ 0 19 0 0.051643 0.0 0.0 1.0
2164
+ 1 6 1 0.765258 0.5 0.0 1.0
2165
+ 2 15 1 1.000000 0.0 0.0 1.0
2166
+ 3 32 0 0.746479 0.0 0.5 1.0
2167
+ 4 12 1 0.835681 1.0 1.0 0.0
2168
+ 5 40 0 0.976526 1.0 0.5 1.0
2169
+ 6 7 1 0.215962 1.0 1.0 1.0
2170
+ 7 36 0 0.530516 0.0 1.0 0.0
2171
+ 8 28 1 0.967136 0.0 0.0 0.0
2172
+ 9 17 1 0.920188 0.0 0.0 0.0,
2173
+ id admitted gpa stats programming masters
2174
+ 0 4 1 0.765258 0.5 1.0 1.0
2175
+ 1 6 1 0.765258 0.5 0.0 1.0
2176
+ 2 7 1 0.215962 1.0 1.0 1.0
2177
+ 3 8 1 0.812207 0.5 0.0 0.0
2178
+ 4 10 1 0.863850 0.0 0.0 0.0
2179
+ 5 11 1 0.591549 0.0 0.0 0.0
2180
+ 6 9 1 0.915493 0.0 0.0 0.0
2181
+ 7 5 0 0.737089 1.0 1.0 0.0
2182
+ 8 3 1 0.859155 1.0 0.5 0.0
2183
+ 9 2 0 0.887324 0.5 0.5 1.0,
2184
+ id admitted gpa stats programming masters
2185
+ 0 23 1 0.807512 0.0 1.0 1.0
2186
+ 1 25 1 0.981221 0.0 0.0 0.0
2187
+ 2 26 1 0.798122 0.0 0.0 1.0
2188
+ 3 27 0 0.981221 0.0 0.0 1.0
2189
+ 4 29 0 1.000000 1.0 0.5 1.0
2190
+ 5 30 0 0.901408 0.0 1.0 1.0
2191
+ 6 28 1 0.967136 0.0 0.0 0.0
2192
+ 7 24 1 0.000000 0.0 1.0 0.0
2193
+ 8 22 0 0.746479 1.0 0.5 1.0
2194
+ 9 21 1 0.938967 1.0 0.5 0.0)
2195
+
2196
+ >>> print(self._labeled_data)
2197
+ {'DF_0': id admitted gpa stats programming masters
2198
+ 0 26 1 0.798122 0.0 0.0 1.0
2199
+ 1 40 0 0.976526 1.0 0.5 1.0
2200
+ 2 7 1 0.215962 1.0 1.0 1.0
2201
+ 3 19 0 0.051643 0.0 0.0 1.0
2202
+ 4 15 1 1.000000 0.0 0.0 1.0
2203
+ 5 32 0 0.746479 0.0 0.5 1.0
2204
+ 6 38 1 0.366197 0.0 0.5 1.0
2205
+ 7 12 1 0.835681 1.0 1.0 0.0
2206
+ 8 6 1 0.765258 0.5 0.0 1.0
2207
+ 9 36 0 0.530516 0.0 1.0 0.0,
2208
+ 'DF_1': id admitted gpa stats programming masters
2209
+ 0 4 1 0.765258 0.5 1.0 1.0
2210
+ 1 6 1 0.765258 0.5 0.0 1.0
2211
+ 2 7 1 0.215962 1.0 1.0 1.0
2212
+ 3 8 1 0.812207 0.5 0.0 0.0
2213
+ 4 10 1 0.863850 0.0 0.0 0.0
2214
+ 5 11 1 0.591549 0.0 0.0 0.0
2215
+ 6 9 1 0.915493 0.0 0.0 0.0
2216
+ 7 5 0 0.737089 1.0 1.0 0.0
2217
+ 8 3 1 0.859155 1.0 0.5 0.0
2218
+ 9 2 0 0.887324 0.5 0.5 1.0,
2219
+ 'DF_2': id admitted gpa stats programming masters
2220
+ 0 23 1 0.807512 0.0 1.0 1.0
2221
+ 1 25 1 0.981221 0.0 0.0 0.0
2222
+ 2 26 1 0.798122 0.0 0.0 1.0
2223
+ 3 27 0 0.981221 0.0 0.0 1.0
2224
+ 4 29 0 1.000000 1.0 0.5 1.0
2225
+ 5 30 0 0.901408 0.0 1.0 1.0
2226
+ 6 28 1 0.967136 0.0 0.0 0.0
2227
+ 7 24 1 0.000000 0.0 1.0 0.0
2228
+ 8 22 0 0.746479 1.0 0.5 1.0
2229
+ 9 21 1 0.938967 1.0 0.5 0.0}
2230
+ """
2231
+ if self.__is_trainable:
2232
+ # "data" argument is a required argument for model trainer function
2233
+ # when data argument is not passed with hyperparameters. On other side,
2234
+ # "data" argument will be optional argument when data argument
2235
+ # is passed with hyperparameters.
2236
+ _is_optional_arg = self.__model_trainer_input_data is not None
2237
+ # validate the model trainer function 'data' argument.
2238
+ self.__validate_model_trainer_input_data_argument(data, _is_optional_arg)
2239
+
2240
+ if not data is None:
2241
+ # '__model_trainer_input_data' is assigned with "data" argument,
2242
+ # when user passes data argument in fit() method.
2243
+ # Note: if user attempts to pass data argument in both "params"
2244
+ # argument as hyperparameters or "data" argument in fit()
2245
+ # method, then latest "data" argument value is considered
2246
+ # for model training.
2247
+ self.__model_trainer_input_data = data
2248
+
2249
+ if self.__is_trainable and self.__is_evaluatable and self.__is_sqle_function:
2250
+ self._labeled_data = self._add_data_label()
2060
2251
 
2061
2252
 
2062
2253
  class GridSearch(_BaseSearch):
@@ -2659,9 +2850,8 @@ class GridSearch(_BaseSearch):
2659
2850
 
2660
2851
  """
2661
2852
 
2662
- self.__params = params
2853
+ self.__params = params.copy()
2663
2854
  super().__init__(func=func, params=self.__params)
2664
-
2665
2855
  # Populate parameter grid from provided parameter space.
2666
2856
  self.__populate_params_grid()
2667
2857
 
@@ -2688,87 +2878,381 @@ class GridSearch(_BaseSearch):
2688
2878
  # Since GridSearch works on all parameter combinations. Set
2689
2879
  # all the parameter combinations to the parameter grid.
2690
2880
  self._parameter_grid = self._BaseSearch__populate_parameter_grid()
2881
+
2691
2882
 
2692
-
2693
- class RandomSearch(_BaseSearch):
2694
- def __init__(self, func, params, n_iter=10):
2883
+ def fit(self,
2884
+ data=None,
2885
+ evaluation_metric=None,
2886
+ early_stop=None,
2887
+ frac=0.8,
2888
+ run_parallel=True,
2889
+ wait=True,
2890
+ verbose=0,
2891
+ stratify_column=None,
2892
+ sample_id_column=None,
2893
+ sample_seed=None,
2894
+ max_time=None,
2895
+ **kwargs):
2695
2896
  """
2696
2897
  DESCRIPTION:
2697
- RandomSearch algorithm performs random sampling on hyperparameter
2698
- space to identify optimal hyperparameters. It works for
2699
- teradataml analytic functions from SQLE, BYOM, VAL and UAF features.
2700
- teradataml RandomSearch allows user to perform hyperparameter tuning for
2701
- all model trainer and non-model trainer functions.
2702
- When used for model trainer functions:
2703
- * Based on evaluation metrics search determines best model.
2704
- * All methods and properties can be used.
2705
- When used for non-model trainer functions:
2706
- * Only fit() method is supported.
2707
- * User can choose the best output as they see fit to use this.
2708
-
2709
- teradataml RandomSearch also allows user to use input data as the
2710
- hyperparameter. This option can be suitable when the user wants to
2711
- identify the best models for a set of input data. When user passes
2712
- set of data as hyperparameter for model trainer function, the search
2713
- determines the best data along with the best model based on the
2714
- evaluation metrics.
2898
+ Function to perform hyperparameter tuning using GridSearch algorithm.
2899
+ Notes:
2900
+ * In the Model trainer function, the best parameters are
2901
+ selected based on training results.
2902
+ * In the Non model trainer function, First execution parameter
2903
+ set is selected as the best parameters.
2715
2904
 
2716
2905
  PARAMETERS:
2717
- func:
2718
- Required Argument.
2719
- Specifies a teradataml analytic function from SQLE, VAL, and UAF.
2720
- Types:
2721
- teradataml Analytic Functions
2722
- * Advanced analytic functions
2723
- * UAF
2724
- * VAL
2725
- Refer to display_analytic_functions() function for list of functions.
2906
+ data:
2907
+ Optional Argument.
2908
+ Specifies the input teradataml DataFrame for model trainer function.
2909
+ Notes:
2910
+ * DataFrame need not to be passed in fit() methods, when "data" is
2911
+ passed as a model hyperparameters ("params").
2912
+ * "data" is a required argument for model trainer functions.
2913
+ * "data" is ignored for non-model trainer functions.
2914
+ * "data" can be contain single DataFrame or multiple DataFrame.
2915
+ * One can pass multiple dataframes to "data". Hyperparameter
2916
+ tuning is performed on all the dataframes for every model
2917
+ parameter.
2918
+ * "data" can be either a dictionary OR a tuple OR a dataframe.
2919
+ * If it is a dictionary then Key represents the label for
2920
+ dataframe and Value represents the dataframe.
2921
+ * If it is a tuple then teradataml converts it to dictionary
2922
+ by generating the labels internally.
2923
+ * If it is a dataframe then teradataml label it as "DF_0".
2924
+ Types: teradataml DataFrame, dictionary, tuples
2726
2925
 
2727
- params:
2728
- Required Argument.
2729
- Specifies the parameter(s) of a teradataml analytic function.
2730
- The parameter(s) must be in dictionary. keys refers to the
2731
- argument names and values refers to argument values for corresponding
2732
- arguments.
2926
+ evaluation_metric:
2927
+ Optional Argument.
2928
+ Specifies the evaluation metrics to considered for model
2929
+ evaluation.
2733
2930
  Notes:
2734
- * One can specify the argument value in a tuple to run HPT
2735
- with different arguments.
2736
- * Model trainer function arguments "id_column", "input_columns",
2737
- and "target_columns" must be passed in fit() method.
2738
- * All required arguments of non-model trainer function must be
2739
- passed while RandomSearch object creation.
2740
- Types: dict
2741
-
2742
- n_iter:
2931
+ * evaluation_metric applicable for model trainer functions.
2932
+ * Best model is not selected when evaluation returns
2933
+ non-finite values.
2934
+ Permitted Values:
2935
+ * Classification: Accuracy, Micro-Precision, Micro-Recall,
2936
+ Micro-F1, Macro-Precision, Macro-Recall,
2937
+ Macro-F1, Weighted-Precision,
2938
+ Weighted-Recall,
2939
+ Weighted-F1.
2940
+ * Regression: MAE, MSE, MSLE, MAPE, MPE, RMSE, RMSLE, ME,
2941
+ R2, EV, MPD, MGD
2942
+
2943
+ Default Value:
2944
+ * Classification: Accuracy
2945
+ * Regression: MAE
2946
+ Types: str
2947
+
2948
+ early_stop:
2743
2949
  Optional Argument.
2744
- Specifies the number of iterations random search need to be performed.
2950
+ Specifies the early stop mechanism value for model trainer
2951
+ functions. Hyperparameter tuning ends model training when
2952
+ the training model evaluation metric attains "early_stop" value.
2745
2953
  Note:
2746
- * n_iter must be less than the size of parameter populations.
2747
- Default Value: 10
2748
- Types: int
2749
-
2750
- RETURNS:
2751
- None
2954
+ * Early stopping supports only when evaluation returns
2955
+ finite value.
2956
+ Types: int or float
2752
2957
 
2753
- RAISES:
2754
- TeradataMlException, TypeError, ValueError
2755
-
2756
- EXAMPLES:
2757
- >>> # Example 1: Model trainer function. Performing hyperparameter-tuning
2758
- >>> # on SVM model trainer function using random search algorithm.
2759
-
2760
- >>> # Load the example data.
2761
- >>> load_example_data("teradataml", ["cal_housing_ex_raw"])
2762
-
2763
- >>> # Create teradataml DataFrame objects.
2764
- >>> data_input = DataFrame.from_table("cal_housing_ex_raw")
2958
+ frac:
2959
+ Optional Argument.
2960
+ Specifies the split percentage of rows to be sampled for training
2961
+ and testing dataset. "frac" argument value must range between (0, 1).
2962
+ Notes:
2963
+ * This "frac" argument is not supported for non-model trainer
2964
+ function.
2965
+ * The "frac" value is considered as train split percentage and
2966
+ The remaining percentage is taken into account for test splitting.
2967
+ Default Value: 0.8
2968
+ Types: float
2765
2969
 
2766
- >>> # Scale "target_columns" with respect to 'STD' value of the column.
2767
- >>> fit_obj = ScaleFit(data=data_input,
2768
- target_columns=['MedInc', 'HouseAge', 'AveRooms',
2769
- 'AveBedrms', 'Population', 'AveOccup',
2770
- 'Latitude', 'Longitude'],
2771
- scale_method="STD")
2970
+ run_parallel:
2971
+ Optional Argument.
2972
+ Specifies the parallel execution functionality of hyperparameter
2973
+ tuning. When "run_parallel" set to true, model functions are
2974
+ executed concurrently. Otherwise, model functions are executed
2975
+ sequentially.
2976
+ Default Value: True
2977
+ Types: bool
2978
+
2979
+ wait:
2980
+ Optional Argument.
2981
+ Specifies whether to wait for the completion of execution
2982
+ of hyperparameter tuning or not. When set to False, hyperparameter
2983
+ tuning is executed in the background and user can use "is_running()"
2984
+ method to check the status. Otherwise it waits until the execution
2985
+ is complete to return the control back to user.
2986
+ Default Value: True
2987
+ Type: bool
2988
+
2989
+ verbose:
2990
+ Optional Argument.
2991
+ Specifies whether to log the model training information and display
2992
+ the logs. When it is set to 1, progress bar alone logged in the
2993
+ console. When it is set to 2, along with progress bar, execution
2994
+ steps and execution time is logged in the console. When it is set
2995
+ to 0, nothing is logged in the console.
2996
+ Note:
2997
+ * verbose is not significant when "wait" is 'False'.
2998
+ Default Value: 0
2999
+ Type: bool
3000
+
3001
+ sample_seed:
3002
+ Optional Argument.
3003
+ Specifies the seed value that controls the shuffling applied
3004
+ to the data before applying the Train-Test split. Pass an int for
3005
+ reproducible output across multiple function calls.
3006
+ Notes:
3007
+ * When the argument is not specified, different
3008
+ runs of the query generate different outputs.
3009
+ * It must be in the range [0, 2147483647]
3010
+ * Seed is supported for stratify column.
3011
+ Types: int
3012
+
3013
+ stratify_column:
3014
+ Optional Argument.
3015
+ Specifies column name that contains the labels indicating
3016
+ which data needs to be stratified for TrainTest split.
3017
+ Notes:
3018
+ * seed is supported for stratify column.
3019
+ Types: str
3020
+
3021
+ sample_id_column:
3022
+ Optional Argument.
3023
+ Specifies the input data column name that has the
3024
+ unique identifier for each row in the input.
3025
+ Note:
3026
+ * Mandatory when "sample_seed" argument is present.
3027
+ Types: str
3028
+
3029
+ max_time:
3030
+ Optional Argument.
3031
+ Specifies the maximum time for the completion of Hyperparameter tuning execution.
3032
+ Default Value: None
3033
+ Types: int or float
3034
+
3035
+ kwargs:
3036
+ Optional Argument.
3037
+ Specifies the keyword arguments. Accepts additional arguments
3038
+ required for the teradataml analytic function.
3039
+
3040
+ RETURNS:
3041
+ None
3042
+
3043
+ RAISES:
3044
+ TeradataMlException, TypeError, ValueError
3045
+
3046
+ EXAMPLES:
3047
+ >>> # Create an instance of the GridSearch algorithm called "optimizer_obj"
3048
+ >>> optimizer_obj = GridSearch(func=SVM, params=params)
3049
+
3050
+ >>> eval_params = {"id_column": "id",
3051
+ "accumulate": "MedHouseVal"}
3052
+ >>> # Example 1: Passing single DataFrame for model trainer function.
3053
+ >>> optimizer_obj.fit(data=train_df,
3054
+ evaluation_metric="MAE",
3055
+ early_stop=70.9,
3056
+ **eval_params)
3057
+
3058
+ >>> # Example 2: Passing multiple datasets as tuple of DataFrames for
3059
+ >>> # model trainer function.
3060
+ >>> optimizer_obj.fit(data=(train_df_1, train_df_2),
3061
+ evaluation_metric="MAE",
3062
+ early_stop=70.9,
3063
+ **eval_params)
3064
+
3065
+ >>> # Example 3: Passing multiple datasets as dictionary of DataFrames
3066
+ >>> # for model trainer function.
3067
+ >>> optimizer_obj.fit(data={"Data-1":train_df_1, "Data-2":train_df_2},
3068
+ evaluation_metric="MAE",
3069
+ early_stop=70.9,
3070
+ **eval_params)
3071
+
3072
+ >>> # Example 4: No data argument passed in fit() method for model trainer function.
3073
+ >>> # Note: data argument must be passed while creating HPT object as
3074
+ >>> # model hyperparameters.
3075
+
3076
+ >>> # Define parameter space for model training with "data" argument.
3077
+ >>> params = {"data":(df1, df2),
3078
+ "input_columns":['MedInc', 'HouseAge', 'AveRooms',
3079
+ 'AveBedrms', 'Population', 'AveOccup',
3080
+ 'Latitude', 'Longitude'],
3081
+ "response_column":"MedHouseVal",
3082
+ "model_type":"regression",
3083
+ "batch_size":(11, 50, 75),
3084
+ "iter_max":(100, 301),
3085
+ "intercept":False,
3086
+ "learning_rate":"INVTIME",
3087
+ "nesterov_optimization":True,
3088
+ "local_sgd_iterations":1}
3089
+
3090
+ >>> # Create "optimizer_obj" using GridSearch algorithm and perform
3091
+ >>> # fit() method without any "data" argument for model trainer function.
3092
+ >>> optimizer_obj.fit(evaluation_metric="MAE",
3093
+ early_stop=70.9,
3094
+ **eval_params)
3095
+
3096
+ >>> # Example 5: Do not pass data argument in fit() method for
3097
+ >>> # non-model trainer function.
3098
+ >>> # Note: data argument must be passed while creating HPT
3099
+ >>> # object as model hyperparameters.
3100
+ >>> optimizer_obj.fit()
3101
+
3102
+ >>> # Example 6: Passing "verbose" argument value '1' in fit() method to
3103
+ >>> # display model log.
3104
+ >>> optimizer_obj.fit(data=train_df, evaluation_metric="R2",
3105
+ verbose=1, **eval_params)
3106
+ completed: |████████████████████████████████████████████████████████████| 100% - 6/6
3107
+
3108
+ >>> # Example 7: max_time argument is passed in fit() method.
3109
+ >>> # Model training parameters
3110
+ >>> model_params = {"input_columns":['sepal_length', 'sepal_width', 'petal_length', 'petal_width'],
3111
+ ... "response_column" :'species',
3112
+ ... "max_depth":(5,10,15),
3113
+ ... "lambda1" :(1000.0,0.001),
3114
+ ... "model_type" :"Classification",
3115
+ ... "seed":32,
3116
+ ... "shrinkage_factor":0.1,
3117
+ ... "iter_num":(5, 50)}
3118
+ >>>
3119
+ >>> eval_params = {"id_column": "id",
3120
+ ... "accumulate":"species",
3121
+ ... "model_type":'Classification',
3122
+ ... "object_order_column":['task_index', 'tree_num', 'iter','class_num', 'tree_order']
3123
+ }
3124
+ >>>
3125
+ >>> # Import model trainer function and optimizer.
3126
+ >>> from teradataml import XGBoost, GridSearch
3127
+ >>>
3128
+ >>> # Initialize the GridSearch optimizer with model trainer
3129
+ >>> # function and parameter space required for model training.
3130
+ >>> gs_obj = GridSearch(func=XGBoost, params=model_params)
3131
+ >>>
3132
+ >>> # fit() method with max_time argument(in seconds) for model trainer function.
3133
+ >>> gs_obj.fit(data=data, max_time=30, verbose=2, **eval_params)
3134
+ Model_id:XGBOOST_2 - Run time:33.277s - Status:PASS - ACCURACY:0.933
3135
+ Model_id:XGBOOST_3 - Run time:33.276s - Status:PASS - ACCURACY:0.933
3136
+ Model_id:XGBOOST_0 - Run time:33.279s - Status:PASS - ACCURACY:0.967
3137
+ Model_id:XGBOOST_1 - Run time:33.278s - Status:PASS - ACCURACY:0.933
3138
+ Computing: |⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾| 33% - 4/12
3139
+ >>>
3140
+ >>> # status 'SKIP' for the models which are not completed within the max_time.
3141
+ >>> gs_obj.models
3142
+ MODEL_ID DATA_ID PARAMETERS STATUS ACCURACY
3143
+ 0 XGBOOST_2 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.933333
3144
+ 1 XGBOOST_4 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
3145
+ 2 XGBOOST_5 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
3146
+ 3 XGBOOST_6 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
3147
+ 4 XGBOOST_7 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
3148
+ 5 XGBOOST_8 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
3149
+ 6 XGBOOST_9 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
3150
+ 7 XGBOOST_10 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
3151
+ 8 XGBOOST_11 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
3152
+ 9 XGBOOST_3 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.933333
3153
+ 10 XGBOOST_0 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.966667
3154
+ 11 XGBOOST_1 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.933333
3155
+ """
3156
+
3157
+ # Set the flag to discard invalid column parameters.
3158
+ self.discard_invalid_column_params =kwargs.get("discard_invalid_column_params", False)
3159
+
3160
+ if self.discard_invalid_column_params:
3161
+ # Setting model trainer input data.
3162
+ super()._setting_model_trainer_data(data)
3163
+ # Data mapping for model trainer function.
3164
+ super()._data_mapping()
3165
+ # Setting the lambda function to None.
3166
+ self._setting_model_trainer_data = lambda data: None
3167
+ self._BaseSearch__update_model_parameters = lambda: None
3168
+
3169
+ # Calling baseSearch class fit method.
3170
+ super().fit(data, evaluation_metric,
3171
+ early_stop, frac, run_parallel,
3172
+ wait, verbose, stratify_column,
3173
+ sample_id_column, sample_seed,
3174
+ max_time, **kwargs)
3175
+
3176
+
3177
+ class RandomSearch(_BaseSearch):
3178
+ def __init__(self, func, params, n_iter=10, **kwargs):
3179
+ """
3180
+ DESCRIPTION:
3181
+ RandomSearch algorithm performs random sampling on hyperparameter
3182
+ space to identify optimal hyperparameters. It works for
3183
+ teradataml analytic functions from SQLE, BYOM, VAL and UAF features.
3184
+ teradataml RandomSearch allows user to perform hyperparameter tuning for
3185
+ all model trainer and non-model trainer functions.
3186
+ When used for model trainer functions:
3187
+ * Based on evaluation metrics search determines best model.
3188
+ * All methods and properties can be used.
3189
+ When used for non-model trainer functions:
3190
+ * Only fit() method is supported.
3191
+ * User can choose the best output as they see fit to use this.
3192
+
3193
+ teradataml RandomSearch also allows user to use input data as the
3194
+ hyperparameter. This option can be suitable when the user wants to
3195
+ identify the best models for a set of input data. When user passes
3196
+ set of data as hyperparameter for model trainer function, the search
3197
+ determines the best data along with the best model based on the
3198
+ evaluation metrics.
3199
+
3200
+ PARAMETERS:
3201
+ func:
3202
+ Required Argument.
3203
+ Specifies a teradataml analytic function from SQLE, VAL, and UAF.
3204
+ Types:
3205
+ teradataml Analytic Functions
3206
+ * Advanced analytic functions
3207
+ * UAF
3208
+ * VAL
3209
+ Refer to display_analytic_functions() function for list of functions.
3210
+
3211
+ params:
3212
+ Required Argument.
3213
+ Specifies the parameter(s) of a teradataml analytic function.
3214
+ The parameter(s) must be in dictionary. keys refers to the
3215
+ argument names and values refers to argument values for corresponding
3216
+ arguments.
3217
+ Notes:
3218
+ * One can specify the argument value in a tuple to run HPT
3219
+ with different arguments.
3220
+ * Model trainer function arguments "id_column", "input_columns",
3221
+ and "target_columns" must be passed in fit() method.
3222
+ * All required arguments of non-model trainer function must be
3223
+ passed while RandomSearch object creation.
3224
+ Types: dict
3225
+
3226
+ n_iter:
3227
+ Optional Argument.
3228
+ Specifies the number of iterations random search need to be performed.
3229
+ Note:
3230
+ * n_iter must be less than the size of parameter populations.
3231
+ Default Value: 10
3232
+ Types: int
3233
+
3234
+ RETURNS:
3235
+ None
3236
+
3237
+ RAISES:
3238
+ TeradataMlException, TypeError, ValueError
3239
+
3240
+ EXAMPLES:
3241
+ >>> # Example 1: Model trainer function. Performing hyperparameter-tuning
3242
+ >>> # on SVM model trainer function using random search algorithm.
3243
+
3244
+ >>> # Load the example data.
3245
+ >>> load_example_data("teradataml", ["cal_housing_ex_raw"])
3246
+
3247
+ >>> # Create teradataml DataFrame objects.
3248
+ >>> data_input = DataFrame.from_table("cal_housing_ex_raw")
3249
+
3250
+ >>> # Scale "target_columns" with respect to 'STD' value of the column.
3251
+ >>> fit_obj = ScaleFit(data=data_input,
3252
+ target_columns=['MedInc', 'HouseAge', 'AveRooms',
3253
+ 'AveBedrms', 'Population', 'AveOccup',
3254
+ 'Latitude', 'Longitude'],
3255
+ scale_method="STD")
2772
3256
 
2773
3257
  >>> # Transform the data.
2774
3258
  >>> transform_obj = ScaleTransform(data=data_input,
@@ -2953,7 +3437,7 @@ class RandomSearch(_BaseSearch):
2953
3437
 
2954
3438
  """
2955
3439
 
2956
- self.__params = params
3440
+ self.__params = params.copy()
2957
3441
  super().__init__(func=func, params=self.__params)
2958
3442
  # Validate argument 'n_iter'
2959
3443
  awu_matrix = []
@@ -2964,10 +3448,9 @@ class RandomSearch(_BaseSearch):
2964
3448
  # Validates the range of n_iter should be greater than or equal to 1 and
2965
3449
  # less than or equal to parameter space.
2966
3450
  _Validators._validate_argument_range(n_iter, "n_iter", 1, len(parameter_space), True, True)
3451
+ self._n_iter = n_iter
2967
3452
 
2968
- self.__populate_params_grid(n_iter, parameter_space)
2969
-
2970
- def __populate_params_grid(self, n_iter, parameter_space):
3453
+ def __populate_params_grid(self):
2971
3454
  """
2972
3455
  DESCRIPTION:
2973
3456
  Populate parameter grid based on the search algorithm. In random search,
@@ -2988,6 +3471,313 @@ class RandomSearch(_BaseSearch):
2988
3471
  EXAMPLES:
2989
3472
  >>> self.__populate_params_grid()
2990
3473
  """
2991
-
2992
3474
  # Populate the parameter space with random and non-repetitive value
2993
- self._parameter_grid = random.sample(parameter_space, n_iter)
3475
+ if self.discard_invalid_column_params:
3476
+ # Defining the empty data_grouped_dict to group the parameters based on data_id.
3477
+ data_grouped_dict = defaultdict(list)
3478
+ for parameter in self._parameter_grid:
3479
+ # Extracting the data_id from the parameter.
3480
+ data_id = parameter['data_id']
3481
+ # Grouping the parameters based on data_id.
3482
+ data_grouped_dict[data_id].append(parameter)
3483
+ # Converting the grouped dictionary to list.
3484
+ data_grouped_dict = list(data_grouped_dict.values())
3485
+ parameter_grid = []
3486
+ for group in data_grouped_dict:
3487
+ # Randomly selecting the n_iter parameters from the grouped data.
3488
+ tmp = random.sample(group, self._n_iter)
3489
+ parameter_grid.extend(tmp)
3490
+
3491
+ # Setting the parameter grid.
3492
+ self._parameter_grid = parameter_grid
3493
+ else:
3494
+ self._parameter_grid = random.sample(self.get_parameter_grid(), self._n_iter)
3495
+
3496
+ def fit(self,
3497
+ data=None,
3498
+ evaluation_metric=None,
3499
+ early_stop=None,
3500
+ frac=0.8,
3501
+ run_parallel=True,
3502
+ wait=True,
3503
+ verbose=0,
3504
+ stratify_column=None,
3505
+ sample_id_column=None,
3506
+ sample_seed=None,
3507
+ max_time=None,
3508
+ **kwargs):
3509
+ """
3510
+ DESCRIPTION:
3511
+ Function to perform hyperparameter tuning using RandomSearch algorithm.
3512
+ Notes:
3513
+ * In the Model trainer function, the best parameters are
3514
+ selected based on training results.
3515
+ * In the Non model trainer function, First execution parameter
3516
+ set is selected as the best parameters.
3517
+
3518
+ PARAMETERS:
3519
+ data:
3520
+ Optional Argument.
3521
+ Specifies the input teradataml DataFrame for model trainer function.
3522
+ Notes:
3523
+ * DataFrame need not to be passed in fit() methods, when "data" is
3524
+ passed as a model hyperparameters ("params").
3525
+ * "data" is a required argument for model trainer functions.
3526
+ * "data" is ignored for non-model trainer functions.
3527
+ * "data" can be contain single DataFrame or multiple DataFrame.
3528
+ * One can pass multiple dataframes to "data". Hyperparameter
3529
+ tuning is performed on all the dataframes for every model
3530
+ parameter.
3531
+ * "data" can be either a dictionary OR a tuple OR a dataframe.
3532
+ * If it is a dictionary then Key represents the label for
3533
+ dataframe and Value represents the dataframe.
3534
+ * If it is a tuple then teradataml converts it to dictionary
3535
+ by generating the labels internally.
3536
+ * If it is a dataframe then teradataml label it as "DF_0".
3537
+ Types: teradataml DataFrame, dictionary, tuples
3538
+
3539
+ evaluation_metric:
3540
+ Optional Argument.
3541
+ Specifies the evaluation metrics to considered for model
3542
+ evaluation.
3543
+ Notes:
3544
+ * evaluation_metric applicable for model trainer functions.
3545
+ * Best model is not selected when evaluation returns
3546
+ non-finite values.
3547
+ Permitted Values:
3548
+ * Classification: Accuracy, Micro-Precision, Micro-Recall,
3549
+ Micro-F1, Macro-Precision, Macro-Recall,
3550
+ Macro-F1, Weighted-Precision,
3551
+ Weighted-Recall,
3552
+ Weighted-F1.
3553
+ * Regression: MAE, MSE, MSLE, MAPE, MPE, RMSE, RMSLE, ME,
3554
+ R2, EV, MPD, MGD
3555
+
3556
+ Default Value:
3557
+ * Classification: Accuracy
3558
+ * Regression: MAE
3559
+ Types: str
3560
+
3561
+ early_stop:
3562
+ Optional Argument.
3563
+ Specifies the early stop mechanism value for model trainer
3564
+ functions. Hyperparameter tuning ends model training when
3565
+ the training model evaluation metric attains "early_stop" value.
3566
+ Note:
3567
+ * Early stopping supports only when evaluation returns
3568
+ finite value.
3569
+ Types: int or float
3570
+
3571
+ frac:
3572
+ Optional Argument.
3573
+ Specifies the split percentage of rows to be sampled for training
3574
+ and testing dataset. "frac" argument value must range between (0, 1).
3575
+ Notes:
3576
+ * This "frac" argument is not supported for non-model trainer
3577
+ function.
3578
+ * The "frac" value is considered as train split percentage and
3579
+ The remaining percentage is taken into account for test splitting.
3580
+ Default Value: 0.8
3581
+ Types: float
3582
+
3583
+ run_parallel:
3584
+ Optional Argument.
3585
+ Specifies the parallel execution functionality of hyperparameter
3586
+ tuning. When "run_parallel" set to true, model functions are
3587
+ executed concurrently. Otherwise, model functions are executed
3588
+ sequentially.
3589
+ Default Value: True
3590
+ Types: bool
3591
+
3592
+ wait:
3593
+ Optional Argument.
3594
+ Specifies whether to wait for the completion of execution
3595
+ of hyperparameter tuning or not. When set to False, hyperparameter
3596
+ tuning is executed in the background and user can use "is_running()"
3597
+ method to check the status. Otherwise it waits until the execution
3598
+ is complete to return the control back to user.
3599
+ Default Value: True
3600
+ Type: bool
3601
+
3602
+ verbose:
3603
+ Optional Argument.
3604
+ Specifies whether to log the model training information and display
3605
+ the logs. When it is set to 1, progress bar alone logged in the
3606
+ console. When it is set to 2, along with progress bar, execution
3607
+ steps and execution time is logged in the console. When it is set
3608
+ to 0, nothing is logged in the console.
3609
+ Note:
3610
+ * verbose is not significant when "wait" is 'False'.
3611
+ Default Value: 0
3612
+ Type: bool
3613
+
3614
+ sample_seed:
3615
+ Optional Argument.
3616
+ Specifies the seed value that controls the shuffling applied
3617
+ to the data before applying the Train-Test split. Pass an int for
3618
+ reproducible output across multiple function calls.
3619
+ Notes:
3620
+ * When the argument is not specified, different
3621
+ runs of the query generate different outputs.
3622
+ * It must be in the range [0, 2147483647]
3623
+ * Seed is supported for stratify column.
3624
+ Types: int
3625
+
3626
+ stratify_column:
3627
+ Optional Argument.
3628
+ Specifies column name that contains the labels indicating
3629
+ which data needs to be stratified for TrainTest split.
3630
+ Notes:
3631
+ * seed is supported for stratify column.
3632
+ Types: str
3633
+
3634
+ sample_id_column:
3635
+ Optional Argument.
3636
+ Specifies the input data column name that has the
3637
+ unique identifier for each row in the input.
3638
+ Note:
3639
+ * Mandatory when "sample_seed" argument is present.
3640
+ Types: str
3641
+
3642
+ max_time:
3643
+ Optional Argument.
3644
+ Specifies the maximum time for the completion of Hyperparameter tuning execution.
3645
+ Default Value: None
3646
+ Types: int or float
3647
+
3648
+ kwargs:
3649
+ Optional Argument.
3650
+ Specifies the keyword arguments. Accepts additional arguments
3651
+ required for the teradataml analytic function.
3652
+
3653
+ RETURNS:
3654
+ None
3655
+
3656
+ RAISES:
3657
+ TeradataMlException, TypeError, ValueError
3658
+
3659
+ EXAMPLES:
3660
+ >>> # Create an instance of the RandomSearch algorithm called "optimizer_obj"
3661
+ >>> optimizer_obj = RandomSearch(func=SVM, params=params, n_iter=3)
3662
+
3663
+ >>> eval_params = {"id_column": "id",
3664
+ "accumulate": "MedHouseVal"}
3665
+ >>> # Example 1: Passing single DataFrame for model trainer function.
3666
+ >>> optimizer_obj.fit(data=train_df,
3667
+ evaluation_metric="MAE",
3668
+ early_stop=70.9,
3669
+ **eval_params)
3670
+
3671
+ >>> # Example 2: Passing multiple datasets as tuple of DataFrames for
3672
+ >>> # model trainer function.
3673
+ >>> optimizer_obj.fit(data=(train_df_1, train_df_2),
3674
+ evaluation_metric="MAE",
3675
+ early_stop=70.9,
3676
+ **eval_params)
3677
+
3678
+ >>> # Example 3: Passing multiple datasets as dictionary of DataFrames
3679
+ >>> # for model trainer function.
3680
+ >>> optimizer_obj.fit(data={"Data-1":train_df_1, "Data-2":train_df_2},
3681
+ evaluation_metric="MAE",
3682
+ early_stop=70.9,
3683
+ **eval_params)
3684
+
3685
+ >>> # Example 4: No data argument passed in fit() method for model trainer function.
3686
+ >>> # Note: data argument must be passed while creating HPT object as
3687
+ >>> # model hyperparameters.
3688
+
3689
+ >>> # Define parameter space for model training with "data" argument.
3690
+ >>> params = {"data":(df1, df2),
3691
+ "input_columns":['MedInc', 'HouseAge', 'AveRooms',
3692
+ 'AveBedrms', 'Population', 'AveOccup',
3693
+ 'Latitude', 'Longitude'],
3694
+ "response_column":"MedHouseVal",
3695
+ "model_type":"regression",
3696
+ "batch_size":(11, 50, 75),
3697
+ "iter_max":(100, 301),
3698
+ "intercept":False,
3699
+ "learning_rate":"INVTIME",
3700
+ "nesterov_optimization":True,
3701
+ "local_sgd_iterations":1}
3702
+
3703
+ >>> # Create "optimizer_obj" using RandomSearch algorithm and perform
3704
+ >>> # fit() method without any "data" argument for model trainer function.
3705
+ >>> optimizer_obj.fit(evaluation_metric="MAE",
3706
+ early_stop=70.9,
3707
+ **eval_params)
3708
+
3709
+ >>> # Example 5: Do not pass data argument in fit() method for
3710
+ >>> # non-model trainer function.
3711
+ >>> # Note: data argument must be passed while creating HPT
3712
+ >>> # object as model hyperparameters.
3713
+ >>> optimizer_obj.fit()
3714
+
3715
+ >>> # Example 6: Passing "verbose" argument value '1' in fit() method to
3716
+ >>> # display model log.
3717
+ >>> optimizer_obj.fit(data=train_df, evaluation_metric="R2",
3718
+ verbose=1, **eval_params)
3719
+ completed: |████████████████████████████████████████████████████████████| 100% - 6/6
3720
+
3721
+ >>> # Example 7: max_time argument is passed in fit() method.
3722
+ >>> # Model training parameters
3723
+ >>> model_params = {"input_columns":['sepal_length', 'sepal_width', 'petal_length', 'petal_width'],
3724
+ ... "response_column" : 'species',
3725
+ ... "max_depth":(5,10,15),
3726
+ ... "lambda1" : (1000.0,0.001),
3727
+ ... "model_type" :"Classification",
3728
+ ... "seed":32,
3729
+ ... "shrinkage_factor":0.1,
3730
+ ... "iter_num":(5, 50)}
3731
+ >>>
3732
+ >>> eval_params = {"id_column": "id",
3733
+ ... "accumulate": "species",
3734
+ ... "model_type":'Classification',
3735
+ ... "object_order_column":['task_index', 'tree_num', 'iter','class_num', 'tree_order']
3736
+ ... }
3737
+ >>>
3738
+ >>> # Import model trainer and optimizer
3739
+ >>> from teradataml import XGBoost, RandomSearch
3740
+ >>>
3741
+ >>> # Initialize the RandomSearch optimizer with model trainer
3742
+ >>> # function and parameter space required for model training.
3743
+ >>> rs_obj = RandomSearch(func=XGBoost, params=model_params, n_iter=5)
3744
+ >>>
3745
+ >>> # fit() method with max_time argument(in seconds) for model trainer function.
3746
+ >>> rs_obj.fit(data=data, max_time=30, verbose=2, **eval_params)
3747
+ Model_id:XGBOOST_3 - Run time:28.292s - Status:PASS - ACCURACY:0.8
3748
+ Model_id:XGBOOST_0 - Run time:28.291s - Status:PASS - ACCURACY:0.867
3749
+ Model_id:XGBOOST_2 - Run time:28.289s - Status:PASS - ACCURACY:0.867
3750
+ Model_id:XGBOOST_1 - Run time:28.291s - Status:PASS - ACCURACY:0.867
3751
+ Computing: |⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫿⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾⫾| 80% - 4/5
3752
+ >>>
3753
+ >>> # status 'SKIP' for the models which are not completed within the max_time.
3754
+ >>> rs_obj.models
3755
+ MODEL_ID DATA_ID PARAMETERS STATUS ACCURACY
3756
+ 0 XGBOOST_3 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.800000
3757
+ 1 XGBOOST_4 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... SKIP NaN
3758
+ 2 XGBOOST_0 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.866667
3759
+ 3 XGBOOST_2 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.866667
3760
+ 4 XGBOOST_1 DF_0 {'input_columns': ['sepal_length', 'sepal_widt... PASS 0.866667
3761
+ """
3762
+
3763
+ # Set discard_invalid_column_params flag.
3764
+ self.discard_invalid_column_params =kwargs.get("discard_invalid_column_params", False)
3765
+
3766
+ if self.discard_invalid_column_params:
3767
+ # Setting model trainer input data
3768
+ super()._setting_model_trainer_data(data)
3769
+ # Mapping the data with input columns
3770
+ super()._data_mapping()
3771
+ # Setting the lambda function to None.
3772
+ self._setting_model_trainer_data = lambda data: None
3773
+ self._BaseSearch__update_model_parameters = lambda: None
3774
+
3775
+ # Populate parameter grid.
3776
+ self.__populate_params_grid()
3777
+
3778
+ # Calling baseSearch class fit method.
3779
+ super().fit(data, evaluation_metric, early_stop,
3780
+ frac, run_parallel, wait, verbose,
3781
+ stratify_column, sample_id_column,
3782
+ sample_seed, max_time, **kwargs)
3783
+