teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (263) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/LICENSE.pdf +0 -0
  3. teradataml/README.md +183 -0
  4. teradataml/__init__.py +6 -3
  5. teradataml/_version.py +2 -2
  6. teradataml/analytics/__init__.py +3 -2
  7. teradataml/analytics/analytic_function_executor.py +275 -40
  8. teradataml/analytics/analytic_query_generator.py +92 -0
  9. teradataml/analytics/byom/__init__.py +3 -2
  10. teradataml/analytics/json_parser/metadata.py +1 -0
  11. teradataml/analytics/json_parser/utils.py +17 -21
  12. teradataml/analytics/meta_class.py +40 -1
  13. teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
  14. teradataml/analytics/sqle/__init__.py +10 -2
  15. teradataml/analytics/table_operator/__init__.py +3 -2
  16. teradataml/analytics/uaf/__init__.py +21 -2
  17. teradataml/analytics/utils.py +62 -1
  18. teradataml/analytics/valib.py +1 -1
  19. teradataml/automl/__init__.py +1553 -319
  20. teradataml/automl/custom_json_utils.py +139 -61
  21. teradataml/automl/data_preparation.py +276 -319
  22. teradataml/automl/data_transformation.py +163 -81
  23. teradataml/automl/feature_engineering.py +402 -239
  24. teradataml/automl/feature_exploration.py +9 -2
  25. teradataml/automl/model_evaluation.py +48 -51
  26. teradataml/automl/model_training.py +291 -189
  27. teradataml/catalog/byom.py +8 -8
  28. teradataml/catalog/model_cataloging_utils.py +1 -1
  29. teradataml/clients/auth_client.py +133 -0
  30. teradataml/clients/pkce_client.py +1 -1
  31. teradataml/common/aed_utils.py +3 -2
  32. teradataml/common/constants.py +48 -6
  33. teradataml/common/deprecations.py +13 -7
  34. teradataml/common/garbagecollector.py +156 -120
  35. teradataml/common/messagecodes.py +6 -1
  36. teradataml/common/messages.py +3 -1
  37. teradataml/common/sqlbundle.py +1 -1
  38. teradataml/common/utils.py +103 -11
  39. teradataml/common/wrapper_utils.py +1 -1
  40. teradataml/context/context.py +121 -31
  41. teradataml/data/advertising.csv +201 -0
  42. teradataml/data/bank_marketing.csv +11163 -0
  43. teradataml/data/bike_sharing.csv +732 -0
  44. teradataml/data/boston2cols.csv +721 -0
  45. teradataml/data/breast_cancer.csv +570 -0
  46. teradataml/data/complaints_test_tokenized.csv +353 -0
  47. teradataml/data/complaints_tokens_model.csv +348 -0
  48. teradataml/data/covid_confirm_sd.csv +83 -0
  49. teradataml/data/customer_segmentation_test.csv +2628 -0
  50. teradataml/data/customer_segmentation_train.csv +8069 -0
  51. teradataml/data/dataframe_example.json +10 -0
  52. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
  53. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
  54. teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
  55. teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
  56. teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
  57. teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
  58. teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
  59. teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
  60. teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
  61. teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
  62. teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
  63. teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
  64. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
  65. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
  66. teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
  67. teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
  68. teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
  69. teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
  70. teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
  71. teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
  72. teradataml/data/docs/sqle/docs_17_20/Shap.py +197 -0
  73. teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
  74. teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
  75. teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
  76. teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
  77. teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
  78. teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
  79. teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
  80. teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
  81. teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
  82. teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
  83. teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
  84. teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
  85. teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
  86. teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
  87. teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
  88. teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
  89. teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
  90. teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
  91. teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
  92. teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
  93. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
  94. teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
  95. teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
  96. teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
  97. teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
  98. teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
  99. teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
  100. teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
  101. teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
  102. teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
  103. teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
  104. teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
  105. teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
  106. teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
  107. teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
  108. teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
  109. teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
  110. teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
  111. teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
  112. teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
  113. teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
  114. teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
  115. teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
  116. teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
  117. teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
  118. teradataml/data/dwt2d_dataTable.csv +65 -0
  119. teradataml/data/dwt_dataTable.csv +8 -0
  120. teradataml/data/dwt_filterTable.csv +3 -0
  121. teradataml/data/finance_data4.csv +13 -0
  122. teradataml/data/glm_example.json +28 -1
  123. teradataml/data/grocery_transaction.csv +19 -0
  124. teradataml/data/housing_train_segment.csv +201 -0
  125. teradataml/data/idwt2d_dataTable.csv +5 -0
  126. teradataml/data/idwt_dataTable.csv +8 -0
  127. teradataml/data/idwt_filterTable.csv +3 -0
  128. teradataml/data/insect2Cols.csv +61 -0
  129. teradataml/data/interval_data.csv +5 -0
  130. teradataml/data/jsons/paired_functions.json +14 -0
  131. teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
  132. teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
  133. teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
  134. teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
  135. teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
  136. teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
  137. teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
  138. teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
  139. teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
  140. teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
  141. teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
  142. teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
  143. teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
  144. teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
  145. teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
  146. teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
  147. teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
  148. teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
  149. teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
  150. teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
  151. teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
  152. teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
  153. teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
  154. teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
  155. teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
  156. teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
  157. teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
  158. teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
  159. teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
  160. teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
  161. teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
  162. teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
  163. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
  164. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
  165. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
  166. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
  167. teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
  168. teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
  169. teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
  170. teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
  171. teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
  172. teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
  173. teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
  174. teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
  175. teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
  176. teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
  177. teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
  178. teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
  179. teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
  180. teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
  181. teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
  182. teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
  183. teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
  184. teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
  185. teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
  186. teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
  187. teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
  188. teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
  189. teradataml/data/jsons/uaf/17.20/TD_SAX.json +208 -0
  190. teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
  191. teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
  192. teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
  193. teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
  194. teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
  195. teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +400 -0
  196. teradataml/data/kmeans_example.json +5 -0
  197. teradataml/data/kmeans_table.csv +10 -0
  198. teradataml/data/load_example_data.py +8 -2
  199. teradataml/data/naivebayestextclassifier_example.json +1 -1
  200. teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
  201. teradataml/data/onehot_encoder_train.csv +4 -0
  202. teradataml/data/openml_example.json +29 -0
  203. teradataml/data/peppers.png +0 -0
  204. teradataml/data/real_values.csv +14 -0
  205. teradataml/data/sax_example.json +8 -0
  206. teradataml/data/scale_attributes.csv +3 -0
  207. teradataml/data/scale_example.json +52 -1
  208. teradataml/data/scale_input_part_sparse.csv +31 -0
  209. teradataml/data/scale_input_partitioned.csv +16 -0
  210. teradataml/data/scale_input_sparse.csv +11 -0
  211. teradataml/data/scale_parameters.csv +3 -0
  212. teradataml/data/scripts/deploy_script.py +21 -2
  213. teradataml/data/scripts/sklearn/sklearn_fit.py +40 -37
  214. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +22 -30
  215. teradataml/data/scripts/sklearn/sklearn_function.template +42 -24
  216. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
  217. teradataml/data/scripts/sklearn/sklearn_neighbors.py +19 -28
  218. teradataml/data/scripts/sklearn/sklearn_score.py +32 -32
  219. teradataml/data/scripts/sklearn/sklearn_transform.py +85 -42
  220. teradataml/data/star_pivot.csv +8 -0
  221. teradataml/data/templates/open_source_ml.json +2 -1
  222. teradataml/data/teradataml_example.json +97 -1
  223. teradataml/data/timestamp_data.csv +4 -0
  224. teradataml/data/titanic_dataset_unpivoted.csv +19 -0
  225. teradataml/data/uaf_example.json +55 -1
  226. teradataml/data/unpivot_example.json +15 -0
  227. teradataml/data/url_data.csv +9 -0
  228. teradataml/data/windowdfft.csv +16 -0
  229. teradataml/data/ztest_example.json +16 -0
  230. teradataml/dataframe/copy_to.py +9 -4
  231. teradataml/dataframe/data_transfer.py +125 -64
  232. teradataml/dataframe/dataframe.py +575 -57
  233. teradataml/dataframe/dataframe_utils.py +47 -9
  234. teradataml/dataframe/fastload.py +273 -90
  235. teradataml/dataframe/functions.py +339 -0
  236. teradataml/dataframe/row.py +160 -0
  237. teradataml/dataframe/setop.py +2 -2
  238. teradataml/dataframe/sql.py +740 -18
  239. teradataml/dataframe/window.py +1 -1
  240. teradataml/dbutils/dbutils.py +324 -18
  241. teradataml/geospatial/geodataframe.py +1 -1
  242. teradataml/geospatial/geodataframecolumn.py +1 -1
  243. teradataml/hyperparameter_tuner/optimizer.py +13 -13
  244. teradataml/lib/aed_0_1.dll +0 -0
  245. teradataml/opensource/sklearn/_sklearn_wrapper.py +254 -122
  246. teradataml/options/__init__.py +16 -5
  247. teradataml/options/configure.py +39 -6
  248. teradataml/options/display.py +2 -2
  249. teradataml/plot/axis.py +4 -4
  250. teradataml/scriptmgmt/UserEnv.py +26 -19
  251. teradataml/scriptmgmt/lls_utils.py +120 -16
  252. teradataml/table_operators/Script.py +4 -5
  253. teradataml/table_operators/TableOperator.py +160 -26
  254. teradataml/table_operators/table_operator_util.py +88 -41
  255. teradataml/table_operators/templates/dataframe_udf.template +63 -0
  256. teradataml/telemetry_utils/__init__.py +0 -0
  257. teradataml/telemetry_utils/queryband.py +52 -0
  258. teradataml/utils/validators.py +41 -3
  259. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/METADATA +191 -6
  260. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/RECORD +263 -185
  261. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/WHEEL +0 -0
  262. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/top_level.txt +0 -0
  263. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/zip-safe +0 -0
@@ -19,7 +19,6 @@ import pandas as pd
19
19
  import random
20
20
  import time
21
21
  import warnings
22
- warnings.filterwarnings("ignore")
23
22
 
24
23
  # Teradata libraries
25
24
  from teradataml.dataframe.dataframe import DataFrame
@@ -27,10 +26,15 @@ from teradataml.dataframe.copy_to import copy_to_sql
27
26
  from teradataml import OutlierFilterFit, OutlierFilterTransform
28
27
  from teradataml import RoundColumns, TeradataMlException
29
28
  from teradataml import ScaleFit, ScaleTransform
30
- from teradataml import TrainTestSplit, UtilFuncs, TeradataConstants
29
+ from teradataml import UtilFuncs, TeradataConstants
30
+ from teradataml.common.garbagecollector import GarbageCollector
31
31
  from teradataml.common.messages import Messages, MessageCodes
32
32
  from teradataml.utils.validators import _Validators
33
+ from teradataml import INTEGER
33
34
 
35
+ # Control Randomnes
36
+ random.seed(42)
37
+ np.random.seed(42)
34
38
 
35
39
  class _DataPreparation:
36
40
 
@@ -41,7 +45,8 @@ class _DataPreparation:
41
45
  excluded_columns=None,
42
46
  custom_data=None,
43
47
  data_transform_dict=None,
44
- task_type="Regression"):
48
+ task_type="Regression",
49
+ **kwargs):
45
50
  """
46
51
  DESCRIPTION:
47
52
  Function initializes the data, target column and columns datatypes
@@ -54,7 +59,7 @@ class _DataPreparation:
54
59
  Types: teradataml Dataframe
55
60
 
56
61
  target_column:
57
- Required Arugment.
62
+ Required Argument.
58
63
  Specifies the name of the target column in "data".
59
64
  Types: str
60
65
 
@@ -69,27 +74,49 @@ class _DataPreparation:
69
74
  Types: int
70
75
 
71
76
  excluded_columns:
72
- Required Arugment.
77
+ Required Argument.
73
78
  Specifies the columns should be excluded from any processing.
74
79
  Types: str or list of strings (str)
75
80
 
76
81
  custom_data:
77
- Optional Arugment.
82
+ Optional Argument.
78
83
  Specifies json object containing user customized input.
79
84
  Types: json object
80
85
 
81
86
  data_transform_dict:
82
- Optional Arugment.
87
+ Optional Argument.
83
88
  Specifies the parameters for data transformation.
84
89
  Types: dict
85
90
 
86
91
  task_type:
87
- Required Arugment.
92
+ Required Argument.
88
93
  Specifies the task type for AutoML, whether to apply regresion OR classification
89
94
  on the provived dataset.
90
95
  Default Value: "Regression"
91
96
  Permitted Values: "Regression", "Classification"
92
97
  Types: str
98
+
99
+ **kwargs:
100
+ Specifies the additional arguments for data preparation. Below
101
+ are the additional arguments:
102
+ volatile:
103
+ Optional Argument.
104
+ Specifies whether to put the interim results of the
105
+ functions in a volatile table or not. When set to
106
+ True, results are stored in a volatile table,
107
+ otherwise not.
108
+ Default Value: False
109
+ Types: bool
110
+
111
+ persist:
112
+ Optional Argument.
113
+ Specifies whether to persist the interim results of the
114
+ functions in a table or not. When set to True,
115
+ results are persisted in a table; otherwise,
116
+ results are garbage collected at the end of the
117
+ session.
118
+ Default Value: False
119
+ Types: bool
93
120
  """
94
121
  self.data = data
95
122
  self.target_column = target_column
@@ -98,16 +125,15 @@ class _DataPreparation:
98
125
  self.data_transform_dict = data_transform_dict
99
126
  self.custom_data = custom_data
100
127
  self.task_type = task_type
128
+ self.volatile = kwargs.get("volatile", False)
129
+ self.persist = kwargs.get("persist", False)
101
130
 
102
131
  # Setting default value for auto run mode
103
- self._train_size = 0.80
104
132
  self._data_sampling_method = "SMOTE"
105
133
  self._scale_method_reg = "STD"
106
134
  self._scale_method_cls = "RANGE"
107
135
  self.table_name_mapping = {}
108
136
 
109
- random.seed(42)
110
- np.random.seed(42)
111
137
  self.data_types = {key: value for key, value in self.data._column_names_and_types}
112
138
 
113
139
 
@@ -116,14 +142,13 @@ class _DataPreparation:
116
142
  """
117
143
  DESCRIPTION:
118
144
  Function to perform following tasks:-
119
- 1. Splits the given data into training and testing datasets.
120
- 2. Performs outlier processing on the training dataset and transformation on the testing dataset.
121
- 3. Performs feature selection using RFE, PCA, and Lasso.
122
- 4. Performs feature scaling.
145
+ 1. Performs outlier processing and transformation on dataset.
146
+ 2. Performs feature selection using RFE, PCA, and Lasso.
147
+ 3. Performs feature scaling.
123
148
 
124
149
  PARAMETERS:
125
150
  auto:
126
- Optional Arugment.
151
+ Optional Argument.
127
152
  Specifies whether to run AutoML in custom mode or auto mode.
128
153
  When set to False, runs in custom mode. Otherwise, by default runs in auto mode.
129
154
  Default Value: True
@@ -138,38 +163,36 @@ class _DataPreparation:
138
163
  progress_bar=self.progress_bar)
139
164
  # Setting user value in case of custom running mode
140
165
  if not auto:
141
- self._set_custom_train_test_split()
142
166
  self._set_custom_scaling_method()
143
167
  self._set_custom_sampling()
144
168
 
145
- # Performing train test split
146
- self._train_test_split()
147
- self.progress_bar.update()
148
-
149
169
  # Handling ouliers in dataset
150
170
  self._handle_outliers(auto)
151
171
  self.progress_bar.update()
152
172
 
153
173
  # Handling float type features before processing with feature selection and scaling
154
- train = self._handle_generated_features('train')
155
- test = self._handle_generated_features('test')
174
+ training_data = self._handle_generated_features()
156
175
  self.progress_bar.update()
157
176
 
158
177
  # Temporary Pulling data for feature selection
159
178
  # Will change after sto
160
179
 
161
180
  # Checking for data imbalance
162
- if self._check_data_imbalance(train):
163
- train = self._data_sampling(train)
181
+ if self._check_data_imbalance(training_data):
182
+ training_data = self._data_sampling(training_data)
164
183
  self.progress_bar.update()
165
184
 
185
+ # Sorting the data based on id to
186
+ # remove any shuffling done by sampling
187
+ training_data = training_data.sort_values(by='id')
188
+
166
189
  # Performing feature selection using lasso followed by scaling
167
- self._feature_selection_Lasso(train, test)
190
+ self._feature_selection_Lasso(training_data)
168
191
  self._scaling_features(feature_selection_mtd="lasso")
169
192
  self.progress_bar.update()
170
193
 
171
194
  # Performing feature selection using rfe followed by scaling
172
- self._feature_selection_RFE(train, test)
195
+ self._feature_selection_RFE(training_data)
173
196
  self._scaling_features(feature_selection_mtd="rfe")
174
197
  self.progress_bar.update()
175
198
 
@@ -180,85 +203,8 @@ class _DataPreparation:
180
203
 
181
204
  return [self.rfe_feature, self.lasso_feature, self.pca_feature], self.data_transform_dict
182
205
 
183
- # Splits data into train and test
184
- def _train_test_split(self):
185
-
186
- """
187
- DESCRIPTION:
188
- Function splits the data into training and testing datasets.
189
-
190
- PARAMETERS:
191
- train_size:
192
- Optional Argument.
193
- Specifies the training size required for splitting dataset.
194
- By Default, it takes 0.8 as training size.
195
- Types: float
196
- """
197
- self._display_msg(msg="\nSpliting of dataset into training and testing ...",
198
- progress_bar=self.progress_bar,
199
- show_data=True)
200
- self._display_msg(inline_msg="Training size : {}".format(self._train_size),
201
- progress_bar=self.progress_bar)
202
- self._display_msg(inline_msg="Testing size : {}".format(round((1-self._train_size),2)),
203
- progress_bar=self.progress_bar)
204
- start_time = time.time()
205
- # Applying TrainTestSplit function on data
206
- # Regression
207
- train_test_func_params = {
208
- "data" : self.data,
209
- "id_column" : "id",
210
- "train_size" : self._train_size,
211
- "seed" : 42
212
- }
213
- if self.is_classification_type():
214
- train_test_func_params["stratify_column"]=self.target_column
215
- train_test_split_out = TrainTestSplit(**train_test_func_params)
216
- train_test_split_out = train_test_split_out.result
217
-
218
- # Splitting the data into training and testing data
219
- self.train_df = train_test_split_out[train_test_split_out['TD_IsTrainRow'] == 1].drop('TD_IsTrainRow', axis=1)
220
- self.test_df = train_test_split_out[train_test_split_out['TD_IsTrainRow'] == 0].drop('TD_IsTrainRow', axis=1)
221
-
222
- self._display_msg(msg="Training data sample",
223
- data=self.train_df,
224
- progress_bar=self.progress_bar)
225
-
226
- self._display_msg(msg="Testing data sample",
227
- data=self.test_df,
228
- progress_bar=self.progress_bar)
229
-
230
- end_time = time.time()
231
- self._display_msg(msg="Time taken for spliting of data: {:.2f} sec ".format(end_time - start_time),
232
- progress_bar=self.progress_bar,
233
- show_data=True)
234
-
235
- def _set_custom_train_test_split(self):
236
- """
237
- DESCRIPTION:
238
- Function to split dataset into training and testing based on user input.
239
-
240
- """
241
- # Fetching user input for train test split
242
- train_test_split_input = self.custom_data.get("TrainTestSplitIndicator", False)
243
- if train_test_split_input:
244
- # Extracting training size
245
- custom_train_size = self.custom_data.get("TrainingSize", None)
246
- if custom_train_size is None:
247
- self._display_msg(inline_msg="No information provided for training size. Proceeding with default option.",
248
- progress_bar=self.progress_bar)
249
- else:
250
- if not isinstance(custom_train_size, float):
251
- err = Messages.get_message(MessageCodes.INVALID_COLUMN_TYPE,
252
- 'custom_train', type(custom_train_size).__name__,
253
- 'float')
254
- raise TeradataMlException(err, MessageCodes.INVALID_COLUMN_TYPE)
255
- self._train_size = custom_train_size
256
- else:
257
- self._display_msg(inline_msg="No information provided for performing customized train test split. Proceeding with default option.",
258
- progress_bar=self.progress_bar)
259
-
260
206
  def _handle_outliers(self,
261
- auto):
207
+ auto):
262
208
  """
263
209
  DESCRIPTION:
264
210
  Function to handle existing outliers in dataset based on running mode.
@@ -289,6 +235,12 @@ class _DataPreparation:
289
235
  DESCRIPTION:
290
236
  Function to handle data imbalance in dataset using sampling techniques
291
237
  in case of classification.
238
+
239
+ PARAMETERS:
240
+ data:
241
+ Required Argument.
242
+ Specifies the input teradataml DataFrame.
243
+ Types: pandas Dataframe.
292
244
  """
293
245
  pass
294
246
 
@@ -310,7 +262,7 @@ class _DataPreparation:
310
262
  outlier_method = "Tukey"
311
263
 
312
264
  # List of columns for outlier processing.
313
- outlier_columns = [col for col in self.train_df.columns if col not in self.excluded_columns]
265
+ outlier_columns = [col for col in self.data.columns if col not in self.excluded_columns]
314
266
 
315
267
  # Detecting outlier percentage in each columns
316
268
  outlier_percentage_df = self._outlier_detection(outlier_method, outlier_columns)
@@ -360,26 +312,45 @@ class _DataPreparation:
360
312
  Pandas DataFrame containing, column name with outlier percentage.
361
313
 
362
314
  """
363
- # Performing fit on train dataset for outlier handling
315
+
316
+ # Setting volatile and persist parameters for Outlier handling function
317
+ volatile, persist = self._set_generic_parameters(func_indicator='OutlierFilterIndicator',
318
+ param_name='OutlierFilterParam')
319
+
320
+ # Performing fit on dataset for outlier handling
364
321
  fit_params = {
365
- "data" : self.train_df,
322
+ "data" : self.data,
366
323
  "target_columns" : target_columns,
367
324
  "outlier_method" : outlier_method,
368
- "replacement_value" : replacement_value
325
+ "replacement_value" : replacement_value,
326
+ "volatile" : volatile,
327
+ "persist" : persist
369
328
  }
370
329
  outlier_fit_out = OutlierFilterFit(**fit_params)
371
- # Performing transform on train dataset for outlier handling
330
+ # Performing transform on dataset for outlier handling
372
331
  transform_params = {
373
- "data" : self.train_df,
332
+ "data" : self.data,
374
333
  "object" : outlier_fit_out.result,
375
334
  "persist" : True
376
335
  }
377
- self.train_df = OutlierFilterTransform(**transform_params).result
336
+
337
+ # Disabling print if persist is True by default
338
+ if not volatile and not persist:
339
+ transform_params["display_table_name"] = False
340
+
341
+ if volatile:
342
+ transform_params["volatile"] = True
343
+ transform_params["persist"] = False
344
+ self.data = OutlierFilterTransform(**transform_params).result
345
+
346
+ if not volatile and not persist:
347
+ # Adding transformed data containing table to garbage collector
348
+ GarbageCollector._add_to_garbagecollector(self.data._table_name)
378
349
 
379
350
  def _outlier_processing(self):
380
351
  """
381
352
  DESCRIPTION:
382
- Function performs outlier processing on the training dataset. It identifies and handle outliers in the dataset.
353
+ Function performs outlier processing on dataset. It identifies and handle outliers in the dataset.
383
354
 
384
355
  """
385
356
  self._display_msg(msg="\nOutlier preprocessing ...",
@@ -400,6 +371,9 @@ class _DataPreparation:
400
371
  target_columns=columns_to_drop_rows
401
372
  replacement_strategy = "DELETE"
402
373
  self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
374
+ self._display_msg(msg="Sample of dataset after removing outlier rows:",
375
+ data=self.data,
376
+ progress_bar=self.progress_bar)
403
377
 
404
378
  # Imputing Median value in place of outliers
405
379
  if len(columns_to_impute) != 0:
@@ -409,6 +383,13 @@ class _DataPreparation:
409
383
  target_columns=columns_to_impute
410
384
  replacement_strategy = "MEDIAN"
411
385
  self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
386
+ self._display_msg(msg="Sample of dataset after performing MEDIAN inplace:",
387
+ data=self.data,
388
+ progress_bar=self.progress_bar)
389
+
390
+ if len(columns_to_drop_rows) == 0 and len(columns_to_impute) == 0:
391
+ self._display_msg(msg='Analysis indicates not outlier in the dataset. No Action Taken.',
392
+ progress_bar=self.progress_bar)
412
393
 
413
394
  end_time = time.time()
414
395
  self._display_msg("Time Taken by Outlier processing: {:.2f} sec ".format(end_time - start_time),
@@ -418,7 +399,7 @@ class _DataPreparation:
418
399
  def _custom_outlier_processing(self):
419
400
  """
420
401
  DESCRIPTION:
421
- Function to perform outlier processing on the training dataset based on user input.
402
+ Function to perform outlier processing on dataset based on user input.
422
403
 
423
404
  """
424
405
  self._display_msg(msg="\nStarting customized outlier processing ...",
@@ -428,7 +409,7 @@ class _DataPreparation:
428
409
  # Checking user input for outlier filtering
429
410
  if outlier_filter_input:
430
411
  # List of columns for outlier processing.
431
- target_columns = [col for col in self.train_df.columns if col not in self.excluded_columns]
412
+ target_columns = [col for col in self.data.columns if col not in self.excluded_columns]
432
413
  # Checking user input for outlier detection method
433
414
  outlier_method = self.custom_data.get("OutlierDetectionMethod", None)
434
415
  if outlier_method == 'PERCENTILE':
@@ -445,11 +426,13 @@ class _DataPreparation:
445
426
  # Checking for rows if outlier containing columns exist
446
427
  if outlier_df.shape[0]:
447
428
  # Checking user input list for outlier handling
448
- outlier_transform_list = self.custom_data.get("OutlierFilterParam", None)
429
+ outlier_transform_list = self.custom_data.get("OutlierFilterParam", None).copy()
449
430
  if outlier_transform_list:
431
+ volatile = outlier_transform_list.pop("volatile", False)
432
+ persist = outlier_transform_list.pop("persist", False)
450
433
  # Checking user input for outlier handling
451
434
  _Validators._validate_dataframe_has_argument_columns(list(outlier_transform_list.keys()), "OutlierFilterParam",
452
- self.train_df, "train")
435
+ self.data, "outlier_data")
453
436
 
454
437
  for target_col, transform_val in outlier_transform_list.items():
455
438
  # Fetching replacement value
@@ -498,14 +481,12 @@ class _DataPreparation:
498
481
  from sklearn.decomposition import PCA
499
482
 
500
483
  start_time = time.time()
501
- # Training and testing data using pandas dataframe
484
+
502
485
  # Temporary Pulling data for feature selection
503
- train = DataFrame.from_table(self.table_name_mapping['pca_train']).to_pandas()
504
- test = DataFrame.from_table(self.table_name_mapping['pca_test']).to_pandas()
486
+ pca_train = DataFrame.from_table(self.table_name_mapping['pca_train']).to_pandas()
505
487
 
506
488
  # Drop unnecessary columns and store the result
507
- train_data = train.drop(columns=['id', self.target_column], axis=1)
508
- test_data = test.drop(columns=['id', self.target_column], axis=1)
489
+ train_data = pca_train.drop(columns=['id', self.target_column], axis=1)
509
490
 
510
491
  # Initialize and fit PCA
511
492
  pca = PCA()
@@ -518,16 +499,15 @@ class _DataPreparation:
518
499
  # Create a new instance of PCA with the optimal number of components
519
500
  pca = PCA(n_components=n, random_state=42)
520
501
 
521
- # Apply PCA on training and testing dataset
502
+ # Apply PCA on dataset
522
503
  X_train_pca = pca.fit_transform(train_data)
523
- X_test_pca = pca.transform(test_data)
524
504
 
525
505
  # storing instance of PCA in data transformation dictionary
526
506
  self.data_transform_dict["pca_fit_instance"] = pca
507
+ self.data_transform_dict["pca_fit_columns"] = train_data.columns.tolist()
527
508
 
528
509
  #converting the numarray into dataframes
529
510
  train_df = pd.DataFrame(X_train_pca)
530
- test_df = pd.DataFrame(X_test_pca)
531
511
 
532
512
  #creating names for combined columns
533
513
  column_name = {col: 'col_'+str(i) for i,col in enumerate(train_df.columns)}
@@ -537,15 +517,12 @@ class _DataPreparation:
537
517
 
538
518
  #renaming them
539
519
  train_df = train_df.rename(columns=column_name)
540
- test_df = test_df.rename(columns=column_name)
541
520
 
542
521
  # adding the id column [PCA does not shuffle the dataset]
543
- train_df = pd.concat([train.reset_index(drop=True)['id'], train_df.reset_index(drop=True)], axis=1)
544
- test_df = pd.concat([test.reset_index(drop=True)['id'], test_df.reset_index(drop=True)], axis=1)
522
+ train_df = pd.concat([pca_train.reset_index(drop=True)['id'], train_df.reset_index(drop=True)], axis=1)
545
523
 
546
- # merging target column with new training and testing data
547
- train_df[self.target_column] = train[self.target_column].reset_index(drop=True)
548
- test_df[self.target_column] = test[self.target_column].reset_index(drop=True)
524
+ # merging target column with new data
525
+ train_df[self.target_column] = pca_train[self.target_column].reset_index(drop=True)
549
526
 
550
527
  self.pca_feature = train_df.drop(columns=['id',self.target_column],axis=1).columns.tolist()
551
528
 
@@ -557,31 +534,21 @@ class _DataPreparation:
557
534
  progress_bar=self.progress_bar,
558
535
  show_data=True)
559
536
 
560
- if self.is_classification_type():
561
- train_df[self.target_column] = train_df[self.target_column].astype('int')
562
- test_df[self.target_column] = test_df[self.target_column].astype('int')
563
-
564
537
  # Pushing the data in database
565
- self.copy_dataframe_to_sql(train_df, test_df, 'pca')
538
+ self.copy_dataframe_to_sql(train_df, 'pca', self.persist)
566
539
 
567
- def _feature_selection_RFE(self,
568
- train=None,
569
- test=None):
540
+ def _feature_selection_RFE(self,
541
+ data=None):
570
542
  """
571
543
  DESCRIPTION:
572
544
  Function performs Recursive Feature Elimination (RFE) for feature selection.
573
545
  It identifies a subset of the most relevant features in the dataset.
574
546
 
575
547
  PARAMETERS:
576
- train:
548
+ data:
577
549
  Required Argument.
578
550
  Specifies the input train pandas DataFrame.
579
- Types: pandas Dataframe
580
-
581
- test:
582
- Required Argument.
583
- Specifies the input test pandas DataFrame.
584
- Types: pandas Dataframe
551
+ Types: pandas Dataframe
585
552
  """
586
553
  self._display_msg(msg="\nFeature selection using rfe ...",
587
554
  progress_bar=self.progress_bar,
@@ -590,57 +557,59 @@ class _DataPreparation:
590
557
  # Required imports for RFE
591
558
  from sklearn.feature_selection import RFECV
592
559
  from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
593
- from sklearn.model_selection import StratifiedKFold,KFold
560
+ from sklearn.model_selection import StratifiedKFold
594
561
 
595
562
  start_time = time.time()
596
563
  # Regression
597
564
  is_classification = self.is_classification_type()
598
565
  # Getting the value of k in k-fold cross-validation
599
- folds = self._num_of_folds(train.shape[0])
566
+ folds = self._num_of_folds(data.shape[0])
600
567
 
601
- # Random forest for RFE model
602
- RFModel = RandomForestRegressor if not is_classification else RandomForestClassifier
603
- rf = RFModel(n_estimators=100, random_state=42)
568
+ # Suppressing warnings generated by pandas and sklearn
569
+ with warnings.catch_warnings():
570
+ warnings.filterwarnings('ignore')
604
571
 
605
- # Determine the scoring metric based on the number of unique classes
606
- score = 'r2' if not self.is_classification_type() \
607
- else 'roc_auc' if self.data.drop_duplicate(self.target_column).size == 2 else 'f1_macro'
572
+ # Random forest for RFE model
573
+ RFModel = RandomForestRegressor if not is_classification else RandomForestClassifier
574
+ rf = RFModel(n_estimators=100, random_state=42)
608
575
 
609
- # Instantiate StratifiedKFold with shuffling for classification
610
- cv = folds if not self.is_classification_type() \
611
- else StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
576
+ # Determine the scoring metric based on the number of unique classes
577
+ score = 'r2' if not self.is_classification_type() \
578
+ else 'roc_auc' if self.data.drop_duplicate(self.target_column).size == 2 else 'f1_macro'
612
579
 
613
- # Define the RFE with cross-validation
614
- rfecv = RFECV(rf, cv=cv, scoring=score)
580
+ # # Instantiate StratifiedKFold with shuffling for classification
581
+ cv = folds if not self.is_classification_type() \
582
+ else StratifiedKFold(n_splits=folds, shuffle=False)
615
583
 
616
- # Prepare the training data
617
- train_data = train.drop(columns=['id',self.target_column], axis=1)
618
- train_target = train[self.target_column]
584
+ # Define the RFE with cross-validation
585
+ rfecv = RFECV(rf, cv=cv, scoring=score)
619
586
 
620
- # Fit the RFE using cv
621
- rfecv.fit(train_data, train_target)
587
+ # Prepare data
588
+ train_data = data.drop(columns=['id',self.target_column], axis=1)
589
+ train_target = data[self.target_column]
622
590
 
623
- # Extract the features
624
- features = train_data.columns[rfecv.support_].tolist()
591
+ # Fit the RFE using cv
592
+ rfecv.fit(train_data, train_target)
625
593
 
626
- self._display_msg(msg="feature selected by RFE:",
627
- col_lst=features,
628
- progress_bar=self.progress_bar)
629
- features.append(self.target_column)
630
- features.insert(0,'id')
631
-
632
- train_df = train[features]
633
- test_df = test[features]
634
-
635
- # storing the rfe selected features in data transformation dictionary
636
- self.data_transform_dict['rfe_features'] = features
637
-
638
- columns_to_rename = [col for col in train_df.columns if col not in ['id', self.target_column]]
639
- new_column = {col: f'r_{col}' for col in columns_to_rename}
640
- self.excluded_columns.extend([new_column[key] for key in self.excluded_columns if key in new_column])
641
-
642
- train_df.rename(columns=new_column, inplace=True)
643
- test_df.rename(columns=new_column, inplace=True)
594
+ # Extract the features
595
+ features = train_data.columns[rfecv.support_].tolist()
596
+
597
+ self._display_msg(msg="feature selected by RFE:",
598
+ col_lst=features,
599
+ progress_bar=self.progress_bar)
600
+ features.append(self.target_column)
601
+ features.insert(0,'id')
602
+
603
+ selected_rfe_df = data[features]
604
+
605
+ # storing the rfe selected features in data transformation dictionary
606
+ self.data_transform_dict['rfe_features'] = features
607
+
608
+ columns_to_rename = [col for col in selected_rfe_df.columns if col not in ['id', self.target_column]]
609
+ new_column = {col: f'r_{col}' for col in columns_to_rename}
610
+ self.excluded_columns.extend([new_column[key] for key in self.excluded_columns if key in new_column])
611
+
612
+ selected_rfe_df.rename(columns=new_column, inplace=True)
644
613
 
645
614
  # storing the rename column list in data transformation dictionary
646
615
  self.data_transform_dict['rfe_rename_column'] = columns_to_rename
@@ -649,29 +618,24 @@ class _DataPreparation:
649
618
  self._display_msg(msg="Total time taken by feature selection: {:.2f} sec ".format( end_time - start_time),
650
619
  progress_bar=self.progress_bar,
651
620
  show_data=True)
652
- self.rfe_feature = train_df.drop(columns=['id',self.target_column], axis=1).columns.tolist()
621
+ self.rfe_feature = selected_rfe_df.drop(columns=['id',self.target_column], axis=1).columns.tolist()
653
622
 
654
623
  # Pushing data into database
655
- self.copy_dataframe_to_sql(train_df, test_df, 'rfe')
624
+ self.copy_dataframe_to_sql(selected_rfe_df, 'rfe', self.persist)
656
625
 
657
626
  def _feature_selection_Lasso(self,
658
- train=None,
659
- test=None):
627
+ data=None):
660
628
  """
661
629
  DESCRIPTION:
662
630
  Function performs Lasso Regression for feature selection.
663
631
  It helps in identifing and retaining the most important features while setting less important ones to zero.
664
632
 
665
633
  PARAMETERS:
666
- train:
634
+ data:
667
635
  Required Argument.
668
636
  Specifies the input train pandas DataFrame.
669
637
  Types: pandas Dataframe
670
638
 
671
- test:
672
- Required Argument.
673
- Specifies the input test pandas DataFrame.
674
- Types: pandas Dataframe
675
639
  """
676
640
  start_time = time.time()
677
641
  self._display_msg(msg="\nFeature selection using lasso ...",
@@ -682,35 +646,46 @@ class _DataPreparation:
682
646
  from sklearn.model_selection import GridSearchCV
683
647
  from sklearn.linear_model import Lasso
684
648
  from sklearn.linear_model import LogisticRegression
685
-
649
+ from sklearn.model_selection import StratifiedKFold
650
+
686
651
  # Getting the value k in k-fold cross-validation
687
- num_folds = self._num_of_folds(train.shape[0])
652
+ num_folds = self._num_of_folds(data.shape[0])
688
653
 
689
- # Prepare the training data
690
- train_features = train.drop(columns=['id',self.target_column], axis=1)
691
- train_target = train[self.target_column]
654
+ # Prepare data
655
+ train_features = data.drop(columns=['id',self.target_column], axis=1)
656
+ train_target = data[self.target_column]
692
657
 
693
- # Determine the estimator and parameters based on the type of problem
694
- if self.is_classification_type():
695
- if self.data.drop_duplicate(self.target_column).size == 2:
696
- scoring_metric = 'roc_auc'
658
+ # Suppressing warnings generated by pandas and sklearn
659
+ with warnings.catch_warnings():
660
+ warnings.filterwarnings('ignore')
661
+
662
+ # Determine the estimator and parameters based on the type of problem
663
+ if self.is_classification_type():
664
+ if self.data.drop_duplicate(self.target_column).size == 2:
665
+ scoring_metric = 'roc_auc'
666
+ else:
667
+ scoring_metric = 'f1_macro'
668
+ estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=42)
669
+ parameters = {'C':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
697
670
  else:
698
- scoring_metric = 'f1_macro'
699
- estimator = LogisticRegression(penalty='l1', solver='liblinear', multi_class='auto')
700
- parameters = {'C':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
701
- else:
702
- estimator = Lasso()
703
- parameters = {'alpha':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
704
- scoring_metric = "r2"
671
+ estimator = Lasso(random_state=42)
672
+ parameters = {'alpha':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
673
+ scoring_metric = "r2"
674
+
675
+ if self.is_classification_type():
676
+ cv = StratifiedKFold(n_splits=5, shuffle=False)
677
+ else:
678
+ cv = num_folds
705
679
 
706
- # Applying hyperparameter tuning and optimizing score
707
- hyperparameter_search = GridSearchCV(estimator, parameters, cv=num_folds, scoring=scoring_metric, verbose=0)
680
+ # Applying hyperparameter tuning and optimizing score
681
+ hyperparameter_search = GridSearchCV(estimator, parameters, cv=cv, refit=True,
682
+ scoring=scoring_metric, verbose=0)
708
683
 
709
- # Fitting the best result from hyperparameter
710
- hyperparameter_search.fit(train_features, train_target)
684
+ # Fitting the best result from hyperparameter
685
+ hyperparameter_search.fit(train_features, train_target)
711
686
 
712
- # Extracting the important estimators
713
- feature_importance = np.abs(hyperparameter_search.best_estimator_.coef_)
687
+ # Extracting the important estimators
688
+ feature_importance = np.abs(hyperparameter_search.best_estimator_.coef_)
714
689
 
715
690
  # Extracting feature using estimators whose importance > 0
716
691
  if self.is_classification_type():
@@ -725,8 +700,7 @@ class _DataPreparation:
725
700
  progress_bar=self.progress_bar)
726
701
 
727
702
  important_features = ['id'] + important_features + [self.target_column]
728
- train_df = train[important_features]
729
- test_df = test[important_features]
703
+ selected_lasso_df = data[important_features]
730
704
 
731
705
  # Storing the lasso selected features in data transformation dictionary
732
706
  self.data_transform_dict['lasso_features'] = important_features
@@ -736,61 +710,62 @@ class _DataPreparation:
736
710
  self._display_msg(msg="Total time taken by feature selection: {:.2f} sec ".format( end_time - start_time),
737
711
  progress_bar=self.progress_bar,
738
712
  show_data=True)
739
- self.lasso_feature = train_df.drop(columns=['id',self.target_column], axis=1).columns.tolist()
713
+ self.lasso_feature = selected_lasso_df.drop(columns=['id',self.target_column], axis=1).columns.tolist()
740
714
 
741
- self.copy_dataframe_to_sql(train_df, test_df, 'lasso')
715
+ self.copy_dataframe_to_sql(selected_lasso_df, 'lasso', self.persist)
742
716
 
743
717
  def copy_dataframe_to_sql(self,
744
- train,
745
- test,
746
- prefix):
718
+ data,
719
+ prefix,
720
+ persist):
747
721
  """
748
722
  DESCRIPTION:
749
723
  Function to copy dataframe to SQL with generated table name.
750
724
 
751
725
  PARAMETERS:
752
- train:
753
- Required Argument.
754
- Specifies the input train pandas DataFrame.
755
- Types: pandas Dataframe
756
-
757
- test:
726
+ data:
758
727
  Required Argument.
759
- Specifies the input test pandas DataFrame.
728
+ Specifies the input pandas DataFrame.
760
729
  Types: pandas Dataframe
761
730
 
762
731
  prefix:
763
732
  Required Argument.
764
733
  Specifies the prefix for the table name.
765
734
  Types: str
735
+
736
+ persist:
737
+ Required Argument.
738
+ Specifies whether to persist the results of the
739
+ function in a table or not. When set to True,
740
+ results are persisted in a table; otherwise,
741
+ results are garbage collected at the end of the
742
+ session.
743
+ Types: bool
766
744
  """
767
745
  # Generating table names
768
746
  train_table_name = UtilFuncs._generate_temp_table_name(prefix='{}_train'.format(prefix),
769
- table_type = TeradataConstants.TERADATA_TABLE)
770
- test_table_name = UtilFuncs._generate_temp_table_name(prefix='{}_test'.format(prefix),
771
- table_type = TeradataConstants.TERADATA_TABLE)
772
-
747
+ table_type = TeradataConstants.TERADATA_TABLE,
748
+ gc_on_quit=not persist)
773
749
  # Storing the table names in the table name mapping dictionary
774
750
  self.table_name_mapping['{}_train'.format(prefix)] = train_table_name
775
- self.table_name_mapping['{}_test'.format(prefix)] = test_table_name
776
751
 
777
752
  # Pushing data into database
778
- copy_to_sql(df=train, table_name=train_table_name, if_exists="replace")
779
- copy_to_sql(df=test, table_name=test_table_name, if_exists="replace")
780
-
781
-
753
+ if self.is_classification_type():
754
+ copy_to_sql(df=data, table_name=train_table_name, if_exists="replace", types={f'{self.target_column}': INTEGER})
755
+ else:
756
+ copy_to_sql(df=data, table_name=train_table_name, if_exists="replace")
782
757
 
783
758
  def _scaling_features_helper(self,
784
- train=None,
785
- feature_selection_mtd=None):
759
+ data=None,
760
+ feature_selection_mtd=None):
786
761
  """
787
762
  DESCRIPTION:
788
763
  This function selects the features on which feature scaling should be applied.
789
764
 
790
765
  PARAMETERS:
791
- train:
766
+ data:
792
767
  Required Argument.
793
- Specifies the training data.
768
+ Specifies the data on which feature scaling will be applied.
794
769
  Types: teradataml Dataframe
795
770
 
796
771
  feature_selection_mtd:
@@ -805,10 +780,10 @@ class _DataPreparation:
805
780
  columns_to_scale = []
806
781
 
807
782
  # Iterating over the columns
808
- for col in train.columns:
783
+ for col in data.columns:
809
784
  # Selecting columns that will be scaled
810
785
  # Exculding target_col and columns with single value
811
- if col not in ['id', self.target_column] and train.drop_duplicate(col).size > 1:
786
+ if col not in ['id', self.target_column] and data.drop_duplicate(col).size > 1:
812
787
  columns_to_scale.append(col)
813
788
 
814
789
  if feature_selection_mtd == "lasso":
@@ -822,7 +797,7 @@ class _DataPreparation:
822
797
  return columns_to_scale
823
798
 
824
799
  def _scaling_features(self,
825
- feature_selection_mtd=None):
800
+ feature_selection_mtd=None):
826
801
  """
827
802
  DESCRIPTION:
828
803
  Function performs feature scaling on columns present inside the dataset
@@ -832,7 +807,7 @@ class _DataPreparation:
832
807
  feature_selection_mtd:
833
808
  Required Argument.
834
809
  Specifies the feature selection algorithm used.
835
- Types: str
810
+ Types: str
836
811
  """
837
812
 
838
813
  self._display_msg(msg="\nscaling Features of {} data ...".format(feature_selection_mtd),
@@ -840,8 +815,7 @@ class _DataPreparation:
840
815
  show_data=True)
841
816
 
842
817
  start_time = time.time()
843
- train = None
844
- test = None
818
+ data_to_scale = None
845
819
 
846
820
  if self.is_classification_type():
847
821
  scale_method = self._scale_method_cls
@@ -850,17 +824,18 @@ class _DataPreparation:
850
824
 
851
825
  # Loading data for feature scaling based of feature selection method
852
826
  if feature_selection_mtd == 'rfe':
853
- train = DataFrame(self.table_name_mapping['rfe_train'])
854
- test = DataFrame(self.table_name_mapping['rfe_test'])
827
+ data_to_scale = DataFrame(self.table_name_mapping['rfe_train'])
855
828
  elif feature_selection_mtd == 'lasso':
856
- train = DataFrame(self.table_name_mapping['lasso_train'])
857
- test = DataFrame(self.table_name_mapping['lasso_test'])
829
+ data_to_scale = DataFrame(self.table_name_mapping['lasso_train'])
858
830
  else:
859
- train = self.train_df
860
- test = self.test_df
831
+ data_to_scale = self.data
832
+
833
+ # Setting volatile and persist parameters for ScaleFit and ScaleTransform functions
834
+ volatile, persist = self._set_generic_parameters(func_indicator='FeatureScalingIndicator',
835
+ param_name='FeatureScalingParam')
861
836
 
862
837
  # List of columns that will be scaled
863
- scale_col= self._scaling_features_helper(train, feature_selection_mtd)
838
+ scale_col= self._scaling_features_helper(data_to_scale, feature_selection_mtd)
864
839
 
865
840
  if len(scale_col) != 0:
866
841
  self._display_msg(msg="columns that will be scaled: ",
@@ -868,58 +843,38 @@ class _DataPreparation:
868
843
  progress_bar=self.progress_bar)
869
844
 
870
845
  # Scale Fit
871
- fit_obj = ScaleFit(data=train,
846
+ fit_obj = ScaleFit(data=data_to_scale,
872
847
  target_columns=scale_col,
873
- scale_method=scale_method)
848
+ scale_method=scale_method,
849
+ volatile=volatile,
850
+ persist=persist)
874
851
 
875
852
  # storing the scale fit object and columns in data transformation dictionary
876
- self.data_transform_dict['{}_scale_fit_obj'.format(feature_selection_mtd)] = fit_obj
853
+ self.data_transform_dict['{}_scale_fit_obj'.format(feature_selection_mtd)] = fit_obj.output
877
854
  self.data_transform_dict['{}_scale_col'.format(feature_selection_mtd)] = scale_col
878
855
 
879
856
  # List of columns to copy to the output generated by scale transform
880
- accumulate_cols = list(set(train.columns) - set(scale_col))
881
-
882
- # Scaling on training dataset
883
- tr_obj = ScaleTransform(data=train,
884
- object=fit_obj,
885
- accumulate=accumulate_cols)
886
-
887
- # Scaling on testing dataset
888
- ts_obj = ScaleTransform(data=test,
889
- object=fit_obj,
890
- accumulate=accumulate_cols)
857
+ accumulate_cols = list(set(data_to_scale.columns) - set(scale_col))
891
858
 
892
- train = tr_obj.result
893
- test = ts_obj.result
859
+ # Scaling dataset
860
+ transform_obj = ScaleTransform(data=data_to_scale,
861
+ object=fit_obj,
862
+ accumulate=accumulate_cols)
863
+ scaled_df = transform_obj.result
894
864
 
895
- self._display_msg(msg="Training dataset sample after scaling:",
896
- data=train,
897
- progress_bar=self.progress_bar)
898
- self._display_msg(msg="Testing dataset sample after scaling:",
899
- data=test,
865
+ self._display_msg(msg="Dataset sample after scaling:",
866
+ data=scaled_df,
900
867
  progress_bar=self.progress_bar)
901
868
  else:
902
869
  self._display_msg(msg="No columns to scale.",
903
870
  progress_bar=self.progress_bar)
904
-
905
- if self.is_classification_type():
906
- train, test = self._bigint_to_int(train, test)
907
871
 
908
- self.copy_dataframe_to_sql(train, test, feature_selection_mtd)
872
+ self.copy_dataframe_to_sql(scaled_df, feature_selection_mtd, persist)
909
873
 
910
874
  end_time = time.time()
911
875
  self._display_msg(msg="Total time taken by feature scaling: {:.2f} sec".format( end_time - start_time),
912
876
  progress_bar=self.progress_bar,
913
877
  show_data=True)
914
-
915
- def _bigint_to_int(self, train, test):
916
- tr = train.to_pandas()
917
- tr[self.target_column] = tr[self.target_column].astype('int')
918
-
919
- ts = test.to_pandas()
920
- ts[self.target_column] = ts[self.target_column].astype('int')
921
-
922
- return tr, ts
923
878
 
924
879
  def _set_custom_scaling_method(self):
925
880
  """
@@ -932,43 +887,32 @@ class _DataPreparation:
932
887
  # Checking user input for feature scaling
933
888
  if feature_scaling_input:
934
889
  # Extracting scaling method
935
- custom_scaling_method = self.custom_data.get("FeatureScalingMethod", None)
936
- if custom_scaling_method is None:
937
- self._display_msg(inline_msg="No information provided for customized scaling method. AutoML will continue with default option.",
938
- progress_bar=self.progress_bar)
939
- else:
940
- if self.is_classification_type():
941
- self._scale_method_cls = custom_scaling_method
890
+ custom_scaling_params = self.custom_data.get("FeatureScalingParam", None)
891
+ if custom_scaling_params:
892
+ custom_scaling_method = custom_scaling_params.get("FeatureScalingMethod", None)
893
+ if custom_scaling_method is None:
894
+ self._display_msg(inline_msg="No information provided for customized scaling method. AutoML will continue with default option.",
895
+ progress_bar=self.progress_bar)
942
896
  else:
943
- self._scale_method_reg = custom_scaling_method
897
+ if self.is_classification_type():
898
+ self._scale_method_cls = custom_scaling_method
899
+ else:
900
+ self._scale_method_reg = custom_scaling_method
944
901
  else:
945
902
  self._display_msg(inline_msg="No information provided for performing customized feature scaling. Proceeding with default option.",
946
903
  progress_bar=self.progress_bar)
947
904
 
948
905
 
949
- def _handle_generated_features(self,
950
- label = None):
906
+ def _handle_generated_features(self):
951
907
  """
952
908
  DESCRIPTION:
953
909
  Function to handle newly generated float features. It will round them upto 4 digit after decimal point.
954
-
955
- PARAMETERS:
956
- label:
957
- Optional Argument.
958
- Specifies label for dataset on which rounding up is getting done i.e., 'train' for training
959
- and 'test' for testing dataset.
960
- By Default, it takes None and transformation is getting applied to whole dataset.
961
- Types: str
962
-
910
+
911
+ RETURNS:
912
+ Pandas DataFrame containing, rounded up float columns.
963
913
  """
964
- # Checking for label and accordingly deciding target dataset.
965
- if label == 'train':
966
- target_df = self.train_df
967
- elif label == 'test':
968
- target_df = self.test_df
969
- else:
970
- target_df=self.data
971
-
914
+ # Assigning data to target dataframe
915
+ target_df = self.data
972
916
  # Detecting list of float columns on target dataset
973
917
  float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float"]]
974
918
 
@@ -988,6 +932,19 @@ class _DataPreparation:
988
932
  "accumulate" : accumulate_columns,
989
933
  "persist" : True}
990
934
 
991
- obj = RoundColumns(**fit_params).result
992
- df = obj.to_pandas()
993
- return df.reset_index()
935
+ # Disabling print if persist is True by default
936
+ if not self.volatile and not self.persist:
937
+ fit_params["display_table_name"] = False
938
+
939
+ if self.volatile:
940
+ fit_params["volatile"] = True
941
+ fit_params["persist"] = False
942
+
943
+ transform_output = RoundColumns(**fit_params).result
944
+ if not self.volatile and not self.persist:
945
+ # Adding transformed data containing table to garbage collector
946
+ GarbageCollector._add_to_garbagecollector(transform_output._table_name)
947
+ cols = transform_output.columns
948
+ df = transform_output.to_pandas().reset_index()
949
+ df = df[cols]
950
+ return df