teradataml 20.0.0.1__py3-none-any.whl → 20.0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (240) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/LICENSE.pdf +0 -0
  3. teradataml/README.md +306 -0
  4. teradataml/__init__.py +10 -3
  5. teradataml/_version.py +1 -1
  6. teradataml/analytics/__init__.py +3 -2
  7. teradataml/analytics/analytic_function_executor.py +299 -16
  8. teradataml/analytics/analytic_query_generator.py +92 -0
  9. teradataml/analytics/byom/__init__.py +3 -2
  10. teradataml/analytics/json_parser/metadata.py +13 -3
  11. teradataml/analytics/json_parser/utils.py +13 -6
  12. teradataml/analytics/meta_class.py +40 -1
  13. teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
  14. teradataml/analytics/sqle/__init__.py +11 -2
  15. teradataml/analytics/table_operator/__init__.py +4 -3
  16. teradataml/analytics/uaf/__init__.py +21 -2
  17. teradataml/analytics/utils.py +66 -1
  18. teradataml/analytics/valib.py +1 -1
  19. teradataml/automl/__init__.py +1502 -323
  20. teradataml/automl/custom_json_utils.py +139 -61
  21. teradataml/automl/data_preparation.py +247 -307
  22. teradataml/automl/data_transformation.py +32 -12
  23. teradataml/automl/feature_engineering.py +325 -86
  24. teradataml/automl/model_evaluation.py +44 -35
  25. teradataml/automl/model_training.py +122 -153
  26. teradataml/catalog/byom.py +8 -8
  27. teradataml/clients/pkce_client.py +1 -1
  28. teradataml/common/__init__.py +2 -1
  29. teradataml/common/constants.py +72 -0
  30. teradataml/common/deprecations.py +13 -7
  31. teradataml/common/garbagecollector.py +152 -120
  32. teradataml/common/messagecodes.py +11 -2
  33. teradataml/common/messages.py +4 -1
  34. teradataml/common/sqlbundle.py +26 -4
  35. teradataml/common/utils.py +225 -14
  36. teradataml/common/wrapper_utils.py +1 -1
  37. teradataml/context/context.py +82 -2
  38. teradataml/data/SQL_Fundamentals.pdf +0 -0
  39. teradataml/data/complaints_test_tokenized.csv +353 -0
  40. teradataml/data/complaints_tokens_model.csv +348 -0
  41. teradataml/data/covid_confirm_sd.csv +83 -0
  42. teradataml/data/dataframe_example.json +27 -1
  43. teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
  44. teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
  45. teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
  46. teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
  47. teradataml/data/docs/sqle/docs_17_20/Shap.py +203 -0
  48. teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
  49. teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
  50. teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
  51. teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
  52. teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
  53. teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
  54. teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
  55. teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
  56. teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
  57. teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
  58. teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
  59. teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
  60. teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
  61. teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
  62. teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
  63. teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
  64. teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
  65. teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
  66. teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
  67. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
  68. teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
  69. teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
  70. teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
  71. teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
  72. teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
  73. teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
  74. teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
  75. teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
  76. teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
  77. teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
  78. teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
  79. teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
  80. teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
  81. teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
  82. teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
  83. teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
  84. teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
  85. teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
  86. teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
  87. teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
  88. teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
  89. teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
  90. teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
  91. teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
  92. teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
  93. teradataml/data/dwt2d_dataTable.csv +65 -0
  94. teradataml/data/dwt_dataTable.csv +8 -0
  95. teradataml/data/dwt_filterTable.csv +3 -0
  96. teradataml/data/finance_data4.csv +13 -0
  97. teradataml/data/grocery_transaction.csv +19 -0
  98. teradataml/data/idwt2d_dataTable.csv +5 -0
  99. teradataml/data/idwt_dataTable.csv +8 -0
  100. teradataml/data/idwt_filterTable.csv +3 -0
  101. teradataml/data/interval_data.csv +5 -0
  102. teradataml/data/jsons/paired_functions.json +14 -0
  103. teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
  104. teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
  105. teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
  106. teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
  107. teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
  108. teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
  109. teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
  110. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
  111. teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
  112. teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
  113. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
  114. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
  115. teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
  116. teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
  117. teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
  118. teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
  119. teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
  120. teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
  121. teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
  122. teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
  123. teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
  124. teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
  125. teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
  126. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
  127. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
  128. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
  129. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
  130. teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
  131. teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
  132. teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
  133. teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
  134. teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
  135. teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
  136. teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
  137. teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
  138. teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
  139. teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
  140. teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
  141. teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
  142. teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
  143. teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
  144. teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
  145. teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
  146. teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
  147. teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
  148. teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
  149. teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
  150. teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
  151. teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
  152. teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
  153. teradataml/data/jsons/uaf/17.20/TD_SAX.json +210 -0
  154. teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
  155. teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
  156. teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
  157. teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
  158. teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
  159. teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +410 -0
  160. teradataml/data/load_example_data.py +8 -2
  161. teradataml/data/medical_readings.csv +101 -0
  162. teradataml/data/naivebayestextclassifier_example.json +1 -1
  163. teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
  164. teradataml/data/patient_profile.csv +101 -0
  165. teradataml/data/peppers.png +0 -0
  166. teradataml/data/real_values.csv +14 -0
  167. teradataml/data/sax_example.json +8 -0
  168. teradataml/data/scripts/deploy_script.py +1 -1
  169. teradataml/data/scripts/lightgbm/dataset.template +157 -0
  170. teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
  171. teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
  172. teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
  173. teradataml/data/scripts/sklearn/sklearn_fit.py +194 -160
  174. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
  175. teradataml/data/scripts/sklearn/sklearn_function.template +34 -16
  176. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
  177. teradataml/data/scripts/sklearn/sklearn_neighbors.py +1 -1
  178. teradataml/data/scripts/sklearn/sklearn_score.py +12 -3
  179. teradataml/data/scripts/sklearn/sklearn_transform.py +162 -24
  180. teradataml/data/star_pivot.csv +8 -0
  181. teradataml/data/target_udt_data.csv +8 -0
  182. teradataml/data/templates/open_source_ml.json +3 -1
  183. teradataml/data/teradataml_example.json +20 -1
  184. teradataml/data/timestamp_data.csv +4 -0
  185. teradataml/data/titanic_dataset_unpivoted.csv +19 -0
  186. teradataml/data/uaf_example.json +55 -1
  187. teradataml/data/unpivot_example.json +15 -0
  188. teradataml/data/url_data.csv +9 -0
  189. teradataml/data/vectordistance_example.json +4 -0
  190. teradataml/data/windowdfft.csv +16 -0
  191. teradataml/dataframe/copy_to.py +1 -1
  192. teradataml/dataframe/data_transfer.py +5 -3
  193. teradataml/dataframe/dataframe.py +1002 -201
  194. teradataml/dataframe/fastload.py +3 -3
  195. teradataml/dataframe/functions.py +867 -0
  196. teradataml/dataframe/row.py +160 -0
  197. teradataml/dataframe/setop.py +2 -2
  198. teradataml/dataframe/sql.py +840 -33
  199. teradataml/dataframe/window.py +1 -1
  200. teradataml/dbutils/dbutils.py +878 -34
  201. teradataml/dbutils/filemgr.py +48 -1
  202. teradataml/geospatial/geodataframe.py +1 -1
  203. teradataml/geospatial/geodataframecolumn.py +1 -1
  204. teradataml/hyperparameter_tuner/optimizer.py +13 -13
  205. teradataml/lib/aed_0_1.dll +0 -0
  206. teradataml/opensource/__init__.py +1 -1
  207. teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
  208. teradataml/opensource/_lightgbm.py +950 -0
  209. teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
  210. teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
  211. teradataml/opensource/sklearn/__init__.py +0 -1
  212. teradataml/opensource/sklearn/_sklearn_wrapper.py +1019 -574
  213. teradataml/options/__init__.py +9 -23
  214. teradataml/options/configure.py +42 -4
  215. teradataml/options/display.py +2 -2
  216. teradataml/plot/axis.py +4 -4
  217. teradataml/scriptmgmt/UserEnv.py +13 -9
  218. teradataml/scriptmgmt/lls_utils.py +77 -23
  219. teradataml/store/__init__.py +13 -0
  220. teradataml/store/feature_store/__init__.py +0 -0
  221. teradataml/store/feature_store/constants.py +291 -0
  222. teradataml/store/feature_store/feature_store.py +2223 -0
  223. teradataml/store/feature_store/models.py +1505 -0
  224. teradataml/store/vector_store/__init__.py +1586 -0
  225. teradataml/table_operators/Script.py +2 -2
  226. teradataml/table_operators/TableOperator.py +106 -20
  227. teradataml/table_operators/query_generator.py +3 -0
  228. teradataml/table_operators/table_operator_query_generator.py +3 -1
  229. teradataml/table_operators/table_operator_util.py +102 -56
  230. teradataml/table_operators/templates/dataframe_register.template +69 -0
  231. teradataml/table_operators/templates/dataframe_udf.template +63 -0
  232. teradataml/telemetry_utils/__init__.py +0 -0
  233. teradataml/telemetry_utils/queryband.py +52 -0
  234. teradataml/utils/dtypes.py +4 -2
  235. teradataml/utils/validators.py +34 -2
  236. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +311 -3
  237. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +240 -157
  238. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
  239. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
  240. {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0
@@ -19,7 +19,6 @@ import pandas as pd
19
19
  import random
20
20
  import time
21
21
  import warnings
22
- warnings.filterwarnings("ignore")
23
22
 
24
23
  # Teradata libraries
25
24
  from teradataml.dataframe.dataframe import DataFrame
@@ -27,7 +26,7 @@ from teradataml.dataframe.copy_to import copy_to_sql
27
26
  from teradataml import OutlierFilterFit, OutlierFilterTransform
28
27
  from teradataml import RoundColumns, TeradataMlException
29
28
  from teradataml import ScaleFit, ScaleTransform
30
- from teradataml import TrainTestSplit, UtilFuncs, TeradataConstants
29
+ from teradataml import UtilFuncs, TeradataConstants
31
30
  from teradataml.common.garbagecollector import GarbageCollector
32
31
  from teradataml.common.messages import Messages, MessageCodes
33
32
  from teradataml.utils.validators import _Validators
@@ -46,7 +45,8 @@ class _DataPreparation:
46
45
  excluded_columns=None,
47
46
  custom_data=None,
48
47
  data_transform_dict=None,
49
- task_type="Regression"):
48
+ task_type="Regression",
49
+ **kwargs):
50
50
  """
51
51
  DESCRIPTION:
52
52
  Function initializes the data, target column and columns datatypes
@@ -95,6 +95,28 @@ class _DataPreparation:
95
95
  Default Value: "Regression"
96
96
  Permitted Values: "Regression", "Classification"
97
97
  Types: str
98
+
99
+ **kwargs:
100
+ Specifies the additional arguments for data preparation. Below
101
+ are the additional arguments:
102
+ volatile:
103
+ Optional Argument.
104
+ Specifies whether to put the interim results of the
105
+ functions in a volatile table or not. When set to
106
+ True, results are stored in a volatile table,
107
+ otherwise not.
108
+ Default Value: False
109
+ Types: bool
110
+
111
+ persist:
112
+ Optional Argument.
113
+ Specifies whether to persist the interim results of the
114
+ functions in a table or not. When set to True,
115
+ results are persisted in a table; otherwise,
116
+ results are garbage collected at the end of the
117
+ session.
118
+ Default Value: False
119
+ Types: bool
98
120
  """
99
121
  self.data = data
100
122
  self.target_column = target_column
@@ -103,9 +125,10 @@ class _DataPreparation:
103
125
  self.data_transform_dict = data_transform_dict
104
126
  self.custom_data = custom_data
105
127
  self.task_type = task_type
128
+ self.volatile = kwargs.get("volatile", False)
129
+ self.persist = kwargs.get("persist", False)
106
130
 
107
131
  # Setting default value for auto run mode
108
- self._train_size = 0.80
109
132
  self._data_sampling_method = "SMOTE"
110
133
  self._scale_method_reg = "STD"
111
134
  self._scale_method_cls = "RANGE"
@@ -119,10 +142,9 @@ class _DataPreparation:
119
142
  """
120
143
  DESCRIPTION:
121
144
  Function to perform following tasks:-
122
- 1. Splits the given data into training and testing datasets.
123
- 2. Performs outlier processing on the training dataset and transformation on the testing dataset.
124
- 3. Performs feature selection using RFE, PCA, and Lasso.
125
- 4. Performs feature scaling.
145
+ 1. Performs outlier processing and transformation on dataset.
146
+ 2. Performs feature selection using RFE, PCA, and Lasso.
147
+ 3. Performs feature scaling.
126
148
 
127
149
  PARAMETERS:
128
150
  auto:
@@ -141,42 +163,36 @@ class _DataPreparation:
141
163
  progress_bar=self.progress_bar)
142
164
  # Setting user value in case of custom running mode
143
165
  if not auto:
144
- self._set_custom_train_test_split()
145
166
  self._set_custom_scaling_method()
146
167
  self._set_custom_sampling()
147
168
 
148
- # Performing train test split
149
- self._train_test_split()
150
- self.progress_bar.update()
151
-
152
169
  # Handling ouliers in dataset
153
170
  self._handle_outliers(auto)
154
171
  self.progress_bar.update()
155
172
 
156
173
  # Handling float type features before processing with feature selection and scaling
157
- train = self._handle_generated_features('train')
158
- test = self._handle_generated_features('test')
174
+ training_data = self._handle_generated_features()
159
175
  self.progress_bar.update()
160
176
 
161
177
  # Temporary Pulling data for feature selection
162
178
  # Will change after sto
163
179
 
164
180
  # Checking for data imbalance
165
- if self._check_data_imbalance(train):
166
- train = self._data_sampling(train)
181
+ if self._check_data_imbalance(training_data):
182
+ training_data = self._data_sampling(training_data)
167
183
  self.progress_bar.update()
168
184
 
169
185
  # Sorting the data based on id to
170
186
  # remove any shuffling done by sampling
171
- train = train.sort_values(by='id')
187
+ training_data = training_data.sort_values(by='id')
172
188
 
173
189
  # Performing feature selection using lasso followed by scaling
174
- self._feature_selection_Lasso(train, test)
190
+ self._feature_selection_Lasso(training_data)
175
191
  self._scaling_features(feature_selection_mtd="lasso")
176
192
  self.progress_bar.update()
177
193
 
178
194
  # Performing feature selection using rfe followed by scaling
179
- self._feature_selection_RFE(train, test)
195
+ self._feature_selection_RFE(training_data)
180
196
  self._scaling_features(feature_selection_mtd="rfe")
181
197
  self.progress_bar.update()
182
198
 
@@ -187,85 +203,8 @@ class _DataPreparation:
187
203
 
188
204
  return [self.rfe_feature, self.lasso_feature, self.pca_feature], self.data_transform_dict
189
205
 
190
- # Splits data into train and test
191
- def _train_test_split(self):
192
-
193
- """
194
- DESCRIPTION:
195
- Function splits the data into training and testing datasets.
196
-
197
- PARAMETERS:
198
- train_size:
199
- Optional Argument.
200
- Specifies the training size required for splitting dataset.
201
- By Default, it takes 0.8 as training size.
202
- Types: float
203
- """
204
- self._display_msg(msg="\nSpliting of dataset into training and testing ...",
205
- progress_bar=self.progress_bar,
206
- show_data=True)
207
- self._display_msg(inline_msg="Training size : {}".format(self._train_size),
208
- progress_bar=self.progress_bar)
209
- self._display_msg(inline_msg="Testing size : {}".format(round((1-self._train_size),2)),
210
- progress_bar=self.progress_bar)
211
- start_time = time.time()
212
- # Applying TrainTestSplit function on data
213
- # Regression
214
- train_test_func_params = {
215
- "data" : self.data,
216
- "id_column" : "id",
217
- "train_size" : self._train_size,
218
- "seed" : 42
219
- }
220
- if self.is_classification_type():
221
- train_test_func_params["stratify_column"]=self.target_column
222
- train_test_split_out = TrainTestSplit(**train_test_func_params)
223
- train_test_split_out = train_test_split_out.result
224
-
225
- # Splitting the data into training and testing data
226
- self.train_df = train_test_split_out[train_test_split_out['TD_IsTrainRow'] == 1].drop('TD_IsTrainRow', axis=1)
227
- self.test_df = train_test_split_out[train_test_split_out['TD_IsTrainRow'] == 0].drop('TD_IsTrainRow', axis=1)
228
-
229
- self._display_msg(msg="Training data sample",
230
- data=self.train_df,
231
- progress_bar=self.progress_bar)
232
-
233
- self._display_msg(msg="Testing data sample",
234
- data=self.test_df,
235
- progress_bar=self.progress_bar)
236
-
237
- end_time = time.time()
238
- self._display_msg(msg="Time taken for spliting of data: {:.2f} sec ".format(end_time - start_time),
239
- progress_bar=self.progress_bar,
240
- show_data=True)
241
-
242
- def _set_custom_train_test_split(self):
243
- """
244
- DESCRIPTION:
245
- Function to split dataset into training and testing based on user input.
246
-
247
- """
248
- # Fetching user input for train test split
249
- train_test_split_input = self.custom_data.get("TrainTestSplitIndicator", False)
250
- if train_test_split_input:
251
- # Extracting training size
252
- custom_train_size = self.custom_data.get("TrainingSize", None)
253
- if custom_train_size is None:
254
- self._display_msg(inline_msg="No information provided for training size. Proceeding with default option.",
255
- progress_bar=self.progress_bar)
256
- else:
257
- if not isinstance(custom_train_size, float):
258
- err = Messages.get_message(MessageCodes.INVALID_COLUMN_TYPE,
259
- 'custom_train', type(custom_train_size).__name__,
260
- 'float')
261
- raise TeradataMlException(err, MessageCodes.INVALID_COLUMN_TYPE)
262
- self._train_size = custom_train_size
263
- else:
264
- self._display_msg(inline_msg="No information provided for performing customized train test split. Proceeding with default option.",
265
- progress_bar=self.progress_bar)
266
-
267
206
  def _handle_outliers(self,
268
- auto):
207
+ auto):
269
208
  """
270
209
  DESCRIPTION:
271
210
  Function to handle existing outliers in dataset based on running mode.
@@ -296,6 +235,12 @@ class _DataPreparation:
296
235
  DESCRIPTION:
297
236
  Function to handle data imbalance in dataset using sampling techniques
298
237
  in case of classification.
238
+
239
+ PARAMETERS:
240
+ data:
241
+ Required Argument.
242
+ Specifies the input teradataml DataFrame.
243
+ Types: pandas Dataframe.
299
244
  """
300
245
  pass
301
246
 
@@ -317,7 +262,7 @@ class _DataPreparation:
317
262
  outlier_method = "Tukey"
318
263
 
319
264
  # List of columns for outlier processing.
320
- outlier_columns = [col for col in self.train_df.columns if col not in self.excluded_columns]
265
+ outlier_columns = [col for col in self.data.columns if col not in self.excluded_columns]
321
266
 
322
267
  # Detecting outlier percentage in each columns
323
268
  outlier_percentage_df = self._outlier_detection(outlier_method, outlier_columns)
@@ -367,28 +312,45 @@ class _DataPreparation:
367
312
  Pandas DataFrame containing, column name with outlier percentage.
368
313
 
369
314
  """
370
- # Performing fit on train dataset for outlier handling
315
+
316
+ # Setting volatile and persist parameters for Outlier handling function
317
+ volatile, persist = self._set_generic_parameters(func_indicator='OutlierFilterIndicator',
318
+ param_name='OutlierFilterParam')
319
+
320
+ # Performing fit on dataset for outlier handling
371
321
  fit_params = {
372
- "data" : self.train_df,
322
+ "data" : self.data,
373
323
  "target_columns" : target_columns,
374
324
  "outlier_method" : outlier_method,
375
- "replacement_value" : replacement_value
325
+ "replacement_value" : replacement_value,
326
+ "volatile" : volatile,
327
+ "persist" : persist
376
328
  }
377
329
  outlier_fit_out = OutlierFilterFit(**fit_params)
378
- # Performing transform on train dataset for outlier handling
330
+ # Performing transform on dataset for outlier handling
379
331
  transform_params = {
380
- "data" : self.train_df,
332
+ "data" : self.data,
381
333
  "object" : outlier_fit_out.result,
382
334
  "persist" : True
383
335
  }
384
- self.train_df = OutlierFilterTransform(**transform_params).result
385
- # Adding transformed data containing table to garbage collector
386
- GarbageCollector._add_to_garbagecollector(self.train_df._table_name)
336
+
337
+ # Disabling print if persist is True by default
338
+ if not volatile and not persist:
339
+ transform_params["display_table_name"] = False
340
+
341
+ if volatile:
342
+ transform_params["volatile"] = True
343
+ transform_params["persist"] = False
344
+ self.data = OutlierFilterTransform(**transform_params).result
345
+
346
+ if not volatile and not persist:
347
+ # Adding transformed data containing table to garbage collector
348
+ GarbageCollector._add_to_garbagecollector(self.data._table_name)
387
349
 
388
350
  def _outlier_processing(self):
389
351
  """
390
352
  DESCRIPTION:
391
- Function performs outlier processing on the training dataset. It identifies and handle outliers in the dataset.
353
+ Function performs outlier processing on dataset. It identifies and handle outliers in the dataset.
392
354
 
393
355
  """
394
356
  self._display_msg(msg="\nOutlier preprocessing ...",
@@ -409,8 +371,8 @@ class _DataPreparation:
409
371
  target_columns=columns_to_drop_rows
410
372
  replacement_strategy = "DELETE"
411
373
  self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
412
- self._display_msg(msg="Sample of training dataset after removing outlier rows:",
413
- data=self.train_df,
374
+ self._display_msg(msg="Sample of dataset after removing outlier rows:",
375
+ data=self.data,
414
376
  progress_bar=self.progress_bar)
415
377
 
416
378
  # Imputing Median value in place of outliers
@@ -421,8 +383,8 @@ class _DataPreparation:
421
383
  target_columns=columns_to_impute
422
384
  replacement_strategy = "MEDIAN"
423
385
  self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
424
- self._display_msg(msg="Sample of training dataset after performing MEDIAN inplace:",
425
- data=self.train_df,
386
+ self._display_msg(msg="Sample of dataset after performing MEDIAN inplace:",
387
+ data=self.data,
426
388
  progress_bar=self.progress_bar)
427
389
 
428
390
  if len(columns_to_drop_rows) == 0 and len(columns_to_impute) == 0:
@@ -437,7 +399,7 @@ class _DataPreparation:
437
399
  def _custom_outlier_processing(self):
438
400
  """
439
401
  DESCRIPTION:
440
- Function to perform outlier processing on the training dataset based on user input.
402
+ Function to perform outlier processing on dataset based on user input.
441
403
 
442
404
  """
443
405
  self._display_msg(msg="\nStarting customized outlier processing ...",
@@ -447,7 +409,7 @@ class _DataPreparation:
447
409
  # Checking user input for outlier filtering
448
410
  if outlier_filter_input:
449
411
  # List of columns for outlier processing.
450
- target_columns = [col for col in self.train_df.columns if col not in self.excluded_columns]
412
+ target_columns = [col for col in self.data.columns if col not in self.excluded_columns]
451
413
  # Checking user input for outlier detection method
452
414
  outlier_method = self.custom_data.get("OutlierDetectionMethod", None)
453
415
  if outlier_method == 'PERCENTILE':
@@ -464,11 +426,13 @@ class _DataPreparation:
464
426
  # Checking for rows if outlier containing columns exist
465
427
  if outlier_df.shape[0]:
466
428
  # Checking user input list for outlier handling
467
- outlier_transform_list = self.custom_data.get("OutlierFilterParam", None)
429
+ outlier_transform_list = self.custom_data.get("OutlierFilterParam", None).copy()
468
430
  if outlier_transform_list:
431
+ volatile = outlier_transform_list.pop("volatile", False)
432
+ persist = outlier_transform_list.pop("persist", False)
469
433
  # Checking user input for outlier handling
470
434
  _Validators._validate_dataframe_has_argument_columns(list(outlier_transform_list.keys()), "OutlierFilterParam",
471
- self.train_df, "train")
435
+ self.data, "outlier_data")
472
436
 
473
437
  for target_col, transform_val in outlier_transform_list.items():
474
438
  # Fetching replacement value
@@ -501,7 +465,7 @@ class _DataPreparation:
501
465
  RETURNS:
502
466
  int, number of folds to be used for cross-validation.
503
467
  """
504
- num_of_folds = lambda rows: 1 if rows > 20000 else (3 if 1000 < rows <= 20000 else 10)
468
+ num_of_folds = lambda rows: 2 if rows > 20000 else (4 if 1000 < rows <= 20000 else 10)
505
469
  return num_of_folds(rows)
506
470
 
507
471
  def _feature_selection_PCA(self):
@@ -517,14 +481,12 @@ class _DataPreparation:
517
481
  from sklearn.decomposition import PCA
518
482
 
519
483
  start_time = time.time()
520
- # Training and testing data using pandas dataframe
484
+
521
485
  # Temporary Pulling data for feature selection
522
- train = DataFrame.from_table(self.table_name_mapping['pca_train']).to_pandas()
523
- test = DataFrame.from_table(self.table_name_mapping['pca_test']).to_pandas()
486
+ pca_train = DataFrame.from_table(self.table_name_mapping['pca_train']).to_pandas()
524
487
 
525
488
  # Drop unnecessary columns and store the result
526
- train_data = train.drop(columns=['id', self.target_column], axis=1)
527
- test_data = test.drop(columns=['id', self.target_column], axis=1)
489
+ train_data = pca_train.drop(columns=['id', self.target_column], axis=1)
528
490
 
529
491
  # Initialize and fit PCA
530
492
  pca = PCA()
@@ -537,16 +499,15 @@ class _DataPreparation:
537
499
  # Create a new instance of PCA with the optimal number of components
538
500
  pca = PCA(n_components=n, random_state=42)
539
501
 
540
- # Apply PCA on training and testing dataset
502
+ # Apply PCA on dataset
541
503
  X_train_pca = pca.fit_transform(train_data)
542
- X_test_pca = pca.transform(test_data)
543
504
 
544
505
  # storing instance of PCA in data transformation dictionary
545
506
  self.data_transform_dict["pca_fit_instance"] = pca
507
+ self.data_transform_dict["pca_fit_columns"] = train_data.columns.tolist()
546
508
 
547
509
  #converting the numarray into dataframes
548
510
  train_df = pd.DataFrame(X_train_pca)
549
- test_df = pd.DataFrame(X_test_pca)
550
511
 
551
512
  #creating names for combined columns
552
513
  column_name = {col: 'col_'+str(i) for i,col in enumerate(train_df.columns)}
@@ -556,15 +517,12 @@ class _DataPreparation:
556
517
 
557
518
  #renaming them
558
519
  train_df = train_df.rename(columns=column_name)
559
- test_df = test_df.rename(columns=column_name)
560
520
 
561
521
  # adding the id column [PCA does not shuffle the dataset]
562
- train_df = pd.concat([train.reset_index(drop=True)['id'], train_df.reset_index(drop=True)], axis=1)
563
- test_df = pd.concat([test.reset_index(drop=True)['id'], test_df.reset_index(drop=True)], axis=1)
522
+ train_df = pd.concat([pca_train.reset_index(drop=True)['id'], train_df.reset_index(drop=True)], axis=1)
564
523
 
565
- # merging target column with new training and testing data
566
- train_df[self.target_column] = train[self.target_column].reset_index(drop=True)
567
- test_df[self.target_column] = test[self.target_column].reset_index(drop=True)
524
+ # merging target column with new data
525
+ train_df[self.target_column] = pca_train[self.target_column].reset_index(drop=True)
568
526
 
569
527
  self.pca_feature = train_df.drop(columns=['id',self.target_column],axis=1).columns.tolist()
570
528
 
@@ -577,26 +535,20 @@ class _DataPreparation:
577
535
  show_data=True)
578
536
 
579
537
  # Pushing the data in database
580
- self.copy_dataframe_to_sql(train_df, test_df, 'pca')
538
+ self.copy_dataframe_to_sql(train_df, 'pca', self.persist)
581
539
 
582
- def _feature_selection_RFE(self,
583
- train=None,
584
- test=None):
540
+ def _feature_selection_RFE(self,
541
+ data=None):
585
542
  """
586
543
  DESCRIPTION:
587
544
  Function performs Recursive Feature Elimination (RFE) for feature selection.
588
545
  It identifies a subset of the most relevant features in the dataset.
589
546
 
590
547
  PARAMETERS:
591
- train:
548
+ data:
592
549
  Required Argument.
593
550
  Specifies the input train pandas DataFrame.
594
- Types: pandas Dataframe
595
-
596
- test:
597
- Required Argument.
598
- Specifies the input test pandas DataFrame.
599
- Types: pandas Dataframe
551
+ Types: pandas Dataframe
600
552
  """
601
553
  self._display_msg(msg="\nFeature selection using rfe ...",
602
554
  progress_bar=self.progress_bar,
@@ -611,51 +563,53 @@ class _DataPreparation:
611
563
  # Regression
612
564
  is_classification = self.is_classification_type()
613
565
  # Getting the value of k in k-fold cross-validation
614
- folds = self._num_of_folds(train.shape[0])
566
+ folds = self._num_of_folds(data.shape[0])
615
567
 
616
- # Random forest for RFE model
617
- RFModel = RandomForestRegressor if not is_classification else RandomForestClassifier
618
- rf = RFModel(n_estimators=100, random_state=42)
568
+ # Suppressing warnings generated by pandas and sklearn
569
+ with warnings.catch_warnings():
570
+ warnings.filterwarnings('ignore')
619
571
 
620
- # Determine the scoring metric based on the number of unique classes
621
- score = 'r2' if not self.is_classification_type() \
622
- else 'roc_auc' if self.data.drop_duplicate(self.target_column).size == 2 else 'f1_macro'
572
+ # Random forest for RFE model
573
+ RFModel = RandomForestRegressor if not is_classification else RandomForestClassifier
574
+ rf = RFModel(n_estimators=100, random_state=42)
623
575
 
624
- # # Instantiate StratifiedKFold with shuffling for classification
625
- cv = folds if not self.is_classification_type() \
626
- else StratifiedKFold(n_splits=folds, shuffle=False)
576
+ # Determine the scoring metric based on the number of unique classes
577
+ score = 'r2' if not self.is_classification_type() \
578
+ else 'roc_auc' if self.data.drop_duplicate(self.target_column).size == 2 else 'f1_macro'
627
579
 
628
- # Define the RFE with cross-validation
629
- rfecv = RFECV(rf, cv=cv, scoring=score)
580
+ # # Instantiate StratifiedKFold with shuffling for classification
581
+ cv = folds if not self.is_classification_type() \
582
+ else StratifiedKFold(n_splits=folds, shuffle=False)
630
583
 
631
- # Prepare the training data
632
- train_data = train.drop(columns=['id',self.target_column], axis=1)
633
- train_target = train[self.target_column]
584
+ # Define the RFE with cross-validation
585
+ rfecv = RFECV(rf, cv=cv, scoring=score)
634
586
 
635
- # Fit the RFE using cv
636
- rfecv.fit(train_data, train_target)
587
+ # Prepare data
588
+ train_data = data.drop(columns=['id',self.target_column], axis=1)
589
+ train_target = data[self.target_column]
637
590
 
638
- # Extract the features
639
- features = train_data.columns[rfecv.support_].tolist()
591
+ # Fit the RFE using cv
592
+ rfecv.fit(train_data, train_target)
640
593
 
641
- self._display_msg(msg="feature selected by RFE:",
642
- col_lst=features,
643
- progress_bar=self.progress_bar)
644
- features.append(self.target_column)
645
- features.insert(0,'id')
646
-
647
- train_df = train[features]
648
- test_df = test[features]
649
-
650
- # storing the rfe selected features in data transformation dictionary
651
- self.data_transform_dict['rfe_features'] = features
652
-
653
- columns_to_rename = [col for col in train_df.columns if col not in ['id', self.target_column]]
654
- new_column = {col: f'r_{col}' for col in columns_to_rename}
655
- self.excluded_columns.extend([new_column[key] for key in self.excluded_columns if key in new_column])
656
-
657
- train_df.rename(columns=new_column, inplace=True)
658
- test_df.rename(columns=new_column, inplace=True)
594
+ # Extract the features
595
+ features = train_data.columns[rfecv.support_].tolist()
596
+
597
+ self._display_msg(msg="feature selected by RFE:",
598
+ col_lst=features,
599
+ progress_bar=self.progress_bar)
600
+ features.append(self.target_column)
601
+ features.insert(0,'id')
602
+
603
+ selected_rfe_df = data[features]
604
+
605
+ # storing the rfe selected features in data transformation dictionary
606
+ self.data_transform_dict['rfe_features'] = features
607
+
608
+ columns_to_rename = [col for col in selected_rfe_df.columns if col not in ['id', self.target_column]]
609
+ new_column = {col: f'r_{col}' for col in columns_to_rename}
610
+ self.excluded_columns.extend([new_column[key] for key in self.excluded_columns if key in new_column])
611
+
612
+ selected_rfe_df.rename(columns=new_column, inplace=True)
659
613
 
660
614
  # storing the rename column list in data transformation dictionary
661
615
  self.data_transform_dict['rfe_rename_column'] = columns_to_rename
@@ -664,29 +618,24 @@ class _DataPreparation:
664
618
  self._display_msg(msg="Total time taken by feature selection: {:.2f} sec ".format( end_time - start_time),
665
619
  progress_bar=self.progress_bar,
666
620
  show_data=True)
667
- self.rfe_feature = train_df.drop(columns=['id',self.target_column], axis=1).columns.tolist()
621
+ self.rfe_feature = selected_rfe_df.drop(columns=['id',self.target_column], axis=1).columns.tolist()
668
622
 
669
623
  # Pushing data into database
670
- self.copy_dataframe_to_sql(train_df, test_df, 'rfe')
624
+ self.copy_dataframe_to_sql(selected_rfe_df, 'rfe', self.persist)
671
625
 
672
626
  def _feature_selection_Lasso(self,
673
- train=None,
674
- test=None):
627
+ data=None):
675
628
  """
676
629
  DESCRIPTION:
677
630
  Function performs Lasso Regression for feature selection.
678
631
  It helps in identifing and retaining the most important features while setting less important ones to zero.
679
632
 
680
633
  PARAMETERS:
681
- train:
634
+ data:
682
635
  Required Argument.
683
636
  Specifies the input train pandas DataFrame.
684
637
  Types: pandas Dataframe
685
638
 
686
- test:
687
- Required Argument.
688
- Specifies the input test pandas DataFrame.
689
- Types: pandas Dataframe
690
639
  """
691
640
  start_time = time.time()
692
641
  self._display_msg(msg="\nFeature selection using lasso ...",
@@ -700,39 +649,43 @@ class _DataPreparation:
700
649
  from sklearn.model_selection import StratifiedKFold
701
650
 
702
651
  # Getting the value k in k-fold cross-validation
703
- num_folds = self._num_of_folds(train.shape[0])
652
+ num_folds = self._num_of_folds(data.shape[0])
704
653
 
705
- # Prepare the training data
706
- train_features = train.drop(columns=['id',self.target_column], axis=1)
707
- train_target = train[self.target_column]
654
+ # Prepare data
655
+ train_features = data.drop(columns=['id',self.target_column], axis=1)
656
+ train_target = data[self.target_column]
708
657
 
709
- # Determine the estimator and parameters based on the type of problem
710
- if self.is_classification_type():
711
- if self.data.drop_duplicate(self.target_column).size == 2:
712
- scoring_metric = 'roc_auc'
658
+ # Suppressing warnings generated by pandas and sklearn
659
+ with warnings.catch_warnings():
660
+ warnings.filterwarnings('ignore')
661
+
662
+ # Determine the estimator and parameters based on the type of problem
663
+ if self.is_classification_type():
664
+ if self.data.drop_duplicate(self.target_column).size == 2:
665
+ scoring_metric = 'roc_auc'
666
+ else:
667
+ scoring_metric = 'f1_macro'
668
+ estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=42)
669
+ parameters = {'C':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
713
670
  else:
714
- scoring_metric = 'f1_macro'
715
- estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=42)
716
- parameters = {'C':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
717
- else:
718
- estimator = Lasso(random_state=42)
719
- parameters = {'alpha':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
720
- scoring_metric = "r2"
671
+ estimator = Lasso(random_state=42)
672
+ parameters = {'alpha':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
673
+ scoring_metric = "r2"
721
674
 
722
- if self.is_classification_type():
723
- cv = StratifiedKFold(n_splits=5, shuffle=False)
724
- else:
725
- cv = num_folds
675
+ if self.is_classification_type():
676
+ cv = StratifiedKFold(n_splits=5, shuffle=False)
677
+ else:
678
+ cv = num_folds
726
679
 
727
- # Applying hyperparameter tuning and optimizing score
728
- hyperparameter_search = GridSearchCV(estimator, parameters, cv=cv, refit=True,
729
- scoring=scoring_metric, verbose=0)
680
+ # Applying hyperparameter tuning and optimizing score
681
+ hyperparameter_search = GridSearchCV(estimator, parameters, cv=cv, refit=True,
682
+ scoring=scoring_metric, verbose=0)
730
683
 
731
- # Fitting the best result from hyperparameter
732
- hyperparameter_search.fit(train_features, train_target)
684
+ # Fitting the best result from hyperparameter
685
+ hyperparameter_search.fit(train_features, train_target)
733
686
 
734
- # Extracting the important estimators
735
- feature_importance = np.abs(hyperparameter_search.best_estimator_.coef_)
687
+ # Extracting the important estimators
688
+ feature_importance = np.abs(hyperparameter_search.best_estimator_.coef_)
736
689
 
737
690
  # Extracting feature using estimators whose importance > 0
738
691
  if self.is_classification_type():
@@ -747,8 +700,7 @@ class _DataPreparation:
747
700
  progress_bar=self.progress_bar)
748
701
 
749
702
  important_features = ['id'] + important_features + [self.target_column]
750
- train_df = train[important_features]
751
- test_df = test[important_features]
703
+ selected_lasso_df = data[important_features]
752
704
 
753
705
  # Storing the lasso selected features in data transformation dictionary
754
706
  self.data_transform_dict['lasso_features'] = important_features
@@ -758,65 +710,62 @@ class _DataPreparation:
758
710
  self._display_msg(msg="Total time taken by feature selection: {:.2f} sec ".format( end_time - start_time),
759
711
  progress_bar=self.progress_bar,
760
712
  show_data=True)
761
- self.lasso_feature = train_df.drop(columns=['id',self.target_column], axis=1).columns.tolist()
713
+ self.lasso_feature = selected_lasso_df.drop(columns=['id',self.target_column], axis=1).columns.tolist()
762
714
 
763
- self.copy_dataframe_to_sql(train_df, test_df, 'lasso')
715
+ self.copy_dataframe_to_sql(selected_lasso_df, 'lasso', self.persist)
764
716
 
765
717
  def copy_dataframe_to_sql(self,
766
- train,
767
- test,
768
- prefix):
718
+ data,
719
+ prefix,
720
+ persist):
769
721
  """
770
722
  DESCRIPTION:
771
723
  Function to copy dataframe to SQL with generated table name.
772
724
 
773
725
  PARAMETERS:
774
- train:
775
- Required Argument.
776
- Specifies the input train pandas DataFrame.
777
- Types: pandas Dataframe
778
-
779
- test:
726
+ data:
780
727
  Required Argument.
781
- Specifies the input test pandas DataFrame.
728
+ Specifies the input pandas DataFrame.
782
729
  Types: pandas Dataframe
783
730
 
784
731
  prefix:
785
732
  Required Argument.
786
733
  Specifies the prefix for the table name.
787
734
  Types: str
735
+
736
+ persist:
737
+ Required Argument.
738
+ Specifies whether to persist the results of the
739
+ function in a table or not. When set to True,
740
+ results are persisted in a table; otherwise,
741
+ results are garbage collected at the end of the
742
+ session.
743
+ Types: bool
788
744
  """
789
745
  # Generating table names
790
746
  train_table_name = UtilFuncs._generate_temp_table_name(prefix='{}_train'.format(prefix),
791
- table_type = TeradataConstants.TERADATA_TABLE)
792
- test_table_name = UtilFuncs._generate_temp_table_name(prefix='{}_test'.format(prefix),
793
- table_type = TeradataConstants.TERADATA_TABLE)
794
-
747
+ table_type = TeradataConstants.TERADATA_TABLE,
748
+ gc_on_quit=not persist)
795
749
  # Storing the table names in the table name mapping dictionary
796
750
  self.table_name_mapping['{}_train'.format(prefix)] = train_table_name
797
- self.table_name_mapping['{}_test'.format(prefix)] = test_table_name
798
751
 
799
752
  # Pushing data into database
800
753
  if self.is_classification_type():
801
- copy_to_sql(df=train, table_name=train_table_name, if_exists="replace", types={f'{self.target_column}': INTEGER})
802
- copy_to_sql(df=test, table_name=test_table_name, if_exists="replace", types={f'{self.target_column}': INTEGER})
754
+ copy_to_sql(df=data, table_name=train_table_name, if_exists="replace", types={f'{self.target_column}': INTEGER})
803
755
  else:
804
- copy_to_sql(df=train, table_name=train_table_name, if_exists="replace")
805
- copy_to_sql(df=test, table_name=test_table_name, if_exists="replace")
806
-
807
-
756
+ copy_to_sql(df=data, table_name=train_table_name, if_exists="replace")
808
757
 
809
758
  def _scaling_features_helper(self,
810
- train=None,
811
- feature_selection_mtd=None):
759
+ data=None,
760
+ feature_selection_mtd=None):
812
761
  """
813
762
  DESCRIPTION:
814
763
  This function selects the features on which feature scaling should be applied.
815
764
 
816
765
  PARAMETERS:
817
- train:
766
+ data:
818
767
  Required Argument.
819
- Specifies the training data.
768
+ Specifies the data on which feature scaling will be applied.
820
769
  Types: teradataml Dataframe
821
770
 
822
771
  feature_selection_mtd:
@@ -831,10 +780,11 @@ class _DataPreparation:
831
780
  columns_to_scale = []
832
781
 
833
782
  # Iterating over the columns
834
- for col in train.columns:
783
+ for col in data.columns:
835
784
  # Selecting columns that will be scaled
836
785
  # Exculding target_col and columns with single value
837
- if col not in ['id', self.target_column] and train.drop_duplicate(col).size > 1:
786
+ if col not in ['id', self.target_column] and \
787
+ data.drop_duplicate(col).size > 1:
838
788
  columns_to_scale.append(col)
839
789
 
840
790
  if feature_selection_mtd == "lasso":
@@ -848,7 +798,7 @@ class _DataPreparation:
848
798
  return columns_to_scale
849
799
 
850
800
  def _scaling_features(self,
851
- feature_selection_mtd=None):
801
+ feature_selection_mtd=None):
852
802
  """
853
803
  DESCRIPTION:
854
804
  Function performs feature scaling on columns present inside the dataset
@@ -858,7 +808,7 @@ class _DataPreparation:
858
808
  feature_selection_mtd:
859
809
  Required Argument.
860
810
  Specifies the feature selection algorithm used.
861
- Types: str
811
+ Types: str
862
812
  """
863
813
 
864
814
  self._display_msg(msg="\nscaling Features of {} data ...".format(feature_selection_mtd),
@@ -866,8 +816,7 @@ class _DataPreparation:
866
816
  show_data=True)
867
817
 
868
818
  start_time = time.time()
869
- train = None
870
- test = None
819
+ data_to_scale = None
871
820
 
872
821
  if self.is_classification_type():
873
822
  scale_method = self._scale_method_cls
@@ -876,17 +825,18 @@ class _DataPreparation:
876
825
 
877
826
  # Loading data for feature scaling based of feature selection method
878
827
  if feature_selection_mtd == 'rfe':
879
- train = DataFrame(self.table_name_mapping['rfe_train'])
880
- test = DataFrame(self.table_name_mapping['rfe_test'])
828
+ data_to_scale = DataFrame(self.table_name_mapping['rfe_train'])
881
829
  elif feature_selection_mtd == 'lasso':
882
- train = DataFrame(self.table_name_mapping['lasso_train'])
883
- test = DataFrame(self.table_name_mapping['lasso_test'])
830
+ data_to_scale = DataFrame(self.table_name_mapping['lasso_train'])
884
831
  else:
885
- train = self.train_df
886
- test = self.test_df
832
+ data_to_scale = self.data
833
+
834
+ # Setting volatile and persist parameters for ScaleFit and ScaleTransform functions
835
+ volatile, persist = self._set_generic_parameters(func_indicator='FeatureScalingIndicator',
836
+ param_name='FeatureScalingParam')
887
837
 
888
838
  # List of columns that will be scaled
889
- scale_col= self._scaling_features_helper(train, feature_selection_mtd)
839
+ scale_col= self._scaling_features_helper(data_to_scale, feature_selection_mtd)
890
840
 
891
841
  if len(scale_col) != 0:
892
842
  self._display_msg(msg="columns that will be scaled: ",
@@ -894,41 +844,33 @@ class _DataPreparation:
894
844
  progress_bar=self.progress_bar)
895
845
 
896
846
  # Scale Fit
897
- fit_obj = ScaleFit(data=train,
847
+ fit_obj = ScaleFit(data=data_to_scale,
898
848
  target_columns=scale_col,
899
- scale_method=scale_method)
849
+ scale_method=scale_method,
850
+ volatile=volatile,
851
+ persist=persist)
900
852
 
901
853
  # storing the scale fit object and columns in data transformation dictionary
902
- self.data_transform_dict['{}_scale_fit_obj'.format(feature_selection_mtd)] = fit_obj
854
+ self.data_transform_dict['{}_scale_fit_obj'.format(feature_selection_mtd)] = fit_obj.output
903
855
  self.data_transform_dict['{}_scale_col'.format(feature_selection_mtd)] = scale_col
904
856
 
905
857
  # List of columns to copy to the output generated by scale transform
906
- accumulate_cols = list(set(train.columns) - set(scale_col))
907
-
908
- # Scaling on training dataset
909
- tr_obj = ScaleTransform(data=train,
910
- object=fit_obj,
911
- accumulate=accumulate_cols)
858
+ accumulate_cols = list(set(data_to_scale.columns) - set(scale_col))
912
859
 
913
- # Scaling on testing dataset
914
- ts_obj = ScaleTransform(data=test,
915
- object=fit_obj,
916
- accumulate=accumulate_cols)
860
+ # Scaling dataset
861
+ transform_obj = ScaleTransform(data=data_to_scale,
862
+ object=fit_obj,
863
+ accumulate=accumulate_cols)
864
+ scaled_df = transform_obj.result
917
865
 
918
- train = tr_obj.result
919
- test = ts_obj.result
920
-
921
- self._display_msg(msg="Training dataset sample after scaling:",
922
- data=train,
923
- progress_bar=self.progress_bar)
924
- self._display_msg(msg="Testing dataset sample after scaling:",
925
- data=test,
866
+ self._display_msg(msg="Dataset sample after scaling:",
867
+ data=scaled_df,
926
868
  progress_bar=self.progress_bar)
927
869
  else:
928
870
  self._display_msg(msg="No columns to scale.",
929
871
  progress_bar=self.progress_bar)
930
872
 
931
- self.copy_dataframe_to_sql(train, test, feature_selection_mtd)
873
+ self.copy_dataframe_to_sql(scaled_df, feature_selection_mtd, persist)
932
874
 
933
875
  end_time = time.time()
934
876
  self._display_msg(msg="Total time taken by feature scaling: {:.2f} sec".format( end_time - start_time),
@@ -946,43 +888,32 @@ class _DataPreparation:
946
888
  # Checking user input for feature scaling
947
889
  if feature_scaling_input:
948
890
  # Extracting scaling method
949
- custom_scaling_method = self.custom_data.get("FeatureScalingMethod", None)
950
- if custom_scaling_method is None:
951
- self._display_msg(inline_msg="No information provided for customized scaling method. AutoML will continue with default option.",
952
- progress_bar=self.progress_bar)
953
- else:
954
- if self.is_classification_type():
955
- self._scale_method_cls = custom_scaling_method
891
+ custom_scaling_params = self.custom_data.get("FeatureScalingParam", None)
892
+ if custom_scaling_params:
893
+ custom_scaling_method = custom_scaling_params.get("FeatureScalingMethod", None)
894
+ if custom_scaling_method is None:
895
+ self._display_msg(inline_msg="No information provided for customized scaling method. AutoML will continue with default option.",
896
+ progress_bar=self.progress_bar)
956
897
  else:
957
- self._scale_method_reg = custom_scaling_method
898
+ if self.is_classification_type():
899
+ self._scale_method_cls = custom_scaling_method
900
+ else:
901
+ self._scale_method_reg = custom_scaling_method
958
902
  else:
959
903
  self._display_msg(inline_msg="No information provided for performing customized feature scaling. Proceeding with default option.",
960
904
  progress_bar=self.progress_bar)
961
905
 
962
906
 
963
- def _handle_generated_features(self,
964
- label = None):
907
+ def _handle_generated_features(self):
965
908
  """
966
909
  DESCRIPTION:
967
910
  Function to handle newly generated float features. It will round them upto 4 digit after decimal point.
968
-
969
- PARAMETERS:
970
- label:
971
- Optional Argument.
972
- Specifies label for dataset on which rounding up is getting done i.e., 'train' for training
973
- and 'test' for testing dataset.
974
- By Default, it takes None and transformation is getting applied to whole dataset.
975
- Types: str
976
-
911
+
912
+ RETURNS:
913
+ Pandas DataFrame containing, rounded up float columns.
977
914
  """
978
- # Checking for label and accordingly deciding target dataset.
979
- if label == 'train':
980
- target_df = self.train_df
981
- elif label == 'test':
982
- target_df = self.test_df
983
- else:
984
- target_df=self.data
985
-
915
+ # Assigning data to target dataframe
916
+ target_df = self.data
986
917
  # Detecting list of float columns on target dataset
987
918
  float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float"]]
988
919
 
@@ -1001,10 +932,19 @@ class _DataPreparation:
1001
932
  "precision_digit" : 4,
1002
933
  "accumulate" : accumulate_columns,
1003
934
  "persist" : True}
935
+
936
+ # Disabling print if persist is True by default
937
+ if not self.volatile and not self.persist:
938
+ fit_params["display_table_name"] = False
939
+
940
+ if self.volatile:
941
+ fit_params["volatile"] = True
942
+ fit_params["persist"] = False
1004
943
 
1005
944
  transform_output = RoundColumns(**fit_params).result
1006
- # Adding transformed data containing table to garbage collector
1007
- GarbageCollector._add_to_garbagecollector(transform_output._table_name)
945
+ if not self.volatile and not self.persist:
946
+ # Adding transformed data containing table to garbage collector
947
+ GarbageCollector._add_to_garbagecollector(transform_output._table_name)
1008
948
  cols = transform_output.columns
1009
949
  df = transform_output.to_pandas().reset_index()
1010
950
  df = df[cols]