teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (263) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/LICENSE.pdf +0 -0
  3. teradataml/README.md +183 -0
  4. teradataml/__init__.py +6 -3
  5. teradataml/_version.py +2 -2
  6. teradataml/analytics/__init__.py +3 -2
  7. teradataml/analytics/analytic_function_executor.py +275 -40
  8. teradataml/analytics/analytic_query_generator.py +92 -0
  9. teradataml/analytics/byom/__init__.py +3 -2
  10. teradataml/analytics/json_parser/metadata.py +1 -0
  11. teradataml/analytics/json_parser/utils.py +17 -21
  12. teradataml/analytics/meta_class.py +40 -1
  13. teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
  14. teradataml/analytics/sqle/__init__.py +10 -2
  15. teradataml/analytics/table_operator/__init__.py +3 -2
  16. teradataml/analytics/uaf/__init__.py +21 -2
  17. teradataml/analytics/utils.py +62 -1
  18. teradataml/analytics/valib.py +1 -1
  19. teradataml/automl/__init__.py +1553 -319
  20. teradataml/automl/custom_json_utils.py +139 -61
  21. teradataml/automl/data_preparation.py +276 -319
  22. teradataml/automl/data_transformation.py +163 -81
  23. teradataml/automl/feature_engineering.py +402 -239
  24. teradataml/automl/feature_exploration.py +9 -2
  25. teradataml/automl/model_evaluation.py +48 -51
  26. teradataml/automl/model_training.py +291 -189
  27. teradataml/catalog/byom.py +8 -8
  28. teradataml/catalog/model_cataloging_utils.py +1 -1
  29. teradataml/clients/auth_client.py +133 -0
  30. teradataml/clients/pkce_client.py +1 -1
  31. teradataml/common/aed_utils.py +3 -2
  32. teradataml/common/constants.py +48 -6
  33. teradataml/common/deprecations.py +13 -7
  34. teradataml/common/garbagecollector.py +156 -120
  35. teradataml/common/messagecodes.py +6 -1
  36. teradataml/common/messages.py +3 -1
  37. teradataml/common/sqlbundle.py +1 -1
  38. teradataml/common/utils.py +103 -11
  39. teradataml/common/wrapper_utils.py +1 -1
  40. teradataml/context/context.py +121 -31
  41. teradataml/data/advertising.csv +201 -0
  42. teradataml/data/bank_marketing.csv +11163 -0
  43. teradataml/data/bike_sharing.csv +732 -0
  44. teradataml/data/boston2cols.csv +721 -0
  45. teradataml/data/breast_cancer.csv +570 -0
  46. teradataml/data/complaints_test_tokenized.csv +353 -0
  47. teradataml/data/complaints_tokens_model.csv +348 -0
  48. teradataml/data/covid_confirm_sd.csv +83 -0
  49. teradataml/data/customer_segmentation_test.csv +2628 -0
  50. teradataml/data/customer_segmentation_train.csv +8069 -0
  51. teradataml/data/dataframe_example.json +10 -0
  52. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
  53. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
  54. teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
  55. teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
  56. teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
  57. teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
  58. teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
  59. teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
  60. teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
  61. teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
  62. teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
  63. teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
  64. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
  65. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
  66. teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
  67. teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
  68. teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
  69. teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
  70. teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
  71. teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
  72. teradataml/data/docs/sqle/docs_17_20/Shap.py +197 -0
  73. teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
  74. teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
  75. teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
  76. teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
  77. teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
  78. teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
  79. teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
  80. teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
  81. teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
  82. teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
  83. teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
  84. teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
  85. teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
  86. teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
  87. teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
  88. teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
  89. teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
  90. teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
  91. teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
  92. teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
  93. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
  94. teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
  95. teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
  96. teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
  97. teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
  98. teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
  99. teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
  100. teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
  101. teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
  102. teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
  103. teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
  104. teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
  105. teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
  106. teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
  107. teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
  108. teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
  109. teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
  110. teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
  111. teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
  112. teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
  113. teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
  114. teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
  115. teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
  116. teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
  117. teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
  118. teradataml/data/dwt2d_dataTable.csv +65 -0
  119. teradataml/data/dwt_dataTable.csv +8 -0
  120. teradataml/data/dwt_filterTable.csv +3 -0
  121. teradataml/data/finance_data4.csv +13 -0
  122. teradataml/data/glm_example.json +28 -1
  123. teradataml/data/grocery_transaction.csv +19 -0
  124. teradataml/data/housing_train_segment.csv +201 -0
  125. teradataml/data/idwt2d_dataTable.csv +5 -0
  126. teradataml/data/idwt_dataTable.csv +8 -0
  127. teradataml/data/idwt_filterTable.csv +3 -0
  128. teradataml/data/insect2Cols.csv +61 -0
  129. teradataml/data/interval_data.csv +5 -0
  130. teradataml/data/jsons/paired_functions.json +14 -0
  131. teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
  132. teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
  133. teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
  134. teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
  135. teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
  136. teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
  137. teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
  138. teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
  139. teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
  140. teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
  141. teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
  142. teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
  143. teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
  144. teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
  145. teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
  146. teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
  147. teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
  148. teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
  149. teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
  150. teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
  151. teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
  152. teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
  153. teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
  154. teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
  155. teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
  156. teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
  157. teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
  158. teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
  159. teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
  160. teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
  161. teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
  162. teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
  163. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
  164. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
  165. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
  166. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
  167. teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
  168. teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
  169. teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
  170. teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
  171. teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
  172. teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
  173. teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
  174. teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
  175. teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
  176. teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
  177. teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
  178. teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
  179. teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
  180. teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
  181. teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
  182. teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
  183. teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
  184. teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
  185. teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
  186. teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
  187. teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
  188. teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
  189. teradataml/data/jsons/uaf/17.20/TD_SAX.json +208 -0
  190. teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
  191. teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
  192. teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
  193. teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
  194. teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
  195. teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +400 -0
  196. teradataml/data/kmeans_example.json +5 -0
  197. teradataml/data/kmeans_table.csv +10 -0
  198. teradataml/data/load_example_data.py +8 -2
  199. teradataml/data/naivebayestextclassifier_example.json +1 -1
  200. teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
  201. teradataml/data/onehot_encoder_train.csv +4 -0
  202. teradataml/data/openml_example.json +29 -0
  203. teradataml/data/peppers.png +0 -0
  204. teradataml/data/real_values.csv +14 -0
  205. teradataml/data/sax_example.json +8 -0
  206. teradataml/data/scale_attributes.csv +3 -0
  207. teradataml/data/scale_example.json +52 -1
  208. teradataml/data/scale_input_part_sparse.csv +31 -0
  209. teradataml/data/scale_input_partitioned.csv +16 -0
  210. teradataml/data/scale_input_sparse.csv +11 -0
  211. teradataml/data/scale_parameters.csv +3 -0
  212. teradataml/data/scripts/deploy_script.py +21 -2
  213. teradataml/data/scripts/sklearn/sklearn_fit.py +40 -37
  214. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +22 -30
  215. teradataml/data/scripts/sklearn/sklearn_function.template +42 -24
  216. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
  217. teradataml/data/scripts/sklearn/sklearn_neighbors.py +19 -28
  218. teradataml/data/scripts/sklearn/sklearn_score.py +32 -32
  219. teradataml/data/scripts/sklearn/sklearn_transform.py +85 -42
  220. teradataml/data/star_pivot.csv +8 -0
  221. teradataml/data/templates/open_source_ml.json +2 -1
  222. teradataml/data/teradataml_example.json +97 -1
  223. teradataml/data/timestamp_data.csv +4 -0
  224. teradataml/data/titanic_dataset_unpivoted.csv +19 -0
  225. teradataml/data/uaf_example.json +55 -1
  226. teradataml/data/unpivot_example.json +15 -0
  227. teradataml/data/url_data.csv +9 -0
  228. teradataml/data/windowdfft.csv +16 -0
  229. teradataml/data/ztest_example.json +16 -0
  230. teradataml/dataframe/copy_to.py +9 -4
  231. teradataml/dataframe/data_transfer.py +125 -64
  232. teradataml/dataframe/dataframe.py +575 -57
  233. teradataml/dataframe/dataframe_utils.py +47 -9
  234. teradataml/dataframe/fastload.py +273 -90
  235. teradataml/dataframe/functions.py +339 -0
  236. teradataml/dataframe/row.py +160 -0
  237. teradataml/dataframe/setop.py +2 -2
  238. teradataml/dataframe/sql.py +740 -18
  239. teradataml/dataframe/window.py +1 -1
  240. teradataml/dbutils/dbutils.py +324 -18
  241. teradataml/geospatial/geodataframe.py +1 -1
  242. teradataml/geospatial/geodataframecolumn.py +1 -1
  243. teradataml/hyperparameter_tuner/optimizer.py +13 -13
  244. teradataml/lib/aed_0_1.dll +0 -0
  245. teradataml/opensource/sklearn/_sklearn_wrapper.py +254 -122
  246. teradataml/options/__init__.py +16 -5
  247. teradataml/options/configure.py +39 -6
  248. teradataml/options/display.py +2 -2
  249. teradataml/plot/axis.py +4 -4
  250. teradataml/scriptmgmt/UserEnv.py +26 -19
  251. teradataml/scriptmgmt/lls_utils.py +120 -16
  252. teradataml/table_operators/Script.py +4 -5
  253. teradataml/table_operators/TableOperator.py +160 -26
  254. teradataml/table_operators/table_operator_util.py +88 -41
  255. teradataml/table_operators/templates/dataframe_udf.template +63 -0
  256. teradataml/telemetry_utils/__init__.py +0 -0
  257. teradataml/telemetry_utils/queryband.py +52 -0
  258. teradataml/utils/validators.py +41 -3
  259. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/METADATA +191 -6
  260. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/RECORD +263 -185
  261. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/WHEEL +0 -0
  262. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/top_level.txt +0 -0
  263. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/zip-safe +0 -0
@@ -24,7 +24,7 @@ from teradataml.dataframe.dataframe import DataFrame
24
24
  from teradataml.dataframe.copy_to import copy_to_sql
25
25
  from teradataml import Antiselect
26
26
  from teradataml import BincodeFit, BincodeTransform
27
- from teradataml import ColumnSummary, CategoricalSummary, GetFutileColumns, FillRowId
27
+ from teradataml import CategoricalSummary, ColumnSummary, ConvertTo, GetFutileColumns, FillRowId
28
28
  from teradataml import Fit, Transform
29
29
  from teradataml import NonLinearCombineFit, NonLinearCombineTransform
30
30
  from teradataml import NumApply
@@ -36,6 +36,8 @@ from teradataml import TargetEncodingFit, TargetEncodingTransform
36
36
  from sqlalchemy import literal_column
37
37
  from teradatasqlalchemy import INTEGER
38
38
  from teradataml import display
39
+ from teradataml.common.garbagecollector import GarbageCollector
40
+ from teradataml.dataframe.sql_functions import case
39
41
  from teradataml.hyperparameter_tuner.utils import _ProgressBar
40
42
  from teradataml.utils.validators import _Validators
41
43
 
@@ -48,7 +50,8 @@ class _FeatureEngineering:
48
50
  model_list,
49
51
  verbose = 0,
50
52
  task_type = "Regression",
51
- custom_data = None):
53
+ custom_data = None,
54
+ **kwargs):
52
55
  """
53
56
  DESCRIPTION:
54
57
  Function initializes the data, target column and columns datatypes
@@ -61,12 +64,12 @@ class _FeatureEngineering:
61
64
  Types: teradataml Dataframe
62
65
 
63
66
  target_column:
64
- Required Arugment.
67
+ Required Argument.
65
68
  Specifies the name of the target column in "data"..
66
69
  Types: str
67
70
 
68
71
  model_list:
69
- Required Arugment.
72
+ Required Argument.
70
73
  Specifies the list of models to be used for model training.
71
74
  Types: list
72
75
 
@@ -81,7 +84,7 @@ class _FeatureEngineering:
81
84
  Types: int
82
85
 
83
86
  task_type:
84
- Required Arugment.
87
+ Required Argument.
85
88
  Specifies the task type for AutoML, whether to apply regresion OR classification
86
89
  on the provived dataset.
87
90
  Default Value: "Regression"
@@ -89,9 +92,31 @@ class _FeatureEngineering:
89
92
  Types: str
90
93
 
91
94
  custom_data:
92
- Optional Arugment.
95
+ Optional Argument.
93
96
  Specifies json object containing user customized input.
94
97
  Types: json object
98
+
99
+ **kwargs:
100
+ Specifies the additional arguments for feature engineering. Below
101
+ are the additional arguments:
102
+ volatile:
103
+ Optional Argument.
104
+ Specifies whether to put the interim results of the
105
+ functions in a volatile table or not. When set to
106
+ True, results are stored in a volatile table,
107
+ otherwise not.
108
+ Default Value: False
109
+ Types: bool
110
+
111
+ persist:
112
+ Optional Argument.
113
+ Specifies whether to persist the interim results of the
114
+ functions in a table or not. When set to True,
115
+ results are persisted in a table; otherwise,
116
+ results are garbage collected at the end of the
117
+ session.
118
+ Default Value: False
119
+ Types: bool
95
120
  """
96
121
  # Instance variables
97
122
  self.data = data
@@ -106,6 +131,8 @@ class _FeatureEngineering:
106
131
  self.data_transform_dict = {}
107
132
  self.one_hot_obj_count = 0
108
133
  self.is_classification_type = lambda: self.task_type.upper() == 'CLASSIFICATION'
134
+ self.volatile = kwargs.get('volatile', False)
135
+ self.persist = kwargs.get('persist', False)
109
136
 
110
137
  # Method for doing feature engineering on data -> adding id, removing futile col, imputation, encoding(one hot)
111
138
  def feature_engineering(self,
@@ -120,7 +147,7 @@ class _FeatureEngineering:
120
147
 
121
148
  PARAMETERS:
122
149
  auto:
123
- Optional Arugment.
150
+ Optional Argument.
124
151
  Specifies whether to run AutoML in custom mode or auto mode.
125
152
  When set to False, runs in custom mode. Otherwise, by default runs in auto mode.
126
153
  Default Value: True
@@ -131,7 +158,7 @@ class _FeatureEngineering:
131
158
  second element represents list of columns which are not participating in outlier tranformation.
132
159
  """
133
160
  # Assigning number of base jobs for progress bar.
134
- base_jobs = 14 if auto else 18
161
+ base_jobs = 13 if auto else 17
135
162
 
136
163
  # Updating model list based on distinct value of target column for classification type
137
164
  if self.is_classification_type():
@@ -181,9 +208,12 @@ class _FeatureEngineering:
181
208
  self._remove_duplicate_rows()
182
209
  self.progress_bar.update()
183
210
 
211
+ self._anti_select_columns()
212
+ self.progress_bar.update()
213
+
184
214
  self._remove_futile_columns()
185
215
  self.progress_bar.update()
186
-
216
+
187
217
  self._handle_date_columns()
188
218
  self.progress_bar.update()
189
219
 
@@ -204,10 +234,7 @@ class _FeatureEngineering:
204
234
 
205
235
  self._non_linear_transformation()
206
236
  self.progress_bar.update()
207
-
208
- self._anti_select_columns()
209
- self.progress_bar.update()
210
-
237
+
211
238
  return self.data, self.excluded_cols, self.target_label, self.data_transform_dict
212
239
 
213
240
  def _extract_list(self,
@@ -255,7 +282,7 @@ class _FeatureEngineering:
255
282
  f"Remaining Columns in the data: {self.data.shape[1]}",
256
283
  progress_bar=self.progress_bar)
257
284
  else:
258
- self._display_msg(inline_msg="Analysis complete. No action taken.",
285
+ self._display_msg(inline_msg="Analysis completed. No action taken.",
259
286
  progress_bar=self.progress_bar)
260
287
 
261
288
  end_time = time.time()
@@ -322,18 +349,22 @@ class _FeatureEngineering:
322
349
  if len(categorical_columns) != 0:
323
350
 
324
351
  obj = CategoricalSummary(data=self.data,
325
- target_columns=categorical_columns)
352
+ target_columns=categorical_columns,
353
+ volatile=self.volatile,
354
+ persist=self.persist)
326
355
 
327
356
  gfc_out = GetFutileColumns(data=self.data,
328
357
  object=obj,
329
358
  category_summary_column="ColumnName",
330
- threshold_value =0.7)
359
+ threshold_value =0.7,
360
+ volatile=self.volatile,
361
+ persist=self.persist)
331
362
 
332
363
  # Extracting Futile columns
333
364
  f_cols = [row[0] for row in gfc_out.result.itertuples()]
334
365
 
335
366
  if len(f_cols) == 0:
336
- self._display_msg(inline_msg="All categorical columns seem to be significant.",
367
+ self._display_msg(inline_msg="Analysis indicates all categorical columns are significant. No action Needed.",
337
368
  progress_bar=self.progress_bar)
338
369
  else:
339
370
 
@@ -350,128 +381,80 @@ class _FeatureEngineering:
350
381
  self._display_msg(msg="Total time to handle less significant features: {:.2f} sec ".format( end_time - start_time),
351
382
  progress_bar=self.progress_bar,
352
383
  show_data=True)
353
-
354
- def _handle_date_component(self,
355
- date_component_columns,
356
- date_component):
357
384
 
385
+ def _fetch_date_component(self):
358
386
  """
359
387
  DESCRIPTION:
360
- Function to handle newly generated date components, i.e., day , month and year diff.
361
- Based on their distinct values, binning is done with predefined prefix.
362
- Binned component is used further as categorical features.
363
-
364
- PARAMETERS:
365
- date_component_columns:
366
- Required Argument.
367
- Specifies the list of newly generated differnt component of date features.
368
- Types: list
369
-
370
- date_component:
371
- Required Argument.
372
- Specifies identifier for the differnt component of date features, i.e., D - Days , M - Months and Y - Year diffs.
373
- Types: str
374
-
375
- """
376
- # Check for day
377
- if date_component == "D":
378
- prefix_value = "Day_"
379
- # Check for month
380
- elif date_component == "M":
381
- prefix_value = "Month_"
382
- # Check for year diff
383
- elif date_component == "Y":
384
- prefix_value = "Year_diff_"
385
-
386
- # Deciding bins based on distinct value of date component features.
387
- for col in date_component_columns:
388
- data_size = self.data.drop_duplicate(col).size
389
- if data_size < 4:
390
- num_bins = data_size
391
- else:
392
- num_bins = 4
393
- # Performing bincode for converting date component to specific labels
394
- fit_params = {
395
- "data": self.data,
396
- "target_columns": col,
397
- "method_type":"Equal-Width",
398
- "nbins": num_bins,
399
- "label_prefix" : prefix_value
400
- }
401
- bin_code_fit = BincodeFit(**fit_params)
402
-
403
- fit_params_map = {"D": "day_component_fit_object",
404
- "M": "month_component_fit_object",
405
- "Y": "year_diff_component_fit_object"}
406
-
407
- # Storing fit object for each date component in data transform dictionary
408
- self.data_transform_dict[fit_params_map[date_component]][col] = bin_code_fit.output
409
-
410
- accumulate_columns = self._extract_list(self.data.columns, [col])
411
- transform_params = {
412
- "data": self.data,
413
- "object": bin_code_fit.output,
414
- "accumulate": accumulate_columns,
415
- "persist": True
416
- }
417
- self.data = BincodeTransform(**transform_params).result
418
-
419
- def _fetch_date_component(self,
420
- process,
421
- regex_str,
422
- columns,
423
- date_component):
388
+ Function to fetch day of week, week of month, month of quarter, quarter of year
389
+ component from date column. Generate weekend and month half details from day of week and
390
+ week of month columns respectively. Convert quarter of year and month of quarter
391
+ component columns to VARCHAR.
424
392
 
393
+ RETURNS:
394
+ List of newly generated date component features.
425
395
  """
426
- DESCRIPTION:
427
- Function to fetch newly generated date component features.
428
- Passing ahead for performing binning.
429
-
430
- PARAMETERS:
431
- process:
432
- Required Argument.
433
- Specifies date component of date feature which is going to be fetched and handled.
434
- Types: str
435
-
436
- regex_str:
437
- Required Argument.
438
- Specifies regular expression for identifying newly generated date component features.
439
- Types: str
440
-
441
- columns:
442
- Required Argument.
443
- Specifies list of newly generated date component features.
444
- Types: list
445
-
446
- date_component:
447
- Required Argument.
448
- Specifies identifier for the differnt component of date features, i.e., D - Days , M - Months and Y - Year diffs.
449
- Types: str
396
+ # List for storing newly generated date component features
397
+ new_date_components=[]
398
+ # Extracting weekend, month, quarter details information from date columns
399
+ date_component_param={}
400
+ for col in self.date_column_list:
401
+ # Generating new column names for extracted date components
402
+ weekend_col = f'{col}_weekend'
403
+ month_half_col = f'{col}_month_half'
404
+ month_of_quarter_col=f'{col}_month_of_quarter'
405
+ quarter_of_year_col=f'{col}_quarter_of_year'
450
406
 
451
- """
452
- date_component_columns = [col for col in columns if re.search(regex_str+"$", col)]
453
- if len(date_component_columns) != 0:
454
- self._handle_date_component(date_component_columns,date_component)
455
- self._display_msg(msg="Useful {} features:".format(process),
456
- col_lst=date_component_columns,
457
- progress_bar=self.progress_bar)
458
- self._display_msg(msg="Updated dataset sample:",
459
- data=self.data,
460
- progress_bar=self.progress_bar)
461
-
462
- else:
463
- self._display_msg("\nNo useful feature found for {} component:".format(process),
464
- progress_bar=self.progress_bar)
407
+ date_component_param = {
408
+ **date_component_param,
409
+ weekend_col: case([(self.data[col].day_of_week().isin([1, 7]), 'yes')], else_='no'),
410
+ month_half_col: case([(self.data[col].week_of_month().isin([1, 2]), 'first_half')], else_='second_half'),
411
+ month_of_quarter_col: self.data[col].month_of_quarter(),
412
+ quarter_of_year_col: self.data[col].quarter_of_year()
413
+ }
414
+ # Storing newly generated date component month and quarter columns.
415
+ # Skipping day of week and week of month columns as they will be used
416
+ # later for extracting weekend and month part details.
417
+ new_date_components.extend([weekend_col, month_half_col, month_of_quarter_col, quarter_of_year_col])
418
+ # Adding new date component columns to dataset
419
+ self.data=self.data.assign(**date_component_param)
420
+ # Dropping date columns as different component columns are extracted.
421
+ self.data = self.data.drop(self.date_column_list, axis=1)
422
+
423
+ # Converting remaining component columns to VARCHAR
424
+ # So that it will be treated as categorical columns
425
+ remaining_component_columns = [col for col in self.data.columns if re.search('month_of_quarter|quarter_of_year'+"$", col)]
426
+ accumulate_columns = self._extract_list(self.data.columns, remaining_component_columns)
427
+ convertto_params = {
428
+ "data" : self.data,
429
+ "target_columns" : remaining_component_columns,
430
+ "target_datatype" : ["VARCHAR(charlen=20,charset=UNICODE,casespecific=NO)"],
431
+ "accumulate" : accumulate_columns,
432
+ "persist" : True
433
+ }
434
+ # Disabling display table name if persist is True by default
435
+ if not self.volatile and not self.persist:
436
+ convertto_params["display_table_name"] = False
465
437
 
466
- return date_component_columns
438
+ # Setting persist to False if volatile is True
439
+ if self.volatile:
440
+ convertto_params["persist"] = False
441
+ convertto_params["volatile"] = True
442
+
443
+ # returning dataset after performing string manipulation
444
+ self.data = ConvertTo(**convertto_params).result
445
+
446
+ # IF volatile is False and persist is False
447
+ if not self.volatile and not self.persist:
448
+ # Adding transformed data containing table to garbage collector
449
+ GarbageCollector._add_to_garbagecollector(self.data._table_name)
450
+ return new_date_components
467
451
 
468
452
  def _handle_date_columns_helper(self):
469
453
 
470
454
  """
471
455
  DESCRIPTION:
472
- Function for dropping irrelevent date features.
473
- Extracting day, month and year component from revelent date features.
474
- Passing extracted component for performing binning.
456
+ Function for dropping irrelevent date features. Perform Extraction of different
457
+ component from revelent date features and transform them.
475
458
  """
476
459
 
477
460
  # Dropping missing value for all date columns
@@ -484,7 +467,7 @@ class _FeatureEngineering:
484
467
  # Date columns list eligible for dropping from dataset
485
468
  drop_date_cols = []
486
469
 
487
- # Checking for single valued date columns
470
+ # Checking for unique valued date columns
488
471
  for col in self.date_column_list:
489
472
  if self.data.drop_duplicate(col).size == self.data.shape[0]:
490
473
  drop_date_cols.append(col)
@@ -496,46 +479,18 @@ class _FeatureEngineering:
496
479
  self._display_msg(msg='Dropping date features with all unique value:',
497
480
  col_lst = drop_date_cols,
498
481
  progress_bar=self.progress_bar)
499
-
500
- # Updated date columns list
501
- self.date_column_list = [item for item in self.date_column_list if item not in drop_date_cols]
502
-
503
- # List for storing newly generated date component features
504
- new_columns=[]
482
+ # Updated date column list after dropping irrelevant date columns
483
+ self.date_column_list = [item for item in self.date_column_list if item not in drop_date_cols]
505
484
 
506
- # Extracting day, month and year difference from date columns
507
485
  if len(self.date_column_list) != 0:
508
486
 
509
- component_param={}
510
- for col in self.date_column_list:
511
-
512
- day_column=str(col)+"_day_comp"
513
- month_column=str(col)+"_month_comp"
514
- year_diff_column=str(col)+"_year_diff_comp"
515
- new_columns.extend([day_column,month_column,year_diff_column])
516
- day_query=("EXTRACT(DAY FROM {0})".format(col))
517
- month_query=("EXTRACT(MONTH FROM {0})".format(col))
518
- year_query=("EXTRACT(YEAR FROM CURRENT_DATE) - EXTRACT(YEAR FROM {0})".format(col))
519
- component_param[day_column]=literal_column(day_query,INTEGER())
520
- component_param[month_column]=literal_column(month_query,INTEGER())
521
- component_param[year_diff_column]=literal_column(year_query,INTEGER())
522
-
523
- self.data=self.data.assign(**component_param)
524
- # Storing newly generated date component list along with parameters in data transform dictionary
525
- self.data_transform_dict['extract_date_comp_col'] = self.date_column_list
526
- self.data_transform_dict['extract_date_comp_param'] = component_param
527
-
528
- # Dropping date columns as we have already extracted day, month and year in new columns
529
- self.data = self.data.drop(self.date_column_list, axis=1)
487
+ # List for storing newly generated date component features
488
+ new_columns=self._fetch_date_component()
530
489
  self._display_msg(msg='List of newly generated features from existing date features:',
531
490
  col_lst=new_columns,
532
491
  progress_bar=self.progress_bar)
533
- self._display_msg(msg='List of newly generated features from existing date features:',
534
- data=self.data,
535
- progress_bar=self.progress_bar)
536
-
492
+ # Dropping columns with all unique values or single value
537
493
  drop_cols=[]
538
-
539
494
  for col in new_columns:
540
495
  distinct_rows = self.data.drop_duplicate(col).size
541
496
  if distinct_rows == self.data.shape[0]:
@@ -555,21 +510,11 @@ class _FeatureEngineering:
555
510
  self.data = self.data.drop(drop_cols, axis=1)
556
511
  # Storing extract date component list for drop in data transform dictionary
557
512
  self.data_transform_dict['drop_extract_date_columns'] = drop_cols
558
-
559
- # Extracting all newly generated columns
560
- new_columns = [item for item in new_columns if item not in drop_cols]
513
+ # Extracting all newly generated columns
514
+ new_columns = [item for item in new_columns if item not in drop_cols]
561
515
 
562
- # Storing each date component transformation fit object in data transform dictionary
563
- self.data_transform_dict = {**self.data_transform_dict,
564
- 'day_component_fit_object': {},
565
- 'month_component_fit_object': {},
566
- 'year_diff_component_fit_object': {}}
567
- # Grouping date components based on types i.e., day, month, and year_diff for performing binning
568
- if len(new_columns) != 0:
569
- self.day_columns = self._fetch_date_component("day", "_day_comp", new_columns, "D")
570
- self.month_columns = self._fetch_date_component("month", "_month_comp", new_columns, "M")
571
- self.year_diff_columns = self._fetch_date_component("year_diff", "_year_diff_comp", new_columns, "Y")
572
- self._display_msg(inline_msg="No useful date component found",
516
+ self._display_msg(msg='Updated list of newly generated features from existing date features :',
517
+ col_lst=new_columns,
573
518
  progress_bar=self.progress_bar)
574
519
 
575
520
  self._display_msg(msg='Updated dataset sample after handling date features:',
@@ -595,7 +540,7 @@ class _FeatureEngineering:
595
540
  if d_type in ["datetime.date","datetime.datetime"]]
596
541
 
597
542
  if len(self.date_column_list) == 0:
598
- self._display_msg(inline_msg="Dataset does not contain any feature related to dates.",
543
+ self._display_msg(inline_msg="Analysis Completed. Dataset does not contain any feature related to dates. No action needed.",
599
544
  progress_bar=self.progress_bar)
600
545
  else:
601
546
  # Storing date column list in data transform dictionary
@@ -622,8 +567,9 @@ class _FeatureEngineering:
622
567
  self.data = self.data.dropna(subset=[self.target_column])
623
568
 
624
569
  obj = ColumnSummary(data=self.data,
625
- target_columns=self.data.columns,
626
- volatile=True)
570
+ target_columns=self.data.columns,
571
+ volatile=self.volatile,
572
+ persist=self.persist)
627
573
 
628
574
  cols_miss_val={}
629
575
  # Iterating over each row in the column summary result
@@ -705,11 +651,15 @@ class _FeatureEngineering:
705
651
  self.data_transform_dict['imputation_columns'] = self.imputation_cols
706
652
 
707
653
  if len(delete_rows) != 0:
654
+ rows = self.data.shape[0]
708
655
  self.data = self.data.dropna(subset=delete_rows)
709
656
  msg_val_found=1
710
657
  self._display_msg(msg='Deleting rows of these columns for handling missing values:',
711
658
  col_lst=delete_rows,
712
659
  progress_bar=self.progress_bar)
660
+ self._display_msg(msg=f'Sample of dataset after removing {rows-self.data.shape[0]} rows:',
661
+ data=self.data,
662
+ progress_bar=self.progress_bar)
713
663
 
714
664
  if len(drop_cols) != 0:
715
665
  self.data = self.data.drop(drop_cols, axis=1)
@@ -719,9 +669,12 @@ class _FeatureEngineering:
719
669
  self._display_msg(msg='Dropping these columns for handling missing values:',
720
670
  col_lst=drop_cols,
721
671
  progress_bar=self.progress_bar)
672
+ self._display_msg(msg=f'Sample of dataset after removing {len(drop_cols)} columns:',
673
+ data=self.data,
674
+ progress_bar=self.progress_bar)
722
675
 
723
676
  if len(self.imputation_cols) == 0 and msg_val_found ==0:
724
- self._display_msg(inline_msg="No Missing Values Detected.",
677
+ self._display_msg(inline_msg="Analysis Completed. No Missing Values Detected.",
725
678
  progress_bar=self.progress_bar)
726
679
 
727
680
  end_time = time.time()
@@ -787,21 +740,23 @@ class _FeatureEngineering:
787
740
 
788
741
  fit_obj = SimpleImputeFit(data=self.data,
789
742
  stats_columns=col_stat,
790
- stats=stat,
791
- volatile=True)
743
+ stats=stat,
744
+ volatile=self.volatile,
745
+ persist=self.persist)
792
746
 
793
747
  # Storing fit object for imputation in data transform dictionary
794
748
  self.data_transform_dict['imputation_fit_object'] = fit_obj.output
795
749
  sm = SimpleImputeTransform(data=self.data,
796
- object=fit_obj,
797
- volatile=True)
750
+ object=fit_obj,
751
+ volatile=self.volatile,
752
+ persist=self.persist)
798
753
 
799
754
  self.data = sm.result
800
- self._display_msg(msg="Sample of Data after Imputation:",
755
+ self._display_msg(msg="Sample of dataset after Imputation:",
801
756
  data=self.data,
802
757
  progress_bar=self.progress_bar)
803
758
  else:
804
- self._display_msg(inline_msg="No imputation is Required.",
759
+ self._display_msg(inline_msg="Analysis completed. No imputation required.",
805
760
  progress_bar=self.progress_bar)
806
761
 
807
762
  end_time = time.time()
@@ -827,6 +782,8 @@ class _FeatureEngineering:
827
782
  drop_col_ind = missing_handling_param.get("DroppingColumnIndicator", False)
828
783
  drop_row_ind = missing_handling_param.get("DroppingRowIndicator", False)
829
784
  impute_ind = missing_handling_param.get("ImputeMissingIndicator", False)
785
+ volatile = missing_handling_param.pop("volatile", False)
786
+ persist = missing_handling_param.pop("persist", False)
830
787
  # Checking for user input if all methods indicator are false or not
831
788
  if not any([drop_col_ind, drop_row_ind, impute_ind]):
832
789
  self._display_msg(inline_msg="No method information provided for performing customized missing value handling. \
@@ -883,7 +840,9 @@ class _FeatureEngineering:
883
840
  "stats_columns" : stat_list,
884
841
  "stats" : stat_method,
885
842
  "literals_columns" : literal_list,
886
- "literals" : literal_value
843
+ "literals" : literal_value,
844
+ "volatile" : volatile,
845
+ "persist" : persist
887
846
  }
888
847
  # Fitting on dataset
889
848
  fit_obj = SimpleImputeFit(**fit_param)
@@ -896,8 +855,18 @@ class _FeatureEngineering:
896
855
  "object" : fit_obj.output,
897
856
  "persist" : True
898
857
  }
858
+ # Disabling display table name if persist is True by default
859
+ if not volatile and not persist:
860
+ transform_param["display_table_name"] = False
861
+
862
+ if volatile:
863
+ transform_param["volatile"] = True
864
+ transform_param["persist"] = False
899
865
  # Updating dataset with transform result
900
866
  self.data = SimpleImputeTransform(**transform_param).result
867
+ if not volatile and not persist:
868
+ # Adding transformed data containing table to garbage collector
869
+ GarbageCollector._add_to_garbagecollector(self.data._table_name)
901
870
  self._display_msg(msg="Updated dataset sample after performing customized missing value imputation:",
902
871
  data=self.data,
903
872
  progress_bar=self.progress_bar)
@@ -938,6 +907,8 @@ class _FeatureEngineering:
938
907
  equal_width_bin_columns = []
939
908
  var_width_bin_list = []
940
909
  var_width_bin_columns = []
910
+ volatile = extracted_col.pop("volatile", False)
911
+ persist = extracted_col.pop("persist", False)
941
912
 
942
913
  # Checking for column present in dataset or not
943
914
  _Validators._validate_dataframe_has_argument_columns(list(extracted_col.keys()), "BincodeParam", self.data, "df")
@@ -971,7 +942,9 @@ class _FeatureEngineering:
971
942
  "data" : self.data,
972
943
  "target_columns": equal_width_bin_columns,
973
944
  "method_type" : "Equal-Width",
974
- "nbins" : bins
945
+ "nbins" : bins,
946
+ "volatile" : volatile,
947
+ "persist" : persist
975
948
  }
976
949
  eql_bin_code_fit = BincodeFit(**fit_params)
977
950
  # Storing fit object and column list for Equal-Width binning in data transform dictionary
@@ -984,9 +957,19 @@ class _FeatureEngineering:
984
957
  "data" : self.data,
985
958
  "object" : eql_bin_code_fit.output,
986
959
  "accumulate" : accumulate_columns,
987
- "persist" : True,
960
+ "persist" : True
988
961
  }
962
+ # Disabling display table name if persist is True by default
963
+ if not volatile and not persist:
964
+ eql_transform_params["display_table_name"] = False
965
+
966
+ if volatile:
967
+ eql_transform_params["volatile"] = True
968
+ eql_transform_params["persist"] = False
989
969
  self.data = BincodeTransform(**eql_transform_params).result
970
+ if not volatile and not persist:
971
+ # Adding transformed data containing table to garbage collector
972
+ GarbageCollector._add_to_garbagecollector(self.data._table_name)
990
973
  self._display_msg(msg="\nUpdated dataset sample after performing Equal-Width binning :-",
991
974
  data=self.data,
992
975
  progress_bar=self.progress_bar)
@@ -1011,7 +994,9 @@ class _FeatureEngineering:
1011
994
  "maxvalue_column" : "MaxValue",
1012
995
  "label_column" : "Label",
1013
996
  "method_type" : "Variable-Width",
1014
- "label_prefix" : "label_prefix"
997
+ "label_prefix" : "label_prefix",
998
+ "volatile" : volatile,
999
+ "persist" : persist
1015
1000
  }
1016
1001
  var_bin_code_fit = BincodeFit(**fit_params)
1017
1002
  # Storing fit object and column list for Variable-Width binning in data transform dictionary
@@ -1023,9 +1008,19 @@ class _FeatureEngineering:
1023
1008
  "object" : var_bin_code_fit.output,
1024
1009
  "object_order_column" : "TD_MinValue_BINFIT",
1025
1010
  "accumulate" : accumulate_columns,
1026
- "persist" : True
1011
+ "persist" : True
1027
1012
  }
1013
+ # Disabling display table name if persist is True by default
1014
+ if not volatile and not persist:
1015
+ var_transform_params["display_table_name"] = False
1016
+
1017
+ if volatile:
1018
+ var_transform_params["volatile"] = True
1019
+ var_transform_params["persist"] = False
1028
1020
  self.data = BincodeTransform(**var_transform_params).result
1021
+ if not volatile and not persist:
1022
+ # Adding transformed data containing table to garbage collector
1023
+ GarbageCollector._add_to_garbagecollector(self.data._table_name)
1029
1024
  self._display_msg(msg="Updated dataset sample after performing Variable-Width binning:",
1030
1025
  data=self.data,
1031
1026
  progress_bar=self.progress_bar)
@@ -1049,11 +1044,13 @@ class _FeatureEngineering:
1049
1044
  # Storing custom string manipulation indicator in data transform dictionary
1050
1045
  self.data_transform_dict['custom_string_manipulation_ind'] = True
1051
1046
  # Fetching list required for performing operation.
1052
- extracted_col = self.custom_data.get("StringManipulationParam", None)
1047
+ extracted_col = self.custom_data.get("StringManipulationParam", None).copy()
1053
1048
  if not extracted_col:
1054
1049
  self._display_msg(inline_msg="No information provided for performing string manipulation.",
1055
1050
  progress_bar=self.progress_bar)
1056
1051
  else:
1052
+ volatile = extracted_col.pop("volatile", False)
1053
+ persist = extracted_col.pop("persist", False)
1057
1054
  # Checking for column present in dataset or not
1058
1055
  _Validators._validate_dataframe_has_argument_columns(list(extracted_col.keys()), "StringManipulationParam", self.data, "df")
1059
1056
 
@@ -1066,8 +1063,9 @@ class _FeatureEngineering:
1066
1063
  data=self.data,
1067
1064
  progress_bar=self.progress_bar)
1068
1065
  else:
1069
- self._display_msg(inline_msg="Skipping customized string manipulation.")
1070
-
1066
+ self._display_msg(inline_msg="Skipping customized string manipulation.",
1067
+ progress_bar=self.progress_bar)
1068
+
1071
1069
  def _str_method_mapping(self,
1072
1070
  target_col,
1073
1071
  transform_val):
@@ -1096,7 +1094,11 @@ class _FeatureEngineering:
1096
1094
 
1097
1095
  # Fetching required parameters from json object
1098
1096
  string_operation = transform_val["StringOperation"]
1099
-
1097
+
1098
+ # Setting volatile and persist parameters for performing string manipulation
1099
+ volatile, persist = self._set_generic_parameters(func_indicator="StringManipulationIndicator",
1100
+ param_name="StringManipulationParam")
1101
+
1100
1102
  # Storing general parameters for performing string transformation
1101
1103
  fit_params = {
1102
1104
  "data" : self.data,
@@ -1106,6 +1108,14 @@ class _FeatureEngineering:
1106
1108
  "inplace" : True,
1107
1109
  "persist" : True
1108
1110
  }
1111
+ # Disabling display table name if persist is True by default
1112
+ if not volatile and not persist:
1113
+ fit_params["display_table_name"] = False
1114
+
1115
+ if volatile:
1116
+ fit_params["volatile"] = True
1117
+ fit_params["persist"] = False
1118
+
1109
1119
  # Adding additional parameters based on string operation type
1110
1120
  if string_operation in ["StringCon", "StringTrim"]:
1111
1121
  string_argument = transform_val["String"]
@@ -1125,11 +1135,15 @@ class _FeatureEngineering:
1125
1135
  "string_length" : string_length}
1126
1136
 
1127
1137
  # returning dataset after performing string manipulation
1128
- return StrApply(**fit_params).result
1138
+ transform_output = StrApply(**fit_params).result
1139
+ if not volatile and not persist:
1140
+ # Adding transformed data containing table to garbage collector
1141
+ GarbageCollector._add_to_garbagecollector(transform_output._table_name)
1142
+ return transform_output
1129
1143
 
1130
1144
  def _one_hot_encoding(self,
1131
- one_hot_columns,
1132
- unique_counts):
1145
+ one_hot_columns,
1146
+ unique_counts):
1133
1147
  """
1134
1148
  DESCRIPTION:
1135
1149
  Function performs the one hot encoding to categorcial columns/features in the dataset.
@@ -1143,12 +1157,16 @@ class _FeatureEngineering:
1143
1157
  unique_counts:
1144
1158
  Required Argument.
1145
1159
  Specifies the unique counts in the categorical columns.
1146
- Types: int or list of integer (int)
1147
-
1160
+ Types: int or list of integer (int)
1148
1161
  """
1149
1162
  # TD function will add extra column_other in onehotEncoding, so
1150
1163
  # initailizing this list to remove those extra columns
1151
1164
  drop_lst = [ele + "_other" for ele in one_hot_columns]
1165
+
1166
+ # Setting volatile and persist parameters for performing encoding
1167
+ volatile, persist = self._set_generic_parameters(func_indicator="CategoricalEncodingIndicator",
1168
+ param_name="CategoricalEncodingParam")
1169
+
1152
1170
  # Adding fit parameters for performing encoding
1153
1171
  fit_params = {
1154
1172
  "data" : self.data,
@@ -1156,7 +1174,9 @@ class _FeatureEngineering:
1156
1174
  "is_input_dense" : True,
1157
1175
  "target_column" : one_hot_columns,
1158
1176
  "category_counts" : unique_counts,
1159
- "other_column" : "other"
1177
+ "other_column" : "other",
1178
+ "volatile" : volatile,
1179
+ "persist" : persist
1160
1180
  }
1161
1181
  # Performing one hot encoding fit on target columns
1162
1182
  fit_obj = OneHotEncodingFit(**fit_params)
@@ -1172,9 +1192,22 @@ class _FeatureEngineering:
1172
1192
  "is_input_dense" : True,
1173
1193
  "persist" : True
1174
1194
  }
1195
+ # Disabling display table name if persist is True by default
1196
+ if not volatile and not persist:
1197
+ transform_params["display_table_name"] = False
1198
+
1199
+ # Setting persist to False if volatile is True
1200
+ if volatile:
1201
+ transform_params["volatile"] = True
1202
+ transform_params["persist"] = False
1203
+
1175
1204
  # Performing one hot encoding transformation
1176
- transform_obj = OneHotEncodingTransform(**transform_params)
1177
- self.data = transform_obj.result.drop(drop_lst, axis=1)
1205
+ transform_output = OneHotEncodingTransform(**transform_params).result
1206
+
1207
+ if not volatile and not persist:
1208
+ # Adding transformed data containing table to garbage collector
1209
+ GarbageCollector._add_to_garbagecollector(transform_output._table_name)
1210
+ self.data = transform_output.drop(drop_lst, axis=1)
1178
1211
 
1179
1212
  def _ordinal_encoding(self,
1180
1213
  ordinal_columns):
@@ -1188,11 +1221,16 @@ class _FeatureEngineering:
1188
1221
  Specifies the categorical columns for which ordinal encoding will be performed.
1189
1222
  Types: str or list of strings (str)
1190
1223
  """
1224
+ # Setting volatile and persist parameters for performing encoding
1225
+ volatile, persist = self._set_generic_parameters(func_indicator="CategoricalEncodingIndicator",
1226
+ param_name="CategoricalEncodingParam")
1227
+
1191
1228
  # Adding fit parameters for performing encoding
1192
1229
  fit_params = {
1193
1230
  "data" : self.data,
1194
1231
  "target_column" : ordinal_columns,
1195
- "volatile" : True
1232
+ "volatile" : volatile,
1233
+ "persist" : persist
1196
1234
  }
1197
1235
  # Performing ordinal encoding fit on target columns
1198
1236
  ord_fit_obj = OrdinalEncodingFit(**fit_params)
@@ -1212,15 +1250,27 @@ class _FeatureEngineering:
1212
1250
  "accumulate" : accumulate_columns,
1213
1251
  "persist" : True
1214
1252
  }
1253
+ # Disabling display table name if persist is True by default
1254
+ if not volatile and not persist:
1255
+ transform_params["display_table_name"] = False
1256
+
1257
+ # Setting persist to False if volatile is True
1258
+ if volatile:
1259
+ transform_params["volatile"] = True
1260
+ transform_params["persist"] = False
1215
1261
  # Performing ordinal encoding transformation
1216
1262
  self.data = OrdinalEncodingTransform(**transform_params).result
1263
+
1264
+ if not volatile and not persist:
1265
+ # Adding transformed data containing table to garbage collector
1266
+ GarbageCollector._add_to_garbagecollector(self.data._table_name)
1217
1267
 
1218
1268
  if len(ordinal_columns) == 1 and ordinal_columns[0] == self.target_column:
1219
1269
  self.target_label = ord_fit_obj
1220
1270
 
1221
1271
 
1222
1272
  def _target_encoding(self,
1223
- target_encoding_list):
1273
+ target_encoding_list):
1224
1274
  """
1225
1275
  DESCRIPTION:
1226
1276
  Function performs the target encoding to categorcial columns/features in the dataset.
@@ -1245,6 +1295,11 @@ class _FeatureEngineering:
1245
1295
  # Storing indicator and fit object for target encoding in data transform dictionary
1246
1296
  self.data_transform_dict["custom_target_encoding_ind"] = True
1247
1297
  self.data_transform_dict["custom_target_encoding_fit_obj"] = {}
1298
+
1299
+ # Setting volatile and persist parameters for performing encoding
1300
+ volatile, persist = self._set_generic_parameters(func_indicator="CategoricalEncodingIndicator",
1301
+ param_name="CategoricalEncodingParam")
1302
+
1248
1303
  # Fetching required argument for performing target encoding
1249
1304
  for col,transform_val in target_encoding_list.items():
1250
1305
  encoder_method = transform_val["encoder_method"]
@@ -1255,7 +1310,9 @@ class _FeatureEngineering:
1255
1310
  "category_data" : category_data,
1256
1311
  "encoder_method" : encoder_method,
1257
1312
  "target_columns" : col,
1258
- "response_column" : response_column
1313
+ "response_column" : response_column,
1314
+ "volatile" : volatile,
1315
+ "persist" : persist
1259
1316
  }
1260
1317
  if encoder_method == "CBM_DIRICHLET":
1261
1318
  num_distinct_responses=transform_val["num_distinct_responses"]
@@ -1264,7 +1321,7 @@ class _FeatureEngineering:
1264
1321
  # Performing target encoding fit on target columns
1265
1322
  tar_fit_obj = TargetEncodingFit(**fit_params)
1266
1323
  # Storing each column fit object for target encoding in data transform dictionary
1267
- self.data_transform_dict["custom_target_encoding_fit_obj"].update({col : tar_fit_obj})
1324
+ self.data_transform_dict["custom_target_encoding_fit_obj"].update({col : tar_fit_obj.result})
1268
1325
  # Extracting accumulate columns
1269
1326
  accumulate_columns = self._extract_list(self.data.columns, [col])
1270
1327
  # Adding transform parameters for performing encoding
@@ -1272,10 +1329,21 @@ class _FeatureEngineering:
1272
1329
  "data" : self.data,
1273
1330
  "object" : tar_fit_obj,
1274
1331
  "accumulate" : accumulate_columns,
1275
- "persist" : True
1332
+ "persist" : True
1276
1333
  }
1334
+
1335
+ # Disabling display table name if persist is True by default
1336
+ if not volatile and not persist:
1337
+ transform_params["display_table_name"] = False
1338
+
1339
+ if volatile:
1340
+ transform_params["volatile"] = True
1341
+ transform_params["persist"] = False
1277
1342
  # Performing ordinal encoding transformation
1278
1343
  self.data = TargetEncodingTransform(**transform_params).result
1344
+ if not volatile and not persist:
1345
+ # Adding transformed data containing table to garbage collector
1346
+ GarbageCollector._add_to_garbagecollector(self.data._table_name)
1279
1347
 
1280
1348
  def _encoding_categorical_columns(self):
1281
1349
  """
@@ -1308,8 +1376,11 @@ class _FeatureEngineering:
1308
1376
  self._display_msg(msg="ONE HOT Encoding these Columns:",
1309
1377
  col_lst=ohe_col,
1310
1378
  progress_bar=self.progress_bar)
1379
+ self._display_msg(msg="Sample of dataset after performing one hot encoding:",
1380
+ data=self.data,
1381
+ progress_bar=self.progress_bar)
1311
1382
  else:
1312
- self._display_msg(inline_msg="Encoding not required.",
1383
+ self._display_msg(inline_msg="Analysis completed. No categorical columns were found.",
1313
1384
  progress_bar=self.progress_bar)
1314
1385
 
1315
1386
  # List of columns after one hot
@@ -1337,8 +1408,10 @@ class _FeatureEngineering:
1337
1408
  # Storing custom categorical encoding indicator in data transform dictionary
1338
1409
  self.data_transform_dict["custom_categorical_encoding_ind"] = True
1339
1410
  # Fetching user input list for performing
1340
- encoding_list = self.custom_data.get("CategoricalEncodingParam", None)
1411
+ encoding_list = self.custom_data.get("CategoricalEncodingParam", None).copy()
1341
1412
  if encoding_list:
1413
+ volatile = encoding_list.pop("volatile", False)
1414
+ persist = encoding_list.pop("persist", False)
1342
1415
  onehot_encode_ind = encoding_list.get("OneHotEncodingIndicator", False)
1343
1416
  ordinal_encode_ind = encoding_list.get("OrdinalEncodingIndicator", False)
1344
1417
  target_encode_ind = encoding_list.get("TargetEncodingIndicator", False)
@@ -1415,11 +1488,25 @@ class _FeatureEngineering:
1415
1488
  """
1416
1489
  DESCRIPTION:
1417
1490
  Function to perform different numerical transformations using NumApply on numerical features based on user input.
1418
-
1491
+
1492
+ PARAMETERS:
1493
+ target_col:
1494
+ Required Argument.
1495
+ Specifies the numerical column for which transformation will be performed.
1496
+ Types: str
1497
+
1498
+ transform_val:
1499
+ Required Argument.
1500
+ Specifies different parameter require for applying numerical transformation.
1501
+ Types: dict
1419
1502
  """
1420
1503
  # Fetching columns for accumulation
1421
1504
  accumulate_columns = self._extract_list(self.data.columns, [target_col])
1422
1505
  apply_method = transform_val["apply_method"]
1506
+
1507
+ # Setting volatile and persist parameters for performing transformation
1508
+ volatile, persist = self._set_generic_parameters(func_indicator="MathameticalTransformationIndicator",
1509
+ param_name="MathameticalTransformationParam")
1423
1510
  # Adding fit parameters for performing transformation
1424
1511
  fit_params={
1425
1512
  "data": self.data,
@@ -1429,14 +1516,25 @@ class _FeatureEngineering:
1429
1516
  "persist" :True,
1430
1517
  "accumulate" : accumulate_columns
1431
1518
  }
1519
+ # Disabling display table name if persist is True by default
1520
+ if not volatile and not persist:
1521
+ fit_params["display_table_name"] = False
1522
+
1523
+ if volatile:
1524
+ fit_params["volatile"] = True
1525
+ fit_params["persist"] = False
1432
1526
  # Adding addition details for fit parameters in case of SIGMOID transformation
1433
1527
  if apply_method == "sigmoid":
1434
1528
  sigmoid_style=transform_val["sigmoid_style"]
1435
1529
  fit_params = {**fit_params, "sigmoid_style" : sigmoid_style}
1436
1530
  # Performing transformation on target columns
1437
- return NumApply(**fit_params).result
1531
+ transform_output = NumApply(**fit_params).result
1532
+ if not volatile and not persist:
1533
+ # Adding transformed data containing table to garbage collector
1534
+ GarbageCollector._add_to_garbagecollector(transform_output._table_name)
1535
+ return transform_output
1438
1536
 
1439
- def _numerical_transformation(self, target_columns, num_transform_data):
1537
+ def _numerical_transformation(self, target_columns, num_transform_data, volatile, persist):
1440
1538
  """
1441
1539
  DESCRIPTION:
1442
1540
  Function to perform different numerical transformations using Fit and Transform on numerical features based on user input.
@@ -1446,7 +1544,9 @@ class _FeatureEngineering:
1446
1544
  fit_params={
1447
1545
  "data" : self.data,
1448
1546
  "object" : num_transform_data,
1449
- "object_order_column" : "TargetColumn"
1547
+ "object_order_column" : "TargetColumn",
1548
+ "volatile" : volatile,
1549
+ "persist" : persist
1450
1550
  }
1451
1551
  # Peforming fit with all arguments.
1452
1552
  num_fit_obj = Fit(**fit_params)
@@ -1464,8 +1564,18 @@ class _FeatureEngineering:
1464
1564
  "id_columns" : id_columns,
1465
1565
  "persist" :True
1466
1566
  }
1567
+ # Disabling display table name if persist is True by default
1568
+ if not volatile and not persist:
1569
+ transform_params["display_table_name"] = False
1570
+
1571
+ if volatile:
1572
+ transform_params["volatile"] = True
1573
+ transform_params["persist"] = False
1467
1574
  # Peforming transformation on target columns
1468
- self.data = Transform(**transform_params).result
1575
+ self.data = Transform(**transform_params).result
1576
+ if not volatile and not persist:
1577
+ # Adding transformed data containing table to garbage collector
1578
+ GarbageCollector._add_to_garbagecollector(self.data._table_name)
1469
1579
  self._display_msg(msg="Updated dataset sample after applying numerical transformation:",
1470
1580
  data=self.data,
1471
1581
  progress_bar=self.progress_bar)
@@ -1484,8 +1594,11 @@ class _FeatureEngineering:
1484
1594
  # Checking user input for mathematical transformations
1485
1595
  if mat_transform_input:
1486
1596
  # Extracting list required for mathematical transformations
1487
- mat_transform_list = self.custom_data.get("MathameticalTransformationParam", None)
1597
+ mat_transform_list = self.custom_data.get("MathameticalTransformationParam", None).copy()
1598
+
1488
1599
  if mat_transform_list:
1600
+ volatile = mat_transform_list.pop("volatile", False)
1601
+ persist = mat_transform_list.pop("persist", False)
1489
1602
  # Checking for column present in dataset or not
1490
1603
  _Validators._validate_dataframe_has_argument_columns(list(mat_transform_list.keys()),
1491
1604
  "MathameticalTransformationParam", self.data, "df")
@@ -1529,7 +1642,7 @@ class _FeatureEngineering:
1529
1642
  copy_to_sql(df=transform_data, table_name="automl_num_transform_data", temporary=True)
1530
1643
  num_transform_data = DataFrame.from_table("automl_num_transform_data")
1531
1644
  # Applying transformation using Fit/Transform functions
1532
- self._numerical_transformation(target_columns, num_transform_data)
1645
+ self._numerical_transformation(target_columns, num_transform_data, volatile, persist)
1533
1646
  # Storing custom numerical transformation parameters and column list in data transform dictionary
1534
1647
  self.data_transform_dict['custom_numerical_transformation_col'] = target_columns
1535
1648
  self.data_transform_dict['custom_numerical_transformation_params'] = num_transform_data
@@ -1555,6 +1668,8 @@ class _FeatureEngineering:
1555
1668
  nl_transform_list = self.custom_data.get("NonLinearTransformationParam", None)
1556
1669
  # Extracting list required for non-linear transformation
1557
1670
  if nl_transform_list:
1671
+ volatile = nl_transform_list.pop("volatile", False)
1672
+ persist = nl_transform_list.pop("persist", False)
1558
1673
  total_combination = len(nl_transform_list)
1559
1674
  # Generating all possible combination names
1560
1675
  possible_combination = ["Combination_"+str(counter) for counter in range(1,total_combination+1)]
@@ -1581,12 +1696,14 @@ class _FeatureEngineering:
1581
1696
  "data" : self.data,
1582
1697
  "target_columns" : target_columns,
1583
1698
  "formula" : formula,
1584
- "result_column" : result_column
1699
+ "result_column" : result_column,
1700
+ "volatile" : volatile,
1701
+ "persist" : persist
1585
1702
  }
1586
1703
  # Performing fit on dataset
1587
1704
  fit_obj = NonLinearCombineFit(**fit_param)
1588
1705
  # Updating it for each non-linear combination
1589
- self.data_transform_dict['custom_non_linear_transformation_fit_object'].update({comb:fit_obj})
1706
+ self.data_transform_dict['custom_non_linear_transformation_fit_object'].update({comb:fit_obj.result})
1590
1707
  # Adding transform params for transformation
1591
1708
  transform_params = {
1592
1709
  "data" : self.data,
@@ -1594,7 +1711,18 @@ class _FeatureEngineering:
1594
1711
  "accumulate" : self.data.columns,
1595
1712
  "persist" : True
1596
1713
  }
1714
+ # Disabling display table name if persist is True by default
1715
+ if not volatile and not persist:
1716
+ transform_params["display_table_name"] = False
1717
+
1718
+ if volatile:
1719
+ transform_params["volatile"] = True
1720
+ transform_params["persist"] = False
1597
1721
  self.data = NonLinearCombineTransform(**transform_params).result
1722
+
1723
+ if not volatile and not persist:
1724
+ # Adding transformed data containing table to garbage collector
1725
+ GarbageCollector._add_to_garbagecollector(self.data._table_name)
1598
1726
  else:
1599
1727
  self._display_msg(inline_msg="Combinations are not as per expectation.",
1600
1728
  progress_bar=self.progress_bar)
@@ -1620,29 +1748,64 @@ class _FeatureEngineering:
1620
1748
  anti_select_input = self.custom_data.get("AntiselectIndicator", False)
1621
1749
  # Checking user input for anti-select columns
1622
1750
  if anti_select_input:
1623
- # Extracting list required for anti-select columns
1624
- anti_select_list = self.custom_data.get("AntiselectParam", None)
1625
- if(anti_select_list):
1626
- if all(item in self.data.columns for item in anti_select_list):
1627
- # Storing custom anti-select columns indicator and column list in data transform dictionary
1628
- self.data_transform_dict['custom_anti_select_columns_ind'] = True
1629
- self.data_transform_dict['custom_anti_select_columns'] = anti_select_list
1630
- fit_params = {
1631
- "data" : self.data,
1632
- "exclude" : anti_select_list
1633
- }
1634
- # Performing transformation for given user input
1635
- self.data = Antiselect(**fit_params).result
1636
- self._display_msg(msg="Updated dataset sample after performing anti-select columns:",
1637
- data=self.data,
1638
- progress_bar=self.progress_bar)
1639
- else:
1640
- self._display_msg(msg="Columns provided in list are not present in dataset:",
1641
- col_lst=anti_select_list,
1642
- progress_bar=self.progress_bar)
1751
+ anti_select_params = self.custom_data.get("AntiselectParam", None)
1752
+ if anti_select_params:
1753
+ # Extracting list required for anti-select columns
1754
+ anti_select_list = anti_select_params.get("excluded_columns", None)
1755
+ volatile = anti_select_params.get("volatile", False)
1756
+ persist = anti_select_params.get("persist", False)
1757
+ if(anti_select_list):
1758
+ if all(item in self.data.columns for item in anti_select_list):
1759
+ # Storing custom anti-select columns indicator and column list in data transform dictionary
1760
+ self.data_transform_dict['custom_anti_select_columns_ind'] = True
1761
+ self.data_transform_dict['custom_anti_select_columns'] = anti_select_list
1762
+ fit_params = {
1763
+ "data" : self.data,
1764
+ "exclude" : anti_select_list,
1765
+ "volatile" : volatile,
1766
+ "persist" : persist
1767
+ }
1768
+ # Performing transformation for given user input
1769
+ self.data = Antiselect(**fit_params).result
1770
+ self._display_msg(msg="Updated dataset sample after performing anti-select columns:",
1771
+ data=self.data,
1772
+ progress_bar=self.progress_bar)
1773
+ else:
1774
+ self._display_msg(msg="Columns provided in list are not present in dataset:",
1775
+ col_lst=anti_select_list,
1776
+ progress_bar=self.progress_bar)
1643
1777
  else:
1644
1778
  self._display_msg(inline_msg="No information provided for performing anti-select columns operation.",
1645
1779
  progress_bar=self.progress_bar)
1646
1780
  else:
1647
1781
  self._display_msg(inline_msg="Skipping customized anti-select columns.",
1648
- progress_bar=self.progress_bar)
1782
+ progress_bar=self.progress_bar)
1783
+
1784
+ def _set_generic_parameters(self,
1785
+ func_indicator=None,
1786
+ param_name=None):
1787
+ """
1788
+ DESCRIPTION:
1789
+ Function to set generic parameters.
1790
+
1791
+ PARAMETERS:
1792
+ func_indicator:
1793
+ Optional Argument.
1794
+ Specifies the name of function indicator.
1795
+ Types: str
1796
+
1797
+ param_name:
1798
+ Optional Argument.
1799
+ Specifies the name of the param which contains generic parameters.
1800
+ Types: str
1801
+
1802
+ RETURNS:
1803
+ Tuple containing volatile and persist parameters.
1804
+ """
1805
+ volatile = self.volatile
1806
+ persist = self.persist
1807
+ if self.custom_data is not None and self.custom_data.get(func_indicator, False):
1808
+ volatile = self.custom_data[param_name].get("volatile", False)
1809
+ persist = self.custom_data[param_name].get("persist", False)
1810
+
1811
+ return (volatile, persist)