teradataml 20.0.0.2__py3-none-any.whl → 20.0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (126) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/README.md +315 -2
  3. teradataml/__init__.py +4 -0
  4. teradataml/_version.py +1 -1
  5. teradataml/analytics/analytic_function_executor.py +95 -8
  6. teradataml/analytics/byom/__init__.py +1 -1
  7. teradataml/analytics/json_parser/metadata.py +12 -3
  8. teradataml/analytics/json_parser/utils.py +7 -2
  9. teradataml/analytics/sqle/__init__.py +5 -1
  10. teradataml/analytics/table_operator/__init__.py +1 -1
  11. teradataml/analytics/uaf/__init__.py +1 -1
  12. teradataml/analytics/utils.py +4 -0
  13. teradataml/analytics/valib.py +18 -4
  14. teradataml/automl/__init__.py +51 -6
  15. teradataml/automl/data_preparation.py +59 -35
  16. teradataml/automl/data_transformation.py +58 -33
  17. teradataml/automl/feature_engineering.py +27 -12
  18. teradataml/automl/model_training.py +73 -46
  19. teradataml/common/constants.py +88 -29
  20. teradataml/common/garbagecollector.py +2 -1
  21. teradataml/common/messagecodes.py +19 -3
  22. teradataml/common/messages.py +6 -1
  23. teradataml/common/sqlbundle.py +64 -12
  24. teradataml/common/utils.py +246 -47
  25. teradataml/common/warnings.py +11 -0
  26. teradataml/context/context.py +161 -27
  27. teradataml/data/amazon_reviews_25.csv +26 -0
  28. teradataml/data/byom_example.json +11 -0
  29. teradataml/data/dataframe_example.json +18 -2
  30. teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
  31. teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
  32. teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
  33. teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
  34. teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
  35. teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
  36. teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +1 -1
  37. teradataml/data/docs/sqle/docs_17_20/Shap.py +34 -6
  38. teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +4 -4
  39. teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
  40. teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
  41. teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
  42. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
  43. teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
  44. teradataml/data/hnsw_alter_data.csv +5 -0
  45. teradataml/data/hnsw_data.csv +10 -0
  46. teradataml/data/jsons/byom/h2opredict.json +1 -1
  47. teradataml/data/jsons/byom/onnxembeddings.json +266 -0
  48. teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
  49. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
  50. teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
  51. teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
  52. teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
  53. teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
  54. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
  55. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
  56. teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
  57. teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
  58. teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
  59. teradataml/data/jsons/uaf/17.20/TD_SAX.json +3 -1
  60. teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +15 -5
  61. teradataml/data/medical_readings.csv +101 -0
  62. teradataml/data/patient_profile.csv +101 -0
  63. teradataml/data/scripts/lightgbm/dataset.template +157 -0
  64. teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
  65. teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
  66. teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
  67. teradataml/data/scripts/sklearn/sklearn_fit.py +194 -167
  68. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
  69. teradataml/data/scripts/sklearn/sklearn_function.template +14 -19
  70. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
  71. teradataml/data/scripts/sklearn/sklearn_transform.py +129 -42
  72. teradataml/data/target_udt_data.csv +8 -0
  73. teradataml/data/templates/open_source_ml.json +3 -2
  74. teradataml/data/teradataml_example.json +8 -0
  75. teradataml/data/vectordistance_example.json +4 -0
  76. teradataml/dataframe/copy_to.py +8 -3
  77. teradataml/dataframe/data_transfer.py +11 -1
  78. teradataml/dataframe/dataframe.py +1049 -285
  79. teradataml/dataframe/dataframe_utils.py +152 -20
  80. teradataml/dataframe/functions.py +578 -35
  81. teradataml/dataframe/setop.py +11 -6
  82. teradataml/dataframe/sql.py +185 -16
  83. teradataml/dbutils/dbutils.py +1049 -115
  84. teradataml/dbutils/filemgr.py +48 -1
  85. teradataml/hyperparameter_tuner/optimizer.py +12 -1
  86. teradataml/lib/aed_0_1.dll +0 -0
  87. teradataml/opensource/__init__.py +1 -1
  88. teradataml/opensource/_base.py +1466 -0
  89. teradataml/opensource/_class.py +464 -0
  90. teradataml/opensource/{sklearn/constants.py → _constants.py} +21 -14
  91. teradataml/opensource/_lightgbm.py +949 -0
  92. teradataml/opensource/_sklearn.py +1008 -0
  93. teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +5 -6
  94. teradataml/options/__init__.py +54 -38
  95. teradataml/options/configure.py +131 -27
  96. teradataml/options/display.py +13 -2
  97. teradataml/plot/axis.py +47 -8
  98. teradataml/plot/figure.py +33 -0
  99. teradataml/plot/plot.py +63 -13
  100. teradataml/scriptmgmt/UserEnv.py +5 -5
  101. teradataml/scriptmgmt/lls_utils.py +130 -40
  102. teradataml/store/__init__.py +12 -0
  103. teradataml/store/feature_store/__init__.py +0 -0
  104. teradataml/store/feature_store/constants.py +291 -0
  105. teradataml/store/feature_store/feature_store.py +2318 -0
  106. teradataml/store/feature_store/models.py +1505 -0
  107. teradataml/table_operators/Apply.py +32 -18
  108. teradataml/table_operators/Script.py +3 -1
  109. teradataml/table_operators/TableOperator.py +3 -1
  110. teradataml/table_operators/query_generator.py +3 -0
  111. teradataml/table_operators/table_operator_query_generator.py +3 -1
  112. teradataml/table_operators/table_operator_util.py +37 -38
  113. teradataml/table_operators/templates/dataframe_register.template +69 -0
  114. teradataml/utils/dtypes.py +51 -2
  115. teradataml/utils/internal_buffer.py +18 -0
  116. teradataml/utils/validators.py +99 -8
  117. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/METADATA +321 -5
  118. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/RECORD +121 -94
  119. teradataml/libaed_0_1.dylib +0 -0
  120. teradataml/libaed_0_1.so +0 -0
  121. teradataml/opensource/sklearn/__init__.py +0 -1
  122. teradataml/opensource/sklearn/_class.py +0 -255
  123. teradataml/opensource/sklearn/_sklearn_wrapper.py +0 -1800
  124. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/WHEEL +0 -0
  125. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/top_level.txt +0 -0
  126. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/zip-safe +0 -0
@@ -73,7 +73,7 @@ for func in _uaf_functions:
73
73
  "__doc__": _AnalyticFunction.__doc__,
74
74
  "__dir__": _common_dir})
75
75
 
76
- _stored_procedure = ['FilterFactory1d']
76
+ _stored_procedure = ['CopyArt', 'FilterFactory1d']
77
77
 
78
78
  for func in _stored_procedure:
79
79
  globals()[func] = type("{}".format(func), (_AnalyticFunction,),
@@ -441,6 +441,10 @@ class FuncSpecialCaseHandler():
441
441
  "filter_type": self._single_quote_arg,
442
442
  "window_type": self._single_quote_arg,
443
443
  "filter_description": self._single_quote_arg},
444
+ "CopyArt":{"database_name": self._single_quote_arg,
445
+ "table_name": self._single_quote_arg,
446
+ "map_name": self._single_quote_arg,
447
+ "permanent_table": self._single_quote_arg},
444
448
  "DWT": {"wavelet": self._single_quote_arg},
445
449
  "IDWT": {"part": self._single_quote_arg,
446
450
  "wavelet": self._single_quote_arg,
@@ -26,6 +26,8 @@ from teradataml.dataframe.dataframe import DataFrame, in_schema
26
26
  from teradataml.utils.validators import _Validators
27
27
  from teradataml.analytics.Transformations import Binning, Derive, OneHotEncoder, FillNa, \
28
28
  LabelEncoder, MinMaxScalar, Retain, Sigmoid, ZScore
29
+ from teradataml.common.constants import TeradataReservedKeywords, TeradataConstants
30
+
29
31
 
30
32
  class _VALIB():
31
33
  """ An internal class for executing VALIB analytic functions. """
@@ -370,9 +372,16 @@ class _VALIB():
370
372
  self.__get_temp_table_name()
371
373
  """
372
374
  prefix = "valib_{}".format(self.__tdml_valib_name.lower())
373
- return UtilFuncs._generate_temp_table_name(prefix=prefix, use_default_database=True,
374
- gc_on_quit=True, quote=False,
375
- table_type=TeradataConstants.TERADATA_TABLE)
375
+ tbl_name = UtilFuncs._generate_temp_table_name(prefix=prefix, use_default_database=True,
376
+ gc_on_quit=True, quote=False,
377
+ table_type=TeradataConstants.TERADATA_TABLE)
378
+ # With VT option, table name is getting generated with 'vt_'.
379
+ # But its not getting created as Volatile table. Hence
380
+ # explicitly garbage collecting.
381
+ if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
382
+ GarbageCollector._add_to_garbagecollector(tbl_name,
383
+ TeradataConstants.TERADATA_TABLE)
384
+ return tbl_name
376
385
 
377
386
  def __process_dyn_cls_output_member(self, arg_name, out_tablename, out_var=None):
378
387
  """
@@ -447,6 +456,7 @@ class _VALIB():
447
456
  # Add extension to the table name.
448
457
  generated_table_name = "{}{}".format(table_name, extension)
449
458
 
459
+
450
460
  # Register new output table to the GC.
451
461
  gc_tabname = "\"{}\".\"{}\"".format(self.__db_name, generated_table_name)
452
462
  GarbageCollector._add_to_garbagecollector(gc_tabname, TeradataConstants.TERADATA_TABLE)
@@ -1463,7 +1473,7 @@ class _VALIB():
1463
1473
  if gen_sql_only:
1464
1474
  valib_inst.__generate_valib_sql_argument_syntax(arg=str(gen_sql_only),
1465
1475
  arg_name="gensqlonly")
1466
-
1476
+ charset = kwargs.pop("charset", None)
1467
1477
  # Raise error if there are additional arguments.
1468
1478
  if len(kwargs) != 0:
1469
1479
  err_ = "The keyword arguments for Overlap() should have data1, data2, ..., dataN " \
@@ -1478,6 +1488,10 @@ class _VALIB():
1478
1488
  arg_name="tablename")
1479
1489
  valib_inst.__generate_valib_sql_argument_syntax(arg=",".join(column_names_df),
1480
1490
  arg_name="columns")
1491
+ # Generate clause of charset.
1492
+ if charset:
1493
+ valib_inst.__generate_valib_sql_argument_syntax(arg=charset,
1494
+ arg_name="charset")
1481
1495
 
1482
1496
  return valib_inst._execute_valib_function(skip_data_arg_processing=True,
1483
1497
  skip_other_arg_processing=True)
@@ -30,7 +30,7 @@ from teradataml import ColumnExpression
30
30
  from teradataml.dataframe.dataframe import DataFrame
31
31
  from teradataml.utils.utils import execute_sql
32
32
  from teradataml.utils.validators import _Validators
33
- from teradataml import ROC, BLOB
33
+ from teradataml import ROC, BLOB, VARCHAR
34
34
  from teradataml.utils.dtypes import _Dtypes
35
35
  from teradataml.common.utils import UtilFuncs
36
36
  from teradataml import TeradataMlException
@@ -94,6 +94,9 @@ class AutoML:
94
94
  the processes by passing the JSON file path in case of custom run. It also
95
95
  supports early stopping of model training based on stopping metrics,
96
96
  maximum running time and maximum models to be trained.
97
+ Note:
98
+ * configure.temp_object_type="VT" follows sequential execution.
99
+
97
100
 
98
101
  PARAMETERS:
99
102
  task_type:
@@ -187,6 +190,12 @@ class AutoML:
187
190
  session.
188
191
  Default Value: False
189
192
  Types: bool
193
+
194
+ seed:
195
+ Optional Argument.
196
+ Specifies the random seed for reproducibility.
197
+ Default Value: 42
198
+ Types: int
190
199
 
191
200
  RETURNS:
192
201
  Instance of AutoML.
@@ -417,9 +426,11 @@ class AutoML:
417
426
 
418
427
  volatile = kwargs.get('volatile', False)
419
428
  persist = kwargs.get('persist', False)
429
+ seed = kwargs.get('seed', 42)
420
430
 
421
431
  arg_info_matrix.append(["volatile", volatile, True, (bool)])
422
432
  arg_info_matrix.append(["persist", persist, True, (bool)])
433
+ arg_info_matrix.append(["seed", seed, True, (int)])
423
434
 
424
435
  # Validate argument types
425
436
  _Validators._validate_function_arguments(arg_info_matrix)
@@ -517,7 +528,7 @@ class AutoML:
517
528
 
518
529
  # Validate argument types
519
530
  _Validators._validate_function_arguments(arg_info_fit_matrix)
520
-
531
+
521
532
  # Initializing class variables
522
533
  self.data = data
523
534
  self.target_column = target_column
@@ -758,11 +769,12 @@ class AutoML:
758
769
  if self.target_column_ind:
759
770
  prediction_column = 'prediction' if 'prediction' in pred.result.columns else 'Prediction'
760
771
  probability_column = 'prob_1'
772
+ pred_target_count = pred.result.drop_duplicate(self.target_column).size
761
773
  # Displaying confusion matrix and ROC-AUC for classification problem
762
774
  if self.is_classification_type():
763
775
  print_data = lambda data: print(data) if _is_terminal() else display(data)
764
776
  # Displaying ROC-AUC for binary classification
765
- if self.target_count == 2:
777
+ if self.target_count == 2 and pred_target_count == 2:
766
778
  fit_params = {
767
779
  "probability_column" : probability_column,
768
780
  "observation_column" : self.target_column,
@@ -886,8 +898,8 @@ class AutoML:
886
898
  # as it is required for evaluation.
887
899
  if self.target_column not in data.columns:
888
900
  raise TeradataMlException(
889
- Messages.get_message(MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE).format(self.target_column),
890
- MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE)
901
+ Messages.get_message(MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE).format(self.target_column),
902
+ MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE)
891
903
 
892
904
  # Checking if data is already transformed before or not
893
905
  data_node_id = data._nodeid
@@ -1234,6 +1246,8 @@ class AutoML:
1234
1246
  pca.n_components_ = load_pca_info['n_components']
1235
1247
  pca.noise_variance_ = load_pca_info['noise_variance']
1236
1248
  pca.singular_values_ = np.array(load_pca_info['singular_values'])
1249
+ pca.feature_names_in_ = data_params['pca_fit_columns']
1250
+ pca.n_features_in_ = len(data_params['pca_fit_columns'])
1237
1251
 
1238
1252
  data_params['pca_fit_instance'] = pca
1239
1253
 
@@ -1442,7 +1456,8 @@ class AutoML:
1442
1456
  # Saving data transformation parameters to the specified table
1443
1457
  sv_models = pd.concat([sv_models, df], ignore_index=True, sort=False)
1444
1458
 
1445
- copy_to_sql(df = sv_models, table_name=table_name, if_exists='replace', types={'DATA_PARAMS':BLOB})
1459
+ copy_to_sql(df = sv_models, table_name=table_name, if_exists='replace', types={'DATA_PARAMS':BLOB,
1460
+ 'PARAMETERS':VARCHAR(length=32000, charset='UNICODE')})
1446
1461
 
1447
1462
  print('Model Deployment Completed Successfully.')
1448
1463
 
@@ -1945,6 +1960,12 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
1945
1960
  Default Value: False
1946
1961
  Types: bool
1947
1962
 
1963
+ seed:
1964
+ Optional Argument.
1965
+ Specifies the random seed for reproducibility.
1966
+ Default Value: 42
1967
+ Types: int
1968
+
1948
1969
  RETURNS:
1949
1970
  a tuple containing, model information and leaderboard.
1950
1971
  """
@@ -2103,6 +2124,12 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
2103
2124
  session.
2104
2125
  Default Value: False
2105
2126
  Types: bool
2127
+
2128
+ seed:
2129
+ Optional Argument.
2130
+ Specifies the random seed for reproducibility.
2131
+ Default Value: 42
2132
+ Types: int
2106
2133
 
2107
2134
  RETURNS:
2108
2135
  a tuple containing, model information and leaderboard.
@@ -2324,6 +2351,9 @@ class AutoRegressor(AutoML):
2324
2351
  """
2325
2352
  DESCRIPTION:
2326
2353
  AutoRegressor is a special purpose AutoML feature to run regression specific tasks.
2354
+ Note:
2355
+ * configure.temp_object_type="VT" follows sequential execution.
2356
+
2327
2357
 
2328
2358
  PARAMETERS:
2329
2359
  include:
@@ -2407,6 +2437,12 @@ class AutoRegressor(AutoML):
2407
2437
  session.
2408
2438
  Default Value: False
2409
2439
  Types: bool
2440
+
2441
+ seed:
2442
+ Optional Argument.
2443
+ Specifies the random seed for reproducibility.
2444
+ Default Value: 42
2445
+ Types: int
2410
2446
 
2411
2447
  RETURNS:
2412
2448
  Instance of AutoRegressor.
@@ -2555,6 +2591,9 @@ class AutoClassifier(AutoML):
2555
2591
  """
2556
2592
  DESCRIPTION:
2557
2593
  AutoClassifier is a special purpose AutoML feature to run classification specific tasks.
2594
+ Note:
2595
+ * configure.temp_object_type="VT" follows sequential execution.
2596
+
2558
2597
 
2559
2598
  PARAMETERS:
2560
2599
  include:
@@ -2638,6 +2677,12 @@ class AutoClassifier(AutoML):
2638
2677
  session.
2639
2678
  Default Value: False
2640
2679
  Types: bool
2680
+
2681
+ seed:
2682
+ Optional Argument.
2683
+ Specifies the random seed for reproducibility.
2684
+ Default Value: 42
2685
+ Types: int
2641
2686
 
2642
2687
  RETURNS:
2643
2688
  Instance of AutoClassifier.
@@ -16,7 +16,6 @@
16
16
  # Python libraries
17
17
  import numpy as np
18
18
  import pandas as pd
19
- import random
20
19
  import time
21
20
  import warnings
22
21
 
@@ -30,11 +29,9 @@ from teradataml import UtilFuncs, TeradataConstants
30
29
  from teradataml.common.garbagecollector import GarbageCollector
31
30
  from teradataml.common.messages import Messages, MessageCodes
32
31
  from teradataml.utils.validators import _Validators
33
- from teradataml import INTEGER
32
+ from teradataml import configure, INTEGER
33
+ from teradataml.common.constants import TeradataConstants
34
34
 
35
- # Control Randomnes
36
- random.seed(42)
37
- np.random.seed(42)
38
35
 
39
36
  class _DataPreparation:
40
37
 
@@ -117,6 +114,12 @@ class _DataPreparation:
117
114
  session.
118
115
  Default Value: False
119
116
  Types: bool
117
+
118
+ seed:
119
+ Optional Argument.
120
+ Specifies the random seed for reproducibility.
121
+ Default Value: 42
122
+ Types: int
120
123
  """
121
124
  self.data = data
122
125
  self.target_column = target_column
@@ -135,7 +138,13 @@ class _DataPreparation:
135
138
  self.table_name_mapping = {}
136
139
 
137
140
  self.data_types = {key: value for key, value in self.data._column_names_and_types}
138
-
141
+ self.seed = kwargs.get("seed", 42)
142
+ # np.random.seed() affects the random number generation in numpy and sklearn
143
+ # setting this changes the global state of the random number generator
144
+ # hence, setting the seed only if it is not None
145
+ if kwargs.get("seed") is not None:
146
+ np.random.seed(self.seed)
147
+
139
148
 
140
149
  def data_preparation(self,
141
150
  auto = True):
@@ -262,25 +271,24 @@ class _DataPreparation:
262
271
  outlier_method = "Tukey"
263
272
 
264
273
  # List of columns for outlier processing.
265
- outlier_columns = [col for col in self.data.columns if col not in self.excluded_columns]
274
+ # Excluding target column and excluded columns from outlier processing
275
+ outlier_columns = [col for col in self.data.columns if col not in self.excluded_columns + ['id', self.target_column]]
266
276
 
267
- # Detecting outlier percentage in each columns
268
- outlier_percentage_df = self._outlier_detection(outlier_method, outlier_columns)
269
-
270
- # Outlier Handling techniques
271
- for i in outlier_percentage_df.itertuples():
272
- # Column Name
273
- col = i[0]
274
- # Outlier value
275
- value = i[1]
276
-
277
- if col == self.target_column:
278
- if value < 5.0 and value > 0.0:
277
+ if len(outlier_columns) != 0:
278
+ # Detecting outlier percentage in each columns
279
+ outlier_percentage_df = self._outlier_detection(outlier_method, outlier_columns)
280
+
281
+ # Outlier Handling techniques
282
+ for i in outlier_percentage_df.itertuples():
283
+ # Column Name
284
+ col = i[0]
285
+ # Outlier value
286
+ value = i[1]
287
+ # Dropping rows
288
+ if value > 0.0 and value <= 8.0 :
279
289
  columns_to_drop_rows.append(col)
280
- elif value > 0.0 and value <= 8.0 :
281
- columns_to_drop_rows.append(col)
282
- elif value> 8.0 and value <= 25.0:
283
- columns_to_impute.append(col)
290
+ elif value> 8.0 and value <= 25.0:
291
+ columns_to_impute.append(col)
284
292
 
285
293
  return columns_to_drop_rows, columns_to_impute
286
294
 
@@ -465,7 +473,7 @@ class _DataPreparation:
465
473
  RETURNS:
466
474
  int, number of folds to be used for cross-validation.
467
475
  """
468
- num_of_folds = lambda rows: 1 if rows > 20000 else (3 if 1000 < rows <= 20000 else 10)
476
+ num_of_folds = lambda rows: 2 if rows > 20000 else (4 if 1000 < rows <= 20000 else 10)
469
477
  return num_of_folds(rows)
470
478
 
471
479
  def _feature_selection_PCA(self):
@@ -489,7 +497,7 @@ class _DataPreparation:
489
497
  train_data = pca_train.drop(columns=['id', self.target_column], axis=1)
490
498
 
491
499
  # Initialize and fit PCA
492
- pca = PCA()
500
+ pca = PCA(random_state=self.seed)
493
501
  pca.fit(train_data)
494
502
 
495
503
  # Find the number of components for PCA
@@ -497,7 +505,7 @@ class _DataPreparation:
497
505
  n = np.argmax(np.cumsum(variance) >= 0.95) + 1
498
506
 
499
507
  # Create a new instance of PCA with the optimal number of components
500
- pca = PCA(n_components=n, random_state=42)
508
+ pca = PCA(n_components=n, random_state=self.seed)
501
509
 
502
510
  # Apply PCA on dataset
503
511
  X_train_pca = pca.fit_transform(train_data)
@@ -571,7 +579,7 @@ class _DataPreparation:
571
579
 
572
580
  # Random forest for RFE model
573
581
  RFModel = RandomForestRegressor if not is_classification else RandomForestClassifier
574
- rf = RFModel(n_estimators=100, random_state=42)
582
+ rf = RFModel(n_estimators=100, random_state=self.seed)
575
583
 
576
584
  # Determine the scoring metric based on the number of unique classes
577
585
  score = 'r2' if not self.is_classification_type() \
@@ -665,10 +673,10 @@ class _DataPreparation:
665
673
  scoring_metric = 'roc_auc'
666
674
  else:
667
675
  scoring_metric = 'f1_macro'
668
- estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=42)
676
+ estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=self.seed)
669
677
  parameters = {'C':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
670
678
  else:
671
- estimator = Lasso(random_state=42)
679
+ estimator = Lasso(random_state=self.seed)
672
680
  parameters = {'alpha':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
673
681
  scoring_metric = "r2"
674
682
 
@@ -679,7 +687,7 @@ class _DataPreparation:
679
687
 
680
688
  # Applying hyperparameter tuning and optimizing score
681
689
  hyperparameter_search = GridSearchCV(estimator, parameters, cv=cv, refit=True,
682
- scoring=scoring_metric, verbose=0)
690
+ scoring=scoring_metric, verbose=0)
683
691
 
684
692
  # Fitting the best result from hyperparameter
685
693
  hyperparameter_search.fit(train_features, train_target)
@@ -746,14 +754,20 @@ class _DataPreparation:
746
754
  train_table_name = UtilFuncs._generate_temp_table_name(prefix='{}_train'.format(prefix),
747
755
  table_type = TeradataConstants.TERADATA_TABLE,
748
756
  gc_on_quit=not persist)
757
+ # If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
758
+ # table name in fully qualified format.
759
+ train_table_name = UtilFuncs._extract_table_name(train_table_name)
760
+
749
761
  # Storing the table names in the table name mapping dictionary
750
762
  self.table_name_mapping['{}_train'.format(prefix)] = train_table_name
751
763
 
764
+ # In the case of the VT option, the table was being persisted, so the VT condition is being checked.
765
+ is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
752
766
  # Pushing data into database
753
767
  if self.is_classification_type():
754
- copy_to_sql(df=data, table_name=train_table_name, if_exists="replace", types={f'{self.target_column}': INTEGER})
768
+ copy_to_sql(df=data, table_name=train_table_name, temporary=is_temporary, if_exists="replace", types={f'{self.target_column}': INTEGER})
755
769
  else:
756
- copy_to_sql(df=data, table_name=train_table_name, if_exists="replace")
770
+ copy_to_sql(df=data, table_name=train_table_name, if_exists="replace", temporary=is_temporary)
757
771
 
758
772
  def _scaling_features_helper(self,
759
773
  data=None,
@@ -783,7 +797,8 @@ class _DataPreparation:
783
797
  for col in data.columns:
784
798
  # Selecting columns that will be scaled
785
799
  # Exculding target_col and columns with single value
786
- if col not in ['id', self.target_column] and data.drop_duplicate(col).size > 1:
800
+ if col not in ['id', self.target_column] and \
801
+ data.drop_duplicate(col).size > 1:
787
802
  columns_to_scale.append(col)
788
803
 
789
804
  if feature_selection_mtd == "lasso":
@@ -855,6 +870,7 @@ class _DataPreparation:
855
870
 
856
871
  # List of columns to copy to the output generated by scale transform
857
872
  accumulate_cols = list(set(data_to_scale.columns) - set(scale_col))
873
+
858
874
 
859
875
  # Scaling dataset
860
876
  transform_obj = ScaleTransform(data=data_to_scale,
@@ -866,6 +882,8 @@ class _DataPreparation:
866
882
  data=scaled_df,
867
883
  progress_bar=self.progress_bar)
868
884
  else:
885
+ # No columns to scale, Original data will be used
886
+ scaled_df = data_to_scale
869
887
  self._display_msg(msg="No columns to scale.",
870
888
  progress_bar=self.progress_bar)
871
889
 
@@ -914,10 +932,16 @@ class _DataPreparation:
914
932
  # Assigning data to target dataframe
915
933
  target_df = self.data
916
934
  # Detecting list of float columns on target dataset
917
- float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float"]]
935
+ float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float", "decimal.Decimal"]]
918
936
 
919
937
  if len(float_columns) == 0:
920
- return target_df.to_pandas()
938
+ cols = target_df.columns
939
+ # Doing reset index to get index column
940
+ df = target_df.to_pandas().reset_index()
941
+
942
+ # Returning the dataframe with cols
943
+ # to avoid extra columns generated by reset_index()
944
+ return df[cols]
921
945
 
922
946
  # storing the column details for round up in data transformation dictionary
923
947
  self.data_transform_dict["round_columns"] = float_columns
@@ -31,8 +31,11 @@ from teradataml import ScaleTransform
31
31
  from teradataml import SimpleImputeTransform
32
32
  from teradataml import TargetEncodingTransform
33
33
  from teradataml import Transform, UtilFuncs, TeradataConstants
34
+ from teradataml import execute_sql
34
35
  from teradataml.common.garbagecollector import GarbageCollector
35
36
  from teradataml.hyperparameter_tuner.utils import _ProgressBar
37
+ from teradataml.options.configure import configure
38
+ from teradataml.common.constants import TeradataConstants
36
39
 
37
40
  # AutoML Internal libraries
38
41
  from teradataml.automl.feature_exploration import _FeatureExplore
@@ -219,11 +222,11 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
219
222
  DESCRIPTION:
220
223
  Function drops irrelevent columns and adds id column.
221
224
  """
222
- # Extracting irrelevent column list
225
+ # Extracting irrelevant column list
223
226
  columns_to_be_removed = self.data_transformation_params.get("drop_irrelevent_columns", None)
224
227
  if columns_to_be_removed:
225
228
  self.data = self.data.drop(columns_to_be_removed, axis=1)
226
- self._display_msg(msg="\nUpdated dataset after dropping irrelevent columns :",
229
+ self._display_msg(msg="\nUpdated dataset after dropping irrelevant columns :",
227
230
  data=self.data,
228
231
  progress_bar=self.progress_bar)
229
232
 
@@ -693,22 +696,28 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
693
696
  lasso_scale_fit_obj = self.data_transformation_params.get("lasso_scale_fit_obj", None)
694
697
  lasso_scale_col = self.data_transformation_params.get("lasso_scale_col", None)
695
698
  # Extracting accumulate columns
696
- accumulate_cols = self._extract_list(lasso_df.columns, lasso_scale_col)
697
- # Scaling dataset
698
- lasso_df = ScaleTransform(data=lasso_df,
699
- object=lasso_scale_fit_obj,
700
- accumulate=accumulate_cols).result
701
- # Displaying scaled dataset
702
- self._display_msg(msg="\nUpdated dataset after performing scaling on Lasso selected features :",
703
- data=lasso_df,
704
- progress_bar=self.progress_bar)
699
+ if lasso_scale_fit_obj is not None:
700
+ accumulate_cols = self._extract_list(lasso_df.columns, lasso_scale_col)
701
+ # Scaling dataset
702
+ lasso_df = ScaleTransform(data=lasso_df,
703
+ object=lasso_scale_fit_obj,
704
+ accumulate=accumulate_cols).result
705
+ # Displaying scaled dataset
706
+ self._display_msg(msg="\nUpdated dataset after performing scaling on Lasso selected features :",
707
+ data=lasso_df,
708
+ progress_bar=self.progress_bar)
705
709
 
706
710
  # Uploading lasso dataset to table for further use
707
711
  table_name = UtilFuncs._generate_temp_table_name(prefix="lasso_new_test",
708
712
  table_type = TeradataConstants.TERADATA_TABLE)
713
+ # If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
714
+ # table name in fully qualified format.
715
+ table_name = UtilFuncs._extract_table_name(table_name)
709
716
  # Storing table name mapping for lasso dataset
710
717
  self.table_name_mapping[self.data_node_id]["lasso_new_test"] = table_name
711
- copy_to_sql(df = lasso_df, table_name= table_name, if_exists="replace")
718
+ # In the case of the VT option, the table was being persisted, so the VT condition is being checked.
719
+ is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
720
+ copy_to_sql(df = lasso_df, table_name= table_name, if_exists="replace", temporary=is_temporary)
712
721
 
713
722
  def _feature_selection_rfe_transformation(self):
714
723
  """
@@ -730,23 +739,30 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
730
739
  # Extracting fit object and columns for scaling
731
740
  rfe_scale_fit_obj = self.data_transformation_params.get("rfe_scale_fit_obj", None)
732
741
  rfe_scale_col = self.data_transformation_params.get("rfe_scale_col", None)
733
- # Extracting accumulate columns
734
- accumulate_cols = self._extract_list(rfe_df.columns, rfe_scale_col)
735
- # Scaling on rfe dataset
736
- rfe_df = ScaleTransform(data=rfe_df,
737
- object=rfe_scale_fit_obj,
738
- accumulate=accumulate_cols).result
739
- # Displaying scaled dataset
740
- self._display_msg(msg="\nUpdated dataset after performing scaling on RFE selected features :",
741
- data=rfe_df,
742
- progress_bar=self.progress_bar)
742
+
743
+ if rfe_scale_fit_obj is not None:
744
+ # Extracting accumulate columns
745
+ accumulate_cols = self._extract_list(rfe_df.columns, rfe_scale_col)
746
+ # Scaling on rfe dataset
747
+ rfe_df = ScaleTransform(data=rfe_df,
748
+ object=rfe_scale_fit_obj,
749
+ accumulate=accumulate_cols).result
750
+ # Displaying scaled dataset
751
+ self._display_msg(msg="\nUpdated dataset after performing scaling on RFE selected features :",
752
+ data=rfe_df,
753
+ progress_bar=self.progress_bar)
743
754
 
744
755
  # Uploading rfe dataset to table for further use
745
756
  table_name = UtilFuncs._generate_temp_table_name(prefix="rfe_new_test",
746
757
  table_type = TeradataConstants.TERADATA_TABLE)
758
+ # If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
759
+ # table name in fully qualified format.
760
+ table_name = UtilFuncs._extract_table_name(table_name)
747
761
  # Storing table name mapping for rfe dataset
748
762
  self.table_name_mapping[self.data_node_id]["rfe_new_test"] = table_name
749
- copy_to_sql(df = rfe_df, table_name= table_name, if_exists="replace")
763
+ # In the case of the VT option, the table was being persisted, so the VT condition is being checked.
764
+ is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
765
+ copy_to_sql(df = rfe_df, table_name= table_name, if_exists="replace", temporary=is_temporary)
750
766
 
751
767
  def _feature_selection_pca_transformation(self):
752
768
  """
@@ -758,17 +774,20 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
758
774
  pca_scale_col = self.data_transformation_params.get("pca_scale_col", None)
759
775
  # Extracting accumulate columns
760
776
  accumulate_cols = self._extract_list(self.data.columns, pca_scale_col)
761
- # Scaling on pca dataset
762
- pca_scaled_df = ScaleTransform(data=self.data,
763
- object=pca_scale_fit_obj,
764
- accumulate=accumulate_cols).result
765
- # Displaying scaled dataset
766
- self._display_msg(msg="\nUpdated dataset after performing scaling for PCA feature selection :",
767
- data=pca_scaled_df,
768
- progress_bar=self.progress_bar)
777
+
778
+ pca_scaled_df = self.data
779
+ if pca_scale_fit_obj is not None:
780
+ # Scaling on pca dataset
781
+ pca_scaled_df = ScaleTransform(data=self.data,
782
+ object=pca_scale_fit_obj,
783
+ accumulate=accumulate_cols).result
784
+ # Displaying scaled dataset
785
+ self._display_msg(msg="\nUpdated dataset after performing scaling for PCA feature selection :",
786
+ data=pca_scaled_df,
787
+ progress_bar=self.progress_bar)
769
788
 
770
789
  # Convert to pandas dataframe for applying pca
771
- pca_scaled_pd = pca_scaled_df.to_pandas()
790
+ pca_scaled_pd = pca_scaled_df.to_pandas().reset_index()
772
791
  # Extracting pca fit instance for applying pca
773
792
  pca_fit_instance = self.data_transformation_params.get("pca_fit_instance", None)
774
793
  # Extracting columns for applying pca
@@ -804,6 +823,12 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
804
823
  # Uploading pca dataset to table for further use
805
824
  table_name = UtilFuncs._generate_temp_table_name(prefix="pca_new_test",
806
825
  table_type = TeradataConstants.TERADATA_TABLE)
826
+ # If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
827
+ # table name in fully qualified format.
828
+ table_name = UtilFuncs._extract_table_name(table_name)
807
829
  # Storing table name mapping for pca dataset
808
830
  self.table_name_mapping[self.data_node_id]["pca_new_test"] = table_name
809
- copy_to_sql(df = pca_df, table_name=table_name, if_exists="replace")
831
+ # In the case of the VT option, the table was being persisted, so the VT condition is being checked.
832
+ is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
833
+ copy_to_sql(df = pca_df, table_name=table_name, if_exists="replace", temporary=is_temporary)
834
+