teradataml 20.0.0.3__py3-none-any.whl → 20.0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (151) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/README.md +193 -1
  3. teradataml/__init__.py +2 -1
  4. teradataml/_version.py +2 -2
  5. teradataml/analytics/analytic_function_executor.py +25 -18
  6. teradataml/analytics/byom/__init__.py +1 -1
  7. teradataml/analytics/json_parser/analytic_functions_argument.py +4 -0
  8. teradataml/analytics/sqle/__init__.py +20 -2
  9. teradataml/analytics/utils.py +15 -1
  10. teradataml/analytics/valib.py +18 -4
  11. teradataml/automl/__init__.py +341 -112
  12. teradataml/automl/autodataprep/__init__.py +471 -0
  13. teradataml/automl/data_preparation.py +84 -42
  14. teradataml/automl/data_transformation.py +69 -33
  15. teradataml/automl/feature_engineering.py +76 -9
  16. teradataml/automl/feature_exploration.py +639 -25
  17. teradataml/automl/model_training.py +35 -14
  18. teradataml/clients/auth_client.py +2 -2
  19. teradataml/common/__init__.py +1 -2
  20. teradataml/common/constants.py +122 -63
  21. teradataml/common/messagecodes.py +14 -3
  22. teradataml/common/messages.py +8 -4
  23. teradataml/common/sqlbundle.py +40 -10
  24. teradataml/common/utils.py +366 -74
  25. teradataml/common/warnings.py +11 -0
  26. teradataml/context/context.py +348 -86
  27. teradataml/data/amazon_reviews_25.csv +26 -0
  28. teradataml/data/apriori_example.json +22 -0
  29. teradataml/data/byom_example.json +11 -0
  30. teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
  31. teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
  32. teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
  33. teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
  34. teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
  35. teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
  36. teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
  37. teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
  38. teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +3 -3
  39. teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
  40. teradataml/data/docs/sqle/docs_17_20/Shap.py +28 -6
  41. teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
  42. teradataml/data/docs/sqle/docs_17_20/TextParser.py +54 -3
  43. teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -1
  44. teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +2 -2
  45. teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +2 -2
  46. teradataml/data/docs/uaf/docs_17_20/DFFT.py +1 -1
  47. teradataml/data/docs/uaf/docs_17_20/DFFT2.py +1 -1
  48. teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +1 -1
  49. teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +1 -1
  50. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
  51. teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +4 -4
  52. teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +2 -2
  53. teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +2 -2
  54. teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +6 -6
  55. teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
  56. teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +1 -1
  57. teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +4 -4
  58. teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +1 -1
  59. teradataml/data/docs/uaf/docs_17_20/PACF.py +1 -1
  60. teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
  61. teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +3 -3
  62. teradataml/data/docs/uaf/docs_17_20/Resample.py +5 -5
  63. teradataml/data/docs/uaf/docs_17_20/SAX.py +3 -3
  64. teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
  65. teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +1 -1
  66. teradataml/data/docs/uaf/docs_17_20/Smoothma.py +3 -3
  67. teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +1 -1
  68. teradataml/data/hnsw_alter_data.csv +5 -0
  69. teradataml/data/hnsw_data.csv +10 -0
  70. teradataml/data/jsons/byom/h2opredict.json +1 -1
  71. teradataml/data/jsons/byom/onnxembeddings.json +266 -0
  72. teradataml/data/jsons/sqle/17.20/NGramSplitter.json +6 -6
  73. teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
  74. teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
  75. teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
  76. teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
  77. teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
  78. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +114 -9
  79. teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +328 -0
  80. teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +420 -0
  81. teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +343 -0
  82. teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +328 -0
  83. teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +328 -0
  84. teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +328 -0
  85. teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +328 -0
  86. teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +359 -0
  87. teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +360 -0
  88. teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +343 -0
  89. teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +343 -0
  90. teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
  91. teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
  92. teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
  93. teradataml/data/jsons/sqle/20.00/TD_KMeans.json +2 -2
  94. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +3 -3
  95. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +6 -6
  96. teradataml/data/ner_dict.csv +8 -0
  97. teradataml/data/ner_input_eng.csv +7 -0
  98. teradataml/data/ner_rule.csv +5 -0
  99. teradataml/data/pos_input.csv +40 -0
  100. teradataml/data/tdnerextractor_example.json +14 -0
  101. teradataml/data/teradataml_example.json +21 -0
  102. teradataml/data/textmorph_example.json +5 -0
  103. teradataml/data/to_num_data.csv +4 -0
  104. teradataml/data/tochar_data.csv +5 -0
  105. teradataml/data/trans_dense.csv +16 -0
  106. teradataml/data/trans_sparse.csv +55 -0
  107. teradataml/data/vectordistance_example.json +1 -1
  108. teradataml/dataframe/copy_to.py +45 -29
  109. teradataml/dataframe/data_transfer.py +72 -46
  110. teradataml/dataframe/dataframe.py +642 -166
  111. teradataml/dataframe/dataframe_utils.py +167 -22
  112. teradataml/dataframe/functions.py +135 -20
  113. teradataml/dataframe/setop.py +11 -6
  114. teradataml/dataframe/sql.py +330 -78
  115. teradataml/dbutils/dbutils.py +556 -140
  116. teradataml/dbutils/filemgr.py +14 -10
  117. teradataml/hyperparameter_tuner/optimizer.py +12 -1
  118. teradataml/lib/aed_0_1.dll +0 -0
  119. teradataml/opensource/{sklearn/_sklearn_wrapper.py → _base.py} +168 -1013
  120. teradataml/opensource/_class.py +141 -17
  121. teradataml/opensource/{constants.py → _constants.py} +7 -3
  122. teradataml/opensource/_lightgbm.py +52 -53
  123. teradataml/opensource/_sklearn.py +1008 -0
  124. teradataml/opensource/_wrapper_utils.py +5 -5
  125. teradataml/options/__init__.py +47 -15
  126. teradataml/options/configure.py +103 -26
  127. teradataml/options/display.py +13 -2
  128. teradataml/plot/axis.py +47 -8
  129. teradataml/plot/figure.py +33 -0
  130. teradataml/plot/plot.py +63 -13
  131. teradataml/scriptmgmt/UserEnv.py +307 -40
  132. teradataml/scriptmgmt/lls_utils.py +428 -145
  133. teradataml/store/__init__.py +2 -3
  134. teradataml/store/feature_store/feature_store.py +102 -7
  135. teradataml/table_operators/Apply.py +48 -19
  136. teradataml/table_operators/Script.py +23 -2
  137. teradataml/table_operators/TableOperator.py +3 -1
  138. teradataml/table_operators/table_operator_util.py +58 -9
  139. teradataml/utils/dtypes.py +49 -1
  140. teradataml/utils/internal_buffer.py +38 -0
  141. teradataml/utils/validators.py +377 -62
  142. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/METADATA +200 -4
  143. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/RECORD +146 -112
  144. teradataml/data/SQL_Fundamentals.pdf +0 -0
  145. teradataml/libaed_0_1.dylib +0 -0
  146. teradataml/libaed_0_1.so +0 -0
  147. teradataml/opensource/sklearn/__init__.py +0 -0
  148. teradataml/store/vector_store/__init__.py +0 -1586
  149. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/WHEEL +0 -0
  150. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/top_level.txt +0 -0
  151. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/zip-safe +0 -0
@@ -16,7 +16,6 @@
16
16
  # Python libraries
17
17
  import numpy as np
18
18
  import pandas as pd
19
- import random
20
19
  import time
21
20
  import warnings
22
21
 
@@ -30,11 +29,9 @@ from teradataml import UtilFuncs, TeradataConstants
30
29
  from teradataml.common.garbagecollector import GarbageCollector
31
30
  from teradataml.common.messages import Messages, MessageCodes
32
31
  from teradataml.utils.validators import _Validators
33
- from teradataml import INTEGER
32
+ from teradataml import configure, INTEGER
33
+ from teradataml.common.constants import TeradataConstants
34
34
 
35
- # Control Randomnes
36
- random.seed(42)
37
- np.random.seed(42)
38
35
 
39
36
  class _DataPreparation:
40
37
 
@@ -117,6 +114,12 @@ class _DataPreparation:
117
114
  session.
118
115
  Default Value: False
119
116
  Types: bool
117
+
118
+ seed:
119
+ Optional Argument.
120
+ Specifies the random seed for reproducibility.
121
+ Default Value: 42
122
+ Types: int
120
123
  """
121
124
  self.data = data
122
125
  self.target_column = target_column
@@ -127,14 +130,22 @@ class _DataPreparation:
127
130
  self.task_type = task_type
128
131
  self.volatile = kwargs.get("volatile", False)
129
132
  self.persist = kwargs.get("persist", False)
133
+ self.aml_phases = kwargs.get("automl_phases", None)
130
134
 
131
135
  # Setting default value for auto run mode
132
136
  self._data_sampling_method = "SMOTE"
133
137
  self._scale_method_reg = "STD"
134
138
  self._scale_method_cls = "RANGE"
135
- self.table_name_mapping = {}
136
139
 
137
140
  self.data_types = {key: value for key, value in self.data._column_names_and_types}
141
+ self.seed = kwargs.get("seed", 42)
142
+ # np.random.seed() affects the random number generation in numpy and sklearn
143
+ # setting this changes the global state of the random number generator
144
+ # hence, setting the seed only if it is not None
145
+ if kwargs.get("seed") is not None:
146
+ np.random.seed(self.seed)
147
+
148
+ self.data_mapping = kwargs.get("data_mapping", {})
138
149
 
139
150
 
140
151
  def data_preparation(self,
@@ -158,7 +169,8 @@ class _DataPreparation:
158
169
  list of lists containing, feature selected by rfe, pca and lasso.
159
170
  """
160
171
  self._display_heading(phase=2,
161
- progress_bar=self.progress_bar)
172
+ progress_bar=self.progress_bar,
173
+ automl_phases=self.aml_phases)
162
174
  self._display_msg(msg='Data preparation started ...',
163
175
  progress_bar=self.progress_bar)
164
176
  # Setting user value in case of custom running mode
@@ -201,7 +213,7 @@ class _DataPreparation:
201
213
  self._feature_selection_PCA()
202
214
  self.progress_bar.update()
203
215
 
204
- return [self.rfe_feature, self.lasso_feature, self.pca_feature], self.data_transform_dict
216
+ return [self.rfe_feature, self.lasso_feature, self.pca_feature], self.data_transform_dict, self.data_mapping
205
217
 
206
218
  def _handle_outliers(self,
207
219
  auto):
@@ -262,25 +274,24 @@ class _DataPreparation:
262
274
  outlier_method = "Tukey"
263
275
 
264
276
  # List of columns for outlier processing.
265
- outlier_columns = [col for col in self.data.columns if col not in self.excluded_columns]
277
+ # Excluding target column and excluded columns from outlier processing
278
+ outlier_columns = [col for col in self.data.columns if col not in self.excluded_columns + ['id', self.target_column]]
266
279
 
267
- # Detecting outlier percentage in each columns
268
- outlier_percentage_df = self._outlier_detection(outlier_method, outlier_columns)
269
-
270
- # Outlier Handling techniques
271
- for i in outlier_percentage_df.itertuples():
272
- # Column Name
273
- col = i[0]
274
- # Outlier value
275
- value = i[1]
276
-
277
- if col == self.target_column:
278
- if value < 5.0 and value > 0.0:
280
+ if len(outlier_columns) != 0:
281
+ # Detecting outlier percentage in each columns
282
+ outlier_percentage_df = self._outlier_detection(outlier_method, outlier_columns)
283
+
284
+ # Outlier Handling techniques
285
+ for i in outlier_percentage_df.itertuples():
286
+ # Column Name
287
+ col = i[0]
288
+ # Outlier value
289
+ value = i[1]
290
+ # Dropping rows
291
+ if value > 0.0 and value <= 8.0 :
279
292
  columns_to_drop_rows.append(col)
280
- elif value > 0.0 and value <= 8.0 :
281
- columns_to_drop_rows.append(col)
282
- elif value> 8.0 and value <= 25.0:
283
- columns_to_impute.append(col)
293
+ elif value> 8.0 and value <= 25.0:
294
+ columns_to_impute.append(col)
284
295
 
285
296
  return columns_to_drop_rows, columns_to_impute
286
297
 
@@ -347,6 +358,9 @@ class _DataPreparation:
347
358
  # Adding transformed data containing table to garbage collector
348
359
  GarbageCollector._add_to_garbagecollector(self.data._table_name)
349
360
 
361
+ # Returning outlier fit object to store in data mapping dictionary
362
+ return outlier_fit_out
363
+
350
364
  def _outlier_processing(self):
351
365
  """
352
366
  DESCRIPTION:
@@ -370,7 +384,10 @@ class _DataPreparation:
370
384
  progress_bar=self.progress_bar)
371
385
  target_columns=columns_to_drop_rows
372
386
  replacement_strategy = "DELETE"
373
- self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
387
+ fit_obj = self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
388
+ self.data_mapping['fit_outlier_delete_output'] = fit_obj.output_data._table_name
389
+ self.data_mapping['fit_outlier_delete_result'] = self.data._table_name
390
+ self.data_mapping['outlier_filtered_data'] = self.data._table_name
374
391
  self._display_msg(msg="Sample of dataset after removing outlier rows:",
375
392
  data=self.data,
376
393
  progress_bar=self.progress_bar)
@@ -382,7 +399,10 @@ class _DataPreparation:
382
399
  progress_bar=self.progress_bar)
383
400
  target_columns=columns_to_impute
384
401
  replacement_strategy = "MEDIAN"
385
- self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
402
+ fit_obj = self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
403
+ self.data_mapping['fit_outlier_impute_output'] = fit_obj.output_data._table_name
404
+ self.data_mapping['fit_outlier_impute_result'] = fit_obj.result._table_name
405
+ self.data_mapping['outlier_imputed_data'] = self.data._table_name
386
406
  self._display_msg(msg="Sample of dataset after performing MEDIAN inplace:",
387
407
  data=self.data,
388
408
  progress_bar=self.progress_bar)
@@ -438,7 +458,10 @@ class _DataPreparation:
438
458
  # Fetching replacement value
439
459
  replacement_value = transform_val["replacement_value"]
440
460
  # Performing outlier handling
441
- self._outlier_handling(target_col, outlier_method, replacement_value)
461
+ fit_obj = self._outlier_handling(target_col, outlier_method, replacement_value)
462
+ self.data_mapping[f'fit_{target_col}_outlier_output'] = fit_obj.output_data._table_name
463
+ self.data_mapping[f'fit_{target_col}_outlier_result'] = fit_obj.result._table_name
464
+ self.data_mapping[f'{target_col}_outlier_treated_data'] = self.data._table_name
442
465
  else:
443
466
  self._display_msg(inline_msg="No information provided for feature transformation in outlier handling.",
444
467
  progress_bar=self.progress_bar)
@@ -483,13 +506,13 @@ class _DataPreparation:
483
506
  start_time = time.time()
484
507
 
485
508
  # Temporary Pulling data for feature selection
486
- pca_train = DataFrame.from_table(self.table_name_mapping['pca_train']).to_pandas()
509
+ pca_train = DataFrame.from_table(self.data_mapping['pca_train']).to_pandas()
487
510
 
488
511
  # Drop unnecessary columns and store the result
489
512
  train_data = pca_train.drop(columns=['id', self.target_column], axis=1)
490
513
 
491
514
  # Initialize and fit PCA
492
- pca = PCA()
515
+ pca = PCA(random_state=self.seed)
493
516
  pca.fit(train_data)
494
517
 
495
518
  # Find the number of components for PCA
@@ -497,7 +520,7 @@ class _DataPreparation:
497
520
  n = np.argmax(np.cumsum(variance) >= 0.95) + 1
498
521
 
499
522
  # Create a new instance of PCA with the optimal number of components
500
- pca = PCA(n_components=n, random_state=42)
523
+ pca = PCA(n_components=n, random_state=self.seed)
501
524
 
502
525
  # Apply PCA on dataset
503
526
  X_train_pca = pca.fit_transform(train_data)
@@ -571,7 +594,7 @@ class _DataPreparation:
571
594
 
572
595
  # Random forest for RFE model
573
596
  RFModel = RandomForestRegressor if not is_classification else RandomForestClassifier
574
- rf = RFModel(n_estimators=100, random_state=42)
597
+ rf = RFModel(n_estimators=100, random_state=self.seed)
575
598
 
576
599
  # Determine the scoring metric based on the number of unique classes
577
600
  score = 'r2' if not self.is_classification_type() \
@@ -665,10 +688,10 @@ class _DataPreparation:
665
688
  scoring_metric = 'roc_auc'
666
689
  else:
667
690
  scoring_metric = 'f1_macro'
668
- estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=42)
691
+ estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=self.seed)
669
692
  parameters = {'C':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
670
693
  else:
671
- estimator = Lasso(random_state=42)
694
+ estimator = Lasso(random_state=self.seed)
672
695
  parameters = {'alpha':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
673
696
  scoring_metric = "r2"
674
697
 
@@ -679,7 +702,7 @@ class _DataPreparation:
679
702
 
680
703
  # Applying hyperparameter tuning and optimizing score
681
704
  hyperparameter_search = GridSearchCV(estimator, parameters, cv=cv, refit=True,
682
- scoring=scoring_metric, verbose=0)
705
+ scoring=scoring_metric, verbose=0)
683
706
 
684
707
  # Fitting the best result from hyperparameter
685
708
  hyperparameter_search.fit(train_features, train_target)
@@ -746,14 +769,20 @@ class _DataPreparation:
746
769
  train_table_name = UtilFuncs._generate_temp_table_name(prefix='{}_train'.format(prefix),
747
770
  table_type = TeradataConstants.TERADATA_TABLE,
748
771
  gc_on_quit=not persist)
772
+ # If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
773
+ # table name in fully qualified format.
774
+ train_table_name = UtilFuncs._extract_table_name(train_table_name)
775
+
749
776
  # Storing the table names in the table name mapping dictionary
750
- self.table_name_mapping['{}_train'.format(prefix)] = train_table_name
777
+ self.data_mapping['{}_train'.format(prefix)] = train_table_name
751
778
 
779
+ # In the case of the VT option, the table was being persisted, so the VT condition is being checked.
780
+ is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
752
781
  # Pushing data into database
753
782
  if self.is_classification_type():
754
- copy_to_sql(df=data, table_name=train_table_name, if_exists="replace", types={f'{self.target_column}': INTEGER})
783
+ copy_to_sql(df=data, table_name=train_table_name, temporary=is_temporary, if_exists="replace", types={f'{self.target_column}': INTEGER})
755
784
  else:
756
- copy_to_sql(df=data, table_name=train_table_name, if_exists="replace")
785
+ copy_to_sql(df=data, table_name=train_table_name, if_exists="replace", temporary=is_temporary)
757
786
 
758
787
  def _scaling_features_helper(self,
759
788
  data=None,
@@ -825,9 +854,9 @@ class _DataPreparation:
825
854
 
826
855
  # Loading data for feature scaling based of feature selection method
827
856
  if feature_selection_mtd == 'rfe':
828
- data_to_scale = DataFrame(self.table_name_mapping['rfe_train'])
857
+ data_to_scale = DataFrame(self.data_mapping['rfe_train'])
829
858
  elif feature_selection_mtd == 'lasso':
830
- data_to_scale = DataFrame(self.table_name_mapping['lasso_train'])
859
+ data_to_scale = DataFrame(self.data_mapping['lasso_train'])
831
860
  else:
832
861
  data_to_scale = self.data
833
862
 
@@ -850,12 +879,16 @@ class _DataPreparation:
850
879
  volatile=volatile,
851
880
  persist=persist)
852
881
 
882
+ self.data_mapping[f'fit_scale_{feature_selection_mtd}_output'] = fit_obj.output_data._table_name
883
+ self.data_mapping[f'fit_scale_{feature_selection_mtd}_result'] = fit_obj.output._table_name
884
+
853
885
  # storing the scale fit object and columns in data transformation dictionary
854
886
  self.data_transform_dict['{}_scale_fit_obj'.format(feature_selection_mtd)] = fit_obj.output
855
887
  self.data_transform_dict['{}_scale_col'.format(feature_selection_mtd)] = scale_col
856
888
 
857
889
  # List of columns to copy to the output generated by scale transform
858
890
  accumulate_cols = list(set(data_to_scale.columns) - set(scale_col))
891
+
859
892
 
860
893
  # Scaling dataset
861
894
  transform_obj = ScaleTransform(data=data_to_scale,
@@ -867,6 +900,8 @@ class _DataPreparation:
867
900
  data=scaled_df,
868
901
  progress_bar=self.progress_bar)
869
902
  else:
903
+ # No columns to scale, Original data will be used
904
+ scaled_df = data_to_scale
870
905
  self._display_msg(msg="No columns to scale.",
871
906
  progress_bar=self.progress_bar)
872
907
 
@@ -915,10 +950,16 @@ class _DataPreparation:
915
950
  # Assigning data to target dataframe
916
951
  target_df = self.data
917
952
  # Detecting list of float columns on target dataset
918
- float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float"]]
953
+ float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float", "decimal.Decimal"]]
919
954
 
920
955
  if len(float_columns) == 0:
921
- return target_df.to_pandas()
956
+ cols = target_df.columns
957
+ # Doing reset index to get index column
958
+ df = target_df.to_pandas().reset_index()
959
+
960
+ # Returning the dataframe with cols
961
+ # to avoid extra columns generated by reset_index()
962
+ return df[cols]
922
963
 
923
964
  # storing the column details for round up in data transformation dictionary
924
965
  self.data_transform_dict["round_columns"] = float_columns
@@ -942,6 +983,7 @@ class _DataPreparation:
942
983
  fit_params["persist"] = False
943
984
 
944
985
  transform_output = RoundColumns(**fit_params).result
986
+ self.data_mapping['round_columns_data'] = transform_output._table_name
945
987
  if not self.volatile and not self.persist:
946
988
  # Adding transformed data containing table to garbage collector
947
989
  GarbageCollector._add_to_garbagecollector(transform_output._table_name)
@@ -15,6 +15,7 @@
15
15
 
16
16
  # Python libraries
17
17
  import pandas as pd
18
+ import warnings
18
19
 
19
20
  # Teradata libraries
20
21
  from teradataml.dataframe.dataframe import DataFrame
@@ -31,8 +32,11 @@ from teradataml import ScaleTransform
31
32
  from teradataml import SimpleImputeTransform
32
33
  from teradataml import TargetEncodingTransform
33
34
  from teradataml import Transform, UtilFuncs, TeradataConstants
35
+ from teradataml import execute_sql
34
36
  from teradataml.common.garbagecollector import GarbageCollector
35
37
  from teradataml.hyperparameter_tuner.utils import _ProgressBar
38
+ from teradataml.options.configure import configure
39
+ from teradataml.common.constants import TeradataConstants
36
40
 
37
41
  # AutoML Internal libraries
38
42
  from teradataml.automl.feature_exploration import _FeatureExplore
@@ -219,11 +223,11 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
219
223
  DESCRIPTION:
220
224
  Function drops irrelevent columns and adds id column.
221
225
  """
222
- # Extracting irrelevent column list
226
+ # Extracting irrelevant column list
223
227
  columns_to_be_removed = self.data_transformation_params.get("drop_irrelevent_columns", None)
224
228
  if columns_to_be_removed:
225
229
  self.data = self.data.drop(columns_to_be_removed, axis=1)
226
- self._display_msg(msg="\nUpdated dataset after dropping irrelevent columns :",
230
+ self._display_msg(msg="\nUpdated dataset after dropping irrelevant columns :",
227
231
  data=self.data,
228
232
  progress_bar=self.progress_bar)
229
233
 
@@ -465,6 +469,7 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
465
469
  custom_target_encoding_ind = self.data_transformation_params.get("custom_target_encoding_ind", False)
466
470
  custom_target_encoding_fit_obj = self.data_transformation_params.get("custom_target_encoding_fit_obj", None)
467
471
  if custom_target_encoding_ind:
472
+ warn_cols = []
468
473
  for col, tar_fit_obj in custom_target_encoding_fit_obj.items():
469
474
  # Extracting accumulate columns
470
475
  accumulate_columns = self._extract_list(self.data.columns, [col])
@@ -480,6 +485,15 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
480
485
  self.data = TargetEncodingTransform(**transform_params).result
481
486
  # Adding transformed data containing table to garbage collector
482
487
  GarbageCollector._add_to_garbagecollector(self.data._table_name)
488
+ if self.data[self.data[col] == -1].shape[0] > 0:
489
+ warn_cols.append(col)
490
+
491
+ # Checking for unseen values in target encoding columns
492
+ if len(warn_cols) > 0:
493
+ warnings.warn(message=f"Unseen categorical values found in test data column(s): {warn_cols}. \
494
+ This may cause inaccurate predictions. Consider retraining the model with updated data.",
495
+ stacklevel=0)
496
+
483
497
  self._display_msg(msg="\nUpdated dataset after performing customized categorical encoding :",
484
498
  data=self.data,
485
499
  progress_bar=self.progress_bar)
@@ -693,22 +707,28 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
693
707
  lasso_scale_fit_obj = self.data_transformation_params.get("lasso_scale_fit_obj", None)
694
708
  lasso_scale_col = self.data_transformation_params.get("lasso_scale_col", None)
695
709
  # Extracting accumulate columns
696
- accumulate_cols = self._extract_list(lasso_df.columns, lasso_scale_col)
697
- # Scaling dataset
698
- lasso_df = ScaleTransform(data=lasso_df,
699
- object=lasso_scale_fit_obj,
700
- accumulate=accumulate_cols).result
701
- # Displaying scaled dataset
702
- self._display_msg(msg="\nUpdated dataset after performing scaling on Lasso selected features :",
703
- data=lasso_df,
704
- progress_bar=self.progress_bar)
710
+ if lasso_scale_fit_obj is not None:
711
+ accumulate_cols = self._extract_list(lasso_df.columns, lasso_scale_col)
712
+ # Scaling dataset
713
+ lasso_df = ScaleTransform(data=lasso_df,
714
+ object=lasso_scale_fit_obj,
715
+ accumulate=accumulate_cols).result
716
+ # Displaying scaled dataset
717
+ self._display_msg(msg="\nUpdated dataset after performing scaling on Lasso selected features :",
718
+ data=lasso_df,
719
+ progress_bar=self.progress_bar)
705
720
 
706
721
  # Uploading lasso dataset to table for further use
707
722
  table_name = UtilFuncs._generate_temp_table_name(prefix="lasso_new_test",
708
723
  table_type = TeradataConstants.TERADATA_TABLE)
724
+ # If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
725
+ # table name in fully qualified format.
726
+ table_name = UtilFuncs._extract_table_name(table_name)
709
727
  # Storing table name mapping for lasso dataset
710
728
  self.table_name_mapping[self.data_node_id]["lasso_new_test"] = table_name
711
- copy_to_sql(df = lasso_df, table_name= table_name, if_exists="replace")
729
+ # In the case of the VT option, the table was being persisted, so the VT condition is being checked.
730
+ is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
731
+ copy_to_sql(df = lasso_df, table_name= table_name, if_exists="replace", temporary=is_temporary)
712
732
 
713
733
  def _feature_selection_rfe_transformation(self):
714
734
  """
@@ -730,23 +750,30 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
730
750
  # Extracting fit object and columns for scaling
731
751
  rfe_scale_fit_obj = self.data_transformation_params.get("rfe_scale_fit_obj", None)
732
752
  rfe_scale_col = self.data_transformation_params.get("rfe_scale_col", None)
733
- # Extracting accumulate columns
734
- accumulate_cols = self._extract_list(rfe_df.columns, rfe_scale_col)
735
- # Scaling on rfe dataset
736
- rfe_df = ScaleTransform(data=rfe_df,
737
- object=rfe_scale_fit_obj,
738
- accumulate=accumulate_cols).result
739
- # Displaying scaled dataset
740
- self._display_msg(msg="\nUpdated dataset after performing scaling on RFE selected features :",
741
- data=rfe_df,
742
- progress_bar=self.progress_bar)
753
+
754
+ if rfe_scale_fit_obj is not None:
755
+ # Extracting accumulate columns
756
+ accumulate_cols = self._extract_list(rfe_df.columns, rfe_scale_col)
757
+ # Scaling on rfe dataset
758
+ rfe_df = ScaleTransform(data=rfe_df,
759
+ object=rfe_scale_fit_obj,
760
+ accumulate=accumulate_cols).result
761
+ # Displaying scaled dataset
762
+ self._display_msg(msg="\nUpdated dataset after performing scaling on RFE selected features :",
763
+ data=rfe_df,
764
+ progress_bar=self.progress_bar)
743
765
 
744
766
  # Uploading rfe dataset to table for further use
745
767
  table_name = UtilFuncs._generate_temp_table_name(prefix="rfe_new_test",
746
768
  table_type = TeradataConstants.TERADATA_TABLE)
769
+ # If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
770
+ # table name in fully qualified format.
771
+ table_name = UtilFuncs._extract_table_name(table_name)
747
772
  # Storing table name mapping for rfe dataset
748
773
  self.table_name_mapping[self.data_node_id]["rfe_new_test"] = table_name
749
- copy_to_sql(df = rfe_df, table_name= table_name, if_exists="replace")
774
+ # In the case of the VT option, the table was being persisted, so the VT condition is being checked.
775
+ is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
776
+ copy_to_sql(df = rfe_df, table_name= table_name, if_exists="replace", temporary=is_temporary)
750
777
 
751
778
  def _feature_selection_pca_transformation(self):
752
779
  """
@@ -758,17 +785,20 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
758
785
  pca_scale_col = self.data_transformation_params.get("pca_scale_col", None)
759
786
  # Extracting accumulate columns
760
787
  accumulate_cols = self._extract_list(self.data.columns, pca_scale_col)
761
- # Scaling on pca dataset
762
- pca_scaled_df = ScaleTransform(data=self.data,
763
- object=pca_scale_fit_obj,
764
- accumulate=accumulate_cols).result
765
- # Displaying scaled dataset
766
- self._display_msg(msg="\nUpdated dataset after performing scaling for PCA feature selection :",
767
- data=pca_scaled_df,
768
- progress_bar=self.progress_bar)
788
+
789
+ pca_scaled_df = self.data
790
+ if pca_scale_fit_obj is not None:
791
+ # Scaling on pca dataset
792
+ pca_scaled_df = ScaleTransform(data=self.data,
793
+ object=pca_scale_fit_obj,
794
+ accumulate=accumulate_cols).result
795
+ # Displaying scaled dataset
796
+ self._display_msg(msg="\nUpdated dataset after performing scaling for PCA feature selection :",
797
+ data=pca_scaled_df,
798
+ progress_bar=self.progress_bar)
769
799
 
770
800
  # Convert to pandas dataframe for applying pca
771
- pca_scaled_pd = pca_scaled_df.to_pandas()
801
+ pca_scaled_pd = pca_scaled_df.to_pandas().reset_index()
772
802
  # Extracting pca fit instance for applying pca
773
803
  pca_fit_instance = self.data_transformation_params.get("pca_fit_instance", None)
774
804
  # Extracting columns for applying pca
@@ -804,6 +834,12 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
804
834
  # Uploading pca dataset to table for further use
805
835
  table_name = UtilFuncs._generate_temp_table_name(prefix="pca_new_test",
806
836
  table_type = TeradataConstants.TERADATA_TABLE)
837
+ # If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
838
+ # table name in fully qualified format.
839
+ table_name = UtilFuncs._extract_table_name(table_name)
807
840
  # Storing table name mapping for pca dataset
808
841
  self.table_name_mapping[self.data_node_id]["pca_new_test"] = table_name
809
- copy_to_sql(df = pca_df, table_name=table_name, if_exists="replace")
842
+ # In the case of the VT option, the table was being persisted, so the VT condition is being checked.
843
+ is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
844
+ copy_to_sql(df = pca_df, table_name=table_name, if_exists="replace", temporary=is_temporary)
845
+