teradataml 20.0.0.4__py3-none-any.whl → 20.0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (107) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/README.md +86 -13
  3. teradataml/__init__.py +2 -1
  4. teradataml/_version.py +2 -2
  5. teradataml/analytics/analytic_function_executor.py +7 -12
  6. teradataml/analytics/json_parser/analytic_functions_argument.py +4 -0
  7. teradataml/analytics/sqle/__init__.py +16 -1
  8. teradataml/analytics/utils.py +15 -1
  9. teradataml/automl/__init__.py +290 -106
  10. teradataml/automl/autodataprep/__init__.py +471 -0
  11. teradataml/automl/data_preparation.py +29 -10
  12. teradataml/automl/data_transformation.py +11 -0
  13. teradataml/automl/feature_engineering.py +64 -4
  14. teradataml/automl/feature_exploration.py +639 -25
  15. teradataml/automl/model_training.py +1 -1
  16. teradataml/clients/auth_client.py +2 -2
  17. teradataml/common/constants.py +61 -26
  18. teradataml/common/messagecodes.py +2 -1
  19. teradataml/common/messages.py +5 -4
  20. teradataml/common/utils.py +255 -37
  21. teradataml/context/context.py +225 -87
  22. teradataml/data/apriori_example.json +22 -0
  23. teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
  24. teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
  25. teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +3 -3
  26. teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
  27. teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
  28. teradataml/data/docs/sqle/docs_17_20/TextParser.py +54 -3
  29. teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -1
  30. teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +2 -2
  31. teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +2 -2
  32. teradataml/data/docs/uaf/docs_17_20/DFFT.py +1 -1
  33. teradataml/data/docs/uaf/docs_17_20/DFFT2.py +1 -1
  34. teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +1 -1
  35. teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +1 -1
  36. teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +4 -4
  37. teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +2 -2
  38. teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +2 -2
  39. teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +6 -6
  40. teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
  41. teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +1 -1
  42. teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +4 -4
  43. teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +1 -1
  44. teradataml/data/docs/uaf/docs_17_20/PACF.py +1 -1
  45. teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
  46. teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +3 -3
  47. teradataml/data/docs/uaf/docs_17_20/Resample.py +5 -5
  48. teradataml/data/docs/uaf/docs_17_20/SAX.py +3 -3
  49. teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
  50. teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +1 -1
  51. teradataml/data/docs/uaf/docs_17_20/Smoothma.py +3 -3
  52. teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +1 -1
  53. teradataml/data/jsons/sqle/17.20/NGramSplitter.json +6 -6
  54. teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
  55. teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
  56. teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
  57. teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
  58. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +114 -9
  59. teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +328 -0
  60. teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +420 -0
  61. teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +343 -0
  62. teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +328 -0
  63. teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +328 -0
  64. teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +328 -0
  65. teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +328 -0
  66. teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +359 -0
  67. teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +360 -0
  68. teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +343 -0
  69. teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +343 -0
  70. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +2 -2
  71. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +1 -1
  72. teradataml/data/ner_dict.csv +8 -0
  73. teradataml/data/ner_input_eng.csv +7 -0
  74. teradataml/data/ner_rule.csv +5 -0
  75. teradataml/data/pos_input.csv +40 -0
  76. teradataml/data/tdnerextractor_example.json +14 -0
  77. teradataml/data/teradataml_example.json +13 -0
  78. teradataml/data/textmorph_example.json +5 -0
  79. teradataml/data/to_num_data.csv +4 -0
  80. teradataml/data/tochar_data.csv +5 -0
  81. teradataml/data/trans_dense.csv +16 -0
  82. teradataml/data/trans_sparse.csv +55 -0
  83. teradataml/dataframe/copy_to.py +37 -26
  84. teradataml/dataframe/data_transfer.py +61 -45
  85. teradataml/dataframe/dataframe.py +130 -50
  86. teradataml/dataframe/dataframe_utils.py +15 -2
  87. teradataml/dataframe/functions.py +109 -9
  88. teradataml/dataframe/sql.py +328 -76
  89. teradataml/dbutils/dbutils.py +33 -13
  90. teradataml/dbutils/filemgr.py +14 -10
  91. teradataml/lib/aed_0_1.dll +0 -0
  92. teradataml/opensource/_base.py +6 -157
  93. teradataml/options/configure.py +4 -5
  94. teradataml/scriptmgmt/UserEnv.py +305 -38
  95. teradataml/scriptmgmt/lls_utils.py +376 -130
  96. teradataml/store/__init__.py +1 -1
  97. teradataml/table_operators/Apply.py +16 -1
  98. teradataml/table_operators/Script.py +20 -1
  99. teradataml/table_operators/table_operator_util.py +58 -9
  100. teradataml/utils/dtypes.py +2 -1
  101. teradataml/utils/internal_buffer.py +22 -2
  102. teradataml/utils/validators.py +313 -57
  103. {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/METADATA +89 -14
  104. {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/RECORD +107 -77
  105. {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/WHEEL +0 -0
  106. {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/top_level.txt +0 -0
  107. {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/zip-safe +0 -0
@@ -0,0 +1,471 @@
1
+ # External libraries
2
+ import pandas as pd
3
+
4
+ # Teradata libraries
5
+ from teradataml import db_drop_table
6
+ from teradataml.common.constants import AutoMLConstants as aml_const
7
+ from teradataml.common.messages import Messages, MessageCodes
8
+ from teradataml.dataframe.dataframe import DataFrame
9
+ from teradataml.dataframe.copy_to import copy_to_sql
10
+ from teradataml.utils.validators import _Validators
11
+
12
+ # AutoML Internal libraries
13
+ from teradataml import AutoML, TeradataMlException
14
+
15
+ class AutoDataPrep(AutoML):
16
+ def __init__(self,
17
+ task_type = "Default",
18
+ verbose = 0,
19
+ **kwargs):
20
+ """
21
+ DESCRIPTION:
22
+ AutoDataPrep simplifies the data preparation process by automating the different aspects of
23
+ data cleaning and transformation, enabling seamless exploration, transformation, and optimization of datasets.
24
+
25
+ PARAMETERS:
26
+ task_type:
27
+ Optional Argument.
28
+ Specifies the task type for AutoDataPrep, whether to apply regression OR classification
29
+ on the provided dataset. If user wants AutoDataPrep() to decide the task type automatically,
30
+ then it should be set to "Default".
31
+ Default Value: "Default"
32
+ Permitted Values: "Regression", "Classification", "Default"
33
+ Types: str
34
+
35
+ verbose:
36
+ Optional Argument.
37
+ Specifies the detailed execution steps based on verbose level.
38
+ Default Value: 0
39
+ Permitted Values:
40
+ * 0: prints the progress bar.
41
+ * 1: prints the execution steps.
42
+ * 2: prints the intermediate data between the execution of each step.
43
+ Types: int
44
+
45
+ **kwargs:
46
+ Specifies the additional arguments for AutoDataPrep. Below
47
+ are the additional arguments:
48
+ custom_config_file:
49
+ Optional Argument.
50
+ Specifies the path of JSON file in case of custom run.
51
+ Types: str
52
+
53
+ volatile:
54
+ Optional Argument.
55
+ Specifies whether to put the interim results of the
56
+ functions in a volatile table or not. When set to
57
+ True, results are stored in a volatile table,
58
+ otherwise not.
59
+ Default Value: False
60
+ Types: bool
61
+
62
+ persist:
63
+ Optional Argument.
64
+ Specifies whether to persist the interim results of the
65
+ functions in a table or not. When set to True,
66
+ results are persisted in a table; otherwise,
67
+ results are garbage collected at the end of the
68
+ session.
69
+ Default Value: False
70
+ Types: bool
71
+
72
+ RETURNS:
73
+ Instance of AutoDataPrep.
74
+
75
+ RAISES:
76
+ TeradataMlException, TypeError, ValueError
77
+
78
+ EXAMPLES:
79
+ # Notes:
80
+ # 1. Get the connection to Vantage to execute the function.
81
+ # 2. One must import the required functions mentioned in
82
+ # the example from teradataml.
83
+ # 3. Function raises error if not supported on the Vantage
84
+ # user is connected to.
85
+
86
+ # Load the example data.
87
+ >>> load_example_data("teradataml", "titanic")
88
+
89
+ # Create teradataml DataFrames.
90
+ >>> titanic = DataFrame.from_table("titanic")
91
+
92
+ # Example 1: Run AutoDataPrep for classification problem.
93
+ # Scenario: Titanic dataset is used to predict the survival of passengers.
94
+
95
+ # Create an instance of AutoDataPrep.
96
+ >>> aprep_obj = AutoDataPrep(task_type="Classification", verbose=2)
97
+
98
+ # Fit the data.
99
+ >>> aprep_obj.fit(titanic, titanic.survived)
100
+
101
+ # Retrieve the data after Auto Data Preparation.
102
+ >>> datas = aprep_obj.get_data()
103
+
104
+ """
105
+ # Initialize the AutoML object
106
+ super().__init__(task_type=task_type,
107
+ verbose=verbose,
108
+ **kwargs)
109
+
110
+ # Setting the attrubutes for AutoDataPrep
111
+ super().__setattr__("_auto_dataprep", True)
112
+ super().__setattr__("model_list", [])
113
+ super().__setattr__("_phases", ["1. Feature Exploration ->",
114
+ "2. Feature Engineering ->",
115
+ "3. Data Preparation"])
116
+ super().__setattr__("_progressbar_prefix", 'Auto Data Prep:')
117
+
118
+ def fit(self,
119
+ data,
120
+ target_column):
121
+ """
122
+ DESCRIPTION:
123
+ Function to fit the data for Auto Data Preparation.
124
+
125
+ PARAMETERS:
126
+ data:
127
+ Required Argument.
128
+ Specifies the input data to be used for Auto Data Preparation.
129
+ Types: DataFrame
130
+
131
+ target_column:
132
+ Required Argument.
133
+ Specifies the target column to be used for Auto Data Preparation.
134
+ Types: str
135
+
136
+ RETURNS:
137
+ None
138
+
139
+ RAISES:
140
+ TeradataMlException, ValueError
141
+
142
+ EXAMPLES:
143
+ # Notes:
144
+ # 1. Get the connection to Vantage to execute the function.
145
+ # 2. One must import the required functions mentioned in
146
+ # the example from teradataml.
147
+ # 3. Function raises error if not supported on the Vantage
148
+ # user is connected to.
149
+
150
+ # Load the example data.
151
+ >>> load_example_data("teradataml", "titanic")
152
+
153
+ # Create teradataml DataFrames.
154
+ >>> titanic = DataFrame.from_table("titanic")
155
+
156
+ # Example 1: Run AutoDataPrep for classification problem.
157
+ # Scenario: Titanic dataset is used to predict the survival of passengers.
158
+
159
+ # Create an instance of AutoDataPrep.
160
+ >>> aprep_obj = AutoDataPrep(task_type="Classification", verbose=2)
161
+
162
+ # Fit the data.
163
+ >>> aprep_obj.fit(titanic, titanic.survived)
164
+
165
+ """
166
+ # Fit the data using AutoML object
167
+ super().fit(data, target_column)
168
+
169
+
170
+ def get_data(self):
171
+ """
172
+ DESCRIPTION:
173
+ Function to retrieve the data after Auto Data Preparation.
174
+
175
+ RETURNS:
176
+ Dictionary of DataFrames containing the data after Auto Data Preparation.
177
+
178
+ RAISES:
179
+ TeradataMlException
180
+
181
+ EXAMPLES:
182
+ # Notes:
183
+ # 1. Get the connection to Vantage to execute the function.
184
+ # 2. One must import the required functions mentioned in
185
+ # the example from teradataml.
186
+ # 3. Function raises error if not supported on the Vantage
187
+ # user is connected to.
188
+
189
+ # Load the example data.
190
+ >>> load_example_data("teradataml", "titanic")
191
+
192
+ # Create teradataml DataFrames.
193
+ >>> titanic = DataFrame.from_table("titanic")
194
+
195
+ # Example 1: Run AutoDataPrep for classification problem.
196
+ # Scenario: Titanic dataset is used to predict the survival of passengers.
197
+
198
+ # Create an instance of AutoDataPrep.
199
+ >>> aprep_obj = AutoDataPrep(task_type="Classification", verbose=2)
200
+
201
+ # Fit the data.
202
+ >>> aprep_obj.fit(titanic, titanic.survived)
203
+
204
+ # Retrieve the data after Auto Data Preparation.
205
+ >>> datas = aprep_obj.get_data()
206
+ """
207
+ # Raise error if fit is not called before get_data
208
+ _Validators._validate_dependent_method("get_data", "fit", self._is_fit_called)
209
+
210
+ datas = {}
211
+ for key, val in self.table_name_mapping.items():
212
+ datas[key] = DataFrame(val)
213
+
214
+ return datas
215
+
216
+ def deploy(self, table_name):
217
+ """
218
+ DESCRIPTION:
219
+ Deploy the AutoDataPrep generated data to the database,
220
+ i.e., saves the data in the database.
221
+
222
+ PARAMETERS:
223
+ table_name:
224
+ Required Argument.
225
+ Specifies the name of the table to store the information
226
+ of deployed datasets in the database.
227
+ Types: str
228
+
229
+ RETURNS:
230
+ None
231
+
232
+ RAISES:
233
+ TeradataMlException, ValueError
234
+
235
+ EXAMPLES:
236
+ # Create an instance of the AutoDataPrep.
237
+ # Perform fit() operation on the AutoDataPrep object.
238
+ # Deploy the data to the table.
239
+
240
+ From teradataml import AutoDataPrep
241
+ # Load the example data.
242
+ >>> load_example_data("teradataml", "titanic")
243
+ >>> titanic = DataFrame.from_table("titanic")
244
+
245
+ # Create an instance of AutoDataPrep.
246
+ >>> aprep_obj = AutoDataPrep(task_type="Classification", verbose=2)
247
+
248
+ # Fit the data.
249
+ >>> aprep_obj.fit(titanic, titanic.survived)
250
+
251
+ # Deploy the data to the table.
252
+ >>> aprep_obj.deploy("table_name")
253
+ """
254
+
255
+ # Appending arguments to list for validation
256
+ arg_info_matrix = []
257
+ arg_info_matrix.append(["table_name", table_name, True, (str), True])
258
+
259
+ # Validating the arguments
260
+ _Validators._validate_function_arguments(arg_info_matrix)
261
+
262
+ # Raise Error if fit is not called before deploy
263
+ _Validators._validate_dependent_method("deploy", "fit", self._is_fit_called)
264
+
265
+ if self.table_name_mapping is not None and \
266
+ isinstance(self.table_name_mapping, dict):
267
+
268
+ tab_map = {}
269
+ # If persist is False, then generate permanent table
270
+ if not self.kwargs.get("persist", False):
271
+ for key, val in self.table_name_mapping.items():
272
+ # Perist the data
273
+ per_name = self._create_per_result_table(prefix='{}_'.format(self.target_column),
274
+ persist_result_table=val)
275
+ # Store the table name mapping
276
+ tab_map[key] = per_name
277
+ else:
278
+ # Tables are already persisted
279
+ tab_map = self.table_name_mapping
280
+ data = pd.DataFrame(list(tab_map.items()), columns=['Feature_Selection_Method', 'Table_Name'])
281
+
282
+ # Save the data to the database
283
+ copy_to_sql(df= data, table_name=table_name, if_exists="replace")
284
+ print("Data deployed successfully to the table: ", table_name)
285
+ return
286
+
287
+ # Raise error if data is not found or
288
+ # table_name_mapping is not a dictionary/ None
289
+ err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
290
+ "'deploy' method", \
291
+ "Data not found to deploy.")
292
+ raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
293
+
294
+ def load(self, table_name):
295
+ """
296
+ DESCRIPTION:
297
+ Loads the AutoDataPrep generated data from the database
298
+ in the session to use it for model training or scoring.
299
+
300
+ PARAMETERS:
301
+ table_name:
302
+ Required Argument.
303
+ Specifies the name of the table containing the information
304
+ of deployed datasets in the database.
305
+ Types: str
306
+
307
+ RETURNS:
308
+ Dictionary of DataFrames containing the datas generated from AutoDataPrep.
309
+
310
+ RAISES:
311
+ TeradataMlException, ValueError
312
+
313
+ EXAMPLES:
314
+ # Create an instance of the AutoDataPrep.
315
+ # Load the data from the table.
316
+
317
+ # Create an instance of AutoDataPrep.
318
+ >>> aprep_obj = AutoDataPrep()
319
+
320
+ # Load the data from the table.
321
+ >>> data = aprep_obj.load("table_name")
322
+
323
+ # Retrieve the data
324
+ >>> print(data)
325
+ """
326
+
327
+ # Appending arguments to list for validation
328
+ arg_info_matrix = []
329
+ arg_info_matrix.append(["table_name", table_name, True, (str), True])
330
+
331
+ # Validating the arguments
332
+ _Validators._validate_function_arguments(arg_info_matrix)
333
+
334
+ # Load the data from the table
335
+ load_df = DataFrame(table_name)
336
+
337
+ data = {}
338
+ # Load the data into dictionary
339
+ for mtd, tab_name in load_df.get_values():
340
+ try:
341
+ data[mtd] = DataFrame(tab_name)
342
+ except Exception as e:
343
+ print(f"Error while loading {mtd} table: ", e)
344
+ data[mtd] = None
345
+ continue
346
+
347
+ return data
348
+
349
+
350
+ def delete_data(self,
351
+ table_name,
352
+ fs_method=None):
353
+ """
354
+ DESCRIPTION:
355
+ Deletes the deployed datasets from the database.
356
+
357
+ PARAMETERS:
358
+ table_name:
359
+ Required Argument.
360
+ Specifies the name of the table containing the deployed datasets.
361
+ Types: str
362
+
363
+ fs_method:
364
+ Optional Argument.
365
+ Specifies the name of the feature selection method to delete from the
366
+ deployed datasets.
367
+ Default Value: None
368
+ Permitted Values: "lasso", "rfe", "pca"
369
+ Note:
370
+ * If "fs_method" is None, then method deletes all the deployed datasets.
371
+ Types: str or list of str
372
+
373
+ RETURNS:
374
+ None
375
+
376
+ RAISES:
377
+ TeradataMlException
378
+
379
+ EXAMPLES:
380
+ # Create an instance of the AutoDataPrep.
381
+ # Fit the data.
382
+ # Deploy the data to the table.
383
+ # Remove the deployed data from the table.
384
+
385
+ # Example 1: Remove the deployed data from the table within the AutoDataPrep object.
386
+
387
+ from teradataml import AutoDataPrep
388
+ # Load the example data.
389
+ >>> load_example_data("teradataml", "titanic")
390
+ >>> titanic = DataFrame.from_table("titanic")
391
+
392
+ # Create an instance of AutoDataPrep.
393
+ >>> aprep_obj = AutoDataPrep(task_type="Classification", verbose=2)
394
+
395
+ # fit the data.
396
+ >>> aprep_obj.fit(titanic, titanic.survived)
397
+
398
+ # Deploy the datas to the database.
399
+ >>> aprep_obj.deploy("table_name")
400
+
401
+ # Remove lasso deployed data from the table.
402
+ >>> aprep_obj.delete_data("table_name", fs_method="lasso")
403
+
404
+ # Example 2: Remove the deployed data from the table using different instance of AutoDataPrep object.
405
+ # Create an instance of AutoDataPrep.
406
+ >>> aprep_obj2 = AutoDataPrep()
407
+
408
+ # Remove lasso and pca deployed data from the table.
409
+ >>> aprep_obj2.delete_data("table_name", fs_method=["lasso", "pca"])
410
+
411
+ """
412
+ # Appending arguments to list for validation
413
+ arg_info_matrix = []
414
+ arg_info_matrix.append(["table_name", table_name, False, (str), True])
415
+ arg_info_matrix.append(["fs_method", fs_method, True, (str, list), True, aml_const.FEATURE_SELECTION_MTDS.value])
416
+
417
+ # Validating the arguments
418
+ _Validators._validate_function_arguments(arg_info_matrix)
419
+
420
+ # Load the data from the table
421
+ df = DataFrame(table_name)
422
+ # Get the values from the loaded DataFrame
423
+ values = df.get_values()
424
+
425
+ if fs_method is None:
426
+ # If fs_method is None, then delete all the tables
427
+ methods = aml_const.FEATURE_SELECTION_MTDS.value
428
+ elif isinstance(fs_method, str):
429
+ # If fs_method is str, then convert it to list
430
+ methods = [fs_method]
431
+ else:
432
+ # If fs_method is list, then use it as it is
433
+ methods = fs_method
434
+ # Convert the methods to lower case
435
+ methods = [method.lower() for method in methods]
436
+
437
+ filtered_data = []
438
+ remaining_data = []
439
+ # Filter the values based on the fs_method
440
+ for row in values:
441
+ if any(cond in row[0] for cond in methods):
442
+ filtered_data.append(row)
443
+ else:
444
+ remaining_data.append(row)
445
+
446
+ # Drop the tables
447
+ err_flag = False
448
+ for row in filtered_data:
449
+ tab_name = row[1]
450
+ mtd = row[0]
451
+ try:
452
+ db_drop_table(tab_name)
453
+ print(f"Removed {mtd} table successfully.")
454
+ except Exception as e:
455
+ print(f"Error while removing {mtd} table: ", e)
456
+ remaining_data.append(row)
457
+ err_flag = True
458
+ continue
459
+
460
+ if err_flag:
461
+ # Print message if error occured while removing deployed data
462
+ print("Error occured while removing deployed data.")
463
+
464
+ if len(remaining_data) > 0:
465
+ rem_data = pd.DataFrame(remaining_data, columns=['Feature_Selection_Method', 'Table_Name'])
466
+ # Save the data to the database
467
+ copy_to_sql(df= rem_data, table_name=table_name, if_exists="replace")
468
+ elif not err_flag:
469
+ # Drop the whole table if no data is remaining
470
+ db_drop_table(table_name)
471
+ print("Deployed data removed successfully.")
@@ -130,12 +130,12 @@ class _DataPreparation:
130
130
  self.task_type = task_type
131
131
  self.volatile = kwargs.get("volatile", False)
132
132
  self.persist = kwargs.get("persist", False)
133
+ self.aml_phases = kwargs.get("automl_phases", None)
133
134
 
134
135
  # Setting default value for auto run mode
135
136
  self._data_sampling_method = "SMOTE"
136
137
  self._scale_method_reg = "STD"
137
138
  self._scale_method_cls = "RANGE"
138
- self.table_name_mapping = {}
139
139
 
140
140
  self.data_types = {key: value for key, value in self.data._column_names_and_types}
141
141
  self.seed = kwargs.get("seed", 42)
@@ -145,6 +145,8 @@ class _DataPreparation:
145
145
  if kwargs.get("seed") is not None:
146
146
  np.random.seed(self.seed)
147
147
 
148
+ self.data_mapping = kwargs.get("data_mapping", {})
149
+
148
150
 
149
151
  def data_preparation(self,
150
152
  auto = True):
@@ -167,7 +169,8 @@ class _DataPreparation:
167
169
  list of lists containing, feature selected by rfe, pca and lasso.
168
170
  """
169
171
  self._display_heading(phase=2,
170
- progress_bar=self.progress_bar)
172
+ progress_bar=self.progress_bar,
173
+ automl_phases=self.aml_phases)
171
174
  self._display_msg(msg='Data preparation started ...',
172
175
  progress_bar=self.progress_bar)
173
176
  # Setting user value in case of custom running mode
@@ -210,7 +213,7 @@ class _DataPreparation:
210
213
  self._feature_selection_PCA()
211
214
  self.progress_bar.update()
212
215
 
213
- return [self.rfe_feature, self.lasso_feature, self.pca_feature], self.data_transform_dict
216
+ return [self.rfe_feature, self.lasso_feature, self.pca_feature], self.data_transform_dict, self.data_mapping
214
217
 
215
218
  def _handle_outliers(self,
216
219
  auto):
@@ -355,6 +358,9 @@ class _DataPreparation:
355
358
  # Adding transformed data containing table to garbage collector
356
359
  GarbageCollector._add_to_garbagecollector(self.data._table_name)
357
360
 
361
+ # Returning outlier fit object to store in data mapping dictionary
362
+ return outlier_fit_out
363
+
358
364
  def _outlier_processing(self):
359
365
  """
360
366
  DESCRIPTION:
@@ -378,7 +384,10 @@ class _DataPreparation:
378
384
  progress_bar=self.progress_bar)
379
385
  target_columns=columns_to_drop_rows
380
386
  replacement_strategy = "DELETE"
381
- self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
387
+ fit_obj = self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
388
+ self.data_mapping['fit_outlier_delete_output'] = fit_obj.output_data._table_name
389
+ self.data_mapping['fit_outlier_delete_result'] = self.data._table_name
390
+ self.data_mapping['outlier_filtered_data'] = self.data._table_name
382
391
  self._display_msg(msg="Sample of dataset after removing outlier rows:",
383
392
  data=self.data,
384
393
  progress_bar=self.progress_bar)
@@ -390,7 +399,10 @@ class _DataPreparation:
390
399
  progress_bar=self.progress_bar)
391
400
  target_columns=columns_to_impute
392
401
  replacement_strategy = "MEDIAN"
393
- self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
402
+ fit_obj = self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
403
+ self.data_mapping['fit_outlier_impute_output'] = fit_obj.output_data._table_name
404
+ self.data_mapping['fit_outlier_impute_result'] = fit_obj.result._table_name
405
+ self.data_mapping['outlier_imputed_data'] = self.data._table_name
394
406
  self._display_msg(msg="Sample of dataset after performing MEDIAN inplace:",
395
407
  data=self.data,
396
408
  progress_bar=self.progress_bar)
@@ -446,7 +458,10 @@ class _DataPreparation:
446
458
  # Fetching replacement value
447
459
  replacement_value = transform_val["replacement_value"]
448
460
  # Performing outlier handling
449
- self._outlier_handling(target_col, outlier_method, replacement_value)
461
+ fit_obj = self._outlier_handling(target_col, outlier_method, replacement_value)
462
+ self.data_mapping[f'fit_{target_col}_outlier_output'] = fit_obj.output_data._table_name
463
+ self.data_mapping[f'fit_{target_col}_outlier_result'] = fit_obj.result._table_name
464
+ self.data_mapping[f'{target_col}_outlier_treated_data'] = self.data._table_name
450
465
  else:
451
466
  self._display_msg(inline_msg="No information provided for feature transformation in outlier handling.",
452
467
  progress_bar=self.progress_bar)
@@ -491,7 +506,7 @@ class _DataPreparation:
491
506
  start_time = time.time()
492
507
 
493
508
  # Temporary Pulling data for feature selection
494
- pca_train = DataFrame.from_table(self.table_name_mapping['pca_train']).to_pandas()
509
+ pca_train = DataFrame.from_table(self.data_mapping['pca_train']).to_pandas()
495
510
 
496
511
  # Drop unnecessary columns and store the result
497
512
  train_data = pca_train.drop(columns=['id', self.target_column], axis=1)
@@ -759,7 +774,7 @@ class _DataPreparation:
759
774
  train_table_name = UtilFuncs._extract_table_name(train_table_name)
760
775
 
761
776
  # Storing the table names in the table name mapping dictionary
762
- self.table_name_mapping['{}_train'.format(prefix)] = train_table_name
777
+ self.data_mapping['{}_train'.format(prefix)] = train_table_name
763
778
 
764
779
  # In the case of the VT option, the table was being persisted, so the VT condition is being checked.
765
780
  is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
@@ -839,9 +854,9 @@ class _DataPreparation:
839
854
 
840
855
  # Loading data for feature scaling based of feature selection method
841
856
  if feature_selection_mtd == 'rfe':
842
- data_to_scale = DataFrame(self.table_name_mapping['rfe_train'])
857
+ data_to_scale = DataFrame(self.data_mapping['rfe_train'])
843
858
  elif feature_selection_mtd == 'lasso':
844
- data_to_scale = DataFrame(self.table_name_mapping['lasso_train'])
859
+ data_to_scale = DataFrame(self.data_mapping['lasso_train'])
845
860
  else:
846
861
  data_to_scale = self.data
847
862
 
@@ -864,6 +879,9 @@ class _DataPreparation:
864
879
  volatile=volatile,
865
880
  persist=persist)
866
881
 
882
+ self.data_mapping[f'fit_scale_{feature_selection_mtd}_output'] = fit_obj.output_data._table_name
883
+ self.data_mapping[f'fit_scale_{feature_selection_mtd}_result'] = fit_obj.output._table_name
884
+
867
885
  # storing the scale fit object and columns in data transformation dictionary
868
886
  self.data_transform_dict['{}_scale_fit_obj'.format(feature_selection_mtd)] = fit_obj.output
869
887
  self.data_transform_dict['{}_scale_col'.format(feature_selection_mtd)] = scale_col
@@ -965,6 +983,7 @@ class _DataPreparation:
965
983
  fit_params["persist"] = False
966
984
 
967
985
  transform_output = RoundColumns(**fit_params).result
986
+ self.data_mapping['round_columns_data'] = transform_output._table_name
968
987
  if not self.volatile and not self.persist:
969
988
  # Adding transformed data containing table to garbage collector
970
989
  GarbageCollector._add_to_garbagecollector(transform_output._table_name)
@@ -15,6 +15,7 @@
15
15
 
16
16
  # Python libraries
17
17
  import pandas as pd
18
+ import warnings
18
19
 
19
20
  # Teradata libraries
20
21
  from teradataml.dataframe.dataframe import DataFrame
@@ -468,6 +469,7 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
468
469
  custom_target_encoding_ind = self.data_transformation_params.get("custom_target_encoding_ind", False)
469
470
  custom_target_encoding_fit_obj = self.data_transformation_params.get("custom_target_encoding_fit_obj", None)
470
471
  if custom_target_encoding_ind:
472
+ warn_cols = []
471
473
  for col, tar_fit_obj in custom_target_encoding_fit_obj.items():
472
474
  # Extracting accumulate columns
473
475
  accumulate_columns = self._extract_list(self.data.columns, [col])
@@ -483,6 +485,15 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
483
485
  self.data = TargetEncodingTransform(**transform_params).result
484
486
  # Adding transformed data containing table to garbage collector
485
487
  GarbageCollector._add_to_garbagecollector(self.data._table_name)
488
+ if self.data[self.data[col] == -1].shape[0] > 0:
489
+ warn_cols.append(col)
490
+
491
+ # Checking for unseen values in target encoding columns
492
+ if len(warn_cols) > 0:
493
+ warnings.warn(message=f"Unseen categorical values found in test data column(s): {warn_cols}. \
494
+ This may cause inaccurate predictions. Consider retraining the model with updated data.",
495
+ stacklevel=0)
496
+
486
497
  self._display_msg(msg="\nUpdated dataset after performing customized categorical encoding :",
487
498
  data=self.data,
488
499
  progress_bar=self.progress_bar)