teradataml 20.0.0.4__py3-none-any.whl → 20.0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/README.md +182 -13
- teradataml/__init__.py +2 -1
- teradataml/_version.py +2 -2
- teradataml/analytics/analytic_function_executor.py +8 -13
- teradataml/analytics/json_parser/analytic_functions_argument.py +4 -0
- teradataml/analytics/sqle/__init__.py +16 -1
- teradataml/analytics/utils.py +60 -1
- teradataml/automl/__init__.py +290 -106
- teradataml/automl/autodataprep/__init__.py +471 -0
- teradataml/automl/data_preparation.py +29 -10
- teradataml/automl/data_transformation.py +11 -0
- teradataml/automl/feature_engineering.py +64 -4
- teradataml/automl/feature_exploration.py +639 -25
- teradataml/automl/model_training.py +1 -1
- teradataml/clients/auth_client.py +12 -8
- teradataml/clients/keycloak_client.py +165 -0
- teradataml/common/constants.py +71 -26
- teradataml/common/exceptions.py +32 -0
- teradataml/common/messagecodes.py +28 -0
- teradataml/common/messages.py +13 -4
- teradataml/common/sqlbundle.py +3 -2
- teradataml/common/utils.py +345 -45
- teradataml/context/context.py +259 -93
- teradataml/data/apriori_example.json +22 -0
- teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
- teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +3 -3
- teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
- teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +54 -3
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/DFFT.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFT2.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +6 -6
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/PACF.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Resample.py +5 -5
- teradataml/data/docs/uaf/docs_17_20/SAX.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Smoothma.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +1 -1
- teradataml/data/jsons/byom/onnxembeddings.json +1 -0
- teradataml/data/jsons/sqle/17.20/NGramSplitter.json +6 -6
- teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
- teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
- teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
- teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +114 -9
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +420 -0
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +343 -0
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +359 -0
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +360 -0
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +343 -0
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +343 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +2 -2
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +1 -1
- teradataml/data/ner_dict.csv +8 -0
- teradataml/data/ner_input_eng.csv +7 -0
- teradataml/data/ner_rule.csv +5 -0
- teradataml/data/pattern_matching_data.csv +11 -0
- teradataml/data/pos_input.csv +40 -0
- teradataml/data/sdk/modelops/modelops_spec.json +101737 -0
- teradataml/data/tdnerextractor_example.json +14 -0
- teradataml/data/teradataml_example.json +21 -1
- teradataml/data/textmorph_example.json +5 -0
- teradataml/data/to_num_data.csv +4 -0
- teradataml/data/tochar_data.csv +5 -0
- teradataml/data/trans_dense.csv +16 -0
- teradataml/data/trans_sparse.csv +55 -0
- teradataml/data/url_data.csv +10 -9
- teradataml/dataframe/copy_to.py +38 -27
- teradataml/dataframe/data_transfer.py +61 -45
- teradataml/dataframe/dataframe.py +1110 -132
- teradataml/dataframe/dataframe_utils.py +73 -27
- teradataml/dataframe/functions.py +1070 -9
- teradataml/dataframe/sql.py +750 -959
- teradataml/dbutils/dbutils.py +33 -13
- teradataml/dbutils/filemgr.py +14 -10
- teradataml/hyperparameter_tuner/utils.py +4 -2
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/_base.py +12 -157
- teradataml/options/configure.py +24 -9
- teradataml/scriptmgmt/UserEnv.py +317 -39
- teradataml/scriptmgmt/lls_utils.py +456 -135
- teradataml/sdk/README.md +79 -0
- teradataml/sdk/__init__.py +4 -0
- teradataml/sdk/_auth_modes.py +422 -0
- teradataml/sdk/_func_params.py +487 -0
- teradataml/sdk/_json_parser.py +453 -0
- teradataml/sdk/_openapi_spec_constants.py +249 -0
- teradataml/sdk/_utils.py +236 -0
- teradataml/sdk/api_client.py +897 -0
- teradataml/sdk/constants.py +62 -0
- teradataml/sdk/modelops/__init__.py +98 -0
- teradataml/sdk/modelops/_client.py +406 -0
- teradataml/sdk/modelops/_constants.py +304 -0
- teradataml/sdk/modelops/models.py +2308 -0
- teradataml/sdk/spinner.py +107 -0
- teradataml/store/__init__.py +1 -1
- teradataml/table_operators/Apply.py +16 -1
- teradataml/table_operators/Script.py +20 -1
- teradataml/table_operators/query_generator.py +4 -21
- teradataml/table_operators/table_operator_util.py +58 -9
- teradataml/utils/dtypes.py +4 -2
- teradataml/utils/internal_buffer.py +22 -2
- teradataml/utils/utils.py +0 -1
- teradataml/utils/validators.py +318 -58
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.6.dist-info}/METADATA +188 -14
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.6.dist-info}/RECORD +131 -84
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.6.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.6.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.6.dist-info}/zip-safe +0 -0
|
@@ -0,0 +1,471 @@
|
|
|
1
|
+
# External libraries
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
# Teradata libraries
|
|
5
|
+
from teradataml import db_drop_table
|
|
6
|
+
from teradataml.common.constants import AutoMLConstants as aml_const
|
|
7
|
+
from teradataml.common.messages import Messages, MessageCodes
|
|
8
|
+
from teradataml.dataframe.dataframe import DataFrame
|
|
9
|
+
from teradataml.dataframe.copy_to import copy_to_sql
|
|
10
|
+
from teradataml.utils.validators import _Validators
|
|
11
|
+
|
|
12
|
+
# AutoML Internal libraries
|
|
13
|
+
from teradataml import AutoML, TeradataMlException
|
|
14
|
+
|
|
15
|
+
class AutoDataPrep(AutoML):
|
|
16
|
+
def __init__(self,
|
|
17
|
+
task_type = "Default",
|
|
18
|
+
verbose = 0,
|
|
19
|
+
**kwargs):
|
|
20
|
+
"""
|
|
21
|
+
DESCRIPTION:
|
|
22
|
+
AutoDataPrep simplifies the data preparation process by automating the different aspects of
|
|
23
|
+
data cleaning and transformation, enabling seamless exploration, transformation, and optimization of datasets.
|
|
24
|
+
|
|
25
|
+
PARAMETERS:
|
|
26
|
+
task_type:
|
|
27
|
+
Optional Argument.
|
|
28
|
+
Specifies the task type for AutoDataPrep, whether to apply regression OR classification
|
|
29
|
+
on the provided dataset. If user wants AutoDataPrep() to decide the task type automatically,
|
|
30
|
+
then it should be set to "Default".
|
|
31
|
+
Default Value: "Default"
|
|
32
|
+
Permitted Values: "Regression", "Classification", "Default"
|
|
33
|
+
Types: str
|
|
34
|
+
|
|
35
|
+
verbose:
|
|
36
|
+
Optional Argument.
|
|
37
|
+
Specifies the detailed execution steps based on verbose level.
|
|
38
|
+
Default Value: 0
|
|
39
|
+
Permitted Values:
|
|
40
|
+
* 0: prints the progress bar.
|
|
41
|
+
* 1: prints the execution steps.
|
|
42
|
+
* 2: prints the intermediate data between the execution of each step.
|
|
43
|
+
Types: int
|
|
44
|
+
|
|
45
|
+
**kwargs:
|
|
46
|
+
Specifies the additional arguments for AutoDataPrep. Below
|
|
47
|
+
are the additional arguments:
|
|
48
|
+
custom_config_file:
|
|
49
|
+
Optional Argument.
|
|
50
|
+
Specifies the path of JSON file in case of custom run.
|
|
51
|
+
Types: str
|
|
52
|
+
|
|
53
|
+
volatile:
|
|
54
|
+
Optional Argument.
|
|
55
|
+
Specifies whether to put the interim results of the
|
|
56
|
+
functions in a volatile table or not. When set to
|
|
57
|
+
True, results are stored in a volatile table,
|
|
58
|
+
otherwise not.
|
|
59
|
+
Default Value: False
|
|
60
|
+
Types: bool
|
|
61
|
+
|
|
62
|
+
persist:
|
|
63
|
+
Optional Argument.
|
|
64
|
+
Specifies whether to persist the interim results of the
|
|
65
|
+
functions in a table or not. When set to True,
|
|
66
|
+
results are persisted in a table; otherwise,
|
|
67
|
+
results are garbage collected at the end of the
|
|
68
|
+
session.
|
|
69
|
+
Default Value: False
|
|
70
|
+
Types: bool
|
|
71
|
+
|
|
72
|
+
RETURNS:
|
|
73
|
+
Instance of AutoDataPrep.
|
|
74
|
+
|
|
75
|
+
RAISES:
|
|
76
|
+
TeradataMlException, TypeError, ValueError
|
|
77
|
+
|
|
78
|
+
EXAMPLES:
|
|
79
|
+
# Notes:
|
|
80
|
+
# 1. Get the connection to Vantage to execute the function.
|
|
81
|
+
# 2. One must import the required functions mentioned in
|
|
82
|
+
# the example from teradataml.
|
|
83
|
+
# 3. Function raises error if not supported on the Vantage
|
|
84
|
+
# user is connected to.
|
|
85
|
+
|
|
86
|
+
# Load the example data.
|
|
87
|
+
>>> load_example_data("teradataml", "titanic")
|
|
88
|
+
|
|
89
|
+
# Create teradataml DataFrames.
|
|
90
|
+
>>> titanic = DataFrame.from_table("titanic")
|
|
91
|
+
|
|
92
|
+
# Example 1: Run AutoDataPrep for classification problem.
|
|
93
|
+
# Scenario: Titanic dataset is used to predict the survival of passengers.
|
|
94
|
+
|
|
95
|
+
# Create an instance of AutoDataPrep.
|
|
96
|
+
>>> aprep_obj = AutoDataPrep(task_type="Classification", verbose=2)
|
|
97
|
+
|
|
98
|
+
# Fit the data.
|
|
99
|
+
>>> aprep_obj.fit(titanic, titanic.survived)
|
|
100
|
+
|
|
101
|
+
# Retrieve the data after Auto Data Preparation.
|
|
102
|
+
>>> datas = aprep_obj.get_data()
|
|
103
|
+
|
|
104
|
+
"""
|
|
105
|
+
# Initialize the AutoML object
|
|
106
|
+
super().__init__(task_type=task_type,
|
|
107
|
+
verbose=verbose,
|
|
108
|
+
**kwargs)
|
|
109
|
+
|
|
110
|
+
# Setting the attrubutes for AutoDataPrep
|
|
111
|
+
super().__setattr__("_auto_dataprep", True)
|
|
112
|
+
super().__setattr__("model_list", [])
|
|
113
|
+
super().__setattr__("_phases", ["1. Feature Exploration ->",
|
|
114
|
+
"2. Feature Engineering ->",
|
|
115
|
+
"3. Data Preparation"])
|
|
116
|
+
super().__setattr__("_progressbar_prefix", 'Auto Data Prep:')
|
|
117
|
+
|
|
118
|
+
def fit(self,
|
|
119
|
+
data,
|
|
120
|
+
target_column):
|
|
121
|
+
"""
|
|
122
|
+
DESCRIPTION:
|
|
123
|
+
Function to fit the data for Auto Data Preparation.
|
|
124
|
+
|
|
125
|
+
PARAMETERS:
|
|
126
|
+
data:
|
|
127
|
+
Required Argument.
|
|
128
|
+
Specifies the input data to be used for Auto Data Preparation.
|
|
129
|
+
Types: DataFrame
|
|
130
|
+
|
|
131
|
+
target_column:
|
|
132
|
+
Required Argument.
|
|
133
|
+
Specifies the target column to be used for Auto Data Preparation.
|
|
134
|
+
Types: str
|
|
135
|
+
|
|
136
|
+
RETURNS:
|
|
137
|
+
None
|
|
138
|
+
|
|
139
|
+
RAISES:
|
|
140
|
+
TeradataMlException, ValueError
|
|
141
|
+
|
|
142
|
+
EXAMPLES:
|
|
143
|
+
# Notes:
|
|
144
|
+
# 1. Get the connection to Vantage to execute the function.
|
|
145
|
+
# 2. One must import the required functions mentioned in
|
|
146
|
+
# the example from teradataml.
|
|
147
|
+
# 3. Function raises error if not supported on the Vantage
|
|
148
|
+
# user is connected to.
|
|
149
|
+
|
|
150
|
+
# Load the example data.
|
|
151
|
+
>>> load_example_data("teradataml", "titanic")
|
|
152
|
+
|
|
153
|
+
# Create teradataml DataFrames.
|
|
154
|
+
>>> titanic = DataFrame.from_table("titanic")
|
|
155
|
+
|
|
156
|
+
# Example 1: Run AutoDataPrep for classification problem.
|
|
157
|
+
# Scenario: Titanic dataset is used to predict the survival of passengers.
|
|
158
|
+
|
|
159
|
+
# Create an instance of AutoDataPrep.
|
|
160
|
+
>>> aprep_obj = AutoDataPrep(task_type="Classification", verbose=2)
|
|
161
|
+
|
|
162
|
+
# Fit the data.
|
|
163
|
+
>>> aprep_obj.fit(titanic, titanic.survived)
|
|
164
|
+
|
|
165
|
+
"""
|
|
166
|
+
# Fit the data using AutoML object
|
|
167
|
+
super().fit(data, target_column)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def get_data(self):
|
|
171
|
+
"""
|
|
172
|
+
DESCRIPTION:
|
|
173
|
+
Function to retrieve the data after Auto Data Preparation.
|
|
174
|
+
|
|
175
|
+
RETURNS:
|
|
176
|
+
Dictionary of DataFrames containing the data after Auto Data Preparation.
|
|
177
|
+
|
|
178
|
+
RAISES:
|
|
179
|
+
TeradataMlException
|
|
180
|
+
|
|
181
|
+
EXAMPLES:
|
|
182
|
+
# Notes:
|
|
183
|
+
# 1. Get the connection to Vantage to execute the function.
|
|
184
|
+
# 2. One must import the required functions mentioned in
|
|
185
|
+
# the example from teradataml.
|
|
186
|
+
# 3. Function raises error if not supported on the Vantage
|
|
187
|
+
# user is connected to.
|
|
188
|
+
|
|
189
|
+
# Load the example data.
|
|
190
|
+
>>> load_example_data("teradataml", "titanic")
|
|
191
|
+
|
|
192
|
+
# Create teradataml DataFrames.
|
|
193
|
+
>>> titanic = DataFrame.from_table("titanic")
|
|
194
|
+
|
|
195
|
+
# Example 1: Run AutoDataPrep for classification problem.
|
|
196
|
+
# Scenario: Titanic dataset is used to predict the survival of passengers.
|
|
197
|
+
|
|
198
|
+
# Create an instance of AutoDataPrep.
|
|
199
|
+
>>> aprep_obj = AutoDataPrep(task_type="Classification", verbose=2)
|
|
200
|
+
|
|
201
|
+
# Fit the data.
|
|
202
|
+
>>> aprep_obj.fit(titanic, titanic.survived)
|
|
203
|
+
|
|
204
|
+
# Retrieve the data after Auto Data Preparation.
|
|
205
|
+
>>> datas = aprep_obj.get_data()
|
|
206
|
+
"""
|
|
207
|
+
# Raise error if fit is not called before get_data
|
|
208
|
+
_Validators._validate_dependent_method("get_data", "fit", self._is_fit_called)
|
|
209
|
+
|
|
210
|
+
datas = {}
|
|
211
|
+
for key, val in self.table_name_mapping.items():
|
|
212
|
+
datas[key] = DataFrame(val)
|
|
213
|
+
|
|
214
|
+
return datas
|
|
215
|
+
|
|
216
|
+
def deploy(self, table_name):
|
|
217
|
+
"""
|
|
218
|
+
DESCRIPTION:
|
|
219
|
+
Deploy the AutoDataPrep generated data to the database,
|
|
220
|
+
i.e., saves the data in the database.
|
|
221
|
+
|
|
222
|
+
PARAMETERS:
|
|
223
|
+
table_name:
|
|
224
|
+
Required Argument.
|
|
225
|
+
Specifies the name of the table to store the information
|
|
226
|
+
of deployed datasets in the database.
|
|
227
|
+
Types: str
|
|
228
|
+
|
|
229
|
+
RETURNS:
|
|
230
|
+
None
|
|
231
|
+
|
|
232
|
+
RAISES:
|
|
233
|
+
TeradataMlException, ValueError
|
|
234
|
+
|
|
235
|
+
EXAMPLES:
|
|
236
|
+
# Create an instance of the AutoDataPrep.
|
|
237
|
+
# Perform fit() operation on the AutoDataPrep object.
|
|
238
|
+
# Deploy the data to the table.
|
|
239
|
+
|
|
240
|
+
From teradataml import AutoDataPrep
|
|
241
|
+
# Load the example data.
|
|
242
|
+
>>> load_example_data("teradataml", "titanic")
|
|
243
|
+
>>> titanic = DataFrame.from_table("titanic")
|
|
244
|
+
|
|
245
|
+
# Create an instance of AutoDataPrep.
|
|
246
|
+
>>> aprep_obj = AutoDataPrep(task_type="Classification", verbose=2)
|
|
247
|
+
|
|
248
|
+
# Fit the data.
|
|
249
|
+
>>> aprep_obj.fit(titanic, titanic.survived)
|
|
250
|
+
|
|
251
|
+
# Deploy the data to the table.
|
|
252
|
+
>>> aprep_obj.deploy("table_name")
|
|
253
|
+
"""
|
|
254
|
+
|
|
255
|
+
# Appending arguments to list for validation
|
|
256
|
+
arg_info_matrix = []
|
|
257
|
+
arg_info_matrix.append(["table_name", table_name, True, (str), True])
|
|
258
|
+
|
|
259
|
+
# Validating the arguments
|
|
260
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
261
|
+
|
|
262
|
+
# Raise Error if fit is not called before deploy
|
|
263
|
+
_Validators._validate_dependent_method("deploy", "fit", self._is_fit_called)
|
|
264
|
+
|
|
265
|
+
if self.table_name_mapping is not None and \
|
|
266
|
+
isinstance(self.table_name_mapping, dict):
|
|
267
|
+
|
|
268
|
+
tab_map = {}
|
|
269
|
+
# If persist is False, then generate permanent table
|
|
270
|
+
if not self.kwargs.get("persist", False):
|
|
271
|
+
for key, val in self.table_name_mapping.items():
|
|
272
|
+
# Perist the data
|
|
273
|
+
per_name = self._create_per_result_table(prefix='{}_'.format(self.target_column),
|
|
274
|
+
persist_result_table=val)
|
|
275
|
+
# Store the table name mapping
|
|
276
|
+
tab_map[key] = per_name
|
|
277
|
+
else:
|
|
278
|
+
# Tables are already persisted
|
|
279
|
+
tab_map = self.table_name_mapping
|
|
280
|
+
data = pd.DataFrame(list(tab_map.items()), columns=['Feature_Selection_Method', 'Table_Name'])
|
|
281
|
+
|
|
282
|
+
# Save the data to the database
|
|
283
|
+
copy_to_sql(df= data, table_name=table_name, if_exists="replace")
|
|
284
|
+
print("Data deployed successfully to the table: ", table_name)
|
|
285
|
+
return
|
|
286
|
+
|
|
287
|
+
# Raise error if data is not found or
|
|
288
|
+
# table_name_mapping is not a dictionary/ None
|
|
289
|
+
err = Messages.get_message(MessageCodes.FUNC_EXECUTION_FAILED,
|
|
290
|
+
"'deploy' method", \
|
|
291
|
+
"Data not found to deploy.")
|
|
292
|
+
raise TeradataMlException(err, MessageCodes.EXECUTION_FAILED)
|
|
293
|
+
|
|
294
|
+
def load(self, table_name):
|
|
295
|
+
"""
|
|
296
|
+
DESCRIPTION:
|
|
297
|
+
Loads the AutoDataPrep generated data from the database
|
|
298
|
+
in the session to use it for model training or scoring.
|
|
299
|
+
|
|
300
|
+
PARAMETERS:
|
|
301
|
+
table_name:
|
|
302
|
+
Required Argument.
|
|
303
|
+
Specifies the name of the table containing the information
|
|
304
|
+
of deployed datasets in the database.
|
|
305
|
+
Types: str
|
|
306
|
+
|
|
307
|
+
RETURNS:
|
|
308
|
+
Dictionary of DataFrames containing the datas generated from AutoDataPrep.
|
|
309
|
+
|
|
310
|
+
RAISES:
|
|
311
|
+
TeradataMlException, ValueError
|
|
312
|
+
|
|
313
|
+
EXAMPLES:
|
|
314
|
+
# Create an instance of the AutoDataPrep.
|
|
315
|
+
# Load the data from the table.
|
|
316
|
+
|
|
317
|
+
# Create an instance of AutoDataPrep.
|
|
318
|
+
>>> aprep_obj = AutoDataPrep()
|
|
319
|
+
|
|
320
|
+
# Load the data from the table.
|
|
321
|
+
>>> data = aprep_obj.load("table_name")
|
|
322
|
+
|
|
323
|
+
# Retrieve the data
|
|
324
|
+
>>> print(data)
|
|
325
|
+
"""
|
|
326
|
+
|
|
327
|
+
# Appending arguments to list for validation
|
|
328
|
+
arg_info_matrix = []
|
|
329
|
+
arg_info_matrix.append(["table_name", table_name, True, (str), True])
|
|
330
|
+
|
|
331
|
+
# Validating the arguments
|
|
332
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
333
|
+
|
|
334
|
+
# Load the data from the table
|
|
335
|
+
load_df = DataFrame(table_name)
|
|
336
|
+
|
|
337
|
+
data = {}
|
|
338
|
+
# Load the data into dictionary
|
|
339
|
+
for mtd, tab_name in load_df.get_values():
|
|
340
|
+
try:
|
|
341
|
+
data[mtd] = DataFrame(tab_name)
|
|
342
|
+
except Exception as e:
|
|
343
|
+
print(f"Error while loading {mtd} table: ", e)
|
|
344
|
+
data[mtd] = None
|
|
345
|
+
continue
|
|
346
|
+
|
|
347
|
+
return data
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def delete_data(self,
|
|
351
|
+
table_name,
|
|
352
|
+
fs_method=None):
|
|
353
|
+
"""
|
|
354
|
+
DESCRIPTION:
|
|
355
|
+
Deletes the deployed datasets from the database.
|
|
356
|
+
|
|
357
|
+
PARAMETERS:
|
|
358
|
+
table_name:
|
|
359
|
+
Required Argument.
|
|
360
|
+
Specifies the name of the table containing the deployed datasets.
|
|
361
|
+
Types: str
|
|
362
|
+
|
|
363
|
+
fs_method:
|
|
364
|
+
Optional Argument.
|
|
365
|
+
Specifies the name of the feature selection method to delete from the
|
|
366
|
+
deployed datasets.
|
|
367
|
+
Default Value: None
|
|
368
|
+
Permitted Values: "lasso", "rfe", "pca"
|
|
369
|
+
Note:
|
|
370
|
+
* If "fs_method" is None, then method deletes all the deployed datasets.
|
|
371
|
+
Types: str or list of str
|
|
372
|
+
|
|
373
|
+
RETURNS:
|
|
374
|
+
None
|
|
375
|
+
|
|
376
|
+
RAISES:
|
|
377
|
+
TeradataMlException
|
|
378
|
+
|
|
379
|
+
EXAMPLES:
|
|
380
|
+
# Create an instance of the AutoDataPrep.
|
|
381
|
+
# Fit the data.
|
|
382
|
+
# Deploy the data to the table.
|
|
383
|
+
# Remove the deployed data from the table.
|
|
384
|
+
|
|
385
|
+
# Example 1: Remove the deployed data from the table within the AutoDataPrep object.
|
|
386
|
+
|
|
387
|
+
from teradataml import AutoDataPrep
|
|
388
|
+
# Load the example data.
|
|
389
|
+
>>> load_example_data("teradataml", "titanic")
|
|
390
|
+
>>> titanic = DataFrame.from_table("titanic")
|
|
391
|
+
|
|
392
|
+
# Create an instance of AutoDataPrep.
|
|
393
|
+
>>> aprep_obj = AutoDataPrep(task_type="Classification", verbose=2)
|
|
394
|
+
|
|
395
|
+
# fit the data.
|
|
396
|
+
>>> aprep_obj.fit(titanic, titanic.survived)
|
|
397
|
+
|
|
398
|
+
# Deploy the datas to the database.
|
|
399
|
+
>>> aprep_obj.deploy("table_name")
|
|
400
|
+
|
|
401
|
+
# Remove lasso deployed data from the table.
|
|
402
|
+
>>> aprep_obj.delete_data("table_name", fs_method="lasso")
|
|
403
|
+
|
|
404
|
+
# Example 2: Remove the deployed data from the table using different instance of AutoDataPrep object.
|
|
405
|
+
# Create an instance of AutoDataPrep.
|
|
406
|
+
>>> aprep_obj2 = AutoDataPrep()
|
|
407
|
+
|
|
408
|
+
# Remove lasso and pca deployed data from the table.
|
|
409
|
+
>>> aprep_obj2.delete_data("table_name", fs_method=["lasso", "pca"])
|
|
410
|
+
|
|
411
|
+
"""
|
|
412
|
+
# Appending arguments to list for validation
|
|
413
|
+
arg_info_matrix = []
|
|
414
|
+
arg_info_matrix.append(["table_name", table_name, False, (str), True])
|
|
415
|
+
arg_info_matrix.append(["fs_method", fs_method, True, (str, list), True, aml_const.FEATURE_SELECTION_MTDS.value])
|
|
416
|
+
|
|
417
|
+
# Validating the arguments
|
|
418
|
+
_Validators._validate_function_arguments(arg_info_matrix)
|
|
419
|
+
|
|
420
|
+
# Load the data from the table
|
|
421
|
+
df = DataFrame(table_name)
|
|
422
|
+
# Get the values from the loaded DataFrame
|
|
423
|
+
values = df.get_values()
|
|
424
|
+
|
|
425
|
+
if fs_method is None:
|
|
426
|
+
# If fs_method is None, then delete all the tables
|
|
427
|
+
methods = aml_const.FEATURE_SELECTION_MTDS.value
|
|
428
|
+
elif isinstance(fs_method, str):
|
|
429
|
+
# If fs_method is str, then convert it to list
|
|
430
|
+
methods = [fs_method]
|
|
431
|
+
else:
|
|
432
|
+
# If fs_method is list, then use it as it is
|
|
433
|
+
methods = fs_method
|
|
434
|
+
# Convert the methods to lower case
|
|
435
|
+
methods = [method.lower() for method in methods]
|
|
436
|
+
|
|
437
|
+
filtered_data = []
|
|
438
|
+
remaining_data = []
|
|
439
|
+
# Filter the values based on the fs_method
|
|
440
|
+
for row in values:
|
|
441
|
+
if any(cond in row[0] for cond in methods):
|
|
442
|
+
filtered_data.append(row)
|
|
443
|
+
else:
|
|
444
|
+
remaining_data.append(row)
|
|
445
|
+
|
|
446
|
+
# Drop the tables
|
|
447
|
+
err_flag = False
|
|
448
|
+
for row in filtered_data:
|
|
449
|
+
tab_name = row[1]
|
|
450
|
+
mtd = row[0]
|
|
451
|
+
try:
|
|
452
|
+
db_drop_table(tab_name)
|
|
453
|
+
print(f"Removed {mtd} table successfully.")
|
|
454
|
+
except Exception as e:
|
|
455
|
+
print(f"Error while removing {mtd} table: ", e)
|
|
456
|
+
remaining_data.append(row)
|
|
457
|
+
err_flag = True
|
|
458
|
+
continue
|
|
459
|
+
|
|
460
|
+
if err_flag:
|
|
461
|
+
# Print message if error occured while removing deployed data
|
|
462
|
+
print("Error occured while removing deployed data.")
|
|
463
|
+
|
|
464
|
+
if len(remaining_data) > 0:
|
|
465
|
+
rem_data = pd.DataFrame(remaining_data, columns=['Feature_Selection_Method', 'Table_Name'])
|
|
466
|
+
# Save the data to the database
|
|
467
|
+
copy_to_sql(df= rem_data, table_name=table_name, if_exists="replace")
|
|
468
|
+
elif not err_flag:
|
|
469
|
+
# Drop the whole table if no data is remaining
|
|
470
|
+
db_drop_table(table_name)
|
|
471
|
+
print("Deployed data removed successfully.")
|
|
@@ -130,12 +130,12 @@ class _DataPreparation:
|
|
|
130
130
|
self.task_type = task_type
|
|
131
131
|
self.volatile = kwargs.get("volatile", False)
|
|
132
132
|
self.persist = kwargs.get("persist", False)
|
|
133
|
+
self.aml_phases = kwargs.get("automl_phases", None)
|
|
133
134
|
|
|
134
135
|
# Setting default value for auto run mode
|
|
135
136
|
self._data_sampling_method = "SMOTE"
|
|
136
137
|
self._scale_method_reg = "STD"
|
|
137
138
|
self._scale_method_cls = "RANGE"
|
|
138
|
-
self.table_name_mapping = {}
|
|
139
139
|
|
|
140
140
|
self.data_types = {key: value for key, value in self.data._column_names_and_types}
|
|
141
141
|
self.seed = kwargs.get("seed", 42)
|
|
@@ -145,6 +145,8 @@ class _DataPreparation:
|
|
|
145
145
|
if kwargs.get("seed") is not None:
|
|
146
146
|
np.random.seed(self.seed)
|
|
147
147
|
|
|
148
|
+
self.data_mapping = kwargs.get("data_mapping", {})
|
|
149
|
+
|
|
148
150
|
|
|
149
151
|
def data_preparation(self,
|
|
150
152
|
auto = True):
|
|
@@ -167,7 +169,8 @@ class _DataPreparation:
|
|
|
167
169
|
list of lists containing, feature selected by rfe, pca and lasso.
|
|
168
170
|
"""
|
|
169
171
|
self._display_heading(phase=2,
|
|
170
|
-
progress_bar=self.progress_bar
|
|
172
|
+
progress_bar=self.progress_bar,
|
|
173
|
+
automl_phases=self.aml_phases)
|
|
171
174
|
self._display_msg(msg='Data preparation started ...',
|
|
172
175
|
progress_bar=self.progress_bar)
|
|
173
176
|
# Setting user value in case of custom running mode
|
|
@@ -210,7 +213,7 @@ class _DataPreparation:
|
|
|
210
213
|
self._feature_selection_PCA()
|
|
211
214
|
self.progress_bar.update()
|
|
212
215
|
|
|
213
|
-
return [self.rfe_feature, self.lasso_feature, self.pca_feature], self.data_transform_dict
|
|
216
|
+
return [self.rfe_feature, self.lasso_feature, self.pca_feature], self.data_transform_dict, self.data_mapping
|
|
214
217
|
|
|
215
218
|
def _handle_outliers(self,
|
|
216
219
|
auto):
|
|
@@ -355,6 +358,9 @@ class _DataPreparation:
|
|
|
355
358
|
# Adding transformed data containing table to garbage collector
|
|
356
359
|
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
357
360
|
|
|
361
|
+
# Returning outlier fit object to store in data mapping dictionary
|
|
362
|
+
return outlier_fit_out
|
|
363
|
+
|
|
358
364
|
def _outlier_processing(self):
|
|
359
365
|
"""
|
|
360
366
|
DESCRIPTION:
|
|
@@ -378,7 +384,10 @@ class _DataPreparation:
|
|
|
378
384
|
progress_bar=self.progress_bar)
|
|
379
385
|
target_columns=columns_to_drop_rows
|
|
380
386
|
replacement_strategy = "DELETE"
|
|
381
|
-
self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
|
|
387
|
+
fit_obj = self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
|
|
388
|
+
self.data_mapping['fit_outlier_delete_output'] = fit_obj.output_data._table_name
|
|
389
|
+
self.data_mapping['fit_outlier_delete_result'] = self.data._table_name
|
|
390
|
+
self.data_mapping['outlier_filtered_data'] = self.data._table_name
|
|
382
391
|
self._display_msg(msg="Sample of dataset after removing outlier rows:",
|
|
383
392
|
data=self.data,
|
|
384
393
|
progress_bar=self.progress_bar)
|
|
@@ -390,7 +399,10 @@ class _DataPreparation:
|
|
|
390
399
|
progress_bar=self.progress_bar)
|
|
391
400
|
target_columns=columns_to_impute
|
|
392
401
|
replacement_strategy = "MEDIAN"
|
|
393
|
-
self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
|
|
402
|
+
fit_obj = self._outlier_handling(target_columns, outlier_handling_method, replacement_strategy)
|
|
403
|
+
self.data_mapping['fit_outlier_impute_output'] = fit_obj.output_data._table_name
|
|
404
|
+
self.data_mapping['fit_outlier_impute_result'] = fit_obj.result._table_name
|
|
405
|
+
self.data_mapping['outlier_imputed_data'] = self.data._table_name
|
|
394
406
|
self._display_msg(msg="Sample of dataset after performing MEDIAN inplace:",
|
|
395
407
|
data=self.data,
|
|
396
408
|
progress_bar=self.progress_bar)
|
|
@@ -446,7 +458,10 @@ class _DataPreparation:
|
|
|
446
458
|
# Fetching replacement value
|
|
447
459
|
replacement_value = transform_val["replacement_value"]
|
|
448
460
|
# Performing outlier handling
|
|
449
|
-
self._outlier_handling(target_col, outlier_method, replacement_value)
|
|
461
|
+
fit_obj = self._outlier_handling(target_col, outlier_method, replacement_value)
|
|
462
|
+
self.data_mapping[f'fit_{target_col}_outlier_output'] = fit_obj.output_data._table_name
|
|
463
|
+
self.data_mapping[f'fit_{target_col}_outlier_result'] = fit_obj.result._table_name
|
|
464
|
+
self.data_mapping[f'{target_col}_outlier_treated_data'] = self.data._table_name
|
|
450
465
|
else:
|
|
451
466
|
self._display_msg(inline_msg="No information provided for feature transformation in outlier handling.",
|
|
452
467
|
progress_bar=self.progress_bar)
|
|
@@ -491,7 +506,7 @@ class _DataPreparation:
|
|
|
491
506
|
start_time = time.time()
|
|
492
507
|
|
|
493
508
|
# Temporary Pulling data for feature selection
|
|
494
|
-
pca_train = DataFrame.from_table(self.
|
|
509
|
+
pca_train = DataFrame.from_table(self.data_mapping['pca_train']).to_pandas()
|
|
495
510
|
|
|
496
511
|
# Drop unnecessary columns and store the result
|
|
497
512
|
train_data = pca_train.drop(columns=['id', self.target_column], axis=1)
|
|
@@ -759,7 +774,7 @@ class _DataPreparation:
|
|
|
759
774
|
train_table_name = UtilFuncs._extract_table_name(train_table_name)
|
|
760
775
|
|
|
761
776
|
# Storing the table names in the table name mapping dictionary
|
|
762
|
-
self.
|
|
777
|
+
self.data_mapping['{}_train'.format(prefix)] = train_table_name
|
|
763
778
|
|
|
764
779
|
# In the case of the VT option, the table was being persisted, so the VT condition is being checked.
|
|
765
780
|
is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
|
|
@@ -839,9 +854,9 @@ class _DataPreparation:
|
|
|
839
854
|
|
|
840
855
|
# Loading data for feature scaling based of feature selection method
|
|
841
856
|
if feature_selection_mtd == 'rfe':
|
|
842
|
-
data_to_scale = DataFrame(self.
|
|
857
|
+
data_to_scale = DataFrame(self.data_mapping['rfe_train'])
|
|
843
858
|
elif feature_selection_mtd == 'lasso':
|
|
844
|
-
data_to_scale = DataFrame(self.
|
|
859
|
+
data_to_scale = DataFrame(self.data_mapping['lasso_train'])
|
|
845
860
|
else:
|
|
846
861
|
data_to_scale = self.data
|
|
847
862
|
|
|
@@ -864,6 +879,9 @@ class _DataPreparation:
|
|
|
864
879
|
volatile=volatile,
|
|
865
880
|
persist=persist)
|
|
866
881
|
|
|
882
|
+
self.data_mapping[f'fit_scale_{feature_selection_mtd}_output'] = fit_obj.output_data._table_name
|
|
883
|
+
self.data_mapping[f'fit_scale_{feature_selection_mtd}_result'] = fit_obj.output._table_name
|
|
884
|
+
|
|
867
885
|
# storing the scale fit object and columns in data transformation dictionary
|
|
868
886
|
self.data_transform_dict['{}_scale_fit_obj'.format(feature_selection_mtd)] = fit_obj.output
|
|
869
887
|
self.data_transform_dict['{}_scale_col'.format(feature_selection_mtd)] = scale_col
|
|
@@ -965,6 +983,7 @@ class _DataPreparation:
|
|
|
965
983
|
fit_params["persist"] = False
|
|
966
984
|
|
|
967
985
|
transform_output = RoundColumns(**fit_params).result
|
|
986
|
+
self.data_mapping['round_columns_data'] = transform_output._table_name
|
|
968
987
|
if not self.volatile and not self.persist:
|
|
969
988
|
# Adding transformed data containing table to garbage collector
|
|
970
989
|
GarbageCollector._add_to_garbagecollector(transform_output._table_name)
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
|
|
16
16
|
# Python libraries
|
|
17
17
|
import pandas as pd
|
|
18
|
+
import warnings
|
|
18
19
|
|
|
19
20
|
# Teradata libraries
|
|
20
21
|
from teradataml.dataframe.dataframe import DataFrame
|
|
@@ -468,6 +469,7 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
468
469
|
custom_target_encoding_ind = self.data_transformation_params.get("custom_target_encoding_ind", False)
|
|
469
470
|
custom_target_encoding_fit_obj = self.data_transformation_params.get("custom_target_encoding_fit_obj", None)
|
|
470
471
|
if custom_target_encoding_ind:
|
|
472
|
+
warn_cols = []
|
|
471
473
|
for col, tar_fit_obj in custom_target_encoding_fit_obj.items():
|
|
472
474
|
# Extracting accumulate columns
|
|
473
475
|
accumulate_columns = self._extract_list(self.data.columns, [col])
|
|
@@ -483,6 +485,15 @@ class _DataTransformation(_FeatureExplore, _FeatureEngineering):
|
|
|
483
485
|
self.data = TargetEncodingTransform(**transform_params).result
|
|
484
486
|
# Adding transformed data containing table to garbage collector
|
|
485
487
|
GarbageCollector._add_to_garbagecollector(self.data._table_name)
|
|
488
|
+
if self.data[self.data[col] == -1].shape[0] > 0:
|
|
489
|
+
warn_cols.append(col)
|
|
490
|
+
|
|
491
|
+
# Checking for unseen values in target encoding columns
|
|
492
|
+
if len(warn_cols) > 0:
|
|
493
|
+
warnings.warn(message=f"Unseen categorical values found in test data column(s): {warn_cols}. \
|
|
494
|
+
This may cause inaccurate predictions. Consider retraining the model with updated data.",
|
|
495
|
+
stacklevel=0)
|
|
496
|
+
|
|
486
497
|
self._display_msg(msg="\nUpdated dataset after performing customized categorical encoding :",
|
|
487
498
|
data=self.data,
|
|
488
499
|
progress_bar=self.progress_bar)
|