teradataml 20.0.0.3__py3-none-any.whl → 20.0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/README.md +119 -0
- teradataml/_version.py +1 -1
- teradataml/analytics/analytic_function_executor.py +18 -6
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/sqle/__init__.py +4 -1
- teradataml/analytics/valib.py +18 -4
- teradataml/automl/__init__.py +51 -6
- teradataml/automl/data_preparation.py +56 -33
- teradataml/automl/data_transformation.py +58 -33
- teradataml/automl/feature_engineering.py +12 -5
- teradataml/automl/model_training.py +34 -13
- teradataml/common/__init__.py +1 -2
- teradataml/common/constants.py +64 -40
- teradataml/common/messagecodes.py +13 -3
- teradataml/common/messages.py +4 -1
- teradataml/common/sqlbundle.py +40 -10
- teradataml/common/utils.py +113 -39
- teradataml/common/warnings.py +11 -0
- teradataml/context/context.py +141 -17
- teradataml/data/amazon_reviews_25.csv +26 -0
- teradataml/data/byom_example.json +11 -0
- teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
- teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
- teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
- teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
- teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
- teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/Shap.py +28 -6
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
- teradataml/data/hnsw_alter_data.csv +5 -0
- teradataml/data/hnsw_data.csv +10 -0
- teradataml/data/jsons/byom/h2opredict.json +1 -1
- teradataml/data/jsons/byom/onnxembeddings.json +266 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
- teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +2 -2
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +1 -1
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +5 -5
- teradataml/data/teradataml_example.json +8 -0
- teradataml/data/vectordistance_example.json +1 -1
- teradataml/dataframe/copy_to.py +8 -3
- teradataml/dataframe/data_transfer.py +11 -1
- teradataml/dataframe/dataframe.py +517 -121
- teradataml/dataframe/dataframe_utils.py +152 -20
- teradataml/dataframe/functions.py +26 -11
- teradataml/dataframe/setop.py +11 -6
- teradataml/dataframe/sql.py +2 -2
- teradataml/dbutils/dbutils.py +525 -129
- teradataml/hyperparameter_tuner/optimizer.py +12 -1
- teradataml/opensource/{sklearn/_sklearn_wrapper.py → _base.py} +317 -1011
- teradataml/opensource/_class.py +141 -17
- teradataml/opensource/{constants.py → _constants.py} +7 -3
- teradataml/opensource/_lightgbm.py +52 -53
- teradataml/opensource/_sklearn.py +1008 -0
- teradataml/opensource/_wrapper_utils.py +5 -5
- teradataml/options/__init__.py +47 -15
- teradataml/options/configure.py +103 -25
- teradataml/options/display.py +13 -2
- teradataml/plot/axis.py +47 -8
- teradataml/plot/figure.py +33 -0
- teradataml/plot/plot.py +63 -13
- teradataml/scriptmgmt/UserEnv.py +2 -2
- teradataml/scriptmgmt/lls_utils.py +63 -26
- teradataml/store/__init__.py +1 -2
- teradataml/store/feature_store/feature_store.py +102 -7
- teradataml/table_operators/Apply.py +32 -18
- teradataml/table_operators/Script.py +3 -1
- teradataml/table_operators/TableOperator.py +3 -1
- teradataml/utils/dtypes.py +47 -0
- teradataml/utils/internal_buffer.py +18 -0
- teradataml/utils/validators.py +68 -9
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/METADATA +123 -2
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/RECORD +79 -75
- teradataml/data/SQL_Fundamentals.pdf +0 -0
- teradataml/libaed_0_1.dylib +0 -0
- teradataml/libaed_0_1.so +0 -0
- teradataml/opensource/sklearn/__init__.py +0 -0
- teradataml/store/vector_store/__init__.py +0 -1586
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/zip-safe +0 -0
teradataml/LICENSE-3RD-PARTY.pdf
CHANGED
|
Binary file
|
teradataml/README.md
CHANGED
|
@@ -17,6 +17,125 @@ Copyright 2024, Teradata. All Rights Reserved.
|
|
|
17
17
|
|
|
18
18
|
## Release Notes:
|
|
19
19
|
|
|
20
|
+
#### teradataml 20.00.00.04
|
|
21
|
+
* ##### New Features/Functionality
|
|
22
|
+
* ###### teradataml OTF Support:
|
|
23
|
+
* This release has enabled the support for accessing OTF data from teradataml.
|
|
24
|
+
* User can now create a teradataml DataFrame on OTF table, allowing user to use teradataml functions.
|
|
25
|
+
* Example usage below:
|
|
26
|
+
* Creation of view on OTF/datalake table is not supported. Hence, user has to set `configure.temp_object_type` to `VT` using below-mentioned statement.
|
|
27
|
+
```configure.temp_object_type = "VT"```
|
|
28
|
+
* User needs to provide additional information about datalake while creating the DataFrame. There are two approaches to provide datalake information
|
|
29
|
+
* Approach 1: Using `in_schema()`
|
|
30
|
+
```
|
|
31
|
+
>>> from teradataml.dataframe.dataframe import in_schema
|
|
32
|
+
# Create an in_schema object to privide additional information about datalake.
|
|
33
|
+
>>> in_schema_tbl = in_schema(schema_name="datalake_db",
|
|
34
|
+
... table_name="datalake_table_name",
|
|
35
|
+
... datalake_name="datalake")
|
|
36
|
+
>>> otf_df = DataFrame(in_schema_tbl)
|
|
37
|
+
```
|
|
38
|
+
* Approach 2: Using `DataFrame.from_table()`
|
|
39
|
+
```
|
|
40
|
+
>>> otf_df = DataFrame.from_table(table_name = "datalake_table_name",
|
|
41
|
+
... schema_name="datalake_db",
|
|
42
|
+
... datalake_name="datalake")
|
|
43
|
+
```
|
|
44
|
+
* Once this DataFrame is created, users can use any DataFrame method or analytics features/functionality from teradataml with it. Visit Limitations and considerations section in _Teradata Python Package User Guide_ to check the supportability.
|
|
45
|
+
* Note: All further operations create volatile tables in local database.
|
|
46
|
+
```
|
|
47
|
+
>>> new_df = otf_df.assign(new_col=otf_df.existing_col*2)
|
|
48
|
+
```
|
|
49
|
+
* ###### teradataml: DataFrame
|
|
50
|
+
* Introduced a new feature 'Exploratory Data Analysis UI' (EDA-UI), which enhances
|
|
51
|
+
the user experience of teradataml with Jupyter notebook. EDA-UI is displayed by default
|
|
52
|
+
when a teradataml DataFrame is printed in the Jupyter notebook.
|
|
53
|
+
* User can control the EDA-UI using a new configuration option `display.enable_ui`.
|
|
54
|
+
It can be disabled by setting `display.enable_ui` to False.
|
|
55
|
+
* New Function
|
|
56
|
+
* `get_output()` is added to get the result of Analytic function when executed from EDA UI.
|
|
57
|
+
|
|
58
|
+
* ###### OpensourceML
|
|
59
|
+
* `td_lightgbm` - A teradataml OpenSourceML module
|
|
60
|
+
* `deploy()` - User can now deploy the models created by lightgbm `Booster` and `sklearn` modules. Deploying the model stores the model in Vantage for future use with `td_lightgbm`.
|
|
61
|
+
* `td_lightgbm.deploy()` - Deploy the lightgbm `Booster` or any `scikit-learn` model trained outside Vantage.
|
|
62
|
+
* `td_lightgbm.train().deploy()` - Deploys the lightgbm `Booster` object trained within Vantage.
|
|
63
|
+
* `td_lightgbm.<sklearn_class>().deploy()` - Deploys lightgbm's sklearn class object created/trained within Vantage.
|
|
64
|
+
* `load()` - User can load the deployed models back in the current session. This allows user to use the lightgbm functions with the `td_lightgbm` module.
|
|
65
|
+
* `td_lightgbm.load()` - Load the deployed model in the current session.
|
|
66
|
+
|
|
67
|
+
* ###### FeatureStore
|
|
68
|
+
* New function `FeatureStore.delete()` is added to drop the Feature Store and corresponding repo from Vantage.
|
|
69
|
+
|
|
70
|
+
* ###### Database Utility
|
|
71
|
+
* `db_python_version_diff()` - Identifies the Python interpreter major version difference between the interpreter installed on Vantage vs interpreter on the local user environment.
|
|
72
|
+
* `db_python_package_version_diff()` - Identifies the Python package version difference between the packages installed on Vantage vs the local user environment.
|
|
73
|
+
|
|
74
|
+
* ###### BYOM Function
|
|
75
|
+
* `ONNXEmbeddings()` - Calculate embeddings values in Vantage using an embeddings model that has been created outside Vantage and stored in ONNX format.
|
|
76
|
+
|
|
77
|
+
* ###### teradataml Options
|
|
78
|
+
* Configuration Options
|
|
79
|
+
* `configure.temp_object_type` - Allows user to choose between creating volatile tables or views for teradataml internal use. By default, teradataml internally creates the views for some of the operations. Now, with new configuration option, user can opt to create Volatile tables instead of views. This provides greater flexibility for users who lack the necessary permissions to create view or need to create views on tables without WITH GRANT permissions.
|
|
80
|
+
* Display Options
|
|
81
|
+
* `display.enable_ui` - Specifies whether to display exploratory data analysis UI when DataFrame is printed. By default, this option is enabled (True), allowing exploratory data analysis UI to be displayed. When set to False, exploratory data analysis UI is hidden.
|
|
82
|
+
|
|
83
|
+
* ##### Updates
|
|
84
|
+
* ###### teradataml: DataFrame function
|
|
85
|
+
* `describe()`
|
|
86
|
+
* New argument added: `pivot`.
|
|
87
|
+
* When argument `pivot` is set to False, Non-numeric columns are no longer supported for generating statistics.
|
|
88
|
+
Use `CategoricalSummary` and `ColumnSummary`.
|
|
89
|
+
* `fillna()` - Accepts new argument `partition_column` to partition the data and impute null values accordingly.
|
|
90
|
+
* Optimised performance for `DataFrame.plot()`.
|
|
91
|
+
* `DataFrame.plot()` will not regenerate the image when run more than once with same arguments.
|
|
92
|
+
* `DataFrame.from_table()`: New argument `datalake_name` added to accept datalake name while creating DataFrame on datalake table.
|
|
93
|
+
|
|
94
|
+
* ###### teradataml: DataFrame Utilities
|
|
95
|
+
* `in_schema()`: New argument `datalake_name` added to accept datalake name.
|
|
96
|
+
|
|
97
|
+
* ###### Table Operator
|
|
98
|
+
* `Apply()` no longer looks at authentication token by default. Authentication token is now required only if user want to update backend Open Analytics Framework service.
|
|
99
|
+
|
|
100
|
+
* ###### Hyper Parameter Tuner
|
|
101
|
+
* `GridSearch()` and `RandomSearch()` now displays a message to refer to `get_error_log()` api when model training fails in HPT.
|
|
102
|
+
|
|
103
|
+
* ###### teradataml Options
|
|
104
|
+
* Configuration Options
|
|
105
|
+
* `configure.indb_install_location`
|
|
106
|
+
Determines the installation location of the In-DB Python package based on the installed RPM version.
|
|
107
|
+
|
|
108
|
+
* ###### teradataml Context Creation
|
|
109
|
+
* `create_context()` - Enables user to create connection using either parameters set in environment or config file, in addition to previous method. Newly added options help users to hide the sensitive data from the script.
|
|
110
|
+
|
|
111
|
+
* ###### Open Analytics Framework
|
|
112
|
+
* Enhanced the `create_env()` to display a message when an invalid base_env is passed, informing users that the default base_env is being used.
|
|
113
|
+
|
|
114
|
+
* ###### OpensourceML
|
|
115
|
+
* Raises a TeradataMlException, if the Python interpreter major version is different between the Vantage Python environment and the local user environment.
|
|
116
|
+
* Displays a warning, if specific Python package versions are different between the Vantage Python environment and the local user environment.
|
|
117
|
+
|
|
118
|
+
* ###### Database Utility
|
|
119
|
+
* `db_list_tables()`: New argument `datalake_name` added to accept datalake name to list tables from.
|
|
120
|
+
* `db_drop_table()`:
|
|
121
|
+
* New argument `datalake_name` added to accept datalake name to drop tables from.
|
|
122
|
+
* New argument `purge` added to specify whether to use `PURGE ALL` or `NO PURGE` clause while dropping table.
|
|
123
|
+
|
|
124
|
+
* ##### Bug Fixes
|
|
125
|
+
* `td_lightgbm` OpensourceML module: In multi model case, `td_lightgbm.Dataset().add_features_from()` function should add features of one partition in first Dataset to features of the same partition in second Dataset. This is not the case before and this function fails. Fixed this now.
|
|
126
|
+
* Fixed a minor bug in the `Shap()` and converted argument `training_method` to required argument.
|
|
127
|
+
* Fixed PCA-related warnings in `AutoML`.
|
|
128
|
+
* `AutoML` no longer fails when data with all categorical columns are provided.
|
|
129
|
+
* Fixed `AutoML` issue with upsampling method.
|
|
130
|
+
* Excluded the identifier column from outlier processing in `AutoML`.
|
|
131
|
+
* `DataFrame.set_index()` no longer modifies the original DataFrame's index when argument `append` is used.
|
|
132
|
+
* `concat()` function now supports the DataFrame with column name starts with digit or contains special characters or contains reserved keywords.
|
|
133
|
+
* `create_env()` proceeds to install other files even if current file installation fails.
|
|
134
|
+
* Corrected the error message being raised in `create_env()` when authentication is not set.
|
|
135
|
+
* Added missing argument `charset` for Vantage Analytic Library functions.
|
|
136
|
+
* New argument `seed` is added to `AutoML`, `AutoRegressor` and `AutoClassifier` to ensure consistency on result.
|
|
137
|
+
* Analytic functions now work even if name of columns for underlying tables is non-ascii characters.
|
|
138
|
+
|
|
20
139
|
#### teradataml 20.00.00.03
|
|
21
140
|
|
|
22
141
|
* teradataml no longer supports setting the `auth_token` using `set_config_params()`. Users should use `set_auth_token()` to set the token.
|
teradataml/_version.py
CHANGED
|
@@ -482,17 +482,20 @@ class _AnlyticFunctionExecutor:
|
|
|
482
482
|
|
|
483
483
|
# Validate column is existed or not in the table.
|
|
484
484
|
_Validators._validate_dataframe_has_argument_columns(
|
|
485
|
-
arg_value, arg_name, dataframe, target_table_argument_name)
|
|
485
|
+
arg_value, arg_name, dataframe, target_table_argument_name, case_insensitive=True)
|
|
486
486
|
|
|
487
487
|
# Append square brackets for column range when function
|
|
488
488
|
# does not require special case handler.
|
|
489
489
|
arg_value = self._spl_func_obj._add_square_bracket(arg_value)
|
|
490
490
|
|
|
491
|
+
# Check if there are columns with non-ASCII characters.
|
|
492
|
+
if UtilFuncs._is_ascii(arg_value):
|
|
493
|
+
arg_value = UtilFuncs._teradata_quote_arg(arg_value, "\"", False)
|
|
491
494
|
# Handling special case for Teradata reserved keywords or column names with spaces.
|
|
492
495
|
# If argument is a string or list of strings, then add quotes to the string.
|
|
493
|
-
|
|
496
|
+
elif arg_name not in ["partition_columns"] and (\
|
|
494
497
|
UtilFuncs._contains_space(arg_value) or list_td_reserved_keywords(arg_value)):
|
|
495
|
-
arg_value = UtilFuncs._teradata_quote_arg(arg_value, "\"", False)
|
|
498
|
+
arg_value = UtilFuncs._teradata_quote_arg(arg_value, "\"", False)
|
|
496
499
|
|
|
497
500
|
# SequenceInputBy arguments require special processing.
|
|
498
501
|
if 500 <= argument.get_r_order_number() <= 510:
|
|
@@ -717,10 +720,17 @@ class _AnlyticFunctionExecutor:
|
|
|
717
720
|
kwargs.update(kwargs.pop("generic_arguments", {}))
|
|
718
721
|
|
|
719
722
|
# Add all arguments to dynamic class as data members.
|
|
723
|
+
global_volatile = False
|
|
724
|
+
if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
|
|
725
|
+
global_volatile = True
|
|
720
726
|
|
|
721
727
|
start_time = time.time()
|
|
722
728
|
persist = kwargs.get("persist", False)
|
|
723
|
-
volatile
|
|
729
|
+
# Use global volatile only when persist argument is False. If persist argument
|
|
730
|
+
# is True, then volatile can't be used whether it is global volatile or normal
|
|
731
|
+
# volatile. If it is normal volatile, then it will raise
|
|
732
|
+
# `CANNOT_USE_TOGETHER_WITH` error below.
|
|
733
|
+
volatile = kwargs.get("volatile", global_volatile if not persist else False)
|
|
724
734
|
display_table_name = kwargs.get("display_table_name", True)
|
|
725
735
|
|
|
726
736
|
# Validate local_order_column argument type and values.
|
|
@@ -1039,7 +1049,8 @@ class _SQLEFunctionExecutor(_AnlyticFunctionExecutor):
|
|
|
1039
1049
|
_Validators._validate_dataframe_has_argument_columns(arg_value,
|
|
1040
1050
|
arg,
|
|
1041
1051
|
input_table_arg_value,
|
|
1042
|
-
input_table_arg
|
|
1052
|
+
input_table_arg,
|
|
1053
|
+
case_insensitive=True
|
|
1043
1054
|
)
|
|
1044
1055
|
|
|
1045
1056
|
order_column_arg_value = UtilFuncs._teradata_collapse_arglist(order_column_arg_value, "\"")
|
|
@@ -1491,7 +1502,8 @@ class _TableOperatorExecutor(_SQLEFunctionExecutor):
|
|
|
1491
1502
|
_Validators._validate_dataframe_has_argument_columns(hash_column_value,
|
|
1492
1503
|
hash_column_arg,
|
|
1493
1504
|
input_table_arg_value,
|
|
1494
|
-
input_table_arg
|
|
1505
|
+
input_table_arg,
|
|
1506
|
+
case_insensitive=True
|
|
1495
1507
|
)
|
|
1496
1508
|
|
|
1497
1509
|
# Hash and order by can be used together as long as is_local_order = True.
|
|
@@ -4,7 +4,7 @@ from teradataml.analytics.byom.PMMLPredict import PMMLPredict
|
|
|
4
4
|
from teradataml.analytics.meta_class import _AnalyticFunction
|
|
5
5
|
from teradataml.analytics.meta_class import _common_init, _common_dir
|
|
6
6
|
|
|
7
|
-
_byom_functions = ['H2OPredict', 'PMMLPredict', 'ONNXPredict', 'DataikuPredict', 'DataRobotPredict']
|
|
7
|
+
_byom_functions = ['H2OPredict', 'PMMLPredict', 'ONNXPredict', 'DataikuPredict', 'DataRobotPredict', 'ONNXEmbeddings']
|
|
8
8
|
|
|
9
9
|
for func in _byom_functions:
|
|
10
10
|
globals()[func] = type("{}".format(func), (_AnalyticFunction,),
|
teradataml/analytics/valib.py
CHANGED
|
@@ -26,6 +26,8 @@ from teradataml.dataframe.dataframe import DataFrame, in_schema
|
|
|
26
26
|
from teradataml.utils.validators import _Validators
|
|
27
27
|
from teradataml.analytics.Transformations import Binning, Derive, OneHotEncoder, FillNa, \
|
|
28
28
|
LabelEncoder, MinMaxScalar, Retain, Sigmoid, ZScore
|
|
29
|
+
from teradataml.common.constants import TeradataReservedKeywords, TeradataConstants
|
|
30
|
+
|
|
29
31
|
|
|
30
32
|
class _VALIB():
|
|
31
33
|
""" An internal class for executing VALIB analytic functions. """
|
|
@@ -370,9 +372,16 @@ class _VALIB():
|
|
|
370
372
|
self.__get_temp_table_name()
|
|
371
373
|
"""
|
|
372
374
|
prefix = "valib_{}".format(self.__tdml_valib_name.lower())
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
375
|
+
tbl_name = UtilFuncs._generate_temp_table_name(prefix=prefix, use_default_database=True,
|
|
376
|
+
gc_on_quit=True, quote=False,
|
|
377
|
+
table_type=TeradataConstants.TERADATA_TABLE)
|
|
378
|
+
# With VT option, table name is getting generated with 'vt_'.
|
|
379
|
+
# But its not getting created as Volatile table. Hence
|
|
380
|
+
# explicitly garbage collecting.
|
|
381
|
+
if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
|
|
382
|
+
GarbageCollector._add_to_garbagecollector(tbl_name,
|
|
383
|
+
TeradataConstants.TERADATA_TABLE)
|
|
384
|
+
return tbl_name
|
|
376
385
|
|
|
377
386
|
def __process_dyn_cls_output_member(self, arg_name, out_tablename, out_var=None):
|
|
378
387
|
"""
|
|
@@ -447,6 +456,7 @@ class _VALIB():
|
|
|
447
456
|
# Add extension to the table name.
|
|
448
457
|
generated_table_name = "{}{}".format(table_name, extension)
|
|
449
458
|
|
|
459
|
+
|
|
450
460
|
# Register new output table to the GC.
|
|
451
461
|
gc_tabname = "\"{}\".\"{}\"".format(self.__db_name, generated_table_name)
|
|
452
462
|
GarbageCollector._add_to_garbagecollector(gc_tabname, TeradataConstants.TERADATA_TABLE)
|
|
@@ -1463,7 +1473,7 @@ class _VALIB():
|
|
|
1463
1473
|
if gen_sql_only:
|
|
1464
1474
|
valib_inst.__generate_valib_sql_argument_syntax(arg=str(gen_sql_only),
|
|
1465
1475
|
arg_name="gensqlonly")
|
|
1466
|
-
|
|
1476
|
+
charset = kwargs.pop("charset", None)
|
|
1467
1477
|
# Raise error if there are additional arguments.
|
|
1468
1478
|
if len(kwargs) != 0:
|
|
1469
1479
|
err_ = "The keyword arguments for Overlap() should have data1, data2, ..., dataN " \
|
|
@@ -1478,6 +1488,10 @@ class _VALIB():
|
|
|
1478
1488
|
arg_name="tablename")
|
|
1479
1489
|
valib_inst.__generate_valib_sql_argument_syntax(arg=",".join(column_names_df),
|
|
1480
1490
|
arg_name="columns")
|
|
1491
|
+
# Generate clause of charset.
|
|
1492
|
+
if charset:
|
|
1493
|
+
valib_inst.__generate_valib_sql_argument_syntax(arg=charset,
|
|
1494
|
+
arg_name="charset")
|
|
1481
1495
|
|
|
1482
1496
|
return valib_inst._execute_valib_function(skip_data_arg_processing=True,
|
|
1483
1497
|
skip_other_arg_processing=True)
|
teradataml/automl/__init__.py
CHANGED
|
@@ -30,7 +30,7 @@ from teradataml import ColumnExpression
|
|
|
30
30
|
from teradataml.dataframe.dataframe import DataFrame
|
|
31
31
|
from teradataml.utils.utils import execute_sql
|
|
32
32
|
from teradataml.utils.validators import _Validators
|
|
33
|
-
from teradataml import ROC, BLOB
|
|
33
|
+
from teradataml import ROC, BLOB, VARCHAR
|
|
34
34
|
from teradataml.utils.dtypes import _Dtypes
|
|
35
35
|
from teradataml.common.utils import UtilFuncs
|
|
36
36
|
from teradataml import TeradataMlException
|
|
@@ -94,6 +94,9 @@ class AutoML:
|
|
|
94
94
|
the processes by passing the JSON file path in case of custom run. It also
|
|
95
95
|
supports early stopping of model training based on stopping metrics,
|
|
96
96
|
maximum running time and maximum models to be trained.
|
|
97
|
+
Note:
|
|
98
|
+
* configure.temp_object_type="VT" follows sequential execution.
|
|
99
|
+
|
|
97
100
|
|
|
98
101
|
PARAMETERS:
|
|
99
102
|
task_type:
|
|
@@ -187,6 +190,12 @@ class AutoML:
|
|
|
187
190
|
session.
|
|
188
191
|
Default Value: False
|
|
189
192
|
Types: bool
|
|
193
|
+
|
|
194
|
+
seed:
|
|
195
|
+
Optional Argument.
|
|
196
|
+
Specifies the random seed for reproducibility.
|
|
197
|
+
Default Value: 42
|
|
198
|
+
Types: int
|
|
190
199
|
|
|
191
200
|
RETURNS:
|
|
192
201
|
Instance of AutoML.
|
|
@@ -417,9 +426,11 @@ class AutoML:
|
|
|
417
426
|
|
|
418
427
|
volatile = kwargs.get('volatile', False)
|
|
419
428
|
persist = kwargs.get('persist', False)
|
|
429
|
+
seed = kwargs.get('seed', 42)
|
|
420
430
|
|
|
421
431
|
arg_info_matrix.append(["volatile", volatile, True, (bool)])
|
|
422
432
|
arg_info_matrix.append(["persist", persist, True, (bool)])
|
|
433
|
+
arg_info_matrix.append(["seed", seed, True, (int)])
|
|
423
434
|
|
|
424
435
|
# Validate argument types
|
|
425
436
|
_Validators._validate_function_arguments(arg_info_matrix)
|
|
@@ -517,7 +528,7 @@ class AutoML:
|
|
|
517
528
|
|
|
518
529
|
# Validate argument types
|
|
519
530
|
_Validators._validate_function_arguments(arg_info_fit_matrix)
|
|
520
|
-
|
|
531
|
+
|
|
521
532
|
# Initializing class variables
|
|
522
533
|
self.data = data
|
|
523
534
|
self.target_column = target_column
|
|
@@ -758,11 +769,12 @@ class AutoML:
|
|
|
758
769
|
if self.target_column_ind:
|
|
759
770
|
prediction_column = 'prediction' if 'prediction' in pred.result.columns else 'Prediction'
|
|
760
771
|
probability_column = 'prob_1'
|
|
772
|
+
pred_target_count = pred.result.drop_duplicate(self.target_column).size
|
|
761
773
|
# Displaying confusion matrix and ROC-AUC for classification problem
|
|
762
774
|
if self.is_classification_type():
|
|
763
775
|
print_data = lambda data: print(data) if _is_terminal() else display(data)
|
|
764
776
|
# Displaying ROC-AUC for binary classification
|
|
765
|
-
if self.target_count == 2:
|
|
777
|
+
if self.target_count == 2 and pred_target_count == 2:
|
|
766
778
|
fit_params = {
|
|
767
779
|
"probability_column" : probability_column,
|
|
768
780
|
"observation_column" : self.target_column,
|
|
@@ -886,8 +898,8 @@ class AutoML:
|
|
|
886
898
|
# as it is required for evaluation.
|
|
887
899
|
if self.target_column not in data.columns:
|
|
888
900
|
raise TeradataMlException(
|
|
889
|
-
|
|
890
|
-
|
|
901
|
+
Messages.get_message(MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE).format(self.target_column),
|
|
902
|
+
MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE)
|
|
891
903
|
|
|
892
904
|
# Checking if data is already transformed before or not
|
|
893
905
|
data_node_id = data._nodeid
|
|
@@ -1234,6 +1246,8 @@ class AutoML:
|
|
|
1234
1246
|
pca.n_components_ = load_pca_info['n_components']
|
|
1235
1247
|
pca.noise_variance_ = load_pca_info['noise_variance']
|
|
1236
1248
|
pca.singular_values_ = np.array(load_pca_info['singular_values'])
|
|
1249
|
+
pca.feature_names_in_ = data_params['pca_fit_columns']
|
|
1250
|
+
pca.n_features_in_ = len(data_params['pca_fit_columns'])
|
|
1237
1251
|
|
|
1238
1252
|
data_params['pca_fit_instance'] = pca
|
|
1239
1253
|
|
|
@@ -1442,7 +1456,8 @@ class AutoML:
|
|
|
1442
1456
|
# Saving data transformation parameters to the specified table
|
|
1443
1457
|
sv_models = pd.concat([sv_models, df], ignore_index=True, sort=False)
|
|
1444
1458
|
|
|
1445
|
-
copy_to_sql(df = sv_models, table_name=table_name, if_exists='replace', types={'DATA_PARAMS':BLOB
|
|
1459
|
+
copy_to_sql(df = sv_models, table_name=table_name, if_exists='replace', types={'DATA_PARAMS':BLOB,
|
|
1460
|
+
'PARAMETERS':VARCHAR(length=32000, charset='UNICODE')})
|
|
1446
1461
|
|
|
1447
1462
|
print('Model Deployment Completed Successfully.')
|
|
1448
1463
|
|
|
@@ -1945,6 +1960,12 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
|
|
|
1945
1960
|
Default Value: False
|
|
1946
1961
|
Types: bool
|
|
1947
1962
|
|
|
1963
|
+
seed:
|
|
1964
|
+
Optional Argument.
|
|
1965
|
+
Specifies the random seed for reproducibility.
|
|
1966
|
+
Default Value: 42
|
|
1967
|
+
Types: int
|
|
1968
|
+
|
|
1948
1969
|
RETURNS:
|
|
1949
1970
|
a tuple containing, model information and leaderboard.
|
|
1950
1971
|
"""
|
|
@@ -2103,6 +2124,12 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
|
|
|
2103
2124
|
session.
|
|
2104
2125
|
Default Value: False
|
|
2105
2126
|
Types: bool
|
|
2127
|
+
|
|
2128
|
+
seed:
|
|
2129
|
+
Optional Argument.
|
|
2130
|
+
Specifies the random seed for reproducibility.
|
|
2131
|
+
Default Value: 42
|
|
2132
|
+
Types: int
|
|
2106
2133
|
|
|
2107
2134
|
RETURNS:
|
|
2108
2135
|
a tuple containing, model information and leaderboard.
|
|
@@ -2324,6 +2351,9 @@ class AutoRegressor(AutoML):
|
|
|
2324
2351
|
"""
|
|
2325
2352
|
DESCRIPTION:
|
|
2326
2353
|
AutoRegressor is a special purpose AutoML feature to run regression specific tasks.
|
|
2354
|
+
Note:
|
|
2355
|
+
* configure.temp_object_type="VT" follows sequential execution.
|
|
2356
|
+
|
|
2327
2357
|
|
|
2328
2358
|
PARAMETERS:
|
|
2329
2359
|
include:
|
|
@@ -2407,6 +2437,12 @@ class AutoRegressor(AutoML):
|
|
|
2407
2437
|
session.
|
|
2408
2438
|
Default Value: False
|
|
2409
2439
|
Types: bool
|
|
2440
|
+
|
|
2441
|
+
seed:
|
|
2442
|
+
Optional Argument.
|
|
2443
|
+
Specifies the random seed for reproducibility.
|
|
2444
|
+
Default Value: 42
|
|
2445
|
+
Types: int
|
|
2410
2446
|
|
|
2411
2447
|
RETURNS:
|
|
2412
2448
|
Instance of AutoRegressor.
|
|
@@ -2555,6 +2591,9 @@ class AutoClassifier(AutoML):
|
|
|
2555
2591
|
"""
|
|
2556
2592
|
DESCRIPTION:
|
|
2557
2593
|
AutoClassifier is a special purpose AutoML feature to run classification specific tasks.
|
|
2594
|
+
Note:
|
|
2595
|
+
* configure.temp_object_type="VT" follows sequential execution.
|
|
2596
|
+
|
|
2558
2597
|
|
|
2559
2598
|
PARAMETERS:
|
|
2560
2599
|
include:
|
|
@@ -2638,6 +2677,12 @@ class AutoClassifier(AutoML):
|
|
|
2638
2677
|
session.
|
|
2639
2678
|
Default Value: False
|
|
2640
2679
|
Types: bool
|
|
2680
|
+
|
|
2681
|
+
seed:
|
|
2682
|
+
Optional Argument.
|
|
2683
|
+
Specifies the random seed for reproducibility.
|
|
2684
|
+
Default Value: 42
|
|
2685
|
+
Types: int
|
|
2641
2686
|
|
|
2642
2687
|
RETURNS:
|
|
2643
2688
|
Instance of AutoClassifier.
|
|
@@ -16,7 +16,6 @@
|
|
|
16
16
|
# Python libraries
|
|
17
17
|
import numpy as np
|
|
18
18
|
import pandas as pd
|
|
19
|
-
import random
|
|
20
19
|
import time
|
|
21
20
|
import warnings
|
|
22
21
|
|
|
@@ -30,11 +29,9 @@ from teradataml import UtilFuncs, TeradataConstants
|
|
|
30
29
|
from teradataml.common.garbagecollector import GarbageCollector
|
|
31
30
|
from teradataml.common.messages import Messages, MessageCodes
|
|
32
31
|
from teradataml.utils.validators import _Validators
|
|
33
|
-
from teradataml import INTEGER
|
|
32
|
+
from teradataml import configure, INTEGER
|
|
33
|
+
from teradataml.common.constants import TeradataConstants
|
|
34
34
|
|
|
35
|
-
# Control Randomnes
|
|
36
|
-
random.seed(42)
|
|
37
|
-
np.random.seed(42)
|
|
38
35
|
|
|
39
36
|
class _DataPreparation:
|
|
40
37
|
|
|
@@ -117,6 +114,12 @@ class _DataPreparation:
|
|
|
117
114
|
session.
|
|
118
115
|
Default Value: False
|
|
119
116
|
Types: bool
|
|
117
|
+
|
|
118
|
+
seed:
|
|
119
|
+
Optional Argument.
|
|
120
|
+
Specifies the random seed for reproducibility.
|
|
121
|
+
Default Value: 42
|
|
122
|
+
Types: int
|
|
120
123
|
"""
|
|
121
124
|
self.data = data
|
|
122
125
|
self.target_column = target_column
|
|
@@ -135,7 +138,13 @@ class _DataPreparation:
|
|
|
135
138
|
self.table_name_mapping = {}
|
|
136
139
|
|
|
137
140
|
self.data_types = {key: value for key, value in self.data._column_names_and_types}
|
|
138
|
-
|
|
141
|
+
self.seed = kwargs.get("seed", 42)
|
|
142
|
+
# np.random.seed() affects the random number generation in numpy and sklearn
|
|
143
|
+
# setting this changes the global state of the random number generator
|
|
144
|
+
# hence, setting the seed only if it is not None
|
|
145
|
+
if kwargs.get("seed") is not None:
|
|
146
|
+
np.random.seed(self.seed)
|
|
147
|
+
|
|
139
148
|
|
|
140
149
|
def data_preparation(self,
|
|
141
150
|
auto = True):
|
|
@@ -262,25 +271,24 @@ class _DataPreparation:
|
|
|
262
271
|
outlier_method = "Tukey"
|
|
263
272
|
|
|
264
273
|
# List of columns for outlier processing.
|
|
265
|
-
|
|
274
|
+
# Excluding target column and excluded columns from outlier processing
|
|
275
|
+
outlier_columns = [col for col in self.data.columns if col not in self.excluded_columns + ['id', self.target_column]]
|
|
266
276
|
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
if value
|
|
277
|
+
if len(outlier_columns) != 0:
|
|
278
|
+
# Detecting outlier percentage in each columns
|
|
279
|
+
outlier_percentage_df = self._outlier_detection(outlier_method, outlier_columns)
|
|
280
|
+
|
|
281
|
+
# Outlier Handling techniques
|
|
282
|
+
for i in outlier_percentage_df.itertuples():
|
|
283
|
+
# Column Name
|
|
284
|
+
col = i[0]
|
|
285
|
+
# Outlier value
|
|
286
|
+
value = i[1]
|
|
287
|
+
# Dropping rows
|
|
288
|
+
if value > 0.0 and value <= 8.0 :
|
|
279
289
|
columns_to_drop_rows.append(col)
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
elif value> 8.0 and value <= 25.0:
|
|
283
|
-
columns_to_impute.append(col)
|
|
290
|
+
elif value> 8.0 and value <= 25.0:
|
|
291
|
+
columns_to_impute.append(col)
|
|
284
292
|
|
|
285
293
|
return columns_to_drop_rows, columns_to_impute
|
|
286
294
|
|
|
@@ -489,7 +497,7 @@ class _DataPreparation:
|
|
|
489
497
|
train_data = pca_train.drop(columns=['id', self.target_column], axis=1)
|
|
490
498
|
|
|
491
499
|
# Initialize and fit PCA
|
|
492
|
-
pca = PCA()
|
|
500
|
+
pca = PCA(random_state=self.seed)
|
|
493
501
|
pca.fit(train_data)
|
|
494
502
|
|
|
495
503
|
# Find the number of components for PCA
|
|
@@ -497,7 +505,7 @@ class _DataPreparation:
|
|
|
497
505
|
n = np.argmax(np.cumsum(variance) >= 0.95) + 1
|
|
498
506
|
|
|
499
507
|
# Create a new instance of PCA with the optimal number of components
|
|
500
|
-
pca = PCA(n_components=n, random_state=
|
|
508
|
+
pca = PCA(n_components=n, random_state=self.seed)
|
|
501
509
|
|
|
502
510
|
# Apply PCA on dataset
|
|
503
511
|
X_train_pca = pca.fit_transform(train_data)
|
|
@@ -571,7 +579,7 @@ class _DataPreparation:
|
|
|
571
579
|
|
|
572
580
|
# Random forest for RFE model
|
|
573
581
|
RFModel = RandomForestRegressor if not is_classification else RandomForestClassifier
|
|
574
|
-
rf = RFModel(n_estimators=100, random_state=
|
|
582
|
+
rf = RFModel(n_estimators=100, random_state=self.seed)
|
|
575
583
|
|
|
576
584
|
# Determine the scoring metric based on the number of unique classes
|
|
577
585
|
score = 'r2' if not self.is_classification_type() \
|
|
@@ -665,10 +673,10 @@ class _DataPreparation:
|
|
|
665
673
|
scoring_metric = 'roc_auc'
|
|
666
674
|
else:
|
|
667
675
|
scoring_metric = 'f1_macro'
|
|
668
|
-
estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=
|
|
676
|
+
estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=self.seed)
|
|
669
677
|
parameters = {'C':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
|
|
670
678
|
else:
|
|
671
|
-
estimator = Lasso(random_state=
|
|
679
|
+
estimator = Lasso(random_state=self.seed)
|
|
672
680
|
parameters = {'alpha':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
|
|
673
681
|
scoring_metric = "r2"
|
|
674
682
|
|
|
@@ -679,7 +687,7 @@ class _DataPreparation:
|
|
|
679
687
|
|
|
680
688
|
# Applying hyperparameter tuning and optimizing score
|
|
681
689
|
hyperparameter_search = GridSearchCV(estimator, parameters, cv=cv, refit=True,
|
|
682
|
-
|
|
690
|
+
scoring=scoring_metric, verbose=0)
|
|
683
691
|
|
|
684
692
|
# Fitting the best result from hyperparameter
|
|
685
693
|
hyperparameter_search.fit(train_features, train_target)
|
|
@@ -746,14 +754,20 @@ class _DataPreparation:
|
|
|
746
754
|
train_table_name = UtilFuncs._generate_temp_table_name(prefix='{}_train'.format(prefix),
|
|
747
755
|
table_type = TeradataConstants.TERADATA_TABLE,
|
|
748
756
|
gc_on_quit=not persist)
|
|
757
|
+
# If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
|
|
758
|
+
# table name in fully qualified format.
|
|
759
|
+
train_table_name = UtilFuncs._extract_table_name(train_table_name)
|
|
760
|
+
|
|
749
761
|
# Storing the table names in the table name mapping dictionary
|
|
750
762
|
self.table_name_mapping['{}_train'.format(prefix)] = train_table_name
|
|
751
763
|
|
|
764
|
+
# In the case of the VT option, the table was being persisted, so the VT condition is being checked.
|
|
765
|
+
is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
|
|
752
766
|
# Pushing data into database
|
|
753
767
|
if self.is_classification_type():
|
|
754
|
-
copy_to_sql(df=data, table_name=train_table_name, if_exists="replace", types={f'{self.target_column}': INTEGER})
|
|
768
|
+
copy_to_sql(df=data, table_name=train_table_name, temporary=is_temporary, if_exists="replace", types={f'{self.target_column}': INTEGER})
|
|
755
769
|
else:
|
|
756
|
-
copy_to_sql(df=data, table_name=train_table_name, if_exists="replace")
|
|
770
|
+
copy_to_sql(df=data, table_name=train_table_name, if_exists="replace", temporary=is_temporary)
|
|
757
771
|
|
|
758
772
|
def _scaling_features_helper(self,
|
|
759
773
|
data=None,
|
|
@@ -856,6 +870,7 @@ class _DataPreparation:
|
|
|
856
870
|
|
|
857
871
|
# List of columns to copy to the output generated by scale transform
|
|
858
872
|
accumulate_cols = list(set(data_to_scale.columns) - set(scale_col))
|
|
873
|
+
|
|
859
874
|
|
|
860
875
|
# Scaling dataset
|
|
861
876
|
transform_obj = ScaleTransform(data=data_to_scale,
|
|
@@ -867,6 +882,8 @@ class _DataPreparation:
|
|
|
867
882
|
data=scaled_df,
|
|
868
883
|
progress_bar=self.progress_bar)
|
|
869
884
|
else:
|
|
885
|
+
# No columns to scale, Original data will be used
|
|
886
|
+
scaled_df = data_to_scale
|
|
870
887
|
self._display_msg(msg="No columns to scale.",
|
|
871
888
|
progress_bar=self.progress_bar)
|
|
872
889
|
|
|
@@ -915,10 +932,16 @@ class _DataPreparation:
|
|
|
915
932
|
# Assigning data to target dataframe
|
|
916
933
|
target_df = self.data
|
|
917
934
|
# Detecting list of float columns on target dataset
|
|
918
|
-
float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float"]]
|
|
935
|
+
float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float", "decimal.Decimal"]]
|
|
919
936
|
|
|
920
937
|
if len(float_columns) == 0:
|
|
921
|
-
|
|
938
|
+
cols = target_df.columns
|
|
939
|
+
# Doing reset index to get index column
|
|
940
|
+
df = target_df.to_pandas().reset_index()
|
|
941
|
+
|
|
942
|
+
# Returning the dataframe with cols
|
|
943
|
+
# to avoid extra columns generated by reset_index()
|
|
944
|
+
return df[cols]
|
|
922
945
|
|
|
923
946
|
# storing the column details for round up in data transformation dictionary
|
|
924
947
|
self.data_transform_dict["round_columns"] = float_columns
|