teradataml 20.0.0.3__py3-none-any.whl → 20.0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (84) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/README.md +119 -0
  3. teradataml/_version.py +1 -1
  4. teradataml/analytics/analytic_function_executor.py +18 -6
  5. teradataml/analytics/byom/__init__.py +1 -1
  6. teradataml/analytics/sqle/__init__.py +4 -1
  7. teradataml/analytics/valib.py +18 -4
  8. teradataml/automl/__init__.py +51 -6
  9. teradataml/automl/data_preparation.py +56 -33
  10. teradataml/automl/data_transformation.py +58 -33
  11. teradataml/automl/feature_engineering.py +12 -5
  12. teradataml/automl/model_training.py +34 -13
  13. teradataml/common/__init__.py +1 -2
  14. teradataml/common/constants.py +64 -40
  15. teradataml/common/messagecodes.py +13 -3
  16. teradataml/common/messages.py +4 -1
  17. teradataml/common/sqlbundle.py +40 -10
  18. teradataml/common/utils.py +113 -39
  19. teradataml/common/warnings.py +11 -0
  20. teradataml/context/context.py +141 -17
  21. teradataml/data/amazon_reviews_25.csv +26 -0
  22. teradataml/data/byom_example.json +11 -0
  23. teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
  24. teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
  25. teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
  26. teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
  27. teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
  28. teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
  29. teradataml/data/docs/sqle/docs_17_20/Shap.py +28 -6
  30. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
  31. teradataml/data/hnsw_alter_data.csv +5 -0
  32. teradataml/data/hnsw_data.csv +10 -0
  33. teradataml/data/jsons/byom/h2opredict.json +1 -1
  34. teradataml/data/jsons/byom/onnxembeddings.json +266 -0
  35. teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
  36. teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
  37. teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
  38. teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
  39. teradataml/data/jsons/sqle/20.00/TD_KMeans.json +2 -2
  40. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +1 -1
  41. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +5 -5
  42. teradataml/data/teradataml_example.json +8 -0
  43. teradataml/data/vectordistance_example.json +1 -1
  44. teradataml/dataframe/copy_to.py +8 -3
  45. teradataml/dataframe/data_transfer.py +11 -1
  46. teradataml/dataframe/dataframe.py +517 -121
  47. teradataml/dataframe/dataframe_utils.py +152 -20
  48. teradataml/dataframe/functions.py +26 -11
  49. teradataml/dataframe/setop.py +11 -6
  50. teradataml/dataframe/sql.py +2 -2
  51. teradataml/dbutils/dbutils.py +525 -129
  52. teradataml/hyperparameter_tuner/optimizer.py +12 -1
  53. teradataml/opensource/{sklearn/_sklearn_wrapper.py → _base.py} +317 -1011
  54. teradataml/opensource/_class.py +141 -17
  55. teradataml/opensource/{constants.py → _constants.py} +7 -3
  56. teradataml/opensource/_lightgbm.py +52 -53
  57. teradataml/opensource/_sklearn.py +1008 -0
  58. teradataml/opensource/_wrapper_utils.py +5 -5
  59. teradataml/options/__init__.py +47 -15
  60. teradataml/options/configure.py +103 -25
  61. teradataml/options/display.py +13 -2
  62. teradataml/plot/axis.py +47 -8
  63. teradataml/plot/figure.py +33 -0
  64. teradataml/plot/plot.py +63 -13
  65. teradataml/scriptmgmt/UserEnv.py +2 -2
  66. teradataml/scriptmgmt/lls_utils.py +63 -26
  67. teradataml/store/__init__.py +1 -2
  68. teradataml/store/feature_store/feature_store.py +102 -7
  69. teradataml/table_operators/Apply.py +32 -18
  70. teradataml/table_operators/Script.py +3 -1
  71. teradataml/table_operators/TableOperator.py +3 -1
  72. teradataml/utils/dtypes.py +47 -0
  73. teradataml/utils/internal_buffer.py +18 -0
  74. teradataml/utils/validators.py +68 -9
  75. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/METADATA +123 -2
  76. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/RECORD +79 -75
  77. teradataml/data/SQL_Fundamentals.pdf +0 -0
  78. teradataml/libaed_0_1.dylib +0 -0
  79. teradataml/libaed_0_1.so +0 -0
  80. teradataml/opensource/sklearn/__init__.py +0 -0
  81. teradataml/store/vector_store/__init__.py +0 -1586
  82. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/WHEEL +0 -0
  83. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/top_level.txt +0 -0
  84. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.4.dist-info}/zip-safe +0 -0
Binary file
teradataml/README.md CHANGED
@@ -17,6 +17,125 @@ Copyright 2024, Teradata. All Rights Reserved.
17
17
 
18
18
  ## Release Notes:
19
19
 
20
+ #### teradataml 20.00.00.04
21
+ * ##### New Features/Functionality
22
+ * ###### teradataml OTF Support:
23
+ * This release has enabled the support for accessing OTF data from teradataml.
24
+ * User can now create a teradataml DataFrame on OTF table, allowing user to use teradataml functions.
25
+ * Example usage below:
26
+ * Creation of view on OTF/datalake table is not supported. Hence, user has to set `configure.temp_object_type` to `VT` using below-mentioned statement.
27
+ ```configure.temp_object_type = "VT"```
28
+ * User needs to provide additional information about datalake while creating the DataFrame. There are two approaches to provide datalake information
29
+ * Approach 1: Using `in_schema()`
30
+ ```
31
+ >>> from teradataml.dataframe.dataframe import in_schema
32
+ # Create an in_schema object to privide additional information about datalake.
33
+ >>> in_schema_tbl = in_schema(schema_name="datalake_db",
34
+ ... table_name="datalake_table_name",
35
+ ... datalake_name="datalake")
36
+ >>> otf_df = DataFrame(in_schema_tbl)
37
+ ```
38
+ * Approach 2: Using `DataFrame.from_table()`
39
+ ```
40
+ >>> otf_df = DataFrame.from_table(table_name = "datalake_table_name",
41
+ ... schema_name="datalake_db",
42
+ ... datalake_name="datalake")
43
+ ```
44
+ * Once this DataFrame is created, users can use any DataFrame method or analytics features/functionality from teradataml with it. Visit Limitations and considerations section in _Teradata Python Package User Guide_ to check the supportability.
45
+ * Note: All further operations create volatile tables in local database.
46
+ ```
47
+ >>> new_df = otf_df.assign(new_col=otf_df.existing_col*2)
48
+ ```
49
+ * ###### teradataml: DataFrame
50
+ * Introduced a new feature 'Exploratory Data Analysis UI' (EDA-UI), which enhances
51
+ the user experience of teradataml with Jupyter notebook. EDA-UI is displayed by default
52
+ when a teradataml DataFrame is printed in the Jupyter notebook.
53
+ * User can control the EDA-UI using a new configuration option `display.enable_ui`.
54
+ It can be disabled by setting `display.enable_ui` to False.
55
+ * New Function
56
+ * `get_output()` is added to get the result of Analytic function when executed from EDA UI.
57
+
58
+ * ###### OpensourceML
59
+ * `td_lightgbm` - A teradataml OpenSourceML module
60
+ * `deploy()` - User can now deploy the models created by lightgbm `Booster` and `sklearn` modules. Deploying the model stores the model in Vantage for future use with `td_lightgbm`.
61
+ * `td_lightgbm.deploy()` - Deploy the lightgbm `Booster` or any `scikit-learn` model trained outside Vantage.
62
+ * `td_lightgbm.train().deploy()` - Deploys the lightgbm `Booster` object trained within Vantage.
63
+ * `td_lightgbm.<sklearn_class>().deploy()` - Deploys lightgbm's sklearn class object created/trained within Vantage.
64
+ * `load()` - User can load the deployed models back in the current session. This allows user to use the lightgbm functions with the `td_lightgbm` module.
65
+ * `td_lightgbm.load()` - Load the deployed model in the current session.
66
+
67
+ * ###### FeatureStore
68
+ * New function `FeatureStore.delete()` is added to drop the Feature Store and corresponding repo from Vantage.
69
+
70
+ * ###### Database Utility
71
+ * `db_python_version_diff()` - Identifies the Python interpreter major version difference between the interpreter installed on Vantage vs interpreter on the local user environment.
72
+ * `db_python_package_version_diff()` - Identifies the Python package version difference between the packages installed on Vantage vs the local user environment.
73
+
74
+ * ###### BYOM Function
75
+ * `ONNXEmbeddings()` - Calculate embeddings values in Vantage using an embeddings model that has been created outside Vantage and stored in ONNX format.
76
+
77
+ * ###### teradataml Options
78
+ * Configuration Options
79
+ * `configure.temp_object_type` - Allows user to choose between creating volatile tables or views for teradataml internal use. By default, teradataml internally creates the views for some of the operations. Now, with new configuration option, user can opt to create Volatile tables instead of views. This provides greater flexibility for users who lack the necessary permissions to create view or need to create views on tables without WITH GRANT permissions.
80
+ * Display Options
81
+ * `display.enable_ui` - Specifies whether to display exploratory data analysis UI when DataFrame is printed. By default, this option is enabled (True), allowing exploratory data analysis UI to be displayed. When set to False, exploratory data analysis UI is hidden.
82
+
83
+ * ##### Updates
84
+ * ###### teradataml: DataFrame function
85
+ * `describe()`
86
+ * New argument added: `pivot`.
87
+ * When argument `pivot` is set to False, Non-numeric columns are no longer supported for generating statistics.
88
+ Use `CategoricalSummary` and `ColumnSummary`.
89
+ * `fillna()` - Accepts new argument `partition_column` to partition the data and impute null values accordingly.
90
+ * Optimised performance for `DataFrame.plot()`.
91
+ * `DataFrame.plot()` will not regenerate the image when run more than once with same arguments.
92
+ * `DataFrame.from_table()`: New argument `datalake_name` added to accept datalake name while creating DataFrame on datalake table.
93
+
94
+ * ###### teradataml: DataFrame Utilities
95
+ * `in_schema()`: New argument `datalake_name` added to accept datalake name.
96
+
97
+ * ###### Table Operator
98
+ * `Apply()` no longer looks at authentication token by default. Authentication token is now required only if user want to update backend Open Analytics Framework service.
99
+
100
+ * ###### Hyper Parameter Tuner
101
+ * `GridSearch()` and `RandomSearch()` now displays a message to refer to `get_error_log()` api when model training fails in HPT.
102
+
103
+ * ###### teradataml Options
104
+ * Configuration Options
105
+ * `configure.indb_install_location`
106
+ Determines the installation location of the In-DB Python package based on the installed RPM version.
107
+
108
+ * ###### teradataml Context Creation
109
+ * `create_context()` - Enables user to create connection using either parameters set in environment or config file, in addition to previous method. Newly added options help users to hide the sensitive data from the script.
110
+
111
+ * ###### Open Analytics Framework
112
+ * Enhanced the `create_env()` to display a message when an invalid base_env is passed, informing users that the default base_env is being used.
113
+
114
+ * ###### OpensourceML
115
+ * Raises a TeradataMlException, if the Python interpreter major version is different between the Vantage Python environment and the local user environment.
116
+ * Displays a warning, if specific Python package versions are different between the Vantage Python environment and the local user environment.
117
+
118
+ * ###### Database Utility
119
+ * `db_list_tables()`: New argument `datalake_name` added to accept datalake name to list tables from.
120
+ * `db_drop_table()`:
121
+ * New argument `datalake_name` added to accept datalake name to drop tables from.
122
+ * New argument `purge` added to specify whether to use `PURGE ALL` or `NO PURGE` clause while dropping table.
123
+
124
+ * ##### Bug Fixes
125
+ * `td_lightgbm` OpensourceML module: In multi model case, `td_lightgbm.Dataset().add_features_from()` function should add features of one partition in first Dataset to features of the same partition in second Dataset. This is not the case before and this function fails. Fixed this now.
126
+ * Fixed a minor bug in the `Shap()` and converted argument `training_method` to required argument.
127
+ * Fixed PCA-related warnings in `AutoML`.
128
+ * `AutoML` no longer fails when data with all categorical columns are provided.
129
+ * Fixed `AutoML` issue with upsampling method.
130
+ * Excluded the identifier column from outlier processing in `AutoML`.
131
+ * `DataFrame.set_index()` no longer modifies the original DataFrame's index when argument `append` is used.
132
+ * `concat()` function now supports the DataFrame with column name starts with digit or contains special characters or contains reserved keywords.
133
+ * `create_env()` proceeds to install other files even if current file installation fails.
134
+ * Corrected the error message being raised in `create_env()` when authentication is not set.
135
+ * Added missing argument `charset` for Vantage Analytic Library functions.
136
+ * New argument `seed` is added to `AutoML`, `AutoRegressor` and `AutoClassifier` to ensure consistency on result.
137
+ * Analytic functions now work even if name of columns for underlying tables is non-ascii characters.
138
+
20
139
  #### teradataml 20.00.00.03
21
140
 
22
141
  * teradataml no longer supports setting the `auth_token` using `set_config_params()`. Users should use `set_auth_token()` to set the token.
teradataml/_version.py CHANGED
@@ -8,4 +8,4 @@
8
8
  #
9
9
  # ##################################################################
10
10
 
11
- version = "20.00.00.03"
11
+ version = "20.00.00.04"
@@ -482,17 +482,20 @@ class _AnlyticFunctionExecutor:
482
482
 
483
483
  # Validate column is existed or not in the table.
484
484
  _Validators._validate_dataframe_has_argument_columns(
485
- arg_value, arg_name, dataframe, target_table_argument_name)
485
+ arg_value, arg_name, dataframe, target_table_argument_name, case_insensitive=True)
486
486
 
487
487
  # Append square brackets for column range when function
488
488
  # does not require special case handler.
489
489
  arg_value = self._spl_func_obj._add_square_bracket(arg_value)
490
490
 
491
+ # Check if there are columns with non-ASCII characters.
492
+ if UtilFuncs._is_ascii(arg_value):
493
+ arg_value = UtilFuncs._teradata_quote_arg(arg_value, "\"", False)
491
494
  # Handling special case for Teradata reserved keywords or column names with spaces.
492
495
  # If argument is a string or list of strings, then add quotes to the string.
493
- if arg_name not in ["partition_columns"] and (\
496
+ elif arg_name not in ["partition_columns"] and (\
494
497
  UtilFuncs._contains_space(arg_value) or list_td_reserved_keywords(arg_value)):
495
- arg_value = UtilFuncs._teradata_quote_arg(arg_value, "\"", False)
498
+ arg_value = UtilFuncs._teradata_quote_arg(arg_value, "\"", False)
496
499
 
497
500
  # SequenceInputBy arguments require special processing.
498
501
  if 500 <= argument.get_r_order_number() <= 510:
@@ -717,10 +720,17 @@ class _AnlyticFunctionExecutor:
717
720
  kwargs.update(kwargs.pop("generic_arguments", {}))
718
721
 
719
722
  # Add all arguments to dynamic class as data members.
723
+ global_volatile = False
724
+ if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
725
+ global_volatile = True
720
726
 
721
727
  start_time = time.time()
722
728
  persist = kwargs.get("persist", False)
723
- volatile = kwargs.get("volatile", False)
729
+ # Use global volatile only when persist argument is False. If persist argument
730
+ # is True, then volatile can't be used whether it is global volatile or normal
731
+ # volatile. If it is normal volatile, then it will raise
732
+ # `CANNOT_USE_TOGETHER_WITH` error below.
733
+ volatile = kwargs.get("volatile", global_volatile if not persist else False)
724
734
  display_table_name = kwargs.get("display_table_name", True)
725
735
 
726
736
  # Validate local_order_column argument type and values.
@@ -1039,7 +1049,8 @@ class _SQLEFunctionExecutor(_AnlyticFunctionExecutor):
1039
1049
  _Validators._validate_dataframe_has_argument_columns(arg_value,
1040
1050
  arg,
1041
1051
  input_table_arg_value,
1042
- input_table_arg
1052
+ input_table_arg,
1053
+ case_insensitive=True
1043
1054
  )
1044
1055
 
1045
1056
  order_column_arg_value = UtilFuncs._teradata_collapse_arglist(order_column_arg_value, "\"")
@@ -1491,7 +1502,8 @@ class _TableOperatorExecutor(_SQLEFunctionExecutor):
1491
1502
  _Validators._validate_dataframe_has_argument_columns(hash_column_value,
1492
1503
  hash_column_arg,
1493
1504
  input_table_arg_value,
1494
- input_table_arg
1505
+ input_table_arg,
1506
+ case_insensitive=True
1495
1507
  )
1496
1508
 
1497
1509
  # Hash and order by can be used together as long as is_local_order = True.
@@ -4,7 +4,7 @@ from teradataml.analytics.byom.PMMLPredict import PMMLPredict
4
4
  from teradataml.analytics.meta_class import _AnalyticFunction
5
5
  from teradataml.analytics.meta_class import _common_init, _common_dir
6
6
 
7
- _byom_functions = ['H2OPredict', 'PMMLPredict', 'ONNXPredict', 'DataikuPredict', 'DataRobotPredict']
7
+ _byom_functions = ['H2OPredict', 'PMMLPredict', 'ONNXPredict', 'DataikuPredict', 'DataRobotPredict', 'ONNXEmbeddings']
8
8
 
9
9
  for func in _byom_functions:
10
10
  globals()[func] = type("{}".format(func), (_AnalyticFunction,),
@@ -95,7 +95,10 @@ _sqle_functions = ['ANOVA',
95
95
  'WordEmbeddings',
96
96
  'XGBoost',
97
97
  'XGBoostPredict',
98
- 'ZTest'
98
+ 'ZTest',
99
+ 'HNSW',
100
+ 'HNSWPredict',
101
+ 'HNSWSummary',
99
102
  ]
100
103
 
101
104
  for func in _sqle_functions:
@@ -26,6 +26,8 @@ from teradataml.dataframe.dataframe import DataFrame, in_schema
26
26
  from teradataml.utils.validators import _Validators
27
27
  from teradataml.analytics.Transformations import Binning, Derive, OneHotEncoder, FillNa, \
28
28
  LabelEncoder, MinMaxScalar, Retain, Sigmoid, ZScore
29
+ from teradataml.common.constants import TeradataReservedKeywords, TeradataConstants
30
+
29
31
 
30
32
  class _VALIB():
31
33
  """ An internal class for executing VALIB analytic functions. """
@@ -370,9 +372,16 @@ class _VALIB():
370
372
  self.__get_temp_table_name()
371
373
  """
372
374
  prefix = "valib_{}".format(self.__tdml_valib_name.lower())
373
- return UtilFuncs._generate_temp_table_name(prefix=prefix, use_default_database=True,
374
- gc_on_quit=True, quote=False,
375
- table_type=TeradataConstants.TERADATA_TABLE)
375
+ tbl_name = UtilFuncs._generate_temp_table_name(prefix=prefix, use_default_database=True,
376
+ gc_on_quit=True, quote=False,
377
+ table_type=TeradataConstants.TERADATA_TABLE)
378
+ # With VT option, table name is getting generated with 'vt_'.
379
+ # But its not getting created as Volatile table. Hence
380
+ # explicitly garbage collecting.
381
+ if configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE:
382
+ GarbageCollector._add_to_garbagecollector(tbl_name,
383
+ TeradataConstants.TERADATA_TABLE)
384
+ return tbl_name
376
385
 
377
386
  def __process_dyn_cls_output_member(self, arg_name, out_tablename, out_var=None):
378
387
  """
@@ -447,6 +456,7 @@ class _VALIB():
447
456
  # Add extension to the table name.
448
457
  generated_table_name = "{}{}".format(table_name, extension)
449
458
 
459
+
450
460
  # Register new output table to the GC.
451
461
  gc_tabname = "\"{}\".\"{}\"".format(self.__db_name, generated_table_name)
452
462
  GarbageCollector._add_to_garbagecollector(gc_tabname, TeradataConstants.TERADATA_TABLE)
@@ -1463,7 +1473,7 @@ class _VALIB():
1463
1473
  if gen_sql_only:
1464
1474
  valib_inst.__generate_valib_sql_argument_syntax(arg=str(gen_sql_only),
1465
1475
  arg_name="gensqlonly")
1466
-
1476
+ charset = kwargs.pop("charset", None)
1467
1477
  # Raise error if there are additional arguments.
1468
1478
  if len(kwargs) != 0:
1469
1479
  err_ = "The keyword arguments for Overlap() should have data1, data2, ..., dataN " \
@@ -1478,6 +1488,10 @@ class _VALIB():
1478
1488
  arg_name="tablename")
1479
1489
  valib_inst.__generate_valib_sql_argument_syntax(arg=",".join(column_names_df),
1480
1490
  arg_name="columns")
1491
+ # Generate clause of charset.
1492
+ if charset:
1493
+ valib_inst.__generate_valib_sql_argument_syntax(arg=charset,
1494
+ arg_name="charset")
1481
1495
 
1482
1496
  return valib_inst._execute_valib_function(skip_data_arg_processing=True,
1483
1497
  skip_other_arg_processing=True)
@@ -30,7 +30,7 @@ from teradataml import ColumnExpression
30
30
  from teradataml.dataframe.dataframe import DataFrame
31
31
  from teradataml.utils.utils import execute_sql
32
32
  from teradataml.utils.validators import _Validators
33
- from teradataml import ROC, BLOB
33
+ from teradataml import ROC, BLOB, VARCHAR
34
34
  from teradataml.utils.dtypes import _Dtypes
35
35
  from teradataml.common.utils import UtilFuncs
36
36
  from teradataml import TeradataMlException
@@ -94,6 +94,9 @@ class AutoML:
94
94
  the processes by passing the JSON file path in case of custom run. It also
95
95
  supports early stopping of model training based on stopping metrics,
96
96
  maximum running time and maximum models to be trained.
97
+ Note:
98
+ * configure.temp_object_type="VT" follows sequential execution.
99
+
97
100
 
98
101
  PARAMETERS:
99
102
  task_type:
@@ -187,6 +190,12 @@ class AutoML:
187
190
  session.
188
191
  Default Value: False
189
192
  Types: bool
193
+
194
+ seed:
195
+ Optional Argument.
196
+ Specifies the random seed for reproducibility.
197
+ Default Value: 42
198
+ Types: int
190
199
 
191
200
  RETURNS:
192
201
  Instance of AutoML.
@@ -417,9 +426,11 @@ class AutoML:
417
426
 
418
427
  volatile = kwargs.get('volatile', False)
419
428
  persist = kwargs.get('persist', False)
429
+ seed = kwargs.get('seed', 42)
420
430
 
421
431
  arg_info_matrix.append(["volatile", volatile, True, (bool)])
422
432
  arg_info_matrix.append(["persist", persist, True, (bool)])
433
+ arg_info_matrix.append(["seed", seed, True, (int)])
423
434
 
424
435
  # Validate argument types
425
436
  _Validators._validate_function_arguments(arg_info_matrix)
@@ -517,7 +528,7 @@ class AutoML:
517
528
 
518
529
  # Validate argument types
519
530
  _Validators._validate_function_arguments(arg_info_fit_matrix)
520
-
531
+
521
532
  # Initializing class variables
522
533
  self.data = data
523
534
  self.target_column = target_column
@@ -758,11 +769,12 @@ class AutoML:
758
769
  if self.target_column_ind:
759
770
  prediction_column = 'prediction' if 'prediction' in pred.result.columns else 'Prediction'
760
771
  probability_column = 'prob_1'
772
+ pred_target_count = pred.result.drop_duplicate(self.target_column).size
761
773
  # Displaying confusion matrix and ROC-AUC for classification problem
762
774
  if self.is_classification_type():
763
775
  print_data = lambda data: print(data) if _is_terminal() else display(data)
764
776
  # Displaying ROC-AUC for binary classification
765
- if self.target_count == 2:
777
+ if self.target_count == 2 and pred_target_count == 2:
766
778
  fit_params = {
767
779
  "probability_column" : probability_column,
768
780
  "observation_column" : self.target_column,
@@ -886,8 +898,8 @@ class AutoML:
886
898
  # as it is required for evaluation.
887
899
  if self.target_column not in data.columns:
888
900
  raise TeradataMlException(
889
- Messages.get_message(MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE).format(self.target_column),
890
- MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE)
901
+ Messages.get_message(MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE).format(self.target_column),
902
+ MessageCodes.TARGET_COL_NOT_FOUND_FOR_EVALUATE)
891
903
 
892
904
  # Checking if data is already transformed before or not
893
905
  data_node_id = data._nodeid
@@ -1234,6 +1246,8 @@ class AutoML:
1234
1246
  pca.n_components_ = load_pca_info['n_components']
1235
1247
  pca.noise_variance_ = load_pca_info['noise_variance']
1236
1248
  pca.singular_values_ = np.array(load_pca_info['singular_values'])
1249
+ pca.feature_names_in_ = data_params['pca_fit_columns']
1250
+ pca.n_features_in_ = len(data_params['pca_fit_columns'])
1237
1251
 
1238
1252
  data_params['pca_fit_instance'] = pca
1239
1253
 
@@ -1442,7 +1456,8 @@ class AutoML:
1442
1456
  # Saving data transformation parameters to the specified table
1443
1457
  sv_models = pd.concat([sv_models, df], ignore_index=True, sort=False)
1444
1458
 
1445
- copy_to_sql(df = sv_models, table_name=table_name, if_exists='replace', types={'DATA_PARAMS':BLOB})
1459
+ copy_to_sql(df = sv_models, table_name=table_name, if_exists='replace', types={'DATA_PARAMS':BLOB,
1460
+ 'PARAMETERS':VARCHAR(length=32000, charset='UNICODE')})
1446
1461
 
1447
1462
  print('Model Deployment Completed Successfully.')
1448
1463
 
@@ -1945,6 +1960,12 @@ class _Regression(_FeatureExplore, _FeatureEngineering, _DataPreparation, _Model
1945
1960
  Default Value: False
1946
1961
  Types: bool
1947
1962
 
1963
+ seed:
1964
+ Optional Argument.
1965
+ Specifies the random seed for reproducibility.
1966
+ Default Value: 42
1967
+ Types: int
1968
+
1948
1969
  RETURNS:
1949
1970
  a tuple containing, model information and leaderboard.
1950
1971
  """
@@ -2103,6 +2124,12 @@ class _Classification(_FeatureExplore, _FeatureEngineering, _DataPreparation, _M
2103
2124
  session.
2104
2125
  Default Value: False
2105
2126
  Types: bool
2127
+
2128
+ seed:
2129
+ Optional Argument.
2130
+ Specifies the random seed for reproducibility.
2131
+ Default Value: 42
2132
+ Types: int
2106
2133
 
2107
2134
  RETURNS:
2108
2135
  a tuple containing, model information and leaderboard.
@@ -2324,6 +2351,9 @@ class AutoRegressor(AutoML):
2324
2351
  """
2325
2352
  DESCRIPTION:
2326
2353
  AutoRegressor is a special purpose AutoML feature to run regression specific tasks.
2354
+ Note:
2355
+ * configure.temp_object_type="VT" follows sequential execution.
2356
+
2327
2357
 
2328
2358
  PARAMETERS:
2329
2359
  include:
@@ -2407,6 +2437,12 @@ class AutoRegressor(AutoML):
2407
2437
  session.
2408
2438
  Default Value: False
2409
2439
  Types: bool
2440
+
2441
+ seed:
2442
+ Optional Argument.
2443
+ Specifies the random seed for reproducibility.
2444
+ Default Value: 42
2445
+ Types: int
2410
2446
 
2411
2447
  RETURNS:
2412
2448
  Instance of AutoRegressor.
@@ -2555,6 +2591,9 @@ class AutoClassifier(AutoML):
2555
2591
  """
2556
2592
  DESCRIPTION:
2557
2593
  AutoClassifier is a special purpose AutoML feature to run classification specific tasks.
2594
+ Note:
2595
+ * configure.temp_object_type="VT" follows sequential execution.
2596
+
2558
2597
 
2559
2598
  PARAMETERS:
2560
2599
  include:
@@ -2638,6 +2677,12 @@ class AutoClassifier(AutoML):
2638
2677
  session.
2639
2678
  Default Value: False
2640
2679
  Types: bool
2680
+
2681
+ seed:
2682
+ Optional Argument.
2683
+ Specifies the random seed for reproducibility.
2684
+ Default Value: 42
2685
+ Types: int
2641
2686
 
2642
2687
  RETURNS:
2643
2688
  Instance of AutoClassifier.
@@ -16,7 +16,6 @@
16
16
  # Python libraries
17
17
  import numpy as np
18
18
  import pandas as pd
19
- import random
20
19
  import time
21
20
  import warnings
22
21
 
@@ -30,11 +29,9 @@ from teradataml import UtilFuncs, TeradataConstants
30
29
  from teradataml.common.garbagecollector import GarbageCollector
31
30
  from teradataml.common.messages import Messages, MessageCodes
32
31
  from teradataml.utils.validators import _Validators
33
- from teradataml import INTEGER
32
+ from teradataml import configure, INTEGER
33
+ from teradataml.common.constants import TeradataConstants
34
34
 
35
- # Control Randomnes
36
- random.seed(42)
37
- np.random.seed(42)
38
35
 
39
36
  class _DataPreparation:
40
37
 
@@ -117,6 +114,12 @@ class _DataPreparation:
117
114
  session.
118
115
  Default Value: False
119
116
  Types: bool
117
+
118
+ seed:
119
+ Optional Argument.
120
+ Specifies the random seed for reproducibility.
121
+ Default Value: 42
122
+ Types: int
120
123
  """
121
124
  self.data = data
122
125
  self.target_column = target_column
@@ -135,7 +138,13 @@ class _DataPreparation:
135
138
  self.table_name_mapping = {}
136
139
 
137
140
  self.data_types = {key: value for key, value in self.data._column_names_and_types}
138
-
141
+ self.seed = kwargs.get("seed", 42)
142
+ # np.random.seed() affects the random number generation in numpy and sklearn
143
+ # setting this changes the global state of the random number generator
144
+ # hence, setting the seed only if it is not None
145
+ if kwargs.get("seed") is not None:
146
+ np.random.seed(self.seed)
147
+
139
148
 
140
149
  def data_preparation(self,
141
150
  auto = True):
@@ -262,25 +271,24 @@ class _DataPreparation:
262
271
  outlier_method = "Tukey"
263
272
 
264
273
  # List of columns for outlier processing.
265
- outlier_columns = [col for col in self.data.columns if col not in self.excluded_columns]
274
+ # Excluding target column and excluded columns from outlier processing
275
+ outlier_columns = [col for col in self.data.columns if col not in self.excluded_columns + ['id', self.target_column]]
266
276
 
267
- # Detecting outlier percentage in each columns
268
- outlier_percentage_df = self._outlier_detection(outlier_method, outlier_columns)
269
-
270
- # Outlier Handling techniques
271
- for i in outlier_percentage_df.itertuples():
272
- # Column Name
273
- col = i[0]
274
- # Outlier value
275
- value = i[1]
276
-
277
- if col == self.target_column:
278
- if value < 5.0 and value > 0.0:
277
+ if len(outlier_columns) != 0:
278
+ # Detecting outlier percentage in each columns
279
+ outlier_percentage_df = self._outlier_detection(outlier_method, outlier_columns)
280
+
281
+ # Outlier Handling techniques
282
+ for i in outlier_percentage_df.itertuples():
283
+ # Column Name
284
+ col = i[0]
285
+ # Outlier value
286
+ value = i[1]
287
+ # Dropping rows
288
+ if value > 0.0 and value <= 8.0 :
279
289
  columns_to_drop_rows.append(col)
280
- elif value > 0.0 and value <= 8.0 :
281
- columns_to_drop_rows.append(col)
282
- elif value> 8.0 and value <= 25.0:
283
- columns_to_impute.append(col)
290
+ elif value> 8.0 and value <= 25.0:
291
+ columns_to_impute.append(col)
284
292
 
285
293
  return columns_to_drop_rows, columns_to_impute
286
294
 
@@ -489,7 +497,7 @@ class _DataPreparation:
489
497
  train_data = pca_train.drop(columns=['id', self.target_column], axis=1)
490
498
 
491
499
  # Initialize and fit PCA
492
- pca = PCA()
500
+ pca = PCA(random_state=self.seed)
493
501
  pca.fit(train_data)
494
502
 
495
503
  # Find the number of components for PCA
@@ -497,7 +505,7 @@ class _DataPreparation:
497
505
  n = np.argmax(np.cumsum(variance) >= 0.95) + 1
498
506
 
499
507
  # Create a new instance of PCA with the optimal number of components
500
- pca = PCA(n_components=n, random_state=42)
508
+ pca = PCA(n_components=n, random_state=self.seed)
501
509
 
502
510
  # Apply PCA on dataset
503
511
  X_train_pca = pca.fit_transform(train_data)
@@ -571,7 +579,7 @@ class _DataPreparation:
571
579
 
572
580
  # Random forest for RFE model
573
581
  RFModel = RandomForestRegressor if not is_classification else RandomForestClassifier
574
- rf = RFModel(n_estimators=100, random_state=42)
582
+ rf = RFModel(n_estimators=100, random_state=self.seed)
575
583
 
576
584
  # Determine the scoring metric based on the number of unique classes
577
585
  score = 'r2' if not self.is_classification_type() \
@@ -665,10 +673,10 @@ class _DataPreparation:
665
673
  scoring_metric = 'roc_auc'
666
674
  else:
667
675
  scoring_metric = 'f1_macro'
668
- estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=42)
676
+ estimator = LogisticRegression(solver='saga', penalty='l2', multi_class='auto', random_state=self.seed)
669
677
  parameters = {'C':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
670
678
  else:
671
- estimator = Lasso(random_state=42)
679
+ estimator = Lasso(random_state=self.seed)
672
680
  parameters = {'alpha':[0.00001,0.0001,0.001,0.01,0.05,0.1,10,100,1000], 'max_iter': [100, 500]}
673
681
  scoring_metric = "r2"
674
682
 
@@ -679,7 +687,7 @@ class _DataPreparation:
679
687
 
680
688
  # Applying hyperparameter tuning and optimizing score
681
689
  hyperparameter_search = GridSearchCV(estimator, parameters, cv=cv, refit=True,
682
- scoring=scoring_metric, verbose=0)
690
+ scoring=scoring_metric, verbose=0)
683
691
 
684
692
  # Fitting the best result from hyperparameter
685
693
  hyperparameter_search.fit(train_features, train_target)
@@ -746,14 +754,20 @@ class _DataPreparation:
746
754
  train_table_name = UtilFuncs._generate_temp_table_name(prefix='{}_train'.format(prefix),
747
755
  table_type = TeradataConstants.TERADATA_TABLE,
748
756
  gc_on_quit=not persist)
757
+ # If configure.temp_object_type="VT", _generate_temp_table_name() retruns the
758
+ # table name in fully qualified format.
759
+ train_table_name = UtilFuncs._extract_table_name(train_table_name)
760
+
749
761
  # Storing the table names in the table name mapping dictionary
750
762
  self.table_name_mapping['{}_train'.format(prefix)] = train_table_name
751
763
 
764
+ # In the case of the VT option, the table was being persisted, so the VT condition is being checked.
765
+ is_temporary = configure.temp_object_type == TeradataConstants.TERADATA_VOLATILE_TABLE
752
766
  # Pushing data into database
753
767
  if self.is_classification_type():
754
- copy_to_sql(df=data, table_name=train_table_name, if_exists="replace", types={f'{self.target_column}': INTEGER})
768
+ copy_to_sql(df=data, table_name=train_table_name, temporary=is_temporary, if_exists="replace", types={f'{self.target_column}': INTEGER})
755
769
  else:
756
- copy_to_sql(df=data, table_name=train_table_name, if_exists="replace")
770
+ copy_to_sql(df=data, table_name=train_table_name, if_exists="replace", temporary=is_temporary)
757
771
 
758
772
  def _scaling_features_helper(self,
759
773
  data=None,
@@ -856,6 +870,7 @@ class _DataPreparation:
856
870
 
857
871
  # List of columns to copy to the output generated by scale transform
858
872
  accumulate_cols = list(set(data_to_scale.columns) - set(scale_col))
873
+
859
874
 
860
875
  # Scaling dataset
861
876
  transform_obj = ScaleTransform(data=data_to_scale,
@@ -867,6 +882,8 @@ class _DataPreparation:
867
882
  data=scaled_df,
868
883
  progress_bar=self.progress_bar)
869
884
  else:
885
+ # No columns to scale, Original data will be used
886
+ scaled_df = data_to_scale
870
887
  self._display_msg(msg="No columns to scale.",
871
888
  progress_bar=self.progress_bar)
872
889
 
@@ -915,10 +932,16 @@ class _DataPreparation:
915
932
  # Assigning data to target dataframe
916
933
  target_df = self.data
917
934
  # Detecting list of float columns on target dataset
918
- float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float"]]
935
+ float_columns =[col for col, d_type in target_df._column_names_and_types if d_type in ["float", "decimal.Decimal"]]
919
936
 
920
937
  if len(float_columns) == 0:
921
- return target_df.to_pandas()
938
+ cols = target_df.columns
939
+ # Doing reset index to get index column
940
+ df = target_df.to_pandas().reset_index()
941
+
942
+ # Returning the dataframe with cols
943
+ # to avoid extra columns generated by reset_index()
944
+ return df[cols]
922
945
 
923
946
  # storing the column details for round up in data transformation dictionary
924
947
  self.data_transform_dict["round_columns"] = float_columns