teradataml 20.0.0.3__py3-none-any.whl → 20.0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (151) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/README.md +193 -1
  3. teradataml/__init__.py +2 -1
  4. teradataml/_version.py +2 -2
  5. teradataml/analytics/analytic_function_executor.py +25 -18
  6. teradataml/analytics/byom/__init__.py +1 -1
  7. teradataml/analytics/json_parser/analytic_functions_argument.py +4 -0
  8. teradataml/analytics/sqle/__init__.py +20 -2
  9. teradataml/analytics/utils.py +15 -1
  10. teradataml/analytics/valib.py +18 -4
  11. teradataml/automl/__init__.py +341 -112
  12. teradataml/automl/autodataprep/__init__.py +471 -0
  13. teradataml/automl/data_preparation.py +84 -42
  14. teradataml/automl/data_transformation.py +69 -33
  15. teradataml/automl/feature_engineering.py +76 -9
  16. teradataml/automl/feature_exploration.py +639 -25
  17. teradataml/automl/model_training.py +35 -14
  18. teradataml/clients/auth_client.py +2 -2
  19. teradataml/common/__init__.py +1 -2
  20. teradataml/common/constants.py +122 -63
  21. teradataml/common/messagecodes.py +14 -3
  22. teradataml/common/messages.py +8 -4
  23. teradataml/common/sqlbundle.py +40 -10
  24. teradataml/common/utils.py +366 -74
  25. teradataml/common/warnings.py +11 -0
  26. teradataml/context/context.py +348 -86
  27. teradataml/data/amazon_reviews_25.csv +26 -0
  28. teradataml/data/apriori_example.json +22 -0
  29. teradataml/data/byom_example.json +11 -0
  30. teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
  31. teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
  32. teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
  33. teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
  34. teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
  35. teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
  36. teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
  37. teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
  38. teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +3 -3
  39. teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
  40. teradataml/data/docs/sqle/docs_17_20/Shap.py +28 -6
  41. teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
  42. teradataml/data/docs/sqle/docs_17_20/TextParser.py +54 -3
  43. teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -1
  44. teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +2 -2
  45. teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +2 -2
  46. teradataml/data/docs/uaf/docs_17_20/DFFT.py +1 -1
  47. teradataml/data/docs/uaf/docs_17_20/DFFT2.py +1 -1
  48. teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +1 -1
  49. teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +1 -1
  50. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
  51. teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +4 -4
  52. teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +2 -2
  53. teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +2 -2
  54. teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +6 -6
  55. teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
  56. teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +1 -1
  57. teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +4 -4
  58. teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +1 -1
  59. teradataml/data/docs/uaf/docs_17_20/PACF.py +1 -1
  60. teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
  61. teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +3 -3
  62. teradataml/data/docs/uaf/docs_17_20/Resample.py +5 -5
  63. teradataml/data/docs/uaf/docs_17_20/SAX.py +3 -3
  64. teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
  65. teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +1 -1
  66. teradataml/data/docs/uaf/docs_17_20/Smoothma.py +3 -3
  67. teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +1 -1
  68. teradataml/data/hnsw_alter_data.csv +5 -0
  69. teradataml/data/hnsw_data.csv +10 -0
  70. teradataml/data/jsons/byom/h2opredict.json +1 -1
  71. teradataml/data/jsons/byom/onnxembeddings.json +266 -0
  72. teradataml/data/jsons/sqle/17.20/NGramSplitter.json +6 -6
  73. teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
  74. teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
  75. teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
  76. teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
  77. teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
  78. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +114 -9
  79. teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +328 -0
  80. teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +420 -0
  81. teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +343 -0
  82. teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +328 -0
  83. teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +328 -0
  84. teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +328 -0
  85. teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +328 -0
  86. teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +359 -0
  87. teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +360 -0
  88. teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +343 -0
  89. teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +343 -0
  90. teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
  91. teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
  92. teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
  93. teradataml/data/jsons/sqle/20.00/TD_KMeans.json +2 -2
  94. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +3 -3
  95. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +6 -6
  96. teradataml/data/ner_dict.csv +8 -0
  97. teradataml/data/ner_input_eng.csv +7 -0
  98. teradataml/data/ner_rule.csv +5 -0
  99. teradataml/data/pos_input.csv +40 -0
  100. teradataml/data/tdnerextractor_example.json +14 -0
  101. teradataml/data/teradataml_example.json +21 -0
  102. teradataml/data/textmorph_example.json +5 -0
  103. teradataml/data/to_num_data.csv +4 -0
  104. teradataml/data/tochar_data.csv +5 -0
  105. teradataml/data/trans_dense.csv +16 -0
  106. teradataml/data/trans_sparse.csv +55 -0
  107. teradataml/data/vectordistance_example.json +1 -1
  108. teradataml/dataframe/copy_to.py +45 -29
  109. teradataml/dataframe/data_transfer.py +72 -46
  110. teradataml/dataframe/dataframe.py +642 -166
  111. teradataml/dataframe/dataframe_utils.py +167 -22
  112. teradataml/dataframe/functions.py +135 -20
  113. teradataml/dataframe/setop.py +11 -6
  114. teradataml/dataframe/sql.py +330 -78
  115. teradataml/dbutils/dbutils.py +556 -140
  116. teradataml/dbutils/filemgr.py +14 -10
  117. teradataml/hyperparameter_tuner/optimizer.py +12 -1
  118. teradataml/lib/aed_0_1.dll +0 -0
  119. teradataml/opensource/{sklearn/_sklearn_wrapper.py → _base.py} +168 -1013
  120. teradataml/opensource/_class.py +141 -17
  121. teradataml/opensource/{constants.py → _constants.py} +7 -3
  122. teradataml/opensource/_lightgbm.py +52 -53
  123. teradataml/opensource/_sklearn.py +1008 -0
  124. teradataml/opensource/_wrapper_utils.py +5 -5
  125. teradataml/options/__init__.py +47 -15
  126. teradataml/options/configure.py +103 -26
  127. teradataml/options/display.py +13 -2
  128. teradataml/plot/axis.py +47 -8
  129. teradataml/plot/figure.py +33 -0
  130. teradataml/plot/plot.py +63 -13
  131. teradataml/scriptmgmt/UserEnv.py +307 -40
  132. teradataml/scriptmgmt/lls_utils.py +428 -145
  133. teradataml/store/__init__.py +2 -3
  134. teradataml/store/feature_store/feature_store.py +102 -7
  135. teradataml/table_operators/Apply.py +48 -19
  136. teradataml/table_operators/Script.py +23 -2
  137. teradataml/table_operators/TableOperator.py +3 -1
  138. teradataml/table_operators/table_operator_util.py +58 -9
  139. teradataml/utils/dtypes.py +49 -1
  140. teradataml/utils/internal_buffer.py +38 -0
  141. teradataml/utils/validators.py +377 -62
  142. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/METADATA +200 -4
  143. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/RECORD +146 -112
  144. teradataml/data/SQL_Fundamentals.pdf +0 -0
  145. teradataml/libaed_0_1.dylib +0 -0
  146. teradataml/libaed_0_1.so +0 -0
  147. teradataml/opensource/sklearn/__init__.py +0 -0
  148. teradataml/store/vector_store/__init__.py +0 -1586
  149. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/WHEEL +0 -0
  150. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/top_level.txt +0 -0
  151. {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/zip-safe +0 -0
@@ -15,46 +15,42 @@
15
15
  #
16
16
  # ##################################################################
17
17
 
18
- from collections import OrderedDict, defaultdict
19
- from importlib import import_module
20
-
21
18
  import base64
22
19
  import json
23
- import numpy
24
20
  import os
25
21
  import pickle
26
- import time
27
- import inspect
28
22
  import warnings
29
- import json
30
- import math
23
+ from collections import OrderedDict, defaultdict
24
+ from importlib import import_module
25
+
31
26
  import pandas as pd
32
- from teradatasqlalchemy import BLOB, CLOB, FLOAT, TIMESTAMP, VARCHAR, INTEGER
33
- import pandas.api.types as pt
27
+ from teradataml.scriptmgmt.lls_utils import list_user_envs
28
+ from teradatasqlalchemy import BLOB, CLOB
34
29
 
35
- from teradataml import _TDML_DIRECTORY, Script, TeradataMlException, Apply
36
- from teradataml.dataframe.copy_to import _get_sqlalchemy_mapping
30
+ from teradataml import _TDML_DIRECTORY, Apply, Script, TeradataMlException
31
+ from teradataml.catalog.byom import delete_byom, retrieve_byom, save_byom
37
32
  from teradataml.common import pylogger
38
- from teradataml.common.utils import UtilFuncs
39
- from teradataml.context.context import _get_current_databasename, get_connection
40
- from teradataml.dbutils.filemgr import install_file, remove_file
41
- from teradataml.utils.utils import execute_sql
42
- from teradataml.options.configure import configure
43
- from teradataml.opensource._wrapper_utils import _validate_fit_run, _generate_new_name,\
44
- _validate_opensource_func_args, _derive_df_and_required_columns, _validate_df_query_type
45
- from teradataml.opensource.constants import OpenSourcePackage, _OSML_MODELS_PRIMARY_INDEX,\
46
- _OSML_MODELS_TABLE_NAME, _OSML_MODELS_TABLE_COLUMNS_TYPE_DICT, OpensourceModels,\
47
- _OSML_ADDITIONAL_COLUMN_TYPES
33
+ from teradataml.common.constants import TeradataConstants
34
+ from teradataml.common.garbagecollector import GarbageCollector
48
35
  from teradataml.common.messagecodes import MessageCodes
49
36
  from teradataml.common.messages import Messages
50
- from teradataml.catalog.byom import save_byom, retrieve_byom, delete_byom
51
- from teradataml.dbutils.dbutils import _create_table, set_session_param
52
- from teradataml.utils.validators import _Validators
37
+ from teradataml.common.utils import UtilFuncs
38
+ from teradataml.common.warnings import OneTimeUserWarning
39
+ from teradataml.context.context import (_get_current_databasename,
40
+ get_connection)
53
41
  from teradataml.dataframe.dataframe import DataFrame
54
42
  from teradataml.dataframe.dataframe_utils import DataFrameUtils
55
- from teradataml.common.garbagecollector import GarbageCollector
56
- from teradataml.common.constants import TeradataConstants
57
-
43
+ from teradataml.dbutils.dbutils import (_create_table,
44
+ execute_sql, set_session_param)
45
+ from teradataml.dbutils.filemgr import install_file, remove_file
46
+ from teradataml.opensource._constants import (
47
+ _OSML_ADDITIONAL_COLUMN_TYPES, _OSML_MODELS_PRIMARY_INDEX,
48
+ _OSML_MODELS_TABLE_COLUMNS_TYPE_DICT, _OSML_MODELS_TABLE_NAME,
49
+ OpensourceModels, OpenSourcePackage, _packages_verified_in_vantage)
50
+ from teradataml.opensource._wrapper_utils import (_generate_new_name,
51
+ _validate_df_query_type)
52
+ from teradataml.options.configure import configure
53
+ from teradataml.utils.validators import _Validators
58
54
 
59
55
  logger = pylogger.getLogger()
60
56
 
@@ -92,8 +88,15 @@ class _GenericObjectWrapper:
92
88
  self._env = configure.openml_user_env
93
89
  else:
94
90
  self._env = UtilFuncs._create_or_get_env("open_source_ml.json")
95
- else:
96
- set_session_param("searchuifdbpath",self._db_name)
91
+
92
+ # Check if the Python interpreter major versions are consistent between Vantage and local.
93
+ UtilFuncs._check_python_version_diff(self._env)
94
+
95
+ # Raise warning when python package versions don't match between Vantage and local.
96
+ # OPENSOURCE_PACKAGE_NAME is set for each opensource package, but not for the base class.
97
+ # Add a check to avoid running this function for the base class.
98
+ if self.OPENSOURCE_PACKAGE_NAME is not None:
99
+ UtilFuncs._check_package_version_diff(self.OPENSOURCE_PACKAGE_NAME.value, self._pkgs, self._env)
97
100
 
98
101
  global _file_installed
99
102
  ## Flag to check whether trained model is installed or not.
@@ -295,7 +298,7 @@ class _GenericObjectWrapper:
295
298
  elif n_unique_partitions > 1:
296
299
  self.modelObj = pd.DataFrame(vals, columns=self._model_data.columns)
297
300
  else:
298
- ValueError("Number of partitions should be greater than 0.")
301
+ raise ValueError("Number of partitions should be greater than 0.")
299
302
 
300
303
  warnings.filterwarnings("default")
301
304
 
@@ -813,41 +816,56 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
813
816
  def fit(self, **kwargs):
814
817
  pass
815
818
 
816
- def _convert_arguments_to_modelObj(self, args, idx_multi_model=None):
819
+ def _convert_arguments_to_modelObj(self, args, partition_col_values=None):
817
820
  """
818
- Internal function to convert all OpensourceML related objects in arguments to
819
- underlying model objects.
821
+ Internal function to get appropriate model from <argument>.modelObj when multiple models are
822
+ generated by fit, based on partition_col_values. If partition_col_values is None, then it is
823
+ single model case.
820
824
  """
821
825
  if isinstance(args, dict):
822
826
  new_args = args.copy() # To avoid updating
823
827
  for k, v in new_args.items():
824
- if isinstance(v, type(self)):
825
- if idx_multi_model is not None:
826
- # single model. This argument is set only when modelObj is single model.
827
- new_args[k] = v.modelObj
828
- else:
828
+ if isinstance(v, _OpenSourceObjectWrapper):
829
+ arg_model_obj = v.modelObj
830
+ if isinstance(arg_model_obj, pd.DataFrame):
829
831
  # multi-model. Get appropriate model from modelObj.
830
- new_args[k] = v.modelObj.iloc[idx_multi_model]["model"]
831
- else:
832
- new_args[k] = v
832
+ arg_partition_values_model_dict = v._get_partition_columns_to_model_dict()
833
+ new_args[k] = arg_partition_values_model_dict[partition_col_values]
834
+ else:
835
+ # single model.
836
+ new_args[k] = arg_model_obj
833
837
  return new_args
834
838
 
835
- # If args is tuple, convert all elements to underlying model object.
836
- elif isinstance(args, tuple):
839
+ if isinstance(args, tuple):
837
840
  new_args = tuple()
838
841
  for arg in args:
839
842
  if isinstance(arg, type(self)):
840
- if idx_multi_model is None:
841
- # single model. This argument is set only when modelObj is single model.
842
- new_args += (arg.modelObj,)
843
- else:
843
+ arg_model_obj = arg.modelObj
844
+ if isinstance(arg_model_obj, pd.DataFrame):
844
845
  # multi-model. Get appropriate model from modelObj.
845
- new_args += (arg.modelObj.iloc[idx_multi_model]["model"],)
846
+ arg_partition_values_model_dict = arg._get_partition_columns_to_model_dict()
847
+ new_args += (arg_partition_values_model_dict[partition_col_values],)
848
+ else:
849
+ # single model.
850
+ new_args += (arg_model_obj,)
846
851
  else:
847
852
  new_args += (arg,)
848
853
  return new_args
849
854
  return args
850
855
 
856
+ def _get_partition_columns_to_model_dict(self):
857
+ """
858
+ Internal function to get partition columns to model dictionary.
859
+ """
860
+ partition_values_model_dict = {}
861
+ no_of_unique_partitions = len(self._fit_partition_unique_values)
862
+ no_of_partitioning_cols = len(self._fit_partition_unique_values[0])
863
+
864
+ for i in range(no_of_unique_partitions):
865
+ partition_values_model_dict[tuple(self.modelObj.iloc[i, :no_of_partitioning_cols])] = self.modelObj.iloc[i]["model"]
866
+
867
+ return partition_values_model_dict
868
+
851
869
  def __get_obj_attributes_multi_model(self, name):
852
870
  """
853
871
  Internal function to get attributes of all sklearn model objects when multiple models are
@@ -873,12 +891,17 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
873
891
 
874
892
  # Wrapper function to invoke dynamic method, using arguments
875
893
  # passed by user, on model in each row.
876
- def __sklearn_method_invoker_for_multimodel(*c, **kwargs):
894
+ def __opensource_method_invoker_for_multimodel(*c, **kwargs):
895
+ """
896
+ Internal function to run functions not taking data related arguments but taking
897
+ arguments, which might contain other model objects.
898
+ """
877
899
  multi_models = self.modelObj.copy()
878
900
  for i in range(multi_models.shape[0]):
879
901
  curr_model = multi_models.iloc[i]["model"]
880
- partition_values = multi_models.iloc[i][0:len(self._fit_partition_colums_non_default)].to_list()
881
- partition_values = "_".join([str(x) for x in partition_values])
902
+ partition_values = tuple(multi_models.iloc[i][0:len(self._fit_partition_colums_non_default)].to_list())
903
+
904
+ partition_values_joined = "_".join([str(x) for x in partition_values])
882
905
  if self.module_name == "lightgbm.basic" and self.class_name == "Booster" and name == "save_model":
883
906
  # filename is first argument.
884
907
  kwargs1 = kwargs.copy()
@@ -886,17 +909,19 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
886
909
 
887
910
  if len(c) > 0:
888
911
  c1 = list(c1)
889
- c1[0] = f"{c1[0]}_{partition_values}"
912
+ c1[0] = f"{c1[0]}_{partition_values_joined}"
890
913
  c1 = tuple(c1)
891
914
  if len(kwargs) > 0 and kwargs.get("filename", None):
892
- kwargs1["filename"] = f"{kwargs1['filename']}_{partition_values}"
915
+ kwargs1["filename"] = f"{kwargs1['filename']}_{partition_values_joined}"
893
916
 
894
- multi_models.at[i, "model"] = getattr(curr_model, name)(*self._convert_arguments_to_modelObj(c1, i),
895
- **self._convert_arguments_to_modelObj(kwargs1, i))
917
+ pos_args = self._convert_arguments_to_modelObj(c1, partition_values)
918
+ key_args = self._convert_arguments_to_modelObj(kwargs1, partition_values)
896
919
  else:
897
- multi_models.at[i, "model"] = getattr(curr_model, name)(*self._convert_arguments_to_modelObj(c, i),
898
- **self._convert_arguments_to_modelObj(kwargs, i))
899
-
920
+ pos_args = self._convert_arguments_to_modelObj(c, partition_values)
921
+ key_args = self._convert_arguments_to_modelObj(kwargs, partition_values)
922
+
923
+ multi_models.at[i, "model"] = getattr(curr_model, name)(*pos_args, **key_args)
924
+
900
925
  first_function_value = multi_models.at[0, "model"]
901
926
  if self.__class__._validate_model_supportability(first_function_value):
902
927
  return __generate_model_object(multi_models, init_model_obj=first_function_value)
@@ -914,7 +939,7 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
914
939
  # If first_atrribute_instance is callable, it should be applied on model in each row
915
940
  # using passed arguments.
916
941
  if callable(first_atrribute_instance):
917
- return __sklearn_method_invoker_for_multimodel
942
+ return __opensource_method_invoker_for_multimodel
918
943
 
919
944
  output_attributes = self.modelObj.copy()
920
945
  for i in range(output_attributes.shape[0]):
@@ -928,7 +953,7 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
928
953
 
929
954
  def __getattr__(self, name):
930
955
  # This just run attributes (functions and properties) from opensource (sklearn/lightgbm) objects.
931
- def __sklearn_method_invoker(*c, **kwargs):
956
+ def __opensource_method_invoker(*c, **kwargs):
932
957
  # Opensource model is returned from the function call. Create _OpensourceObjectWrapper object.
933
958
  model_obj = attribute_instance(*self._convert_arguments_to_modelObj(c), **self._convert_arguments_to_modelObj(kwargs))
934
959
  if self.__class__._validate_model_supportability(model_obj):
@@ -942,7 +967,7 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
942
967
  attribute_instance = getattr(self.modelObj, name)
943
968
 
944
969
  if callable(attribute_instance):
945
- return __sklearn_method_invoker
970
+ return __opensource_method_invoker
946
971
 
947
972
  if self.__class__._validate_model_supportability(attribute_instance):
948
973
  # sklearn model is returned from the attribute. Create _SkLearnObjectWrapper object.
@@ -1003,7 +1028,9 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
1003
1028
  fit_partition_columns_non_default=self._fit_partition_colums_non_default,
1004
1029
  model=self.modelObj,
1005
1030
  pos_args=self.pos_args,
1006
- key_args=self.kwargs)
1031
+ key_args=self.kwargs,
1032
+ osml_class=self.__class__.__name__,
1033
+ osml_module=self.__module__)
1007
1034
 
1008
1035
  # Saved the model object to a file to be used in save_byom() for writing to Vantage table.
1009
1036
  file_name = os.path.join(self._tdml_tmp_dir, "deployed_file.pickle")
@@ -1048,7 +1075,7 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
1048
1075
  cls = cls(model=model)
1049
1076
  # Load the model file into Vantage node as file can be used in
1050
1077
  # predict or other operations.
1051
- cls._install_initial_model_file()
1078
+ cls._install_initial_model_file(False)
1052
1079
 
1053
1080
  cls._save_model(model_name, replace_if_exists)
1054
1081
 
@@ -1079,9 +1106,16 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
1079
1106
  # - 2nd contains package name.
1080
1107
  model_obj = pickle.loads(model_vals_list[0])
1081
1108
  model = model_obj.model
1109
+ osml_module = model_obj.osml_module if hasattr(model_obj, "osml_module") else None
1110
+ osml_class = model_obj.osml_class if hasattr(model_obj, "osml_class") else None
1111
+
1112
+ new_cls = cls
1113
+ if osml_module is not None and osml_class is not None:
1114
+ new_cls = getattr(import_module(osml_module), osml_class)
1115
+
1082
1116
  package = model_vals_list[1]
1083
1117
 
1084
- if package != cls.OPENSOURCE_PACKAGE_NAME.value:
1118
+ if package != new_cls.OPENSOURCE_PACKAGE_NAME.value:
1085
1119
  # Raise error if trying to access model of different package.
1086
1120
  raise TeradataMlException(Messages.get_message(MessageCodes.MODEL_NOT_FOUND, model_name,
1087
1121
  f". Requested model is from '{package}' package"),
@@ -1091,23 +1125,24 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
1091
1125
  # Create a new instance of the class and set the model object to the instance.
1092
1126
  # Instantiation can take only model, not model object. Hence, passing one of the model
1093
1127
  # from pandas df. Updating modelObj and other fields later
1094
- cls = cls(model=model.iloc[1,2])
1095
- cls.modelObj = model
1096
- cls._fit_partition_unique_values = [lst[:len(lst)-1] for lst in model.values.tolist()]
1128
+ new_cls = new_cls(model=model.iloc[1,2])
1129
+ new_cls.modelObj = model
1130
+ new_cls._fit_partition_unique_values = [lst[:len(model_obj.fit_partition_columns_non_default)]
1131
+ for lst in model.values.tolist()]
1097
1132
  else:
1098
- cls = cls(model=model)
1133
+ new_cls = new_cls(model=model)
1099
1134
 
1100
- cls._model_file_name_prefix = model_obj.partition_file_prefix
1101
- cls._is_default_partition_value_fit = model_obj.is_default_partition_value
1102
- cls._fit_partition_colums_non_default = model_obj.fit_partition_columns_non_default
1103
- cls.pos_args = model_obj.pos_args
1104
- cls.kwargs = model_obj.key_args
1135
+ new_cls._model_file_name_prefix = model_obj.partition_file_prefix
1136
+ new_cls._is_default_partition_value_fit = model_obj.is_default_partition_value
1137
+ new_cls._fit_partition_colums_non_default = model_obj.fit_partition_columns_non_default
1138
+ new_cls.pos_args = model_obj.pos_args
1139
+ new_cls.kwargs = model_obj.key_args
1105
1140
 
1106
1141
  # Load the model file into Vantage node as file can be used in
1107
1142
  # predict or other operations.
1108
- cls._install_initial_model_file()
1143
+ new_cls._install_initial_model_file(False)
1109
1144
 
1110
- return cls
1145
+ return new_cls
1111
1146
 
1112
1147
  def deploy(self, model_name, replace_if_exists=False):
1113
1148
  """
@@ -1136,962 +1171,89 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
1136
1171
  "replace_if_exists" is set to False.
1137
1172
 
1138
1173
  EXAMPLES:
1174
+ ## sklearn examples.
1175
+
1176
+ # Import the required libraries and create LinearRegression Opensource object wrapper.
1139
1177
  >>> from teradataml import td_sklearn
1140
1178
  >>> model = td_sklearn.LinearRegression(normalize=True)
1141
1179
  >>> model
1142
1180
  LinearRegression(normalize=True)
1143
1181
 
1144
- # Example 1: Deploy the model held by interface object to Vantage.
1182
+ # Example 1: Deploy the model held by LinearRegression Opensource object to Vantage.
1145
1183
  >>> lin_reg = model.deploy("linreg_model_ver_2")
1146
1184
  Model is saved.
1147
1185
  >>> lin_reg
1148
1186
  LinearRegression(normalize=True)
1149
1187
 
1150
- # Example 2: Deploy the model held by interface object to Vantage with the name same
1151
- # as that of model that already existed in Vantage.
1188
+ # Example 2: Deploy the model held by LinearRegression Opensource object to Vantage
1189
+ # with the name same as that of model that already existed in Vantage.
1152
1190
  >>> lin_reg = model.deploy("linreg_model_ver_2", replace_if_exists=True)
1153
1191
  Model is deleted.
1154
1192
  Model is saved.
1155
1193
  >>> lin_reg
1156
1194
  LinearRegression(normalize=True)
1157
- """
1158
-
1159
- # Install model file into Vantage, if not installed.
1160
- self._install_initial_model_file()
1161
-
1162
- self._save_model(model_name, replace_if_exists)
1163
- return self
1164
-
1165
1195
 
1166
- class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1196
+ ## lightgbm examples.
1167
1197
 
1168
- OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.SKLEARN
1198
+ # For lightGBM, there are two types of models created by `td_lightgbm` interface object.
1199
+ # - the model object created using LGBMClassifier or other class of lightgbm.sklearn module.
1200
+ # - the model object created using train() method (object of lightgbm.Booster class)
1201
+ # or standalone object of lightgbm.Booster class.
1169
1202
 
1170
- def __init__(self, model=None, module_name=None, class_name=None, pos_args=None, kwargs=None):
1171
- super().__init__(model=model, module_name=module_name, class_name=class_name,
1172
- pos_args=pos_args, kwargs=kwargs)
1173
-
1174
- self._initialize_variables(table_name_prefix="td_sklearn_")
1175
- if model is not None:
1176
- self.modelObj = model
1177
- self.module_name = model.__module__.split("._")[0]
1178
- self.class_name = model.__class__.__name__
1179
- # __dict__ gets all the arguments as dictionary including default ones and positional
1180
- # args.
1181
- self.kwargs = model.__dict__
1182
- self.pos_args = tuple() # Kept empty as all are moved to kwargs.
1183
- else:
1184
- self._initialize_object()
1203
+ # Import the required libraries and create LGBMClassifier Opensource object wrapper.
1204
+ >>> from teradataml import td_lightgbm
1205
+ >>> model = td_lightgbm.LGBMClassifier()
1206
+ >>> model
1207
+ LGBMClassifier()
1185
1208
 
1186
- def _validate_args_and_get_data(self, X=None, y=None, groups=None, kwargs={},
1187
- skip_either_or_that=False):
1188
- """
1189
- Internal function to validate arguments passed to exposed opensource APIs and return
1190
- parent DataFrame, feature columns, label columns, group columns, data partition columns.
1191
- """
1192
- _validate_opensource_func_args(X=X, y=y, groups=groups,
1193
- fit_partition_cols=self._fit_partition_colums_non_default,
1194
- kwargs=kwargs,
1195
- skip_either_or_that=skip_either_or_that)
1196
- return _derive_df_and_required_columns(X=X, y=y, groups=groups, kwargs=kwargs,
1197
- fit_partition_cols=self._fit_partition_colums_non_default)
1209
+ # Example 1: Deploy the model held by LGBMClassifier Opensource object to Vantage.
1210
+ >>> lgbm_cls = model.deploy("lgbm_cls_model_ver_2")
1211
+ Model is saved.
1212
+ >>> lgbm_cls
1213
+ LGBMClassifier()
1198
1214
 
1199
- def _run_fit_related_functions(self,
1200
- data,
1201
- feature_columns,
1202
- label_columns,
1203
- partition_columns,
1204
- func,
1205
- classes=None,
1206
- file_name="sklearn_fit.py"):
1207
- """
1208
- Internal function to run fit() and partial_fit() functions.
1215
+ # Example 2: Deploy the model held by LGBMClassifier Opensource object to Vantage with
1216
+ # the name same as that of model that already existed in Vantage.
1217
+ >>> lgbm_cls = model.deploy("lgbm_cls_model_ver_2", replace_if_exists=True)
1218
+ Model is deleted.
1219
+ Model is saved.
1220
+ >>> lgbm_cls
1221
+ LGBMClassifier()
1222
+
1223
+ # Example 3: Deploy the model trained using td_lightgbm.train() function to Vantage.
1224
+ # Create Dataset object, assuming df_x and df_y are the feature and label teradataml
1225
+ # DataFrames.
1226
+ >>> lgbm_data = td_lightgbm.Dataset(data=df_x, label=df_y, free_raw_data=False)
1227
+ >>> lgbm_data
1228
+ <lightgbm.basic.Dataset object at ....>
1229
+
1230
+ # Train the model using `td_lightgbm` interface object.
1231
+ >>> model = td_lightgbm.train(params={}, train_set=lgbm_data, num_boost_round=30, valid_sets=[lgbm_data])
1232
+ [LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000043 seconds.
1233
+ You can set `force_row_wise=true` to remove the overhead.
1234
+ And if memory is not enough, you can set `force_col_wise=true`.
1235
+ [LightGBM] [Info] Total Bins 532
1236
+ [LightGBM] [Info] Number of data points in the train set: 400, number of used features: 4
1237
+ [1] valid_0's l2: 0.215811
1238
+ [2] valid_0's l2: 0.188138
1239
+ [3] valid_0's l2: 0.166146
1240
+ ...
1241
+ ...
1242
+ [29] valid_0's l2: 0.042255
1243
+ [30] valid_0's l2: 0.0416953
1244
+
1245
+ # Deploy the model to Vantage.
1246
+ >>> lgb_model = model.deploy("lgbm_train_model_ver_2")
1247
+ >>> lgb_model
1248
+ <lightgbm.basic.Booster object at ...>
1209
1249
  """
1210
- label_columns = self._get_columns_as_list(label_columns)
1211
-
1212
- data, new_partition_columns = self._get_data_and_data_partition_columns(data,
1213
- feature_columns,
1214
- label_columns,
1215
- partition_columns)
1216
-
1217
- model_type = BLOB() if self._is_lake_system else CLOB()
1218
- return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1219
- for col in new_partition_columns] + [("model", model_type)]
1220
-
1221
- if classes:
1222
- class_type = type(classes[0]).__name__
1223
- classes = "--".join([str(x) for x in classes])
1224
- else:
1225
- classes = str(None)
1226
- class_type = str(None)
1227
-
1228
- data_column_types_str, partition_indices_str, _, new_partition_columns = \
1229
- self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
1230
-
1231
- # db_name is applicable for enterprise system.
1232
- db_file_name = file_name if self._is_lake_system else f"./{self._db_name}/{file_name}"
1233
- py_exc = UtilFuncs._get_python_execution_path()
1234
- script_command = f"{py_exc} {db_file_name} {func} {len(feature_columns)} "\
1235
- f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
1236
- f"{self._model_file_name_prefix} {classes} {class_type} {self._is_lake_system}"
1237
-
1238
- # Get unique values in partitioning columns.
1239
- self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
1240
1250
 
1251
+ # Install model file into Vantage, if not installed.
1241
1252
  self._install_initial_model_file()
1242
1253
 
1243
- self._model_data = self._run_script(data, script_command, new_partition_columns,
1244
- return_types)
1245
-
1246
- self._assign_fit_variables_after_execution(data, new_partition_columns, label_columns)
1247
-
1248
- def partial_fit(self, X=None, y=None, classes=None, **kwargs):
1249
- """
1250
- Please check the description in Docs/OpensourceML/sklearn.py.
1251
- """
1252
- st_time = time.time()
1253
-
1254
- # "classes" argument validation.
1255
- arg_info_matrix = []
1256
- arg_info_matrix.append(["classes", classes, True, (list)])
1257
- _Validators._validate_function_arguments(arg_info_matrix)
1258
-
1259
- self._is_default_partition_value_fit = True # False when the user provides partition columns.
1260
-
1261
- data, feature_columns, label_columns, _, partition_columns = \
1262
- self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
1263
-
1264
- if partition_columns:
1265
- self._is_default_partition_value_fit = False
1266
- self._fit_partition_colums_non_default = partition_columns
1267
-
1268
- self._run_fit_related_functions(data,
1269
- feature_columns,
1270
- label_columns,
1271
- partition_columns,
1272
- inspect.stack()[0][3],
1273
- classes)
1274
-
1275
- self._partial_fit_execution_time = time.time() - st_time
1276
-
1277
- return self
1278
-
1279
- def fit(self, X=None, y=None, **kwargs):
1280
- """
1281
- Please check the description in Docs/OpensourceML/sklearn.py.
1282
- """
1283
- st_time = time.time()
1284
-
1285
- self._is_default_partition_value_fit = True # False when the user provides partition columns.
1286
-
1287
- data, feature_columns, label_columns, _, partition_columns = \
1288
- self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
1289
-
1290
- if partition_columns:
1291
- self._is_default_partition_value_fit = False
1292
- self._fit_partition_colums_non_default = partition_columns
1293
-
1294
- file_name = kwargs.pop("file_name", None)
1295
- func_name = kwargs.pop("name", "fit")
1296
-
1297
- args = {"data": data,
1298
- "feature_columns": feature_columns,
1299
- "label_columns": label_columns,
1300
- "partition_columns": partition_columns,
1301
- "func": func_name}
1302
-
1303
- if file_name is not None:
1304
- args["file_name"] = file_name
1305
-
1306
- self._run_fit_related_functions(**args)
1307
-
1308
- self._fit_execution_time = time.time() - st_time
1309
-
1310
- return self
1311
-
1312
- def set_params(self, **params):
1313
- """
1314
- Please check the description in Docs/OpensourceML/sklearn.py.
1315
- """
1316
- for key, val in params.items():
1317
- self.kwargs[key] = val
1318
-
1319
- # Initialize with new arguments and return the class/model object.
1320
- # set_params takes all keyword arguments and no positional arguments.
1321
- self.__init__(None, self.module_name, self.class_name, tuple(), self.kwargs)
1254
+ self._save_model(model_name, replace_if_exists)
1322
1255
  return self
1323
1256
 
1324
- # get_params() will be executed through __getattr__().
1325
-
1326
- # @_validate_fit_run
1327
- def __getattr__(self, name):
1328
- def __run_transform(*c, **kwargs):
1329
- kwargs["name"] = name
1330
- return self._transform(*c, **kwargs)
1331
-
1332
- def __run_function_needing_all_rows(*c, **kwargs):
1333
- kwargs["name"] = name
1334
- return self._run_function_needing_all_rows(*c, **kwargs)
1335
-
1336
- def __run_kneighbors(*c, **kwargs):
1337
- kwargs["name"] = name
1338
- return self._run_neighbors(*c, **kwargs)
1339
-
1340
- if name in ["score", "aic", "bic", "perplexity"]:
1341
- # TODO: ELE-6352 - Implement error_norm() function later.
1342
- return __run_function_needing_all_rows
1343
-
1344
- if name in ["kneighbors",
1345
- "radius_neighbors",
1346
- "kneighbors_graph",
1347
- "radius_neighbors_graph"]:
1348
- return __run_kneighbors
1349
-
1350
- if name in ["predict",
1351
- "transform",
1352
- "inverse_transform",
1353
- "predict_proba",
1354
- "predict_log_proba",
1355
- "decision_function",
1356
- "score_samples",
1357
- "decision_path",
1358
- "apply",
1359
- "cost_complexity_pruning_path",
1360
- "gibbs",
1361
- "kneighbors_graph",
1362
- "radius_neighbors_graph",
1363
- "mahalanobis",
1364
- "correct_covariance",
1365
- "reweight_covariance",
1366
- "path"]:
1367
- return __run_transform
1368
-
1369
- return super().__getattr__(name)
1370
-
1371
- def _special_handling_multimodel_(self, data, feature_columns, label_columns, partition_columns,
1372
- func_name, **kwargs):
1373
- """
1374
- Internal function to handle multi model case for transform function for functions
1375
- ["SelectFpr", "SelectFdr", "SelectFwe", "SelectFromModel", "RFECV"] of feature_selection module
1376
- and "Birch" of cluster module.
1377
- These functions generate multiple models and when transform is applied to each model, it generates
1378
- output with different number of columns.
1379
- """
1380
- skl_objs_dict = {}
1381
- no_of_unique_partitions = len(self._fit_partition_unique_values)
1382
- no_of_partitioning_cols = len(self._fit_partition_unique_values[0])
1383
-
1384
- # Run on 10 rows of data individually using corresponding scikit-learn objects based on paritition value
1385
- # and get the maximum number of columns and their types.
1386
- for i in range(no_of_unique_partitions):
1387
- skl_objs_dict[tuple(self.modelObj.iloc[i, :no_of_partitioning_cols])] = self.modelObj.iloc[i]["model"]
1388
-
1389
-
1390
- data = data.select(feature_columns + label_columns + partition_columns)
1391
- ten_row_data = data.head(10).get_values()
1392
- X = numpy.array(ten_row_data)
1393
-
1394
- # For multi-model case, model in one AMP can give more number of columns than other AMPs.
1395
- # Returns clause can't contain different number of columns in different AMPs. Hence, taking
1396
- # maximum number of columns and their types from all models.
1397
- max_no_of_columns = 0
1398
- max_col_names = []
1399
- max_col_types = []
1400
-
1401
- def _get_input_row_without_nans(row):
1402
- """
1403
- `inverse_transform` should not contain NaNs. Hence, removing NaNs from the row.
1404
- """
1405
- X1 = []
1406
- for _, v in enumerate(row):
1407
- if isinstance(v, type(None)) or isinstance(v, str) or not math.isnan(v) or self.module_name == "sklearn.impute":
1408
- # Add to list when:
1409
- # - v is None or
1410
- # - v is string or
1411
- # - v is not nan or
1412
- # - if module is impute (which transforms nan values) even though v is nan.
1413
- X1.append(v)
1414
- else:
1415
- # skip nan values.
1416
- pass
1417
- return X1
1418
-
1419
- for i in range(X.shape[0]):
1420
- # Run `transform` or `inverse_transform` on each row with corresponding scikit-learn model object.
1421
- partition_values = tuple(X[i, -no_of_partitioning_cols:])
1422
- skl_obj = skl_objs_dict[partition_values]
1423
-
1424
- X1 = X[i, :-no_of_partitioning_cols]
1425
- # Since Nans/NULLs are added in transform for last columns where some models generated
1426
- # less number of columns, removing Nans/NULLs from the input row for inverse_transform
1427
- # using function _get_input_row_without_nans().
1428
- X1 = numpy.array([_get_input_row_without_nans(X1)])
1429
-
1430
- trans_opt = getattr(skl_obj, func_name)(X1, **kwargs)
1431
-
1432
- no_of_columns = 1
1433
-
1434
- if trans_opt.shape == (X1.shape[0],):
1435
- trans_opt = trans_opt.reshape(X1.shape[0], 1)
1436
-
1437
- if isinstance(trans_opt[0], numpy.ndarray) \
1438
- or isinstance(trans_opt[0], list) \
1439
- or isinstance(trans_opt[0], tuple):
1440
- no_of_columns = len(trans_opt[0])
1441
-
1442
- col_names = [f"{self.class_name.lower()}_{func_name}_{(i + 1)}" for i in range(no_of_columns)]
1443
-
1444
- # Get new column sqlalchemy types for pandas df columns of transform output.
1445
- opt_pd = pd.DataFrame(trans_opt)
1446
-
1447
- # Get output column types for each column in pandas df from the output of transform
1448
- # type functions.
1449
- types = {}
1450
- for idx in range(no_of_columns):
1451
- col = list(opt_pd.columns)[idx]
1452
-
1453
- # Only one row in trans_opt.
1454
- if isinstance(trans_opt[0], numpy.ndarray) or isinstance(trans_opt[0], tuple) or isinstance(trans_opt[0], list):
1455
- type_ = type(trans_opt[0][idx])
1456
- else:
1457
- # only one value in the output.
1458
- type_ = type(trans_opt[0])
1459
-
1460
- # If type of the output value (trans_opt) is None, then use `str` as type since
1461
- # pandas astype() does not accept None type.
1462
- if type_ is type(None):
1463
- type_ = str
1464
-
1465
- # numpy integer columns with nan values can't be typecasted using pd.astype() to int64.
1466
- # It raises error like "Cannot convert non-finite values (NA or inf) to integer:
1467
- # Error while type casting for column '2'"
1468
- # Hence, using pd.Int64Dtype() for integer columns with nan values.
1469
- types[col] = type_ if type_ not in [int, numpy.int64] else pd.Int64Dtype()
1470
-
1471
- # Without this, all columns will be of object type and gets converted to VARCHAR in Vantage.
1472
- opt_pd = opt_pd.astype(types)
1473
-
1474
- # If the datatype is not specified then check if the datatype is datetime64 and timezone is present then map it to
1475
- # TIMESTAMP(timezone=True) else map it according to default value.
1476
- col_types = [TIMESTAMP(timezone=True)
1477
- if pt.is_datetime64_ns_dtype(opt_pd.dtypes[key]) and (opt_pd[col_name].dt.tz is not None)
1478
- else _get_sqlalchemy_mapping(str(opt_pd.dtypes[key]))
1479
- for key, col_name in enumerate(list(opt_pd.columns))]
1480
-
1481
- # Different models in multi model case can generate different number of output columns for example in
1482
- # SelectFpr. Hence, taking the model which generates maximum number of columns.
1483
- if no_of_columns > max_no_of_columns:
1484
- max_no_of_columns = no_of_columns
1485
- max_col_names = col_names
1486
- max_col_types = col_types
1487
-
1488
- return [(c_name, c_type) for c_name, c_type in zip(max_col_names, max_col_types)]
1489
-
1490
- def _get_return_columns_for_function_(self,
1491
- data,
1492
- feature_columns,
1493
- label_columns,
1494
- partition_columns,
1495
- func_name,
1496
- kwargs):
1497
- """
1498
- Internal function to return list of column names and their sqlalchemy types
1499
- which should be used in return_types of Script.
1500
- """
1501
- if func_name == "fit_predict":
1502
- """
1503
- Get return columns using label_columns.
1504
- """
1505
- return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}",
1506
- data._td_column_names_and_sqlalchemy_types[col.lower()])
1507
- for i, col in enumerate(label_columns)]
1508
-
1509
- if func_name == "predict" and self.OPENSOURCE_PACKAGE_NAME == OpenSourcePackage.SKLEARN:
1510
- """
1511
- Return predict columns using either label_columns (if provided) or
1512
- self._fit_label_columns_types (if the function is trained using label columns).
1513
- Otherwise run predict on ten rows of data to get the number of columns and their types
1514
- after this if condition.
1515
- """
1516
- if label_columns:
1517
- return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}",
1518
- data._td_column_names_and_sqlalchemy_types[col.lower()])
1519
- for i, col in enumerate(label_columns)]
1520
- if self._fit_label_columns_types:
1521
- return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}", col_type)
1522
- for i, col_type in enumerate(self._fit_label_columns_types)]
1523
-
1524
- ## If function is not `fit_predict`:
1525
- # then take one row of transform/other functions to execute in client
1526
- # to get number of columns in return clause and their Vantage types.
1527
- n_f = len(feature_columns)
1528
- n_c = len(label_columns)
1529
-
1530
- # For paritioning columns, it will be a dataframe and getattr(modelObj, func_name) fails.
1531
- # Just for getting the number of columns and their types, using only one model of all.
1532
- if len(self._fit_partition_unique_values) == 1:
1533
- # Single model case.
1534
- skl_obj = self.modelObj
1535
- else:
1536
- # Multi model case.
1537
- if (func_name in ["transform", "inverse_transform"] and \
1538
- self.class_name in ["SelectFpr", "SelectFdr", "SelectFwe", "SelectFromModel", "RFECV", "Birch"]) or \
1539
- (self.module_name == "lightgbm.sklearn" and self.class_name == "LGBMClassifier"):
1540
- # Special handling for multi model case for transform function as these classes
1541
- # generate transform output with different number of columns for each model.
1542
- # Hence, need to add Nulls/Nans to columns which are not present in the transform output of
1543
- # some models.
1544
- return self._special_handling_multimodel_(data, feature_columns, label_columns,
1545
- partition_columns, func_name, **kwargs)
1546
-
1547
- skl_obj = self.modelObj.iloc[0]["model"]
1548
-
1549
- data = data.select(feature_columns + label_columns)
1550
-
1551
- ten_row_data = data.head(10).get_values()
1552
- X = numpy.array(ten_row_data)
1553
- if label_columns:
1554
- y = X[:,n_f : n_f + n_c]
1555
- X = X[:,:n_f]
1556
- # predict() now takes 'y' also for it to return the labels from script. Skipping 'y'
1557
- # in local run if passed. Generally, 'y' is passed to return y along with actual output.
1558
- try:
1559
- trans_opt = getattr(skl_obj, func_name)(X, y, **kwargs)
1560
- except TypeError as ex:
1561
- # Function which does not accept 'y' like predict_proba() raises error like
1562
- # "predict_proba() takes 2 positional arguments but 3 were given".
1563
- trans_opt = getattr(skl_obj, func_name)(X, **kwargs)
1564
- else:
1565
- trans_opt = getattr(skl_obj, func_name)(X, **kwargs)
1566
-
1567
- if func_name == "path":
1568
- raise NotImplementedError(
1569
- "path() returns tuple of ndarrays of different shapes. Not Implemented yet."
1570
- )
1571
-
1572
- if isinstance(trans_opt, numpy.ndarray) and trans_opt.shape == (X.shape[0],):
1573
- trans_opt = trans_opt.reshape(X.shape[0], 1)
1574
-
1575
- if type(trans_opt).__name__ in ["csr_matrix", "csc_matrix"]:
1576
- no_of_columns = trans_opt.get_shape()[1]
1577
- trans_opt = trans_opt.toarray()
1578
- elif isinstance(trans_opt, dict):
1579
- raise NotImplementedError(f"Output returns dictionary {trans_opt}. NOT implemented yet.")
1580
- elif isinstance(trans_opt[0], numpy.ndarray) \
1581
- or isinstance(trans_opt[0], list) \
1582
- or isinstance(trans_opt[0], tuple):
1583
- no_of_columns = len(trans_opt[0])
1584
- else:
1585
- no_of_columns = 1
1586
-
1587
- # Special handling when inverse_transform of no_of_columns returns no of rows
1588
- # less than the no of classes. Such columns are filled with NaN values.
1589
- # Updating number of columns here (new columns with NaN values will be added).
1590
- if func_name == "inverse_transform" and self.class_name == "MultiLabelBinarizer":
1591
- no_of_columns = len(self.classes_)
1592
- for i in range(len(ten_row_data)):
1593
- trans_opt[i] += tuple([numpy.nan] * (no_of_columns - len(trans_opt[i])))
1594
-
1595
- # Special handling required for cross_decomposition classes's transform function, which
1596
- # takes label columns also. In this case, output is a tuple of numpy arrays - x_scores and
1597
- # y_scores. If label columns are not provided, only x_scores are returned.
1598
- if self.module_name == "sklearn.cross_decomposition" and func_name == "transform":
1599
- # For cross_decomposition, output is a tuple of arrays when label columns are provided
1600
- # along with feature columns for transform function. In this case, concatenate the
1601
- # arrays and return the column names accordingly.
1602
- if isinstance(trans_opt, tuple): # tuple when label_columns is provided.
1603
- assert trans_opt[0].shape == trans_opt[1].shape,\
1604
- "Output arrays should be of same shape when transform/fit_transform is run "\
1605
- "with label columns for cross_decomposition classes.."
1606
- first_cols = [f"x_scores_{(i + 1)}" for i in range(trans_opt[0].shape[1])]
1607
- second_cols = [f"y_scores_{(i + 1)}" for i in range(trans_opt[1].shape[1])]
1608
- no_of_columns = trans_opt[0].shape[1] + trans_opt[1].shape[1]
1609
- col_names = first_cols + second_cols
1610
-
1611
- trans_opt = numpy.concatenate(trans_opt, axis=1)
1612
- else:
1613
- assert isinstance(trans_opt, numpy.ndarray), "When transform/fit_transform is run "\
1614
- "without label columns for cross_decomposition classes, "\
1615
- "output should be a numpy array."
1616
- no_of_columns = trans_opt.shape[1]
1617
- col_names =[f"x_scores_{(i + 1)}" for i in range(trans_opt.shape[1])]
1618
- else:
1619
- # Generate list of new column names.
1620
- col_names = [f"{self.class_name.lower()}_{func_name}_{(i + 1)}" for i in range(no_of_columns)]
1621
-
1622
- # Get new column sqlalchemy types for pandas df columns of transform output.
1623
- opt_pd = pd.DataFrame(trans_opt)
1624
-
1625
- # Get output column types for each column in pandas df from the output of transform
1626
- # type functions.
1627
- types = {}
1628
- for idx, col in enumerate(list(opt_pd.columns)):
1629
- # Get type of column using data from all rows, in case if the column has None values.
1630
- # 'and' of types of all values in the column with type(None) gives the type of the column.
1631
- type_ = type(None)
1632
- for i in range(len(trans_opt)):
1633
- type_ = type_ and type(trans_opt[i][idx])
1634
-
1635
- # If all the values of the output (trans_opt) is None, thelen use `str` as type since
1636
- # pandas astype() does not accept None type.
1637
- if type_ is type(None):
1638
- type_ = str
1639
-
1640
- # numpy integer columns with nan values can't be typecasted using pd.astype() to int64.
1641
- # It raises error like "Cannot convert non-finite values (NA or inf) to integer:
1642
- # Error while type casting for column '2'"
1643
- # Hence, using pd.Int64Dtype() for integer columns with nan values.
1644
- types[col] = type_ if type_ not in [int, numpy.int64] else pd.Int64Dtype()
1645
-
1646
- # Without this, all columns will be of object type and gets converted to VARCHAR in Vantage.
1647
- opt_pd = opt_pd.astype(types)
1648
-
1649
- # If the datatype is not specified then check if the datatype is datetime64 and timezone is present then map it to
1650
- # TIMESTAMP(timezone=True) else map it according to default value.
1651
- col_types = [TIMESTAMP(timezone=True)
1652
- if pt.is_datetime64_ns_dtype(opt_pd.dtypes[key]) and (opt_pd[col_name].dt.tz is not None)
1653
- else _get_sqlalchemy_mapping(str(opt_pd.dtypes[key]))
1654
- for key, col_name in enumerate(list(opt_pd.columns))]
1655
-
1656
- return [(c_name, c_type) for c_name, c_type in zip(col_names, col_types)]
1657
-
1658
- @_validate_fit_run
1659
- def _run_function_needing_all_rows(self, X=None, y=None, file_name="sklearn_score.py", **kwargs):
1660
- """
1661
- Internal function to run functions like score, aic, bic which needs all rows and return
1662
- one floating number as result.
1663
- """
1664
- st_time = time.time()
1665
-
1666
- assert kwargs["name"], "function name should be passed."
1667
- func_name = kwargs["name"]
1668
-
1669
- # Remove 'name' to pass other kwargs to script. TODO: Not passing it now.
1670
- kwargs.pop("name")
1671
-
1672
- data, feature_columns, label_columns, _, partition_columns = \
1673
- self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
1674
-
1675
- label_columns = self._get_columns_as_list(label_columns)
1676
-
1677
- data, new_partition_columns = self._get_data_and_data_partition_columns(data,
1678
- feature_columns,
1679
- label_columns,
1680
- partition_columns)
1681
-
1682
- script_file_path = f"{file_name}" if self._is_lake_system \
1683
- else f"./{self._db_name}/{file_name}"
1684
-
1685
- data_column_types_str, partition_indices_str, _, new_partition_columns = \
1686
- self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
1687
-
1688
- self._validate_unique_partition_values(data, new_partition_columns)
1689
-
1690
- py_exc = UtilFuncs._get_python_execution_path()
1691
- script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
1692
- f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
1693
- f"{self._model_file_name_prefix} {self._is_lake_system}"
1694
-
1695
- # score, aic, bic returns float values.
1696
- return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1697
- for col in new_partition_columns] + [(func_name, FLOAT())]
1698
-
1699
- # Checking the trained model installation. If not installed,
1700
- # install it and set flag to True.
1701
- if not self._is_trained_model_installed:
1702
- self._install_initial_model_file()
1703
- self._is_trained_model_installed = True
1704
-
1705
- opt = self._run_script(data, script_command, new_partition_columns, return_types)
1706
-
1707
- self._score_execution_time = time.time() - st_time
1708
-
1709
- if self._is_default_partition_value_fit:
1710
- # For single model case, partition column is internally generated and
1711
- # no point in returning it to the user.
1712
- return opt.select(func_name)
1713
-
1714
- return opt
1715
-
1716
- @_validate_fit_run
1717
- def _transform(self, X=None, y=None, file_name="sklearn_transform.py", **kwargs):
1718
- """
1719
- Internal function to run predict/transform and similar functions, which returns
1720
- multiple columns. This function will return data row along with the generated
1721
- columns' row data, unlike sklearn's functions which returns just output data.
1722
- """
1723
- st_time = time.time()
1724
-
1725
- assert kwargs["name"], "function name should be passed."
1726
- func_name = kwargs["name"]
1727
-
1728
- # Remove 'name' to pass other kwargs to script. TODO: Not passing it now.
1729
- kwargs.pop("name")
1730
-
1731
- data, feature_columns, label_columns, _, partition_columns = \
1732
- self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
1733
-
1734
- data, new_partition_columns = self._get_data_and_data_partition_columns(data,
1735
- feature_columns,
1736
- label_columns,
1737
- partition_columns)
1738
-
1739
- # Since kwargs are passed to transform, removing additional unrelated arguments from kwargs.
1740
- self._remove_data_related_args_from_kwargs(kwargs)
1741
-
1742
- script_file_path = f"{file_name}" if self._is_lake_system \
1743
- else f"./{self._db_name}/{file_name}"
1744
-
1745
- data_column_types_str, partition_indices_str, _, new_partition_columns = \
1746
- self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
1747
-
1748
- self._validate_unique_partition_values(data, new_partition_columns)
1749
-
1750
- return_columns_python_types = None
1751
- if self._fit_label_columns_python_types:
1752
- return_columns_python_types = '--'.join(self._fit_label_columns_python_types)
1753
-
1754
- # Returning feature columns also along with transformed columns because we don't know the
1755
- # mapping of feature columns to the transformed columns.
1756
- ## 'correct_covariance()' returns the (n_features, n_features)
1757
- if func_name == "correct_covariance":
1758
- return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1759
- for col in new_partition_columns]
1760
- else:
1761
- return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1762
- for col in (new_partition_columns + feature_columns)]
1763
- if func_name in ["predict", "decision_function"] and label_columns:
1764
- return_types += [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1765
- for col in label_columns]
1766
-
1767
- output_cols_types = self._get_return_columns_for_function_(data,
1768
- feature_columns,
1769
- label_columns,
1770
- new_partition_columns,
1771
- func_name,
1772
- kwargs)
1773
- return_types += output_cols_types
1774
-
1775
- py_exc = UtilFuncs._get_python_execution_path()
1776
- script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
1777
- f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
1778
- f"{self._model_file_name_prefix} {len(output_cols_types)} {self._is_lake_system} " \
1779
- f"{return_columns_python_types}"
1780
-
1781
- # Checking the trained model installation. If not installed,
1782
- # install it and set flag to True.
1783
- if not self._is_trained_model_installed:
1784
- self._install_initial_model_file()
1785
- self._is_trained_model_installed = True
1786
-
1787
- opt = self._run_script(data, script_command, new_partition_columns, return_types)
1788
-
1789
- self._transform_execution_time = time.time() - st_time
1790
-
1791
- return self._get_returning_df(opt, new_partition_columns, return_types)
1792
-
1793
- def fit_predict(self, X=None, y=None, **kwargs):
1794
- """
1795
- Please check the description in Docs/OpensourceML/sklearn.py.
1796
- """
1797
- st_time = time.time()
1798
-
1799
- self._is_default_partition_value_fit = True # False when the user provides partition columns.
1800
-
1801
- data, feature_columns, label_columns, _, partition_columns = \
1802
- self._validate_args_and_get_data(X=X, y=y, groups=None, kwargs=kwargs)
1803
-
1804
- if partition_columns:
1805
- self._is_default_partition_value_fit = False
1806
-
1807
- data, new_partition_columns = self._get_data_and_data_partition_columns(data,
1808
- feature_columns,
1809
- label_columns,
1810
- partition_columns)
1811
-
1812
- # Return label_columns also if user provides in the function call.
1813
- return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1814
- for col in (new_partition_columns + feature_columns + label_columns)]
1815
-
1816
- func_name = inspect.stack()[0][3]
1817
- if label_columns:
1818
- return_types += self._get_return_columns_for_function_(data,
1819
- feature_columns,
1820
- label_columns,
1821
- new_partition_columns,
1822
- func_name,
1823
- {})
1824
- else:
1825
- # If there are no label_columns, we will have only one
1826
- # predicted column.
1827
- return_types += [(f"{self.class_name.lower()}_{func_name}_1", FLOAT())]
1828
-
1829
- file_name = "sklearn_fit_predict.py"
1830
-
1831
- data_column_types_str, partition_indices_str, _, new_partition_columns = \
1832
- self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
1833
-
1834
- script_file_name = f"{file_name}" if self._is_lake_system \
1835
- else f"./{self._db_name}/{file_name}"
1836
- py_exc = UtilFuncs._get_python_execution_path()
1837
- script_command = f"{py_exc} {script_file_name} {len(feature_columns)} "\
1838
- f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
1839
- f"{self._model_file_name_prefix} {self._is_lake_system}"
1840
-
1841
- # Get unique values in partitioning columns.
1842
- self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
1843
-
1844
- # Checking the trained model installation. If not installed,
1845
- # install it and flag to True.
1846
- if not self._is_trained_model_installed:
1847
- self._install_initial_model_file()
1848
- self._is_trained_model_installed = True
1849
-
1850
- opt = self._run_script(data, script_command, new_partition_columns, return_types)
1851
-
1852
- self._fit_predict_execution_time = time.time() - st_time
1853
-
1854
- if self._is_default_partition_value_fit:
1855
- # For single model case, partition column is internally generated and no point in
1856
- # returning it to the user.
1857
-
1858
- # Extract columns from return types.
1859
- returning_cols = [col[0] for col in return_types[len(new_partition_columns):]]
1860
- return opt.select(returning_cols)
1861
-
1862
- return opt
1863
-
1864
- def fit_transform(self, X=None, y=None, **kwargs):
1865
- """
1866
- Please check the description in Docs/OpensourceML/sklearn.py.
1867
- """
1868
- # 'y' is not needed for transform().
1869
- fit_obj = self.fit(X, y, **kwargs)
1870
- kwargs["label_columns"] = None
1871
- return fit_obj.transform(X, None, **kwargs)
1872
-
1873
- @_validate_fit_run
1874
- def _run_neighbors(self, X=None, **kwargs):
1875
- """
1876
- Internal function to run functions like kneighbors, radius_neighbors, kneighbors_graph,
1877
- radius_neighbors_graph which returns multiple columns. This function will return data row
1878
- along with the generated columns' row data, unlike sklearn's functions which returns just
1879
- output data.
1880
- """
1881
- assert kwargs["name"], "function name should be passed."
1882
- func_name = kwargs["name"]
1883
- kwargs.pop("name")
1884
-
1885
- if self.module_name != "sklearn.neighbors":
1886
- raise AttributeError(f"{self.module_name+'.'+self.class_name} does not have {func_name}() method.")
1887
-
1888
- data = kwargs.get("data", None)
1889
- partition_columns = kwargs.get("partition_columns", None)
1890
-
1891
- if not X and not partition_columns and not data:
1892
- # If data is not passed, then run from client only.
1893
- # TODO: decide whether to run from client or from Vantage.
1894
- opt = super().__getattr__(func_name)(**kwargs)
1895
- from scipy.sparse.csr import csr_matrix
1896
- if isinstance(opt, csr_matrix):
1897
- return opt.toarray()
1898
- return opt
1899
-
1900
- self._is_default_partition_value_fit = True # False when the user provides partition columns.
1901
-
1902
- data, feature_columns, _, _, new_partition_columns = \
1903
- self._validate_args_and_get_data(X=X, y=None, groups=None, kwargs=kwargs,
1904
- skip_either_or_that=True)
1905
-
1906
- # Remove the kwargs data.
1907
- self._remove_data_related_args_from_kwargs(kwargs)
1908
-
1909
- if partition_columns:
1910
- # kwargs are passed to kneighbors function. So, removing them from kwargs.
1911
- self._is_default_partition_value_fit = False
1912
-
1913
- # Generating new partition column name.
1914
- data, new_partition_columns = self._get_data_and_data_partition_columns(data,
1915
- feature_columns,
1916
- [],
1917
- partition_columns)
1918
-
1919
- args_str = self._get_kwargs_str(kwargs)
1920
-
1921
- file_name = "sklearn_neighbors.py"
1922
-
1923
- script_file_path = f"{file_name}" if self._is_lake_system \
1924
- else f"./{self._db_name}/{file_name}"
1925
-
1926
- # Returning feature columns also along with new columns.
1927
- return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1928
- for col in (new_partition_columns + feature_columns)]
1929
-
1930
- # `return_distance` is needed as the result is a tuple of two arrays when it is True.
1931
- return_distance = kwargs.get("return_distance", True) # Default value is True.
1932
-
1933
- # Though new columns return numpy arrays, we are returning them as strings.
1934
- # TODO: Will update to columns later, if requested later.
1935
- if func_name in ['kneighbors', 'radius_neighbors']:
1936
- if return_distance:
1937
- return_types += [("neigh_dist", VARCHAR())]
1938
- return_types += [("neigh_ind", VARCHAR())]
1939
- elif func_name in ['kneighbors_graph', 'radius_neighbors_graph']:
1940
- return_types += [("A", VARCHAR())]
1941
- else:
1942
- return_types += [("output", VARCHAR())]
1943
-
1944
- data_column_types_str, partition_indices_str, _, new_partition_columns = \
1945
- self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
1946
-
1947
- py_exc = UtilFuncs._get_python_execution_path()
1948
- script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
1949
- f"{partition_indices_str} {data_column_types_str} {self._model_file_name_prefix} {self._is_lake_system} "\
1950
- f"{args_str}"
1951
-
1952
- # Get unique values in partitioning columns.
1953
- self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
1954
-
1955
- # Checking the trained model installation. If not installed,
1956
- # install it and set flag to True.
1957
- if not self._is_trained_model_installed:
1958
- self._install_initial_model_file()
1959
- self._is_trained_model_installed = True
1960
-
1961
- opt = self._run_script(data, script_command, new_partition_columns, return_types)
1962
-
1963
- return self._get_returning_df(opt, new_partition_columns, return_types)
1964
-
1965
- def split(self, X=None, y=None, groups=None, **kwargs):
1966
- """
1967
- Please check the description in Docs/OpensourceML/sklearn.py.
1968
- """
1969
- opt = self._run_model_selection("split", X=X, y=y, groups=groups,
1970
- skip_either_or_that=True, kwargs=kwargs)
1971
-
1972
- # Get number of splits in the result DataFrame.
1973
- n_splits = opt.drop_duplicate("split_id").shape[0]
1974
-
1975
- data = kwargs.get("data", None)
1976
- feature_columns = kwargs.get("feature_columns", [])
1977
- label_columns = self._get_columns_as_list(kwargs.get("label_columns", []))
1978
-
1979
- # If there is not X and y, get feature_columns and label_columns for "data".
1980
- partition_columns = kwargs.get("partition_columns", [])
1981
- feature_columns = [col for col in X.columns if col not in partition_columns] \
1982
- if X and not data and not feature_columns else feature_columns
1983
- label_columns = y.columns if y and not data and not label_columns else label_columns
1984
-
1985
- # Return iterator of the train and test dataframes for each split.
1986
- for i in range(1, n_splits+1):
1987
- train_df = opt[(opt.split_id == i) & (opt.data_type == "train")]\
1988
- .select(partition_columns + feature_columns + label_columns)
1989
- train_df._index_label = None
1990
- test_df = opt[(opt.split_id == i) & (opt.data_type == "test")]\
1991
- .select(partition_columns + feature_columns + label_columns)
1992
- test_df._index_label = None
1993
-
1994
- yield train_df, test_df
1995
-
1996
- def get_n_splits(self, X=None, y=None, groups=None, **kwargs):
1997
- """
1998
- Please check the description in Docs/OpensourceML/sklearn.py.
1999
- """
2000
- return self._run_model_selection("get_n_splits", X=X, y=y, groups=groups,
2001
- skip_either_or_that=True, kwargs=kwargs)
2002
-
2003
- def _run_model_selection(self,
2004
- func_name,
2005
- X=None,
2006
- y=None,
2007
- groups=None,
2008
- skip_either_or_that=False,
2009
- kwargs={}):
2010
- """
2011
- Internal function to run functions like split, get_n_splits of model selection module.
2012
- - get_n_splits() returns number of splits as value, not as teradataml DataFrame.
2013
- - split() returns teradataml DataFrame containing train and test data for each split
2014
- (add partition information if the argument "partition_cols" is provided).
2015
- """
2016
- if self.module_name != "sklearn.model_selection":
2017
- raise AttributeError(f"{self.module_name+'.'+self.class_name} does not "
2018
- f"have {func_name}() method.")
2019
-
2020
- data = kwargs.get("data", None)
2021
-
2022
- if not X and not y and not groups and not data:
2023
- # If data is not passed, then run from client only.
2024
- # TODO: decide whether to run from client or from Vantage.
2025
- return super().__getattr__(func_name)()
2026
-
2027
- self._is_default_partition_value_fit = True # False when the user provides partition columns.
2028
-
2029
- data, feature_columns, label_columns, group_columns, partition_columns = \
2030
- self._validate_args_and_get_data(X=X, y=y, groups=groups, kwargs=kwargs,
2031
- skip_either_or_that=skip_either_or_that)
2032
-
2033
- if partition_columns:
2034
- self._is_default_partition_value_fit = False
2035
-
2036
- data, new_partition_columns = self._get_data_and_data_partition_columns(data,
2037
- feature_columns,
2038
- label_columns,
2039
- partition_columns,
2040
- group_columns)
2041
-
2042
- file_name = "sklearn_model_selection_split.py"
2043
-
2044
- script_file_path = f"{file_name}" if self._is_lake_system \
2045
- else f"./{self._db_name}/{file_name}"
2046
-
2047
- if func_name == "split":
2048
- # Need to generate data into splits of train and test.
2049
- # split_id - the column which will be used to identify the split.
2050
- # data_type - the column which will be used to identify whether the row is
2051
- # train or test row.
2052
- return_types = [("split_id", INTEGER()), ("data_type", VARCHAR())]
2053
- # Returning feature columns and label columns as well.
2054
- return_types += [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
2055
- for col in (feature_columns + label_columns)]
2056
- else:
2057
- # Return Varchar by default.
2058
- # Returns Varchar even for functions like `get_n_splits` which returns large integer
2059
- # numbers like `4998813702034726525205100` for `LeavePOut` class (when the argument
2060
- # `p` is 28 and no of data rows is 100) as Vantage cannot scope it to INTEGER.
2061
- return_types = [(func_name, VARCHAR())]
2062
-
2063
- return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
2064
- for col in new_partition_columns] + return_types
2065
-
2066
- data_column_types_str, partition_indices_str, _, new_partition_columns = \
2067
- self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
2068
-
2069
- py_exc = UtilFuncs._get_python_execution_path()
2070
- script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
2071
- f"{len(label_columns)} {len(group_columns)} {partition_indices_str} {data_column_types_str} "\
2072
- f"{self._model_file_name_prefix} {self._is_lake_system}"
2073
-
2074
- # Get unique values in partitioning columns.
2075
- self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
2076
-
2077
- # Checking the trained model installation. If not installed,
2078
- # install it and set flag to True.
2079
- if not self._is_trained_model_installed:
2080
- self._install_initial_model_file()
2081
- self._is_trained_model_installed = True
2082
-
2083
- opt = self._run_script(data, script_command, new_partition_columns, return_types)
2084
-
2085
- if func_name == "get_n_splits" and not partition_columns:
2086
- # Return number of splits as value, not as dataframe.
2087
- vals = execute_sql("select {} from {}".format(func_name, opt._table_name))
2088
- opt = vals.fetchall()[0][0]
2089
-
2090
- # Varchar is returned by the script. Convert it to int.
2091
- return int(opt)
2092
-
2093
- return opt
2094
-
2095
1257
 
2096
1258
  class _FunctionWrapper(_GenericObjectWrapper):
2097
1259
  def __init__(self, module_name, func_name, file_type, template_file):
@@ -2151,10 +1313,3 @@ class _FunctionWrapper(_GenericObjectWrapper):
2151
1313
  self._remove_script_file(self._script_file_name)
2152
1314
 
2153
1315
  return self.modelObj
2154
-
2155
-
2156
- class _SKLearnFunctionWrapper(_FunctionWrapper):
2157
- def __init__(self, module_name, func_name):
2158
- file_type = "file_fn_sklearn"
2159
- template_file = "sklearn_function.template"
2160
- super().__init__(module_name, func_name, file_type=file_type, template_file=template_file)