teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (263) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/LICENSE.pdf +0 -0
  3. teradataml/README.md +183 -0
  4. teradataml/__init__.py +6 -3
  5. teradataml/_version.py +2 -2
  6. teradataml/analytics/__init__.py +3 -2
  7. teradataml/analytics/analytic_function_executor.py +275 -40
  8. teradataml/analytics/analytic_query_generator.py +92 -0
  9. teradataml/analytics/byom/__init__.py +3 -2
  10. teradataml/analytics/json_parser/metadata.py +1 -0
  11. teradataml/analytics/json_parser/utils.py +17 -21
  12. teradataml/analytics/meta_class.py +40 -1
  13. teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
  14. teradataml/analytics/sqle/__init__.py +10 -2
  15. teradataml/analytics/table_operator/__init__.py +3 -2
  16. teradataml/analytics/uaf/__init__.py +21 -2
  17. teradataml/analytics/utils.py +62 -1
  18. teradataml/analytics/valib.py +1 -1
  19. teradataml/automl/__init__.py +1553 -319
  20. teradataml/automl/custom_json_utils.py +139 -61
  21. teradataml/automl/data_preparation.py +276 -319
  22. teradataml/automl/data_transformation.py +163 -81
  23. teradataml/automl/feature_engineering.py +402 -239
  24. teradataml/automl/feature_exploration.py +9 -2
  25. teradataml/automl/model_evaluation.py +48 -51
  26. teradataml/automl/model_training.py +291 -189
  27. teradataml/catalog/byom.py +8 -8
  28. teradataml/catalog/model_cataloging_utils.py +1 -1
  29. teradataml/clients/auth_client.py +133 -0
  30. teradataml/clients/pkce_client.py +1 -1
  31. teradataml/common/aed_utils.py +3 -2
  32. teradataml/common/constants.py +48 -6
  33. teradataml/common/deprecations.py +13 -7
  34. teradataml/common/garbagecollector.py +156 -120
  35. teradataml/common/messagecodes.py +6 -1
  36. teradataml/common/messages.py +3 -1
  37. teradataml/common/sqlbundle.py +1 -1
  38. teradataml/common/utils.py +103 -11
  39. teradataml/common/wrapper_utils.py +1 -1
  40. teradataml/context/context.py +121 -31
  41. teradataml/data/advertising.csv +201 -0
  42. teradataml/data/bank_marketing.csv +11163 -0
  43. teradataml/data/bike_sharing.csv +732 -0
  44. teradataml/data/boston2cols.csv +721 -0
  45. teradataml/data/breast_cancer.csv +570 -0
  46. teradataml/data/complaints_test_tokenized.csv +353 -0
  47. teradataml/data/complaints_tokens_model.csv +348 -0
  48. teradataml/data/covid_confirm_sd.csv +83 -0
  49. teradataml/data/customer_segmentation_test.csv +2628 -0
  50. teradataml/data/customer_segmentation_train.csv +8069 -0
  51. teradataml/data/dataframe_example.json +10 -0
  52. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
  53. teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
  54. teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
  55. teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
  56. teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
  57. teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
  58. teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
  59. teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
  60. teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
  61. teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
  62. teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
  63. teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
  64. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
  65. teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
  66. teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
  67. teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
  68. teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
  69. teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
  70. teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
  71. teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
  72. teradataml/data/docs/sqle/docs_17_20/Shap.py +197 -0
  73. teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
  74. teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
  75. teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
  76. teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
  77. teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
  78. teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
  79. teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
  80. teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
  81. teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
  82. teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
  83. teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
  84. teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
  85. teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
  86. teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
  87. teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
  88. teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
  89. teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
  90. teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
  91. teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
  92. teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
  93. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
  94. teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
  95. teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
  96. teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
  97. teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
  98. teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
  99. teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
  100. teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
  101. teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
  102. teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
  103. teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
  104. teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
  105. teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
  106. teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
  107. teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
  108. teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
  109. teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
  110. teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
  111. teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
  112. teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
  113. teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
  114. teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
  115. teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
  116. teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
  117. teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
  118. teradataml/data/dwt2d_dataTable.csv +65 -0
  119. teradataml/data/dwt_dataTable.csv +8 -0
  120. teradataml/data/dwt_filterTable.csv +3 -0
  121. teradataml/data/finance_data4.csv +13 -0
  122. teradataml/data/glm_example.json +28 -1
  123. teradataml/data/grocery_transaction.csv +19 -0
  124. teradataml/data/housing_train_segment.csv +201 -0
  125. teradataml/data/idwt2d_dataTable.csv +5 -0
  126. teradataml/data/idwt_dataTable.csv +8 -0
  127. teradataml/data/idwt_filterTable.csv +3 -0
  128. teradataml/data/insect2Cols.csv +61 -0
  129. teradataml/data/interval_data.csv +5 -0
  130. teradataml/data/jsons/paired_functions.json +14 -0
  131. teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
  132. teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
  133. teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
  134. teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
  135. teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
  136. teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
  137. teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
  138. teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
  139. teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
  140. teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
  141. teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
  142. teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
  143. teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
  144. teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
  145. teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
  146. teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
  147. teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
  148. teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
  149. teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
  150. teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
  151. teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
  152. teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
  153. teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
  154. teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
  155. teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
  156. teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
  157. teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
  158. teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
  159. teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
  160. teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
  161. teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
  162. teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
  163. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
  164. teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
  165. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
  166. teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
  167. teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
  168. teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
  169. teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
  170. teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
  171. teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
  172. teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
  173. teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
  174. teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
  175. teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
  176. teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
  177. teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
  178. teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
  179. teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
  180. teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
  181. teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
  182. teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
  183. teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
  184. teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
  185. teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
  186. teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
  187. teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
  188. teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
  189. teradataml/data/jsons/uaf/17.20/TD_SAX.json +208 -0
  190. teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
  191. teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
  192. teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
  193. teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
  194. teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
  195. teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +400 -0
  196. teradataml/data/kmeans_example.json +5 -0
  197. teradataml/data/kmeans_table.csv +10 -0
  198. teradataml/data/load_example_data.py +8 -2
  199. teradataml/data/naivebayestextclassifier_example.json +1 -1
  200. teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
  201. teradataml/data/onehot_encoder_train.csv +4 -0
  202. teradataml/data/openml_example.json +29 -0
  203. teradataml/data/peppers.png +0 -0
  204. teradataml/data/real_values.csv +14 -0
  205. teradataml/data/sax_example.json +8 -0
  206. teradataml/data/scale_attributes.csv +3 -0
  207. teradataml/data/scale_example.json +52 -1
  208. teradataml/data/scale_input_part_sparse.csv +31 -0
  209. teradataml/data/scale_input_partitioned.csv +16 -0
  210. teradataml/data/scale_input_sparse.csv +11 -0
  211. teradataml/data/scale_parameters.csv +3 -0
  212. teradataml/data/scripts/deploy_script.py +21 -2
  213. teradataml/data/scripts/sklearn/sklearn_fit.py +40 -37
  214. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +22 -30
  215. teradataml/data/scripts/sklearn/sklearn_function.template +42 -24
  216. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
  217. teradataml/data/scripts/sklearn/sklearn_neighbors.py +19 -28
  218. teradataml/data/scripts/sklearn/sklearn_score.py +32 -32
  219. teradataml/data/scripts/sklearn/sklearn_transform.py +85 -42
  220. teradataml/data/star_pivot.csv +8 -0
  221. teradataml/data/templates/open_source_ml.json +2 -1
  222. teradataml/data/teradataml_example.json +97 -1
  223. teradataml/data/timestamp_data.csv +4 -0
  224. teradataml/data/titanic_dataset_unpivoted.csv +19 -0
  225. teradataml/data/uaf_example.json +55 -1
  226. teradataml/data/unpivot_example.json +15 -0
  227. teradataml/data/url_data.csv +9 -0
  228. teradataml/data/windowdfft.csv +16 -0
  229. teradataml/data/ztest_example.json +16 -0
  230. teradataml/dataframe/copy_to.py +9 -4
  231. teradataml/dataframe/data_transfer.py +125 -64
  232. teradataml/dataframe/dataframe.py +575 -57
  233. teradataml/dataframe/dataframe_utils.py +47 -9
  234. teradataml/dataframe/fastload.py +273 -90
  235. teradataml/dataframe/functions.py +339 -0
  236. teradataml/dataframe/row.py +160 -0
  237. teradataml/dataframe/setop.py +2 -2
  238. teradataml/dataframe/sql.py +740 -18
  239. teradataml/dataframe/window.py +1 -1
  240. teradataml/dbutils/dbutils.py +324 -18
  241. teradataml/geospatial/geodataframe.py +1 -1
  242. teradataml/geospatial/geodataframecolumn.py +1 -1
  243. teradataml/hyperparameter_tuner/optimizer.py +13 -13
  244. teradataml/lib/aed_0_1.dll +0 -0
  245. teradataml/opensource/sklearn/_sklearn_wrapper.py +254 -122
  246. teradataml/options/__init__.py +16 -5
  247. teradataml/options/configure.py +39 -6
  248. teradataml/options/display.py +2 -2
  249. teradataml/plot/axis.py +4 -4
  250. teradataml/scriptmgmt/UserEnv.py +26 -19
  251. teradataml/scriptmgmt/lls_utils.py +120 -16
  252. teradataml/table_operators/Script.py +4 -5
  253. teradataml/table_operators/TableOperator.py +160 -26
  254. teradataml/table_operators/table_operator_util.py +88 -41
  255. teradataml/table_operators/templates/dataframe_udf.template +63 -0
  256. teradataml/telemetry_utils/__init__.py +0 -0
  257. teradataml/telemetry_utils/queryband.py +52 -0
  258. teradataml/utils/validators.py +41 -3
  259. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/METADATA +191 -6
  260. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/RECORD +263 -185
  261. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/WHEEL +0 -0
  262. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/top_level.txt +0 -0
  263. {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/zip-safe +0 -0
@@ -49,7 +49,7 @@ from teradataml.opensource.sklearn.constants import OpenSourcePackage, _OSML_MOD
49
49
  from teradataml.common.messagecodes import MessageCodes
50
50
  from teradataml.common.messages import Messages
51
51
  from teradataml.catalog.byom import save_byom, retrieve_byom, delete_byom
52
- from teradataml.dbutils.dbutils import _create_table
52
+ from teradataml.dbutils.dbutils import _create_table, set_session_param
53
53
  from teradataml.utils.validators import _Validators
54
54
  from teradataml.dataframe.dataframe import DataFrame
55
55
  from teradataml.dataframe.dataframe_utils import DataFrameUtils
@@ -64,6 +64,10 @@ validator = _Validators()
64
64
 
65
65
  installed_model_files = defaultdict(int)
66
66
 
67
+ ## Flag to ensure the sklearn script
68
+ ## installation occurs only once.
69
+ _file_installed = False
70
+
67
71
  class _GenericObjectWrapper:
68
72
  def __init__(self) -> None:
69
73
  self._db_name = _get_current_databasename()
@@ -76,7 +80,7 @@ class _GenericObjectWrapper:
76
80
  self.modelObj = None
77
81
  self._model_data = None
78
82
 
79
- self._tdml_tmp_dir = os.path.join(os.path.expanduser("~"), ".teradataml")
83
+ self._tdml_tmp_dir = GarbageCollector._get_temp_dir_name()
80
84
 
81
85
  self._env = None
82
86
 
@@ -86,43 +90,24 @@ class _GenericObjectWrapper:
86
90
  if configure.openml_user_env is not None:
87
91
  self._env = configure.openml_user_env
88
92
  else:
89
- self._create_or_get_env()
93
+ self._env = UtilFuncs._create_or_get_env("open_source_ml.json")
90
94
  else:
91
- execute_sql(f"SET SESSION SEARCHUIFDBPATH = {self._db_name};")
95
+ set_session_param("searchuifdbpath",self._db_name)
92
96
 
93
- def _create_or_get_env(self):
94
- """
95
- Internal function to return the env if already exists else
96
- creates the environment using template file and return the env.
97
- """
98
- # Get the template file path.
99
- template_dir_path = os.path.join(_TDML_DIRECTORY, "data", "templates",
100
- "open_source_ml.json")
97
+ global _file_installed
98
+ ## Flag to check whether trained model is installed or not.
99
+ self._is_trained_model_installed = False
101
100
 
102
- # Read template file.
103
- with open(template_dir_path, "r") as r_file:
104
- data = json.load(r_file)
101
+ ## Install all sklearn script files on Vantage.
102
+ if not _file_installed:
103
+ sklearn_script_files = ["sklearn_fit.py", "sklearn_score.py",
104
+ "sklearn_transform.py", "sklearn_fit_predict.py",
105
+ "sklearn_neighbors.py", "sklearn_model_selection_split.py"]
106
+ for script_file in sklearn_script_files:
107
+ self._install_script_file(file_identifier=script_file.split(".")[0],
108
+ file_name=script_file)
105
109
 
106
- # Get env_name.
107
- _env_name = data["env_specs"][0]["env_name"]
108
-
109
- try:
110
- # Call function to 'openml_env' get env.
111
- self._env = get_env(_env_name)
112
- except TeradataMlException as tdml_e:
113
- # We will get here when error says, env does not exist otherwise raise the exception as is.
114
- # Env does not exist so create one.
115
-
116
- exc_msg = "Failed to execute get_env(). User environment '{}' not " \
117
- "found.".format(_env_name)
118
- if exc_msg in tdml_e.args[0]:
119
- print(f"No OpenAF environment with name '{_env_name}' found. Creating one with "\
120
- "latest supported python and required packages.")
121
- _env = create_env(template=template_dir_path)
122
- else:
123
- raise tdml_e
124
- except Exception as exc:
125
- raise exc
110
+ _file_installed = True
126
111
 
127
112
  def _get_columns_as_list(self, cols):
128
113
  """
@@ -205,34 +190,65 @@ class _GenericObjectWrapper:
205
190
  is_binary=is_binary)
206
191
  else:
207
192
  status = self._env.install_file(file_path=new_script,
208
- replace=True,
209
- suppress_output=True)
193
+ replace=True,
194
+ suppress_output=True)
210
195
  if not status:
211
196
  raise TeradataMlException(
212
197
  f"Script file '{file_name}' failed to get installed/replaced in Vantage."
213
198
  )
214
199
 
215
- def _get_partition_col_indices_and_types(self, data, partition_columns):
200
+ def _remove_script_file(self, file_name):
216
201
  """
217
- partition_columns can be from feature columns and label columns.
218
- So, get the indices and types of these columns from the data columns.
202
+ Internal function to remove script file in Vantage.
219
203
  """
220
- partition_indices = []
221
- partition_types = []
204
+ # _env is set while object creation
205
+ # If not set, it is Vantage Enterprise. Otherwise, it is Vantage Lake.
206
+
207
+ if not self._is_lake_system:
208
+ status = remove_file(file_identifier=file_name.split(".")[0],
209
+ force_remove=True,
210
+ suppress_output=True)
211
+ else:
212
+ status = self._env.remove_file(file_name=file_name,
213
+ suppress_output=True)
214
+ if not status:
215
+ raise TeradataMlException(
216
+ f"Script file '{file_name}' failed to remove in Vantage."
217
+ )
218
+ def _get_data_col_types_and_partition_col_indices_and_types(self, data, partition_columns,
219
+ idx_delim=",",
220
+ types_delim="--"):
221
+ """
222
+ Internal function to get the data column types and partition column names, indices and types.
223
+ Function returns delimiter separated string of types and indices if idx_delim and
224
+ types_delim are provided. Otherwise, it returns list of types and indices. Partition names
225
+ are returned as list always.
226
+ """
227
+ data_column_types = "" if types_delim else []
228
+ partition_indices = "" if idx_delim else []
229
+ partition_types = "" if types_delim else []
222
230
  new_partition_columns = []
231
+ j = 0
223
232
  for i, col in enumerate(data.columns):
233
+ _type = data._td_column_names_and_sqlalchemy_types[col.lower()].python_type.__name__
234
+ if types_delim:
235
+ data_column_types += (_type if i == 0 else f"{types_delim}{_type}")
236
+ else:
237
+ data_column_types.append(_type)
224
238
  if col in partition_columns:
225
239
  new_partition_columns.append(col)
226
- partition_indices.append(i)
227
- partition_types.append(data._td_column_names_and_sqlalchemy_types[col.lower()].\
228
- python_type.__name__)
229
- # Converting to string "None" if they are not present as empty string can't be passed
230
- # to Script script_commands' command line arguments.
231
- # Otherwise, pass the values as comma separated string.
232
- partition_indices = ",".join([str(x) for x in partition_indices])\
233
- if partition_indices else "None"
234
- partition_types = ",".join([x for x in partition_types]) if partition_types else "None"
235
- return partition_indices, partition_types, new_partition_columns
240
+ if idx_delim:
241
+ partition_indices += (str(i) if j == 0 else f"{idx_delim}{str(i)}")
242
+ else:
243
+ partition_indices.append(i)
244
+ if types_delim:
245
+ partition_types += (_type if j == 0 else f"{types_delim}{_type}")
246
+ else:
247
+ partition_types.append(_type)
248
+ j += 1
249
+ # Return types of all columns (as list or str), partition column indices (as list or str)
250
+ # and partition column types (as list or str).
251
+ return data_column_types, partition_indices, partition_types, new_partition_columns
236
252
 
237
253
  def _get_kwargs_str(self, kwargs):
238
254
  """
@@ -357,6 +373,23 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
357
373
  Internal function to get attributes of all sklearn model objects when multiple models are
358
374
  generated by fit.
359
375
  """
376
+
377
+ def __generate_model_object(model_obj_value):
378
+ """
379
+ Internal function to generate _SkLearnWrapperObject model object from model_obj_value.
380
+ """
381
+ # Create _SkLearnObjectWrapper object from opensource model object.
382
+ model_obj = self.__class__(model=first_atrribute_instance)
383
+ model_obj.modelObj = model_obj_value
384
+ model_obj._is_model_installed = True
385
+
386
+ # Setting other model attributes.
387
+ model_obj._is_default_partition_value_fit = self._is_default_partition_value_fit
388
+ model_obj._is_default_partition_value_predict = self._is_default_partition_value_predict
389
+ model_obj._fit_partition_colums_non_default = self._fit_partition_colums_non_default
390
+ model_obj._fit_partition_unique_values = self._fit_partition_unique_values
391
+ return model_obj
392
+
360
393
  # Wrapper function to invoke dynamic method, using arguments
361
394
  # passed by user, on model in each row.
362
395
  def __sklearn_method_invoker_for_multimodel(*c, **kwargs):
@@ -364,36 +397,58 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
364
397
  for i in range(multi_models.shape[0]):
365
398
  curr_model = multi_models.iloc[i]["model"]
366
399
  multi_models.at[i, "model"] = getattr(curr_model, name)(*c, **kwargs)
400
+
401
+ first_function_instance = multi_models.at[0, "model"]
402
+ if self.__class__._validate_model_supportability(first_function_instance):
403
+ return __generate_model_object(multi_models)
404
+
367
405
  return multi_models.rename(columns={"model": name})
368
406
 
369
- # Identify if attribute is callable or not to avoid
370
- # this check in loop for every model.
371
- is_attr_callable = False
372
407
  # Assuming that self.modelObj will have at least 1 row.
373
- is_attr_callable = callable(getattr(self.modelObj.iloc[0]["model"], name))
374
408
 
375
- # If attribute is callable, it should be applied on model in each row
409
+ # Get attribute instance from first model object.
410
+ first_atrribute_instance = getattr(self.modelObj.iloc[0]["model"], name)
411
+
412
+ # If first_atrribute_instance is callable, it should be applied on model in each row
376
413
  # using passed arguments.
377
- if is_attr_callable:
414
+ if callable(first_atrribute_instance):
378
415
  return __sklearn_method_invoker_for_multimodel
379
416
 
380
417
  output_attributes = self.modelObj.copy()
381
418
  for i in range(output_attributes.shape[0]):
382
419
  model = output_attributes.iloc[i]["model"]
383
420
  output_attributes.at[i, "model"] = getattr(model, name)
421
+
422
+ if self.__class__._validate_model_supportability(first_atrribute_instance):
423
+ return __generate_model_object(output_attributes)
424
+
384
425
  return output_attributes.rename(columns={"model": name})
385
426
 
386
427
  def __getattr__(self, name):
387
428
  # This just run attributes (functions and properties) from sklearn object.
388
429
  def __sklearn_method_invoker(*c, **kwargs):
389
- return atrribute_instance(*c, **kwargs)
430
+ # sklearn model is returned from the function call. Create _SkLearnObjectWrapper object.
431
+ model_obj = attribute_instance(*c, **kwargs)
432
+ if self.__class__._validate_model_supportability(model_obj):
433
+ model_obj = self.__class__(model=model_obj)
434
+ model_obj._is_model_installed = True # Trained model is returned by function call.
435
+ return model_obj
436
+
390
437
  if isinstance(self.modelObj, pd.DataFrame):
391
438
  return self.__get_obj_attributes_multi_model(name)
392
439
 
393
- atrribute_instance = getattr(self.modelObj, name)
394
- if callable(atrribute_instance):
440
+ attribute_instance = getattr(self.modelObj, name)
441
+
442
+ if callable(attribute_instance):
395
443
  return __sklearn_method_invoker
396
- return atrribute_instance
444
+
445
+ if self.__class__._validate_model_supportability(attribute_instance):
446
+ # sklearn model is returned from the attribute. Create _SkLearnObjectWrapper object.
447
+ model_obj = self.__class__(model=attribute_instance)
448
+ model_obj._is_model_installed = True # Trained model is returned as attribute.
449
+ return model_obj
450
+
451
+ return attribute_instance
397
452
 
398
453
  @classmethod
399
454
  def _validate_model_supportability(cls, model):
@@ -404,15 +459,25 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
404
459
  error_msg = Messages.get_message(MessageCodes.MODEL_CATALOGING_OPERATION_FAILED, "validate",
405
460
  "The given model is not a supported opensource model.")
406
461
  msg_code = MessageCodes.MODEL_CATALOGING_OPERATION_FAILED
462
+ package_name = None
463
+ class_name = None
407
464
  try:
408
465
  # For scikit-learn, model.__module__ is similar to 'sklearn.linear_model._base'.
409
466
  # TODO: check for other supported packages.
410
- if model.__module__.split(".")[0] not in OpenSourcePackage.values():
411
- raise TeradataMlException(error_msg, msg_code)
467
+ if hasattr(model, "__module__"):
468
+ package_name = model.__module__.split(".")[0]
469
+ if package_name not in OpenSourcePackage.values():
470
+ return False
471
+ if hasattr(model, "__class__"):
472
+ class_name = model.__class__.__name__
412
473
  except Exception as ex:
413
474
  # If in case, model.__module__ fails.
414
475
  raise TeradataMlException(error_msg, msg_code) from ex
415
476
 
477
+ # True only if package name is opensource package name and class name is not internal class.
478
+ return True if package_name and class_name and \
479
+ package_name == cls.OPENSOURCE_PACKAGE_NAME.value and not class_name.startswith("_") else False
480
+
416
481
  def _save_model(self, model_name, replace_if_exists=False):
417
482
  """
418
483
  Internal function to save the model stored in file at location mentioned by class variable
@@ -423,7 +488,8 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
423
488
  conn = get_connection()
424
489
  osml_models_table_exists = conn.dialect.has_table(conn,
425
490
  table_name=_OSML_MODELS_TABLE_NAME,
426
- schema=self._db_name)
491
+ schema=self._db_name,
492
+ table_only=True)
427
493
  if not osml_models_table_exists:
428
494
  all_columns = _OSML_MODELS_TABLE_COLUMNS_TYPE_DICT.copy()
429
495
  all_columns.update(_OSML_ADDITIONAL_COLUMN_TYPES)
@@ -471,7 +537,11 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
471
537
  Internal function to create an instance of the class using the model and deploy
472
538
  the model to Vantage.
473
539
  """
474
- cls._validate_model_supportability(model=model)
540
+ is_model_supportable = cls._validate_model_supportability(model=model)
541
+ if not is_model_supportable:
542
+ raise TeradataMlException(Messages.get_message(MessageCodes.MODEL_CATALOGING_OPERATION_FAILED,
543
+ "deploy", "The given model is not a supported opensource model."),
544
+ MessageCodes.MODEL_CATALOGING_OPERATION_FAILED)
475
545
 
476
546
  cls = cls(model=model)
477
547
  # Load the model file into Vantage node as file can be used in
@@ -817,7 +887,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
817
887
  for col in new_partition_columns] + [("model", model_type)]
818
888
 
819
889
  file_name = "sklearn_fit.py"
820
- self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
821
890
 
822
891
  if classes:
823
892
  class_type = type(classes[0]).__name__
@@ -825,15 +894,15 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
825
894
  else:
826
895
  classes = str(None)
827
896
  class_type = str(None)
828
-
829
- partition_indices, partition_types, new_partition_columns = \
830
- self._get_partition_col_indices_and_types(data, new_partition_columns)
897
+
898
+ data_column_types_str, partition_indices_str, _, new_partition_columns = \
899
+ self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
831
900
 
832
901
  # db_name is applicable for enterprise system.
833
902
  db_file_name = file_name if self._is_lake_system else f"./{self._db_name}/{file_name}"
834
903
  py_exc = UtilFuncs._get_python_execution_path()
835
904
  script_command = f"{py_exc} {db_file_name} {func} {len(feature_columns)} "\
836
- f"{len(label_columns)} {partition_indices} {partition_types} "\
905
+ f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
837
906
  f"{self._model_file_name_prefix} {classes} {class_type} {self._is_lake_system}"
838
907
 
839
908
  # Get unique values in partitioning columns.
@@ -852,6 +921,13 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
852
921
  self._fit_label_columns_types = [data._td_column_names_and_sqlalchemy_types[l_c.lower()]
853
922
  for l_c in label_columns]
854
923
 
924
+ # If the model is trained a second time after the object creation,
925
+ # or if set_params() is called after the first model training,
926
+ # this flag will reset to False. So that for subsequent predict/score
927
+ # operations, the newly trained model will be installed.
928
+ if self._is_trained_model_installed:
929
+ self._is_trained_model_installed = False
930
+
855
931
  def partial_fit(self, X=None, y=None, classes=None, **kwargs):
856
932
  """
857
933
  Please check the description in Docs/OpensourceML/sklearn.py.
@@ -972,7 +1048,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
972
1048
  feature_columns,
973
1049
  label_columns,
974
1050
  func_name,
975
- n_partitions,
976
1051
  kwargs):
977
1052
  """
978
1053
  Internal function to return list of column names and their sqlalchemy types
@@ -1010,7 +1085,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1010
1085
 
1011
1086
  # For paritioning columns, it will be a dataframe and getattr(modelObj, func_name) fails.
1012
1087
  # Just for getting the number of columns and their types, using only one model of all.
1013
- if n_partitions == 1:
1088
+ if len(self._fit_partition_unique_values) == 1:
1014
1089
  # Single model case.
1015
1090
  skl_obj = self.modelObj
1016
1091
  else:
@@ -1038,11 +1113,10 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1038
1113
  "path() returns tuple of ndarrays of different shapes. Not Implemented yet."
1039
1114
  )
1040
1115
 
1041
- # This import is as per scipy version 1.10.x in local machine as teradataml does not
1042
- # impose restrictions on this package in setup.py. TODO
1043
- from scipy.sparse import csr_matrix
1044
-
1045
- if isinstance(trans_opt, csr_matrix):
1116
+ if isinstance(trans_opt, numpy.ndarray) and trans_opt.shape == (X.shape[0],):
1117
+ trans_opt = trans_opt.reshape(X.shape[0], 1)
1118
+
1119
+ if type(trans_opt).__name__ in ["csr_matrix", "csc_matrix"]:
1046
1120
  no_of_columns = trans_opt.get_shape()[1]
1047
1121
  trans_opt = trans_opt.toarray()
1048
1122
  elif isinstance(trans_opt, dict):
@@ -1054,6 +1128,14 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1054
1128
  else:
1055
1129
  no_of_columns = 1
1056
1130
 
1131
+ # Special handling when inverse_transform of no_of_columns returns no of rows
1132
+ # less than the no of classes. Such columns are filled with NaN values.
1133
+ # Updating number of columns here (new columns with NaN values will be added).
1134
+ if func_name == "inverse_transform" and self.class_name == "MultiLabelBinarizer":
1135
+ no_of_columns = len(self.classes_)
1136
+ for i in range(len(ten_row_data)):
1137
+ trans_opt[i] += tuple([numpy.nan] * (no_of_columns - len(trans_opt[i])))
1138
+
1057
1139
  # Special handling required for cross_decomposition classes's transform function, which
1058
1140
  # takes label columns also. In this case, output is a tuple of numpy arrays - x_scores and
1059
1141
  # y_scores. If label columns are not provided, only x_scores are returned.
@@ -1084,6 +1166,30 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1084
1166
  # Get new column sqlalchemy types for pandas df columns of transform output.
1085
1167
  opt_pd = pd.DataFrame(trans_opt)
1086
1168
 
1169
+ # Get output column types for each column in pandas df from the output of transform
1170
+ # type functions.
1171
+ types = {}
1172
+ for idx, col in enumerate(list(opt_pd.columns)):
1173
+ # Get type of column using data from all rows, in case if the column has None values.
1174
+ # 'and' of types of all values in the column with type(None) gives the type of the column.
1175
+ type_ = type(None)
1176
+ for i in range(len(trans_opt)):
1177
+ type_ = type_ and type(trans_opt[i][idx])
1178
+
1179
+ # If all the values of the output (trans_opt) is None, thelen use `str` as type since
1180
+ # pandas astype() does not accept None type.
1181
+ if type_ is type(None):
1182
+ type_ = str
1183
+
1184
+ # numpy integer columns with nan values can't be typecasted using pd.astype() to int64.
1185
+ # It raises error like "Cannot convert non-finite values (NA or inf) to integer:
1186
+ # Error while type casting for column '2'"
1187
+ # Hence, using pd.Int64Dtype() for integer columns with nan values.
1188
+ types[col] = type_ if type_ not in [int, numpy.int64] else pd.Int64Dtype()
1189
+
1190
+ # Without this, all columns will be of object type and gets converted to VARCHAR in Vantage.
1191
+ opt_pd = opt_pd.astype(types)
1192
+
1087
1193
  # If the datatype is not specified then check if the datatype is datetime64 and timezone is present then map it to
1088
1194
  # TIMESTAMP(timezone=True) else map it according to default value.
1089
1195
  col_types = [TIMESTAMP(timezone=True)
@@ -1118,26 +1224,29 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1118
1224
  partition_columns)
1119
1225
 
1120
1226
  file_name = "sklearn_score.py"
1121
- self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
1122
1227
 
1123
1228
  script_file_path = f"{file_name}" if self._is_lake_system \
1124
1229
  else f"./{self._db_name}/{file_name}"
1125
1230
 
1126
- partition_indices, partition_types, new_partition_columns = \
1127
- self._get_partition_col_indices_and_types(data, new_partition_columns)
1231
+ data_column_types_str, partition_indices_str, _, new_partition_columns = \
1232
+ self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
1128
1233
 
1129
1234
  self._validate_unique_partition_values(data, new_partition_columns)
1130
1235
 
1131
1236
  py_exc = UtilFuncs._get_python_execution_path()
1132
1237
  script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
1133
- f"{len(label_columns)} {partition_indices} {partition_types} "\
1238
+ f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
1134
1239
  f"{self._model_file_name_prefix} {self._is_lake_system}"
1135
1240
 
1136
1241
  # score, aic, bic returns float values.
1137
1242
  return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1138
1243
  for col in new_partition_columns] + [(func_name, FLOAT())]
1139
1244
 
1140
- self._install_initial_model_file()
1245
+ # Checking the trained model installation. If not installed,
1246
+ # install it and set flag to True.
1247
+ if not self._is_trained_model_installed:
1248
+ self._install_initial_model_file()
1249
+ self._is_trained_model_installed = True
1141
1250
 
1142
1251
  opt = self._run_script(data, script_command, new_partition_columns, return_types)
1143
1252
 
@@ -1186,19 +1295,18 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1186
1295
  kwargs.pop("label_columns")
1187
1296
 
1188
1297
  file_name = "sklearn_transform.py"
1189
- self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
1190
1298
 
1191
1299
  script_file_path = f"{file_name}" if self._is_lake_system \
1192
1300
  else f"./{self._db_name}/{file_name}"
1193
1301
 
1194
- partition_indices, partition_types, new_partition_columns = \
1195
- self._get_partition_col_indices_and_types(data, new_partition_columns)
1302
+ data_column_types_str, partition_indices_str, _, new_partition_columns = \
1303
+ self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
1196
1304
 
1197
1305
  self._validate_unique_partition_values(data, new_partition_columns)
1198
1306
 
1199
1307
  py_exc = UtilFuncs._get_python_execution_path()
1200
1308
  script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
1201
- f"{len(label_columns)} {partition_indices} {partition_types} "\
1309
+ f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
1202
1310
  f"{self._model_file_name_prefix} {self._is_lake_system}"
1203
1311
 
1204
1312
  # Returning feature columns also along with transformed columns because we don't know the
@@ -1208,15 +1316,18 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1208
1316
  if func_name in ["predict", "decision_function"] and label_columns:
1209
1317
  return_types += [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1210
1318
  for col in label_columns]
1319
+
1211
1320
  return_types += self._get_return_columns_for_function_(data,
1212
1321
  feature_columns,
1213
1322
  label_columns,
1214
1323
  func_name,
1215
- len(new_partition_columns),
1216
1324
  kwargs)
1217
1325
 
1218
- # Installing model files before running sklearn_transform.py.
1219
- self._install_initial_model_file()
1326
+ # Checking the trained model installation. If not installed,
1327
+ # install it and set flag to True.
1328
+ if not self._is_trained_model_installed:
1329
+ self._install_initial_model_file()
1330
+ self._is_trained_model_installed = True
1220
1331
 
1221
1332
  opt = self._run_script(data, script_command, new_partition_columns, return_types)
1222
1333
 
@@ -1253,7 +1364,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1253
1364
  feature_columns,
1254
1365
  label_columns,
1255
1366
  func_name,
1256
- len(new_partition_columns),
1257
1367
  {})
1258
1368
  else:
1259
1369
  # If there are no label_columns, we will have only one
@@ -1261,22 +1371,25 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1261
1371
  return_types += [(f"{self.class_name.lower()}_{func_name}_1", FLOAT())]
1262
1372
 
1263
1373
  file_name = "sklearn_fit_predict.py"
1264
- self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
1265
1374
 
1266
- partition_indices, partition_types, new_partition_columns = \
1267
- self._get_partition_col_indices_and_types(data, new_partition_columns)
1375
+ data_column_types_str, partition_indices_str, _, new_partition_columns = \
1376
+ self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
1268
1377
 
1269
1378
  script_file_name = f"{file_name}" if self._is_lake_system \
1270
1379
  else f"./{self._db_name}/{file_name}"
1271
1380
  py_exc = UtilFuncs._get_python_execution_path()
1272
1381
  script_command = f"{py_exc} {script_file_name} {len(feature_columns)} "\
1273
- f"{len(label_columns)} {partition_indices} {partition_types} "\
1382
+ f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
1274
1383
  f"{self._model_file_name_prefix} {self._is_lake_system}"
1275
1384
 
1276
1385
  # Get unique values in partitioning columns.
1277
1386
  self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
1278
1387
 
1279
- self._install_initial_model_file()
1388
+ # Checking the trained model installation. If not installed,
1389
+ # install it and flag to True.
1390
+ if not self._is_trained_model_installed:
1391
+ self._install_initial_model_file()
1392
+ self._is_trained_model_installed = True
1280
1393
 
1281
1394
  opt = self._run_script(data, script_command, new_partition_columns, return_types)
1282
1395
 
@@ -1354,7 +1467,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1354
1467
  args_str = self._get_kwargs_str(kwargs)
1355
1468
 
1356
1469
  file_name = "sklearn_neighbors.py"
1357
- self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
1358
1470
 
1359
1471
  script_file_path = f"{file_name}" if self._is_lake_system \
1360
1472
  else f"./{self._db_name}/{file_name}"
@@ -1377,18 +1489,22 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1377
1489
  else:
1378
1490
  return_types += [("output", VARCHAR())]
1379
1491
 
1380
- partition_indices, partition_types, new_partition_columns = \
1381
- self._get_partition_col_indices_and_types(data, new_partition_columns)
1492
+ data_column_types_str, partition_indices_str, _, new_partition_columns = \
1493
+ self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
1382
1494
 
1383
1495
  py_exc = UtilFuncs._get_python_execution_path()
1384
1496
  script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
1385
- f"{partition_indices} {partition_types} {self._model_file_name_prefix} {self._is_lake_system} "\
1497
+ f"{partition_indices_str} {data_column_types_str} {self._model_file_name_prefix} {self._is_lake_system} "\
1386
1498
  f"{args_str}"
1387
1499
 
1388
1500
  # Get unique values in partitioning columns.
1389
1501
  self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
1390
1502
 
1391
- self._install_initial_model_file()
1503
+ # Checking the trained model installation. If not installed,
1504
+ # install it and set flag to True.
1505
+ if not self._is_trained_model_installed:
1506
+ self._install_initial_model_file()
1507
+ self._is_trained_model_installed = True
1392
1508
 
1393
1509
  opt = self._run_script(data, script_command, new_partition_columns, return_types)
1394
1510
 
@@ -1472,7 +1588,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1472
1588
  group_columns)
1473
1589
 
1474
1590
  file_name = "sklearn_model_selection_split.py"
1475
- self._install_script_file(file_identifier=file_name.split(".")[0], file_name=file_name)
1476
1591
 
1477
1592
  script_file_path = f"{file_name}" if self._is_lake_system \
1478
1593
  else f"./{self._db_name}/{file_name}"
@@ -1496,18 +1611,22 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1496
1611
  return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1497
1612
  for col in new_partition_columns] + return_types
1498
1613
 
1499
- partition_indices, partition_types, new_partition_columns = \
1500
- self._get_partition_col_indices_and_types(data, new_partition_columns)
1614
+ data_column_types_str, partition_indices_str, _, new_partition_columns = \
1615
+ self._get_data_col_types_and_partition_col_indices_and_types(data, new_partition_columns)
1501
1616
 
1502
1617
  py_exc = UtilFuncs._get_python_execution_path()
1503
1618
  script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
1504
- f"{len(label_columns)} {len(group_columns)} {partition_indices} {partition_types} "\
1619
+ f"{len(label_columns)} {len(group_columns)} {partition_indices_str} {data_column_types_str} "\
1505
1620
  f"{self._model_file_name_prefix} {self._is_lake_system}"
1506
1621
 
1507
1622
  # Get unique values in partitioning columns.
1508
1623
  self._fit_partition_unique_values = data.drop_duplicate(new_partition_columns).get_values()
1509
1624
 
1510
- self._install_initial_model_file()
1625
+ # Checking the trained model installation. If not installed,
1626
+ # install it and set flag to True.
1627
+ if not self._is_trained_model_installed:
1628
+ self._install_initial_model_file()
1629
+ self._is_trained_model_installed = True
1511
1630
 
1512
1631
  opt = self._run_script(data, script_command, new_partition_columns, return_types)
1513
1632
 
@@ -1586,19 +1705,25 @@ class _SKLearnFunctionWrapper(_GenericObjectWrapper):
1586
1705
 
1587
1706
  self.__params = kwargs
1588
1707
 
1589
- # Get indices and types of partition_columns.
1590
- idxs, types, partition_cols = self._get_partition_col_indices_and_types(self.__tdml_df,
1591
- partition_cols)
1708
+ # Get indices of partition_columns and types of all columns.
1709
+ data_column_types_str, partition_indices_str, _, partition_cols = \
1710
+ self._get_data_col_types_and_partition_col_indices_and_types(self.__tdml_df, partition_cols)
1592
1711
 
1593
1712
  script_file_path = f"{self._model_file_name}" if self._is_lake_system \
1594
1713
  else f"./{self._db_name}/{self._model_file_name}"
1714
+
1715
+ model_file_prefix = None
1716
+ if self._is_lake_system:
1717
+ model_file_prefix = self._model_file_name.replace(".py", "")
1718
+
1595
1719
  py_exc = UtilFuncs._get_python_execution_path()
1596
- script_command = (f"{py_exc} {script_file_path} {idxs}"
1597
- f" ") + \
1598
- f"{types} {data_args_str}"
1720
+ script_command = (f"{py_exc} {script_file_path} {partition_indices_str} "\
1721
+ f"{data_column_types_str} {data_args_str} {self._is_lake_system}"\
1722
+ f" {model_file_prefix}")
1599
1723
 
1600
- return_types = [(col, self.__tdml_df._td_column_names_and_sqlalchemy_types[col.lower()])
1601
- for col in partition_cols] + [(self.__func_name, CLOB())]
1724
+ model_type = BLOB() if self._is_lake_system else CLOB()
1725
+ return_types = [(col, self.__tdml_df._td_column_names_and_sqlalchemy_types[col.lower()])
1726
+ for col in partition_cols] + [(self.__func_name, model_type)]
1602
1727
 
1603
1728
  # Generate new file in .teradataml directory and install it to Vantage.
1604
1729
  self._prepare_and_install_file()
@@ -1613,23 +1738,30 @@ class _SKLearnFunctionWrapper(_GenericObjectWrapper):
1613
1738
 
1614
1739
  # File cleanup after processing.
1615
1740
  os.remove(self._model_file_local)
1616
- remove_file(file_identifier=self._model_file_name.split(".")[0], suppress_output=True,
1617
- force_remove=True)
1741
+ self._remove_script_file(self._model_file_name)
1618
1742
 
1619
1743
  return self.modelObj
1620
1744
 
1621
1745
  def _prepare_data_args_string(self, kwargs):
1746
+ """
1747
+ Get column indices and types of each data related arguments in the format:
1748
+ "{<arg_name>-<comma separated indices>-<comma separated types>}--
1749
+ {<arg_name>-<comma separated indices>-<comma separated types>}"
1750
+ """
1622
1751
  data_args_str = []
1623
1752
  for arg_name in list(self.__data_args.keys()):
1624
1753
  # Remove DataFrame arguments from kwargs, which will be passed to Script.
1625
1754
  kwargs.pop(arg_name)
1626
1755
 
1627
1756
  # Get column indices and their types for each dataframe from parent dataframe.
1628
- _indices, _types, _ = self._get_partition_col_indices_and_types(self.__tdml_df,
1629
- self.__data_args[arg_name].columns)
1630
-
1631
- # Format "<arg_name>-<comma separated indices>-<comma separated types>"
1632
- data_args_str.append(f"{arg_name}-{_indices}-{_types}")
1757
+ _, partition_indices_str, partition_types_str, _ = \
1758
+ self._get_data_col_types_and_partition_col_indices_and_types(self.__tdml_df,
1759
+ self.__data_args[arg_name].columns,
1760
+ idx_delim=",",
1761
+ types_delim=",")
1762
+
1763
+ # Format "<arg_name>-<comma separated indices>-<comma separated types>"
1764
+ data_args_str.append(f"{arg_name}-{partition_indices_str}-{partition_types_str}")
1633
1765
 
1634
1766
  # Format "{<arg_name>-<comma separated indices>-<comma separated types>}--
1635
1767
  # {<arg_name>-<comma separated indices>-<comma separated types>}"
@@ -1650,7 +1782,7 @@ class _SKLearnFunctionWrapper(_GenericObjectWrapper):
1650
1782
 
1651
1783
  def _prepare_and_install_file(self):
1652
1784
  """
1653
- Prepare function script file from template file and install it in Vaantage.
1785
+ Prepare function script file from template file and install it in Vantage.
1654
1786
  """
1655
1787
  with open(os.path.join(self._scripts_path, "sklearn_function.template")) as fp:
1656
1788
  script_data = fp.read()