teradataml 20.0.0.2__py3-none-any.whl → 20.0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (88) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/README.md +196 -2
  3. teradataml/__init__.py +4 -0
  4. teradataml/_version.py +1 -1
  5. teradataml/analytics/analytic_function_executor.py +79 -4
  6. teradataml/analytics/json_parser/metadata.py +12 -3
  7. teradataml/analytics/json_parser/utils.py +7 -2
  8. teradataml/analytics/sqle/__init__.py +1 -0
  9. teradataml/analytics/table_operator/__init__.py +1 -1
  10. teradataml/analytics/uaf/__init__.py +1 -1
  11. teradataml/analytics/utils.py +4 -0
  12. teradataml/automl/data_preparation.py +3 -2
  13. teradataml/automl/feature_engineering.py +15 -7
  14. teradataml/automl/model_training.py +39 -33
  15. teradataml/common/__init__.py +2 -1
  16. teradataml/common/constants.py +35 -0
  17. teradataml/common/garbagecollector.py +2 -1
  18. teradataml/common/messagecodes.py +8 -2
  19. teradataml/common/messages.py +3 -1
  20. teradataml/common/sqlbundle.py +25 -3
  21. teradataml/common/utils.py +134 -9
  22. teradataml/context/context.py +20 -10
  23. teradataml/data/SQL_Fundamentals.pdf +0 -0
  24. teradataml/data/dataframe_example.json +18 -2
  25. teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +1 -1
  26. teradataml/data/docs/sqle/docs_17_20/Shap.py +7 -1
  27. teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +4 -4
  28. teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
  29. teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
  30. teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
  31. teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
  32. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
  33. teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
  34. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
  35. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
  36. teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
  37. teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
  38. teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
  39. teradataml/data/jsons/uaf/17.20/TD_SAX.json +3 -1
  40. teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +15 -5
  41. teradataml/data/medical_readings.csv +101 -0
  42. teradataml/data/patient_profile.csv +101 -0
  43. teradataml/data/scripts/lightgbm/dataset.template +157 -0
  44. teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
  45. teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
  46. teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
  47. teradataml/data/scripts/sklearn/sklearn_fit.py +194 -167
  48. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
  49. teradataml/data/scripts/sklearn/sklearn_function.template +14 -19
  50. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
  51. teradataml/data/scripts/sklearn/sklearn_transform.py +129 -42
  52. teradataml/data/target_udt_data.csv +8 -0
  53. teradataml/data/templates/open_source_ml.json +3 -2
  54. teradataml/data/vectordistance_example.json +4 -0
  55. teradataml/dataframe/dataframe.py +543 -175
  56. teradataml/dataframe/functions.py +553 -25
  57. teradataml/dataframe/sql.py +184 -15
  58. teradataml/dbutils/dbutils.py +556 -18
  59. teradataml/dbutils/filemgr.py +48 -1
  60. teradataml/lib/aed_0_1.dll +0 -0
  61. teradataml/opensource/__init__.py +1 -1
  62. teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
  63. teradataml/opensource/_lightgbm.py +950 -0
  64. teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
  65. teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
  66. teradataml/opensource/sklearn/__init__.py +0 -1
  67. teradataml/opensource/sklearn/_sklearn_wrapper.py +798 -438
  68. teradataml/options/__init__.py +7 -23
  69. teradataml/options/configure.py +29 -3
  70. teradataml/scriptmgmt/UserEnv.py +3 -3
  71. teradataml/scriptmgmt/lls_utils.py +74 -21
  72. teradataml/store/__init__.py +13 -0
  73. teradataml/store/feature_store/__init__.py +0 -0
  74. teradataml/store/feature_store/constants.py +291 -0
  75. teradataml/store/feature_store/feature_store.py +2223 -0
  76. teradataml/store/feature_store/models.py +1505 -0
  77. teradataml/store/vector_store/__init__.py +1586 -0
  78. teradataml/table_operators/query_generator.py +3 -0
  79. teradataml/table_operators/table_operator_query_generator.py +3 -1
  80. teradataml/table_operators/table_operator_util.py +37 -38
  81. teradataml/table_operators/templates/dataframe_register.template +69 -0
  82. teradataml/utils/dtypes.py +4 -2
  83. teradataml/utils/validators.py +33 -1
  84. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +200 -5
  85. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +88 -65
  86. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
  87. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
  88. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0
@@ -19,7 +19,6 @@ from collections import OrderedDict, defaultdict
19
19
  from importlib import import_module
20
20
 
21
21
  import base64
22
- import functools
23
22
  import json
24
23
  import numpy
25
24
  import os
@@ -28,7 +27,7 @@ import time
28
27
  import inspect
29
28
  import warnings
30
29
  import json
31
- import random
30
+ import math
32
31
  import pandas as pd
33
32
  from teradatasqlalchemy import BLOB, CLOB, FLOAT, TIMESTAMP, VARCHAR, INTEGER
34
33
  import pandas.api.types as pt
@@ -41,9 +40,9 @@ from teradataml.context.context import _get_current_databasename, get_connection
41
40
  from teradataml.dbutils.filemgr import install_file, remove_file
42
41
  from teradataml.utils.utils import execute_sql
43
42
  from teradataml.options.configure import configure
44
- from teradataml.opensource.sklearn._wrapper_utils import _validate_fit_run, _generate_new_name,\
43
+ from teradataml.opensource._wrapper_utils import _validate_fit_run, _generate_new_name,\
45
44
  _validate_opensource_func_args, _derive_df_and_required_columns, _validate_df_query_type
46
- from teradataml.opensource.sklearn.constants import OpenSourcePackage, _OSML_MODELS_PRIMARY_INDEX,\
45
+ from teradataml.opensource.constants import OpenSourcePackage, _OSML_MODELS_PRIMARY_INDEX,\
47
46
  _OSML_MODELS_TABLE_NAME, _OSML_MODELS_TABLE_COLUMNS_TYPE_DICT, OpensourceModels,\
48
47
  _OSML_ADDITIONAL_COLUMN_TYPES
49
48
  from teradataml.common.messagecodes import MessageCodes
@@ -53,7 +52,6 @@ from teradataml.dbutils.dbutils import _create_table, set_session_param
53
52
  from teradataml.utils.validators import _Validators
54
53
  from teradataml.dataframe.dataframe import DataFrame
55
54
  from teradataml.dataframe.dataframe_utils import DataFrameUtils
56
- from teradataml.scriptmgmt.lls_utils import create_env, get_env
57
55
  from teradataml.common.garbagecollector import GarbageCollector
58
56
  from teradataml.common.constants import TeradataConstants
59
57
 
@@ -70,6 +68,9 @@ _file_installed = False
70
68
 
71
69
  class _GenericObjectWrapper:
72
70
  def __init__(self) -> None:
71
+ if not get_connection():
72
+ raise TeradataMlException(Messages.get_message(MessageCodes.INVALID_CONTEXT_CONNECTION),
73
+ MessageCodes.INVALID_CONTEXT_CONNECTION)
73
74
  self._db_name = _get_current_databasename()
74
75
 
75
76
  self._scripts_path = os.path.join(_TDML_DIRECTORY, "data", "scripts", "sklearn")
@@ -215,6 +216,7 @@ class _GenericObjectWrapper:
215
216
  raise TeradataMlException(
216
217
  f"Script file '{file_name}' failed to remove in Vantage."
217
218
  )
219
+
218
220
  def _get_data_col_types_and_partition_col_indices_and_types(self, data, partition_columns,
219
221
  idx_delim=",",
220
222
  types_delim="--"):
@@ -264,7 +266,7 @@ class _GenericObjectWrapper:
264
266
  args_str += f" {strr}"
265
267
  return args_str
266
268
 
267
- def extract_sklearn_obj(self, n_unique_partitions = 1, n_partition_cols = 1):
269
+ def _extract_model_objs(self, n_unique_partitions=1, n_partition_cols=1):
268
270
  """
269
271
  Internal function to extract sklearn object from the model(s) depending on the number of
270
272
  partitions. When it is only one model, it is directly used as sklearn object (modelObj).
@@ -297,13 +299,256 @@ class _GenericObjectWrapper:
297
299
 
298
300
  warnings.filterwarnings("default")
299
301
 
302
+ def _validate_existence_of_partition_columns(self, partition_columns, all_columns, arg_names_for_dfs):
303
+ """
304
+ Validate if columns in "partition_columns" argument are present in any of the given
305
+ dataframes.
306
+ """
307
+ invalid_part_cols = [c for c in partition_columns if c not in all_columns]
308
+
309
+ if invalid_part_cols:
310
+ raise ValueError(Messages.get_message(MessageCodes.INVALID_PARTITIONING_COLS,
311
+ ", ".join(invalid_part_cols),
312
+ "', '".join(arg_names_for_dfs))
313
+ )
314
+
315
+ def _prepare_data_args_string(self, kwargs):
316
+ """
317
+ Get column indices and types of each data related arguments in the format:
318
+ "{<arg_name>-<comma separated indices>-<comma separated types>}--
319
+ {<arg_name>-<comma separated indices>-<comma separated types>}"
320
+ """
321
+ data_args_str = []
322
+ for arg_name in list(self._data_args.keys()):
323
+ # Remove DataFrame arguments from kwargs, which will be passed to Script.
324
+ kwargs.pop(arg_name)
325
+
326
+ # Get column indices and their types for each dataframe from parent dataframe.
327
+ _, partition_indices_str, partition_types_str, _ = \
328
+ self._get_data_col_types_and_partition_col_indices_and_types(self._tdml_df,
329
+ self._data_args[arg_name].columns,
330
+ idx_delim=",",
331
+ types_delim=",")
332
+
333
+ # Format "<arg_name>-<comma separated indices>-<comma separated types>"
334
+ data_args_str.append(f"{arg_name}-{partition_indices_str}-{partition_types_str}")
335
+
336
+ # Format "{<arg_name>-<comma separated indices>-<comma separated types>}--
337
+ # {<arg_name>-<comma separated indices>-<comma separated types>}"
338
+ return "--".join(data_args_str)
339
+
340
+ def _prepare_and_install_file(self, replace_dict):
341
+ """
342
+ Prepare function script file from template file and install it in Vantage.
343
+ Takes the dictionary with keys as strings to be replaced in script and values as
344
+ strings which should be added in place of keys.
345
+ """
346
+
347
+ with open(os.path.join(self._scripts_path, self._template_file)) as fp:
348
+ script_data = fp.read()
349
+
350
+ for old, new in replace_dict.items():
351
+ script_data = script_data.replace(old, new)
352
+
353
+ self._script_file_local = os.path.join(self._tdml_tmp_dir, self._script_file_name)
354
+
355
+ with open(self._script_file_local, "w") as fp:
356
+ fp.write(script_data)
357
+
358
+ self._install_script_file(file_identifier=self._script_file_name.split(".")[0],
359
+ file_name=self._script_file_name,
360
+ file_location=self._tdml_tmp_dir)
361
+
362
+ def _get_dataframe_related_args_and_their_columns(self, kwargs):
363
+ """
364
+ Get dataframe related arguments and return all their column names from kwargs.
365
+ """
366
+ __data_columns = []
367
+ __data_args_dict = OrderedDict()
368
+
369
+ # Separate dataframe related arguments and their column names from actual kwargs.
370
+ for k, v in kwargs.items():
371
+ if isinstance(v, DataFrame):
372
+ # All dataframes should be select of parent dataframe.
373
+ _validate_df_query_type(v, "select", k)
374
+
375
+ # Save all columns in dataframe related arguments.
376
+ __data_columns.extend(v.columns)
377
+
378
+ __data_args_dict[k] = v
379
+
380
+ return __data_args_dict, __data_columns
381
+
382
+ def _process_data_for_funcs_returning_objects(self, kwargs):
383
+ """
384
+ Internal function to process all arguments and assign self._data_args, self._tdml_df
385
+ and return
386
+ 1. dictionary of elements (needed to replace in the script template file)
387
+ 2. partition columns list.
388
+ """
389
+ partition_cols = self._get_columns_as_list(kwargs.get("partition_columns", None))
390
+ if partition_cols:
391
+ kwargs.pop("partition_columns")
392
+
393
+ self._data_args, __data_columns = self._get_dataframe_related_args_and_their_columns(kwargs)
394
+
395
+ arg_names_for_dfs = list(self._data_args.keys())
396
+
397
+ # Get common parent dataframe from all dataframes.
398
+ self._tdml_df = DataFrameUtils()._get_common_parent_df_from_dataframes(list(self._data_args.values()))
399
+
400
+ self._tdml_df = self._tdml_df.select(__data_columns + partition_cols)
401
+
402
+ self._validate_existence_of_partition_columns(partition_cols, self._tdml_df.columns, arg_names_for_dfs)
403
+
404
+ self._tdml_df, partition_cols = self._get_data_and_data_partition_columns(self._tdml_df,
405
+ __data_columns,
406
+ [],
407
+ partition_cols
408
+ )
409
+
410
+ # Prepare string of data arguments with name, indices where columns of that argument resides
411
+ # and types of each of the column.
412
+ data_args_str = self._prepare_data_args_string(kwargs)
413
+
414
+ # Get indices of partition_columns and types of all columns.
415
+ data_column_types_str, partition_indices_str, _, partition_cols = \
416
+ self._get_data_col_types_and_partition_col_indices_and_types(self._tdml_df,
417
+ partition_cols,
418
+ types_delim=None,
419
+ idx_delim=None)
420
+
421
+ replace_dict = {"<partition_cols_indices>": str(partition_indices_str),
422
+ "<types_of_data_cols>": str(data_column_types_str),
423
+ "<data_args_info_str>": f"'{data_args_str}'"}
424
+
425
+ return replace_dict, partition_cols
426
+
427
+ def _validate_equality_of_partition_values(self, fit_values, trans_values):
428
+ """
429
+ Internal function to compare the partition values in fit() and predict() are same.
430
+ """
431
+ if len(fit_values) != len(trans_values):
432
+ return False
433
+
434
+ for val in fit_values:
435
+ if not all([val in trans_values]):
436
+ return False
437
+
438
+ return True
439
+
440
+ def _get_non_data_related_args_from_kwargs(self, kwargs):
441
+ """
442
+ Get all non-data related arguments from kwargs.
443
+ """
444
+ non_data_related_args = {}
445
+ for k, v in kwargs.items():
446
+ if not isinstance(v, DataFrame):
447
+ non_data_related_args[k] = v
448
+ non_data_related_args.pop("partition_columns", None)
449
+ return non_data_related_args
450
+
451
+ def _read_from_template_and_write_dict_to_file(self, template_file, replace_dict,
452
+ output_script_file_name=None):
453
+ """
454
+ Read template file, replace the keys with values and write to new file.
455
+ """
456
+ with open(os.path.join(self._scripts_path, template_file)) as fp:
457
+ script_data = fp.read()
458
+
459
+ for old, new in replace_dict.items():
460
+ script_data = script_data.replace(old, new)
461
+
462
+ if output_script_file_name is None:
463
+ output_script_file_name = self._script_file_name
464
+ file_path = os.path.join(self._tdml_tmp_dir, output_script_file_name)
465
+ with open(file_path, "w") as fp:
466
+ fp.write(script_data)
467
+
468
+ def _generate_script_file_from_template_file(self, kwargs, template_file, func_name,
469
+ output_script_file_name=None):
470
+ """
471
+ Internal function to generate script file from template file. It just adds the non-data
472
+ related arguments to the template file and writes the contents to new file, so that these
473
+ arguments are available in the script file for running this function "func_name".
474
+ """
475
+ # Take out all non-data related arguments to write to template file.
476
+ non_data_related_args = self._get_non_data_related_args_from_kwargs(kwargs)
477
+
478
+ # Read template file and write the contents to new file with non-data related arguments.
479
+ template_f = os.path.join(self._scripts_path, template_file)
480
+ with open(template_f, "r") as f:
481
+ template = f.read()
482
+
483
+ if output_script_file_name is None:
484
+ output_script_file_name = self._script_file_name
485
+ file_path = os.path.join(self._tdml_tmp_dir, output_script_file_name)
486
+ with open(file_path, "w") as f:
487
+ f.write("import json\n")
488
+ f.write(f"params = json.loads('{json.dumps(non_data_related_args)}')\n")
489
+ f.write(template)
490
+
491
+ kwargs["file_name"] = output_script_file_name
492
+ kwargs["name"] = func_name
493
+
494
+ def _remove_data_related_args_from_kwargs(self, kwargs):
495
+ """
496
+ Internal function to remove data related arguments from kwargs.
497
+ """
498
+ kwargs.pop("data", None)
499
+ kwargs.pop("feature_columns", None)
500
+ kwargs.pop("group_columns", None)
501
+ kwargs.pop("partition_columns", None)
502
+ kwargs.pop("label_columns", None)
503
+
504
+ def _convert_pos_args_to_kwargs_for_function(self, pos_args, kwargs, func_name):
505
+ """
506
+ Internal function to convert positional arguments to keyword arguments.
507
+ """
508
+ fn = getattr(getattr(import_module(self.module_name), self.class_name), func_name)
509
+ kwargs.update(zip(fn.__code__.co_varnames[1:], pos_args))
510
+
511
+ def _install_model_and_script_files(self, file_name, file_location):
512
+ """
513
+ Internal function to install model and script files to Vantage.
514
+ """
515
+ self._install_initial_model_file()
516
+ self._install_script_file(file_identifier=file_name.split(".")[0],
517
+ file_name=file_name,
518
+ is_binary=False,
519
+ file_location=file_location)
520
+
521
+ def _assign_fit_variables_after_execution(self, data, partition_columns, label_columns):
522
+ """
523
+ Internal function to assign fit related variables.
524
+ """
525
+ # Extract sklearn object(s) from the depending on the number of unique partitioning values.
526
+ self._extract_model_objs(n_unique_partitions=len(self._fit_partition_unique_values),
527
+ n_partition_cols=len(partition_columns))
528
+
529
+ # Need this label columns types in prediction.
530
+ self._fit_label_columns_types = []
531
+ self._fit_label_columns_python_types = []
532
+
533
+ for l_c in label_columns:
534
+ column_data = data._td_column_names_and_sqlalchemy_types[l_c.lower()]
535
+ self._fit_label_columns_types.append(column_data)
536
+ self._fit_label_columns_python_types.append(column_data.python_type.__name__)
537
+
538
+ # If the model is trained a second time after the object creation,
539
+ # or if set_params() is called after the first model training,
540
+ # this flag will reset to False. So that for subsequent predict/score
541
+ # operations, the newly trained model will be installed.
542
+ if self._is_trained_model_installed:
543
+ self._is_trained_model_installed = False
544
+
300
545
 
301
546
  class _OpenSourceObjectWrapper(_GenericObjectWrapper):
302
547
  # This has to be set for every package which subclasses this class.
303
548
  OPENSOURCE_PACKAGE_NAME = None
304
549
 
305
550
  def __init__(self, model=None, module_name=None, class_name=None, pos_args=None, kwargs=None):
306
- if not model and not module_name and not class_name:
551
+ if model is None and not module_name and not class_name:
307
552
  raise TeradataMlException(Messages.get_message(MessageCodes.EITHER_THIS_OR_THAT_ARGUMENT, "model",
308
553
  "module_name and class_name"),
309
554
  MessageCodes.EITHER_THIS_OR_THAT_ARGUMENT)
@@ -319,24 +564,224 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
319
564
  self.pos_args = pos_args if pos_args is not None else tuple()
320
565
 
321
566
  self._fit_label_columns_types = None
567
+ self._fit_label_columns_python_types = None
322
568
  self._table_name_prefix = None
323
569
 
324
570
  self._is_default_partition_value_fit = True # False when the user provides partition columns.
325
571
  self._fit_partition_colums_non_default = None
326
572
  self._is_default_partition_value_predict = True # False when the user provides partition columns.
327
573
 
328
- def _validate_equality_of_partition_values(self, fit_values, trans_values):
574
+ def __repr__(self):
575
+ if self._is_default_partition_value_fit:
576
+ # Single model use case.
577
+ return self.modelObj.__repr__()
578
+
579
+ pd.set_option("display.expand_frame_repr", None)
580
+ pd.set_option("display.max_colwidth", None)
581
+ opt = self.modelObj.__repr__()
582
+ pd.reset_option("display.expand_frame_repr")
583
+ pd.reset_option("display.max_colwidth")
584
+ return opt
585
+
586
+ def _initialize_object(self):
329
587
  """
330
- Internal function to compare the partition values in fit() and predict() are same.
588
+ Internal function to initialize sklearn object from module name and class name.
331
589
  """
332
- if len(fit_values) != len(trans_values):
333
- return False
590
+ # Needed when writing imported modules to generated file. TODO: Remove later.
591
+ imported_args = {}
592
+ # If there are any objects of class `_SkLearnObjectWrapper`, it is modified to
593
+ # corresponding sklearn object.
594
+ _partition_column_names = None
595
+ if "partition_columns" in self.kwargs:
596
+ self._fit_partition_colums_non_default = self.kwargs["partition_columns"]
597
+ self._is_default_partition_value_fit = False
598
+ _partition_column_names = self._fit_partition_colums_non_default
334
599
 
335
- for val in fit_values:
336
- if not all([val in trans_values]):
337
- return False
338
600
 
339
- return True
601
+ new_sklearn_pos_args = self.modify_args(None, self.pos_args, imported_args)
602
+ new_sklearn_kwargs = self.modify_args(None, self.kwargs, imported_args)
603
+
604
+ # Create model object from new positional and keyword arguments.
605
+ class_obj = getattr(import_module(self.module_name), self.class_name)
606
+ if new_sklearn_pos_args:
607
+ self.modelObj = class_obj(*new_sklearn_pos_args, **new_sklearn_kwargs)
608
+ else:
609
+ self.modelObj = class_obj(**new_sklearn_kwargs)
610
+
611
+ # All arguments are moved to kwargs and kept pos_args empty.
612
+ # Might help in set_params() bug fix.
613
+ self.pos_args = tuple()
614
+ _arguments = self.modelObj.__dict__
615
+
616
+ if hasattr(self.modelObj, "get_params"):
617
+ # Update kwargs that are both in modelObj and get_params() as there are
618
+ # some classes which return other internals variables also.
619
+ # Hence, filtering them using get_params().
620
+ for k, v in _arguments.items():
621
+ if type(v).__name__ in ["function", "generator"]:
622
+ # TODO: ELE-6351: Skipping adding functions and generators to kwargs as these
623
+ # are not supported yet due to pickling issue.
624
+ continue
625
+ if self.get_params():
626
+ if k in self.get_params():
627
+ self.kwargs[k] = v
628
+ else:
629
+ _model_init_arguments = None
630
+ try:
631
+ _model_init_arguments = self.modelObj.__init__.__code__.co_varnames
632
+ except AttributeError:
633
+ pass
634
+ if _model_init_arguments:
635
+ self.kwargs = dict((k, v) for k, v in _arguments.items() if k in _model_init_arguments)
636
+ else:
637
+ self.kwargs = _arguments
638
+ else:
639
+ # Model selection classes will not have `get_params`, in which case modelObj's __dict__
640
+ # is saved as kwargs.
641
+ self.kwargs = _arguments
642
+
643
+ if _partition_column_names:
644
+ self.kwargs["partition_columns"] = _partition_column_names
645
+
646
+ def _initialize_variables(self, table_name_prefix):
647
+ """
648
+ Internal function to initialize variables used in this class.
649
+ """
650
+ self.feature_names_in_ = None
651
+ self._table_name_prefix = table_name_prefix
652
+ self._model_file_name_prefix = _generate_new_name(type="file")
653
+ self.model_file_paths_local = set()
654
+
655
+ self._fit_execution_time = None
656
+ self._fit_predict_execution_time = None
657
+ self._partial_fit_execution_time = None
658
+ self._predict_execution_time = None
659
+ self._transform_execution_time = None
660
+ self._score_execution_time = None
661
+
662
+ # Set to partition columns when training is done with partition columns.
663
+ self._fit_partition_colums_non_default = None
664
+
665
+ self._is_model_installed = False
666
+ self._fit_partition_unique_values = [[self._default_data_partition_value]]
667
+
668
+ def _get_returning_df(self, script_df, partition_column, returns):
669
+ """
670
+ Internal function to return the teradataml Dataframe except
671
+ partition_column.
672
+ """
673
+ if self._is_default_partition_value_fit:
674
+ # For single model case, partition column is internally generated
675
+ # and no point in returning it to the user.
676
+
677
+ # Extract columns from return types.
678
+ returning_cols = [col[0] for col in returns[len(partition_column):]]
679
+ return script_df.select(returning_cols)
680
+ return script_df
681
+
682
+ def modify_args(self, fp1, arg, imported_args):
683
+ """
684
+ Internal function to recursively (if "arg" is list/tuple/dict) check if any sklearn object
685
+ of opensourceML is present in the argument "arg" and modify it to corresponding sklearn
686
+ object.
687
+ This function can also be used to write import statements to file (if "fp1" is not
688
+ None). Update "imported_args" dictionary with imported module and class name to avoid
689
+ importing same module and class again when writing to file. This is useful when we want to
690
+ generate script from template file.
691
+ Pass None to "fp1" if we don't want to write to file and just modify opensourceML sklearn
692
+ object to corresponding sklearn object.
693
+ """
694
+ if isinstance(arg, type(self)):
695
+ imported_tuple = (arg.module_name, arg.class_name)
696
+ already_imported = imported_args.get(imported_tuple, False)
697
+ if not already_imported:
698
+ imported_args[imported_tuple] = True
699
+ if fp1:
700
+ fp1.write(f"from {arg.module_name} import {arg.class_name}\n")
701
+ self.modify_args(fp1, arg.pos_args, imported_args)
702
+ self.modify_args(fp1, arg.kwargs, imported_args)
703
+ return arg.modelObj
704
+ elif isinstance(arg, list):
705
+ return [self.modify_args(fp1, val, imported_args) for val in arg]
706
+ elif isinstance(arg, tuple):
707
+ return tuple([self.modify_args(fp1, val, imported_args) for val in arg])
708
+ elif type(arg).__name__ == "generator":
709
+ # Raising exception as generator object can't be pickled.
710
+ # TODO: ELE-6351 - Find ways to pickle generator object later.
711
+ raise ValueError("Generator type/iterator is not supported for any argument. "\
712
+ "Support will be added later.")
713
+ elif type(arg).__name__ == "function":
714
+ # Raising exception as functions/lambda functions can't be pickled.
715
+ # TODO: ELE-6351 - Find ways to pickle functions later.
716
+ raise ValueError("Functions are not supported for any argument. "\
717
+ "Support will be added later.")
718
+ elif isinstance(arg, dict):
719
+ return dict(
720
+ (
721
+ self.modify_args(fp1, k, imported_args),
722
+ self.modify_args(fp1, v, imported_args),
723
+ )
724
+ for k, v in arg.items() if k != "partition_columns"
725
+ )
726
+ # elif arg == "partition_columns":
727
+
728
+ else:
729
+ return arg
730
+
731
+ def _install_initial_model_file(self, use_dummy_initial_file=False):
732
+ """
733
+ If model file(s) is/are not installed in Vantage, then install it/them.
734
+ """
735
+ if isinstance(self.modelObj, pd.DataFrame):
736
+ # Get list of unique partition values and corresponding model object as dict.
737
+ partition_values_model_dict = {}
738
+ obj_list = self.modelObj.values.tolist()
739
+ for lst in obj_list:
740
+ partition_values_model_dict[tuple(lst[:len(self._fit_partition_colums_non_default)])] = \
741
+ lst[len(self._fit_partition_colums_non_default)]
742
+
743
+ for partition in self._fit_partition_unique_values:
744
+ # Create a new file with file name with partition values and
745
+ # dump sklearn object into it. Finally install the file to Vantage.
746
+ partition_join = "_".join([str(x) for x in partition])
747
+ file_name = f"{self._model_file_name_prefix}_{partition_join}"
748
+ # Replace '-' with '_' as '-' can't be present in file identifier.
749
+ # Needed this replace because partition_columns can be negative.
750
+ file_name = file_name.replace("-", "_")
751
+ full_file_name = os.path.join(self._tdml_tmp_dir, file_name)
752
+ with open(full_file_name, "wb+") as fp:
753
+ # Write sklearn object to file.
754
+ if isinstance(self.modelObj, pd.DataFrame):
755
+ # If multiple models, then write the model corresponding to the partition value.
756
+ fp.write(pickle.dumps(partition_values_model_dict[tuple(partition)]))
757
+ else:
758
+ if use_dummy_initial_file:
759
+ fp.write(pickle.dumps("abc"))
760
+ else:
761
+ fp.write(pickle.dumps(self.modelObj))
762
+ self.model_file_paths_local.add(file_name)
763
+
764
+ self._install_script_file(file_identifier=file_name,
765
+ file_name=file_name,
766
+ is_binary=True,
767
+ file_location=self._tdml_tmp_dir)
768
+
769
+ if self._is_lake_system:
770
+ # Need to pass env_name along with file_name for cleaning up the files in env.
771
+ obj = f"{self._env.env_name}::{file_name}"
772
+ if installed_model_files[obj] == 0:
773
+ # Add to GC for the first time the model file (along with env name) is encountered.
774
+ installed_model_files[obj] = 1
775
+ GarbageCollector._add_to_garbagecollector(object_name=obj,
776
+ object_type=TeradataConstants.TERADATA_APPLY)
777
+ else:
778
+ if installed_model_files[file_name] == 0:
779
+ # Add to GC for the first time the model file is encountered.
780
+ installed_model_files[file_name] = 1
781
+ GarbageCollector._add_to_garbagecollector(object_name=file_name,
782
+ object_type=TeradataConstants.TERADATA_SCRIPT)
783
+
784
+ self._is_model_installed = True
340
785
 
341
786
  def _validate_unique_partition_values(self, data, partition_columns):
342
787
  """
@@ -361,25 +806,61 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
361
806
 
362
807
  if not self._validate_equality_of_partition_values(fit_unique_values, trans_unique_values):
363
808
  raise TeradataMlException(
364
- Messages.get_message(MessageCodes.PARTITION_VALUES_NOT_MATCHING),
809
+ Messages.get_message(MessageCodes.PARTITION_VALUES_NOT_MATCHING, "training", "test"),
365
810
  MessageCodes.PARTITION_VALUES_NOT_MATCHING
366
811
  )
367
812
 
368
813
  def fit(self, **kwargs):
369
814
  pass
370
815
 
816
+ def _convert_arguments_to_modelObj(self, args, idx_multi_model=None):
817
+ """
818
+ Internal function to convert all OpensourceML related objects in arguments to
819
+ underlying model objects.
820
+ """
821
+ if isinstance(args, dict):
822
+ new_args = args.copy() # To avoid updating
823
+ for k, v in new_args.items():
824
+ if isinstance(v, type(self)):
825
+ if idx_multi_model is not None:
826
+ # single model. This argument is set only when modelObj is single model.
827
+ new_args[k] = v.modelObj
828
+ else:
829
+ # multi-model. Get appropriate model from modelObj.
830
+ new_args[k] = v.modelObj.iloc[idx_multi_model]["model"]
831
+ else:
832
+ new_args[k] = v
833
+ return new_args
834
+
835
+ # If args is tuple, convert all elements to underlying model object.
836
+ elif isinstance(args, tuple):
837
+ new_args = tuple()
838
+ for arg in args:
839
+ if isinstance(arg, type(self)):
840
+ if idx_multi_model is None:
841
+ # single model. This argument is set only when modelObj is single model.
842
+ new_args += (arg.modelObj,)
843
+ else:
844
+ # multi-model. Get appropriate model from modelObj.
845
+ new_args += (arg.modelObj.iloc[idx_multi_model]["model"],)
846
+ else:
847
+ new_args += (arg,)
848
+ return new_args
849
+ return args
850
+
371
851
  def __get_obj_attributes_multi_model(self, name):
372
852
  """
373
853
  Internal function to get attributes of all sklearn model objects when multiple models are
374
854
  generated by fit.
375
855
  """
376
856
 
377
- def __generate_model_object(model_obj_value):
857
+ def __generate_model_object(model_obj_value, init_model_obj):
378
858
  """
379
859
  Internal function to generate _SkLearnWrapperObject model object from model_obj_value.
380
860
  """
381
861
  # Create _SkLearnObjectWrapper object from opensource model object.
382
- model_obj = self.__class__(model=first_atrribute_instance)
862
+ model_obj = self.__class__(model=init_model_obj)
863
+
383
864
  model_obj.modelObj = model_obj_value
384
865
  model_obj._is_model_installed = True
385
866
 
@@ -396,13 +877,34 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
396
877
  multi_models = self.modelObj.copy()
397
878
  for i in range(multi_models.shape[0]):
398
879
  curr_model = multi_models.iloc[i]["model"]
399
- multi_models.at[i, "model"] = getattr(curr_model, name)(*c, **kwargs)
880
+ partition_values = multi_models.iloc[i][0:len(self._fit_partition_colums_non_default)].to_list()
881
+ partition_values = "_".join([str(x) for x in partition_values])
882
+ if self.module_name == "lightgbm.basic" and self.class_name == "Booster" and name == "save_model":
883
+ # filename is first argument.
884
+ kwargs1 = kwargs.copy()
885
+ c1 = c
886
+
887
+ if len(c) > 0:
888
+ c1 = list(c1)
889
+ c1[0] = f"{c1[0]}_{partition_values}"
890
+ c1 = tuple(c1)
891
+ if len(kwargs) > 0 and kwargs.get("filename", None):
892
+ kwargs1["filename"] = f"{kwargs1['filename']}_{partition_values}"
893
+
894
+ multi_models.at[i, "model"] = getattr(curr_model, name)(*self._convert_arguments_to_modelObj(c1, i),
895
+ **self._convert_arguments_to_modelObj(kwargs1, i))
896
+ else:
897
+ multi_models.at[i, "model"] = getattr(curr_model, name)(*self._convert_arguments_to_modelObj(c, i),
898
+ **self._convert_arguments_to_modelObj(kwargs, i))
400
899
 
401
- first_function_instance = multi_models.at[0, "model"]
402
- if self.__class__._validate_model_supportability(first_function_instance):
403
- return __generate_model_object(multi_models)
900
+ first_function_value = multi_models.at[0, "model"]
901
+ if self.__class__._validate_model_supportability(first_function_value):
902
+ return __generate_model_object(multi_models, init_model_obj=first_function_value)
404
903
 
405
- return multi_models.rename(columns={"model": name})
904
+ multi_models = multi_models.rename(columns={"model": name})
905
+
906
+ # Select only partition columns and the attribute column.
907
+ return multi_models[self._fit_partition_colums_non_default + [name]]
406
908
 
407
909
  # Assuming that self.modelObj will have at least 1 row.
408
910
 
@@ -420,15 +922,15 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
420
922
  output_attributes.at[i, "model"] = getattr(model, name)
421
923
 
422
924
  if self.__class__._validate_model_supportability(first_atrribute_instance):
423
- return __generate_model_object(output_attributes)
925
+ return __generate_model_object(output_attributes, init_model_obj=first_atrribute_instance)
424
926
 
425
927
  return output_attributes.rename(columns={"model": name})
426
928
 
427
929
  def __getattr__(self, name):
428
- # This just run attributes (functions and properties) from sklearn object.
930
+ # This just run attributes (functions and properties) from opensource (sklearn/lightgbm) objects.
429
931
  def __sklearn_method_invoker(*c, **kwargs):
430
- # sklearn model is returned from the function call. Create _SkLearnObjectWrapper object.
431
- model_obj = attribute_instance(*c, **kwargs)
932
+ # Opensource model is returned from the function call. Create _OpensourceObjectWrapper object.
933
+ model_obj = attribute_instance(*self._convert_arguments_to_modelObj(c), **self._convert_arguments_to_modelObj(kwargs))
432
934
  if self.__class__._validate_model_supportability(model_obj):
433
935
  model_obj = self.__class__(model=model_obj)
434
936
  model_obj._is_model_installed = True # Trained model is returned by function call.
@@ -636,234 +1138,63 @@ class _OpenSourceObjectWrapper(_GenericObjectWrapper):
636
1138
  EXAMPLES:
637
1139
  >>> from teradataml import td_sklearn
638
1140
  >>> model = td_sklearn.LinearRegression(normalize=True)
639
- >>> model
640
- LinearRegression(normalize=True)
641
-
642
- # Example 1: Deploy the model held by interface object to Vantage.
643
- >>> lin_reg = model.deploy("linreg_model_ver_2")
644
- Model is saved.
645
- >>> lin_reg
646
- LinearRegression(normalize=True)
647
-
648
- # Example 2: Deploy the model held by interface object to Vantage with the name same
649
- # as that of model that already existed in Vantage.
650
- >>> lin_reg = model.deploy("linreg_model_ver_2", replace_if_exists=True)
651
- Model is deleted.
652
- Model is saved.
653
- >>> lin_reg
654
- LinearRegression(normalize=True)
655
- """
656
-
657
- # Install model file into Vantage, if not installed.
658
- self._install_initial_model_file()
659
-
660
- self._save_model(model_name, replace_if_exists)
661
- return self
662
-
663
-
664
- class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
665
-
666
- OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.SKLEARN
667
-
668
- def __init__(self, model=None, module_name=None, class_name=None, pos_args=None, kwargs=None):
669
- super().__init__(model=model, module_name=module_name, class_name=class_name,
670
- pos_args=pos_args, kwargs=kwargs)
671
-
672
- self._initialize_variables()
673
- if model:
674
- self.modelObj = model
675
- self.module_name = model.__module__.split("._")[0]
676
- self.class_name = model.__class__.__name__
677
- # __dict__ gets all the arguments as dictionary including default ones and positional
678
- # args.
679
- self.kwargs = model.__dict__
680
- self.pos_args = tuple() # Kept empty as all are moved to kwargs.
681
- else:
682
- self._initialize_object()
683
-
684
- def __repr__(self):
685
- if self._is_default_partition_value_fit:
686
- # Single model use case.
687
- return self.modelObj.__repr__()
688
-
689
- pd.set_option("display.expand_frame_repr", None)
690
- pd.set_option("display.max_colwidth", None)
691
- opt = self.modelObj.__repr__()
692
- pd.reset_option("display.expand_frame_repr")
693
- pd.reset_option("display.max_colwidth")
694
- return opt
695
-
696
- def _validate_args_and_get_data(self, X=None, y=None, groups=None, kwargs={},
697
- skip_either_or_that=False):
698
- """
699
- Internal function to validate arguments passed to exposed opensource APIs and return
700
- parent DataFrame, feature columns, label columns, group columns, data partition columns.
701
- """
702
- _validate_opensource_func_args(X=X, y=y, groups=groups,
703
- fit_partition_cols=self._fit_partition_colums_non_default,
704
- kwargs=kwargs,
705
- skip_either_or_that=skip_either_or_that)
706
- return _derive_df_and_required_columns(X=X, y=y, groups=groups, kwargs=kwargs,
707
- fit_partition_cols=self._fit_partition_colums_non_default)
708
-
709
- def _initialize_object(self):
710
- """
711
- Internal function to initialize sklearn object from module name and class name.
712
- """
713
- # Needed when writing imported modules to generated file. TODO: Remove later.
714
- imported_args = {}
715
- # If there are any objects of class `_SkLearnObjectWrapper`, it is modified to
716
- # corresponding sklearn object.
717
- new_sklearn_pos_args = self.modify_args(None, self.pos_args, imported_args)
718
- new_sklearn_kwargs = self.modify_args(None, self.kwargs, imported_args)
719
-
720
- # Create model object from new positional and keyword arguments.
721
- class_obj = getattr(import_module(self.module_name), self.class_name)
722
- if new_sklearn_pos_args:
723
- self.modelObj = class_obj(*new_sklearn_pos_args, **new_sklearn_kwargs)
724
- else:
725
- self.modelObj = class_obj(**new_sklearn_kwargs)
726
-
727
- # All arguments are moved to kwargs and kept pos_args empty.
728
- # Might help in set_params() bug fix.
729
- self.pos_args = tuple()
730
- _arguments = self.modelObj.__dict__
731
-
732
- if hasattr(self.modelObj, "get_params"):
733
- # Update kwargs that are both in modelObj and get_params() as there are
734
- # some classes which return other internals variables also.
735
- # Hence, filtering them using get_params().
736
- for k, v in _arguments.items():
737
- if type(v).__name__ in ["function", "generator"]:
738
- # TODO: ELE-6351: Skipping adding functions and generators to kwargs as these
739
- # are not supported yet due to pickling issue.
740
- continue
741
- if k in self.get_params():
742
- self.kwargs[k] = v
743
- else:
744
- # Model selection classes will not have `get_params`, in which case modelObj's __dict__
745
- # is saved as kwargs.
746
- self.kwargs = _arguments
747
-
748
- def _initialize_variables(self):
749
- """
750
- Internal function to initialize variables used in this class.
751
- """
752
- self.feature_names_in_ = None
753
- self._table_name_prefix = "td_sklearn_"
754
- self._model_file_name_prefix = _generate_new_name(type="file")
755
- self.model_file_paths_local = set()
756
-
757
- self._fit_execution_time = None
758
- self._fit_predict_execution_time = None
759
- self._partial_fit_execution_time = None
760
- self._predict_execution_time = None
761
- self._transform_execution_time = None
762
- self._score_execution_time = None
763
-
764
- # Set to partition columns when training is done with partition columns.
765
- self._fit_partition_colums_non_default = None
766
-
767
- self._is_model_installed = False
768
- self._fit_partition_unique_values = [[self._default_data_partition_value]]
769
-
770
- def modify_args(self, fp1, arg, imported_args):
771
- """
772
- Internal function to recursively (if "arg" is list/tuple/dict) check if any sklearn object
773
- of opensourceML is present in the argument "arg" and modify it to corresponding sklearn
774
- object.
775
- This function can also be used to write import statements to file (if "fp1" is not
776
- None). Update "imported_args" dictionary with imported module and class name to avoid
777
- importing same module and class again when writing to file. This is useful when we want to
778
- generate script from template file.
779
- Pass None to "fp1" if we don't want to write to file and just modify opensourceML sklearn
780
- object to corresponding sklearn object.
781
- """
782
- if isinstance(arg, type(self)):
783
- imported_tuple = (arg.module_name, arg.class_name)
784
- already_imported = imported_args.get(imported_tuple, False)
785
- if not already_imported:
786
- imported_args[imported_tuple] = True
787
- if fp1:
788
- fp1.write(f"from {arg.module_name} import {arg.class_name}\n")
789
- self.modify_args(fp1, arg.pos_args, imported_args)
790
- self.modify_args(fp1, arg.kwargs, imported_args)
791
- return arg.modelObj
792
- elif isinstance(arg, list):
793
- return [self.modify_args(fp1, val, imported_args) for val in arg]
794
- elif isinstance(arg, tuple):
795
- return tuple([self.modify_args(fp1, val, imported_args) for val in arg])
796
- elif type(arg).__name__ == "generator":
797
- # Raising exception as generator object can't be pickled.
798
- # TODO: ELE-6351 - Find ways to pickle generator object later.
799
- raise ValueError("Generator type/iterator is not supported for any argument. "\
800
- "Support will be added later.")
801
- elif type(arg).__name__ == "function":
802
- # Raising exception as functions/lambda functions can't be pickled.
803
- # TODO: ELE-6351 - Find ways to pickle functions later.
804
- raise ValueError("Functions are not supported for any argument. "\
805
- "Support will be added later.")
806
- elif isinstance(arg, dict):
807
- return dict(
808
- (
809
- self.modify_args(fp1, k, imported_args),
810
- self.modify_args(fp1, v, imported_args),
811
- )
812
- for k, v in arg.items()
813
- )
814
- else:
815
- return arg
1141
+ >>> model
1142
+ LinearRegression(normalize=True)
816
1143
 
817
- def _install_initial_model_file(self):
818
- """
819
- If model file(s) is/are not installed in Vantage, then install it/them.
1144
+ # Example 1: Deploy the model held by interface object to Vantage.
1145
+ >>> lin_reg = model.deploy("linreg_model_ver_2")
1146
+ Model is saved.
1147
+ >>> lin_reg
1148
+ LinearRegression(normalize=True)
1149
+
1150
+ # Example 2: Deploy the model held by interface object to Vantage with the name same
1151
+ # as that of model that already existed in Vantage.
1152
+ >>> lin_reg = model.deploy("linreg_model_ver_2", replace_if_exists=True)
1153
+ Model is deleted.
1154
+ Model is saved.
1155
+ >>> lin_reg
1156
+ LinearRegression(normalize=True)
820
1157
  """
821
- if isinstance(self.modelObj, pd.DataFrame):
822
- # Get list of unique partition values and corresponding model object as dict.
823
- partition_values_model_dict = {}
824
- obj_list = self.modelObj.values.tolist()
825
- for lst in obj_list:
826
- partition_values_model_dict[tuple(lst[:len(lst)-1])] = lst[len(lst)-1]
827
1158
 
828
- for partition in self._fit_partition_unique_values:
829
- # Create a new file with file name with partition values and
830
- # dump sklearn object into it. Finally install the file to Vantage.
831
- partition_join = "_".join([str(x) for x in partition])
832
- file_name = f"{self._model_file_name_prefix}_{partition_join}"
833
- # Replace '-' with '_' as '-' can't be present in file identifier.
834
- # Needed this replace because partition_columns can be negative.
835
- file_name = file_name.replace("-", "_")
836
- full_file_name = os.path.join(self._tdml_tmp_dir, file_name)
837
- with open(full_file_name, "wb+") as fp:
838
- # Write sklearn object to file.
839
- if isinstance(self.modelObj, pd.DataFrame):
840
- # If multiple models, then write the model corresponding to the partition value.
841
- fp.write(pickle.dumps(partition_values_model_dict[tuple(partition)]))
842
- else:
843
- fp.write(pickle.dumps(self.modelObj))
844
- self.model_file_paths_local.add(file_name)
1159
+ # Install model file into Vantage, if not installed.
1160
+ self._install_initial_model_file()
845
1161
 
846
- self._install_script_file(file_identifier=file_name,
847
- file_name=file_name,
848
- is_binary=True,
849
- file_location=self._tdml_tmp_dir)
1162
+ self._save_model(model_name, replace_if_exists)
1163
+ return self
850
1164
 
851
- if self._is_lake_system:
852
- # Need to pass env_name along with file_name for cleaning up the files in env.
853
- obj = f"{self._env.env_name}::{file_name}"
854
- if installed_model_files[obj] == 0:
855
- # Add to GC for the first time the model file (along with env name) is encountered.
856
- installed_model_files[obj] = 1
857
- GarbageCollector._add_to_garbagecollector(object_name=obj,
858
- object_type=TeradataConstants.TERADATA_APPLY)
859
- else:
860
- if installed_model_files[file_name] == 0:
861
- # Add to GC for the first time the model file is encountered.
862
- installed_model_files[file_name] = 1
863
- GarbageCollector._add_to_garbagecollector(object_name=file_name,
864
- object_type=TeradataConstants.TERADATA_SCRIPT)
865
1165
 
866
- self._is_model_installed = True
1166
+ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1167
+
1168
+ OPENSOURCE_PACKAGE_NAME = OpenSourcePackage.SKLEARN
1169
+
1170
+ def __init__(self, model=None, module_name=None, class_name=None, pos_args=None, kwargs=None):
1171
+ super().__init__(model=model, module_name=module_name, class_name=class_name,
1172
+ pos_args=pos_args, kwargs=kwargs)
1173
+
1174
+ self._initialize_variables(table_name_prefix="td_sklearn_")
1175
+ if model is not None:
1176
+ self.modelObj = model
1177
+ self.module_name = model.__module__.split("._")[0]
1178
+ self.class_name = model.__class__.__name__
1179
+ # __dict__ gets all the arguments as dictionary including default ones and positional
1180
+ # args.
1181
+ self.kwargs = model.__dict__
1182
+ self.pos_args = tuple() # Kept empty as all are moved to kwargs.
1183
+ else:
1184
+ self._initialize_object()
1185
+
1186
+ def _validate_args_and_get_data(self, X=None, y=None, groups=None, kwargs={},
1187
+ skip_either_or_that=False):
1188
+ """
1189
+ Internal function to validate arguments passed to exposed opensource APIs and return
1190
+ parent DataFrame, feature columns, label columns, group columns, data partition columns.
1191
+ """
1192
+ _validate_opensource_func_args(X=X, y=y, groups=groups,
1193
+ fit_partition_cols=self._fit_partition_colums_non_default,
1194
+ kwargs=kwargs,
1195
+ skip_either_or_that=skip_either_or_that)
1196
+ return _derive_df_and_required_columns(X=X, y=y, groups=groups, kwargs=kwargs,
1197
+ fit_partition_cols=self._fit_partition_colums_non_default)
867
1198
 
868
1199
  def _run_fit_related_functions(self,
869
1200
  data,
@@ -871,7 +1202,8 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
871
1202
  label_columns,
872
1203
  partition_columns,
873
1204
  func,
874
- classes=None):
1205
+ classes=None,
1206
+ file_name="sklearn_fit.py"):
875
1207
  """
876
1208
  Internal function to run fit() and partial_fit() functions.
877
1209
  """
@@ -886,8 +1218,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
886
1218
  return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
887
1219
  for col in new_partition_columns] + [("model", model_type)]
888
1220
 
889
- file_name = "sklearn_fit.py"
890
-
891
1221
  if classes:
892
1222
  class_type = type(classes[0]).__name__
893
1223
  classes = "--".join([str(x) for x in classes])
@@ -913,20 +1243,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
913
1243
  self._model_data = self._run_script(data, script_command, new_partition_columns,
914
1244
  return_types)
915
1245
 
916
- # Extract sklearn object(s) from the depending on the number of unique partitioning values.
917
- self.extract_sklearn_obj(n_unique_partitions=len(self._fit_partition_unique_values),
918
- n_partition_cols=len(new_partition_columns))
919
-
920
- # Need this label columns types in prediction.
921
- self._fit_label_columns_types = [data._td_column_names_and_sqlalchemy_types[l_c.lower()]
922
- for l_c in label_columns]
923
-
924
- # If the model is trained a second time after the object creation,
925
- # or if set_params() is called after the first model training,
926
- # this flag will reset to False. So that for subsequent predict/score
927
- # operations, the newly trained model will be installed.
928
- if self._is_trained_model_installed:
929
- self._is_trained_model_installed = False
1246
+ self._assign_fit_variables_after_execution(data, new_partition_columns, label_columns)
930
1247
 
931
1248
  def partial_fit(self, X=None, y=None, classes=None, **kwargs):
932
1249
  """
@@ -974,11 +1291,19 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
974
1291
  self._is_default_partition_value_fit = False
975
1292
  self._fit_partition_colums_non_default = partition_columns
976
1293
 
977
- self._run_fit_related_functions(data,
978
- feature_columns,
979
- label_columns,
980
- partition_columns,
981
- inspect.stack()[0][3])
1294
+ file_name = kwargs.pop("file_name", None)
1295
+ func_name = kwargs.pop("name", "fit")
1296
+
1297
+ args = {"data": data,
1298
+ "feature_columns": feature_columns,
1299
+ "label_columns": label_columns,
1300
+ "partition_columns": partition_columns,
1301
+ "func": func_name}
1302
+
1303
+ if file_name is not None:
1304
+ args["file_name"] = file_name
1305
+
1306
+ self._run_fit_related_functions(**args)
982
1307
 
983
1308
  self._fit_execution_time = time.time() - st_time
984
1309
 
@@ -1043,10 +1368,130 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1043
1368
 
1044
1369
  return super().__getattr__(name)
1045
1370
 
1371
+ def _special_handling_multimodel_(self, data, feature_columns, label_columns, partition_columns,
1372
+ func_name, **kwargs):
1373
+ """
1374
+ Internal function to handle multi model case for transform function for functions
1375
+ ["SelectFpr", "SelectFdr", "SelectFwe", "SelectFromModel", "RFECV"] of feature_selection module
1376
+ and "Birch" of cluster module.
1377
+ These functions generate multiple models and when transform is applied to each model, it generates
1378
+ output with different number of columns.
1379
+ """
1380
+ skl_objs_dict = {}
1381
+ no_of_unique_partitions = len(self._fit_partition_unique_values)
1382
+ no_of_partitioning_cols = len(self._fit_partition_unique_values[0])
1383
+
1384
+ # Run on 10 rows of data individually using corresponding scikit-learn objects based on paritition value
1385
+ # and get the maximum number of columns and their types.
1386
+ for i in range(no_of_unique_partitions):
1387
+ skl_objs_dict[tuple(self.modelObj.iloc[i, :no_of_partitioning_cols])] = self.modelObj.iloc[i]["model"]
1388
+
1389
+
1390
+ data = data.select(feature_columns + label_columns + partition_columns)
1391
+ ten_row_data = data.head(10).get_values()
1392
+ X = numpy.array(ten_row_data)
1393
+
1394
+ # For multi-model case, model in one AMP can give more number of columns than other AMPs.
1395
+ # Returns clause can't contain different number of columns in different AMPs. Hence, taking
1396
+ # maximum number of columns and their types from all models.
1397
+ max_no_of_columns = 0
1398
+ max_col_names = []
1399
+ max_col_types = []
1400
+
1401
+ def _get_input_row_without_nans(row):
1402
+ """
1403
+ `inverse_transform` should not contain NaNs. Hence, removing NaNs from the row.
1404
+ """
1405
+ X1 = []
1406
+ for _, v in enumerate(row):
1407
+ if isinstance(v, type(None)) or isinstance(v, str) or not math.isnan(v) or self.module_name == "sklearn.impute":
1408
+ # Add to list when:
1409
+ # - v is None or
1410
+ # - v is string or
1411
+ # - v is not nan or
1412
+ # - if module is impute (which transforms nan values) even though v is nan.
1413
+ X1.append(v)
1414
+ else:
1415
+ # skip nan values.
1416
+ pass
1417
+ return X1
1418
+
1419
+ for i in range(X.shape[0]):
1420
+ # Run `transform` or `inverse_transform` on each row with corresponding scikit-learn model object.
1421
+ partition_values = tuple(X[i, -no_of_partitioning_cols:])
1422
+ skl_obj = skl_objs_dict[partition_values]
1423
+
1424
+ X1 = X[i, :-no_of_partitioning_cols]
1425
+ # Since Nans/NULLs are added in transform for last columns where some models generated
1426
+ # less number of columns, removing Nans/NULLs from the input row for inverse_transform
1427
+ # using function _get_input_row_without_nans().
1428
+ X1 = numpy.array([_get_input_row_without_nans(X1)])
1429
+
1430
+ trans_opt = getattr(skl_obj, func_name)(X1, **kwargs)
1431
+
1432
+ no_of_columns = 1
1433
+
1434
+ if trans_opt.shape == (X1.shape[0],):
1435
+ trans_opt = trans_opt.reshape(X1.shape[0], 1)
1436
+
1437
+ if isinstance(trans_opt[0], numpy.ndarray) \
1438
+ or isinstance(trans_opt[0], list) \
1439
+ or isinstance(trans_opt[0], tuple):
1440
+ no_of_columns = len(trans_opt[0])
1441
+
1442
+ col_names = [f"{self.class_name.lower()}_{func_name}_{(i + 1)}" for i in range(no_of_columns)]
1443
+
1444
+ # Get new column sqlalchemy types for pandas df columns of transform output.
1445
+ opt_pd = pd.DataFrame(trans_opt)
1446
+
1447
+ # Get output column types for each column in pandas df from the output of transform
1448
+ # type functions.
1449
+ types = {}
1450
+ for idx in range(no_of_columns):
1451
+ col = list(opt_pd.columns)[idx]
1452
+
1453
+ # Only one row in trans_opt.
1454
+ if isinstance(trans_opt[0], numpy.ndarray) or isinstance(trans_opt[0], tuple) or isinstance(trans_opt[0], list):
1455
+ type_ = type(trans_opt[0][idx])
1456
+ else:
1457
+ # only one value in the output.
1458
+ type_ = type(trans_opt[0])
1459
+
1460
+ # If type of the output value (trans_opt) is None, then use `str` as type since
1461
+ # pandas astype() does not accept None type.
1462
+ if type_ is type(None):
1463
+ type_ = str
1464
+
1465
+ # numpy integer columns with nan values can't be typecasted using pd.astype() to int64.
1466
+ # It raises error like "Cannot convert non-finite values (NA or inf) to integer:
1467
+ # Error while type casting for column '2'"
1468
+ # Hence, using pd.Int64Dtype() for integer columns with nan values.
1469
+ types[col] = type_ if type_ not in [int, numpy.int64] else pd.Int64Dtype()
1470
+
1471
+ # Without this, all columns will be of object type and gets converted to VARCHAR in Vantage.
1472
+ opt_pd = opt_pd.astype(types)
1473
+
1474
+ # If the datatype is not specified then check if the datatype is datetime64 and timezone is present then map it to
1475
+ # TIMESTAMP(timezone=True) else map it according to default value.
1476
+ col_types = [TIMESTAMP(timezone=True)
1477
+ if pt.is_datetime64_ns_dtype(opt_pd.dtypes[key]) and (opt_pd[col_name].dt.tz is not None)
1478
+ else _get_sqlalchemy_mapping(str(opt_pd.dtypes[key]))
1479
+ for key, col_name in enumerate(list(opt_pd.columns))]
1480
+
1481
+ # Different models in multi model case can generate different number of output columns for example in
1482
+ # SelectFpr. Hence, taking the model which generates maximum number of columns.
1483
+ if no_of_columns > max_no_of_columns:
1484
+ max_no_of_columns = no_of_columns
1485
+ max_col_names = col_names
1486
+ max_col_types = col_types
1487
+
1488
+ return [(c_name, c_type) for c_name, c_type in zip(max_col_names, max_col_types)]
1489
+
1046
1490
  def _get_return_columns_for_function_(self,
1047
1491
  data,
1048
1492
  feature_columns,
1049
1493
  label_columns,
1494
+ partition_columns,
1050
1495
  func_name,
1051
1496
  kwargs):
1052
1497
  """
@@ -1060,7 +1505,8 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1060
1505
  return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}",
1061
1506
  data._td_column_names_and_sqlalchemy_types[col.lower()])
1062
1507
  for i, col in enumerate(label_columns)]
1063
- if func_name == "predict":
1508
+
1509
+ if func_name == "predict" and self.OPENSOURCE_PACKAGE_NAME == OpenSourcePackage.SKLEARN:
1064
1510
  """
1065
1511
  Return predict columns using either label_columns (if provided) or
1066
1512
  self._fit_label_columns_types (if the function is trained using label columns).
@@ -1075,8 +1521,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1075
1521
  return [(f"{self.class_name.lower()}_{func_name}_{(i + 1)}", col_type)
1076
1522
  for i, col_type in enumerate(self._fit_label_columns_types)]
1077
1523
 
1078
- data = data.select(feature_columns + label_columns)
1079
-
1080
1524
  ## If function is not `fit_predict`:
1081
1525
  # then take one row of transform/other functions to execute in client
1082
1526
  # to get number of columns in return clause and their Vantage types.
@@ -1090,8 +1534,20 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1090
1534
  skl_obj = self.modelObj
1091
1535
  else:
1092
1536
  # Multi model case.
1537
+ if (func_name in ["transform", "inverse_transform"] and \
1538
+ self.class_name in ["SelectFpr", "SelectFdr", "SelectFwe", "SelectFromModel", "RFECV", "Birch"]) or \
1539
+ (self.module_name == "lightgbm.sklearn" and self.class_name == "LGBMClassifier"):
1540
+ # Special handling for multi model case for transform function as these classes
1541
+ # generate transform output with different number of columns for each model.
1542
+ # Hence, need to add Nulls/Nans to columns which are not present in the transform output of
1543
+ # some models.
1544
+ return self._special_handling_multimodel_(data, feature_columns, label_columns,
1545
+ partition_columns, func_name, **kwargs)
1546
+
1093
1547
  skl_obj = self.modelObj.iloc[0]["model"]
1094
1548
 
1549
+ data = data.select(feature_columns + label_columns)
1550
+
1095
1551
  ten_row_data = data.head(10).get_values()
1096
1552
  X = numpy.array(ten_row_data)
1097
1553
  if label_columns:
@@ -1200,7 +1656,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1200
1656
  return [(c_name, c_type) for c_name, c_type in zip(col_names, col_types)]
1201
1657
 
1202
1658
  @_validate_fit_run
1203
- def _run_function_needing_all_rows(self, X=None, y=None, **kwargs):
1659
+ def _run_function_needing_all_rows(self, X=None, y=None, file_name="sklearn_score.py", **kwargs):
1204
1660
  """
1205
1661
  Internal function to run functions like score, aic, bic which needs all rows and return
1206
1662
  one floating number as result.
@@ -1223,8 +1679,6 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1223
1679
  label_columns,
1224
1680
  partition_columns)
1225
1681
 
1226
- file_name = "sklearn_score.py"
1227
-
1228
1682
  script_file_path = f"{file_name}" if self._is_lake_system \
1229
1683
  else f"./{self._db_name}/{file_name}"
1230
1684
 
@@ -1260,7 +1714,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1260
1714
  return opt
1261
1715
 
1262
1716
  @_validate_fit_run
1263
- def _transform(self, X=None, y=None, **kwargs):
1717
+ def _transform(self, X=None, y=None, file_name="sklearn_transform.py", **kwargs):
1264
1718
  """
1265
1719
  Internal function to run predict/transform and similar functions, which returns
1266
1720
  multiple columns. This function will return data row along with the generated
@@ -1283,18 +1737,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1283
1737
  partition_columns)
1284
1738
 
1285
1739
  # Since kwargs are passed to transform, removing additional unrelated arguments from kwargs.
1286
- if "data" in kwargs:
1287
- kwargs.pop("data")
1288
- if "feature_columns" in kwargs:
1289
- kwargs.pop("feature_columns")
1290
- if "group_columns" in kwargs:
1291
- kwargs.pop("group_columns")
1292
- if "partition_columns" in kwargs:
1293
- kwargs.pop("partition_columns")
1294
- if "label_columns" in kwargs:
1295
- kwargs.pop("label_columns")
1296
-
1297
- file_name = "sklearn_transform.py"
1740
+ self._remove_data_related_args_from_kwargs(kwargs)
1298
1741
 
1299
1742
  script_file_path = f"{file_name}" if self._is_lake_system \
1300
1743
  else f"./{self._db_name}/{file_name}"
@@ -1304,24 +1747,36 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1304
1747
 
1305
1748
  self._validate_unique_partition_values(data, new_partition_columns)
1306
1749
 
1307
- py_exc = UtilFuncs._get_python_execution_path()
1308
- script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
1309
- f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
1310
- f"{self._model_file_name_prefix} {self._is_lake_system}"
1750
+ return_columns_python_types = None
1751
+ if self._fit_label_columns_python_types:
1752
+ return_columns_python_types = '--'.join(self._fit_label_columns_python_types)
1311
1753
 
1312
1754
  # Returning feature columns also along with transformed columns because we don't know the
1313
1755
  # mapping of feature columns to the transformed columns.
1314
- return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1315
- for col in (new_partition_columns + feature_columns)]
1756
+ ## 'correct_covariance()' returns the (n_features, n_features)
1757
+ if func_name == "correct_covariance":
1758
+ return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1759
+ for col in new_partition_columns]
1760
+ else:
1761
+ return_types = [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1762
+ for col in (new_partition_columns + feature_columns)]
1316
1763
  if func_name in ["predict", "decision_function"] and label_columns:
1317
1764
  return_types += [(col, data._td_column_names_and_sqlalchemy_types[col.lower()])
1318
1765
  for col in label_columns]
1319
1766
 
1320
- return_types += self._get_return_columns_for_function_(data,
1321
- feature_columns,
1322
- label_columns,
1323
- func_name,
1324
- kwargs)
1767
+ output_cols_types = self._get_return_columns_for_function_(data,
1768
+ feature_columns,
1769
+ label_columns,
1770
+ new_partition_columns,
1771
+ func_name,
1772
+ kwargs)
1773
+ return_types += output_cols_types
1774
+
1775
+ py_exc = UtilFuncs._get_python_execution_path()
1776
+ script_command = f"{py_exc} {script_file_path} {func_name} {len(feature_columns)} "\
1777
+ f"{len(label_columns)} {partition_indices_str} {data_column_types_str} "\
1778
+ f"{self._model_file_name_prefix} {len(output_cols_types)} {self._is_lake_system} " \
1779
+ f"{return_columns_python_types}"
1325
1780
 
1326
1781
  # Checking the trained model installation. If not installed,
1327
1782
  # install it and set flag to True.
@@ -1363,6 +1818,7 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1363
1818
  return_types += self._get_return_columns_for_function_(data,
1364
1819
  feature_columns,
1365
1820
  label_columns,
1821
+ new_partition_columns,
1366
1822
  func_name,
1367
1823
  {})
1368
1824
  else:
@@ -1448,14 +1904,10 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1448
1904
  skip_either_or_that=True)
1449
1905
 
1450
1906
  # Remove the kwargs data.
1451
- input_data = kwargs.pop("data", None)
1452
- partition_cols = kwargs.pop("partition_columns", None)
1453
- feature_cols = kwargs.pop("feature_columns", None)
1454
- label_cols = kwargs.pop("label_columns", None)
1907
+ self._remove_data_related_args_from_kwargs(kwargs)
1455
1908
 
1456
1909
  if partition_columns:
1457
1910
  # kwargs are passed to kneighbors function. So, removing them from kwargs.
1458
- kwargs.pop("partition_columns")
1459
1911
  self._is_default_partition_value_fit = False
1460
1912
 
1461
1913
  # Generating new partition column name.
@@ -1640,161 +2092,69 @@ class _SkLearnObjectWrapper(_OpenSourceObjectWrapper):
1640
2092
 
1641
2093
  return opt
1642
2094
 
1643
- def _get_returning_df(self, script_df, partition_column, returns):
1644
- """
1645
- Internal function to return the teradataml Dataframe except
1646
- partition_column.
1647
- """
1648
- if self._is_default_partition_value_fit:
1649
- # For single model case, partition column is internally generated
1650
- # and no point in returning it to the user.
1651
-
1652
- # Extract columns from return types.
1653
- returning_cols = [col[0] for col in returns[len(partition_column):]]
1654
- return script_df.select(returning_cols)
1655
- return script_df
1656
2095
 
1657
-
1658
- class _SKLearnFunctionWrapper(_GenericObjectWrapper):
1659
- def __init__(self, module_name, func_name):
2096
+ class _FunctionWrapper(_GenericObjectWrapper):
2097
+ def __init__(self, module_name, func_name, file_type, template_file):
1660
2098
  super().__init__()
1661
- self.__module_name = module_name
1662
- self.__func_name = func_name
1663
- self.__params = None
1664
- self.__data_args = OrderedDict()
1665
- self._model_file_name = _generate_new_name(type="file_function", extension="py")
2099
+ self._module_name = module_name
2100
+ self._func_name = func_name
2101
+ self._params = None
2102
+ self._data_args = OrderedDict()
2103
+ self._template_file = template_file
2104
+ self._script_file_name = _generate_new_name(type=file_type, extension="py")
1666
2105
 
1667
2106
  def __call__(self, **kwargs):
1668
2107
  """
1669
2108
  Run the function with all the arguments passed from `td_sklearn.<function_name>` function.
1670
2109
  """
1671
- __data_columns = []
1672
-
1673
- partition_cols = self._get_columns_as_list(kwargs.get("partition_columns", None))
1674
- if partition_cols:
1675
- kwargs.pop("partition_columns")
1676
-
1677
- # Separate dataframe related arguments and their column names from actual kwargs.
1678
- for k, v in kwargs.items():
1679
- if isinstance(v, DataFrame):
1680
- # All dataframes should be select of parent dataframe.
1681
- _validate_df_query_type(v, "select", k)
1682
-
1683
- # Save all columns in dataframe related arguments.
1684
- __data_columns.extend(v.columns)
1685
-
1686
- self.__data_args[k] = v
1687
-
1688
-
1689
- # Get common parent dataframe from all dataframes.
1690
- self.__tdml_df = DataFrameUtils()._get_common_parent_df_from_dataframes(list(self.__data_args.values()))
2110
+ replace_dict, partition_cols = self._process_data_for_funcs_returning_objects(kwargs)
1691
2111
 
1692
- self._validate_existence_of_partition_columns(partition_cols, self.__tdml_df.columns)
1693
-
1694
- self.__tdml_df = self.__tdml_df.select(__data_columns + partition_cols)
1695
-
1696
- self.__tdml_df, partition_cols = self._get_data_and_data_partition_columns(self.__tdml_df,
1697
- __data_columns,
1698
- [],
1699
- partition_cols
1700
- )
1701
-
1702
- # Prepare string of data arguments with name, indices where columns of that argument resides
1703
- # and types of each of the column.
1704
- data_args_str = self._prepare_data_args_string(kwargs)
1705
-
1706
- self.__params = kwargs
1707
-
1708
- # Get indices of partition_columns and types of all columns.
1709
- data_column_types_str, partition_indices_str, _, partition_cols = \
1710
- self._get_data_col_types_and_partition_col_indices_and_types(self.__tdml_df, partition_cols)
1711
-
1712
- script_file_path = f"{self._model_file_name}" if self._is_lake_system \
1713
- else f"./{self._db_name}/{self._model_file_name}"
2112
+ script_file_path = f"{self._script_file_name}" if self._is_lake_system \
2113
+ else f"./{self._db_name}/{self._script_file_name}"
1714
2114
 
1715
2115
  model_file_prefix = None
1716
2116
  if self._is_lake_system:
1717
- model_file_prefix = self._model_file_name.replace(".py", "")
2117
+ model_file_prefix = self._script_file_name.replace(".py", "")
1718
2118
 
1719
2119
  py_exc = UtilFuncs._get_python_execution_path()
1720
- script_command = (f"{py_exc} {script_file_path} {partition_indices_str} "\
1721
- f"{data_column_types_str} {data_args_str} {self._is_lake_system}"\
1722
- f" {model_file_prefix}")
2120
+ script_command = f"{py_exc} {script_file_path} {model_file_prefix} {self._is_lake_system}"
1723
2121
 
1724
2122
  model_type = BLOB() if self._is_lake_system else CLOB()
1725
- return_types = [(col, self.__tdml_df._td_column_names_and_sqlalchemy_types[col.lower()])
1726
- for col in partition_cols] + [(self.__func_name, model_type)]
1727
-
1728
- # Generate new file in .teradataml directory and install it to Vantage.
1729
- self._prepare_and_install_file()
1730
-
1731
- self._model_data = self._run_script(self.__tdml_df, script_command, partition_cols, return_types)
1732
- self._model_data._index_label = None
1733
2123
 
1734
- fit_partition_unique_values = self.__tdml_df.drop_duplicate(partition_cols).get_values()
2124
+ return_types = [(col, self._tdml_df._td_column_names_and_sqlalchemy_types[col.lower()])
2125
+ for col in partition_cols] + [(self._func_name, model_type)]
1735
2126
 
1736
- self.extract_sklearn_obj(n_unique_partitions=len(fit_partition_unique_values),
1737
- n_partition_cols=len(partition_cols))
1738
-
1739
- # File cleanup after processing.
1740
- os.remove(self._model_file_local)
1741
- self._remove_script_file(self._model_file_name)
2127
+ replace_dict.update({"<module_name>": self._module_name,
2128
+ "<func_name>": self._func_name,
2129
+ "<params>": json.dumps(kwargs)})
1742
2130
 
1743
- return self.modelObj
2131
+ # Generate new file in .teradataml directory and install it to Vantage.
2132
+ self._prepare_and_install_file(replace_dict=replace_dict)
1744
2133
 
1745
- def _prepare_data_args_string(self, kwargs):
1746
- """
1747
- Get column indices and types of each data related arguments in the format:
1748
- "{<arg_name>-<comma separated indices>-<comma separated types>}--
1749
- {<arg_name>-<comma separated indices>-<comma separated types>}"
1750
- """
1751
- data_args_str = []
1752
- for arg_name in list(self.__data_args.keys()):
1753
- # Remove DataFrame arguments from kwargs, which will be passed to Script.
1754
- kwargs.pop(arg_name)
2134
+ try:
2135
+ self._model_data = self._run_script(self._tdml_df, script_command, partition_cols, return_types)
2136
+ self._model_data._index_label = None
1755
2137
 
1756
- # Get column indices and their types for each dataframe from parent dataframe.
1757
- _, partition_indices_str, partition_types_str, _ = \
1758
- self._get_data_col_types_and_partition_col_indices_and_types(self.__tdml_df,
1759
- self.__data_args[arg_name].columns,
1760
- idx_delim=",",
1761
- types_delim=",")
1762
-
1763
- # Format "<arg_name>-<comma separated indices>-<comma separated types>"
1764
- data_args_str.append(f"{arg_name}-{partition_indices_str}-{partition_types_str}")
1765
-
1766
- # Format "{<arg_name>-<comma separated indices>-<comma separated types>}--
1767
- # {<arg_name>-<comma separated indices>-<comma separated types>}"
1768
- return "--".join(data_args_str)
2138
+ fit_partition_unique_values = self._tdml_df.drop_duplicate(partition_cols).get_values()
1769
2139
 
1770
- def _validate_existence_of_partition_columns(self, partition_columns, all_columns):
1771
- """
1772
- Validate if columns in "partition_columns" argument are present in any of the given
1773
- dataframes.
1774
- """
1775
- invalid_part_cols = [c for c in partition_columns if c not in all_columns]
2140
+ self._extract_model_objs(n_unique_partitions=len(fit_partition_unique_values),
2141
+ n_partition_cols=len(partition_cols))
1776
2142
 
1777
- if invalid_part_cols:
1778
- raise ValueError(Messages.get_message(MessageCodes.INVALID_PARTITIONING_COLS,
1779
- ", ".join(invalid_part_cols),
1780
- "', '".join(list(self.__data_args.keys())))
1781
- )
2143
+ except Exception as ex:
2144
+ # File cleanup if script execution fails or unable to fetch modelObj.
2145
+ os.remove(self._script_file_local)
2146
+ self._remove_script_file(self._script_file_name)
2147
+ raise
1782
2148
 
1783
- def _prepare_and_install_file(self):
1784
- """
1785
- Prepare function script file from template file and install it in Vantage.
1786
- """
1787
- with open(os.path.join(self._scripts_path, "sklearn_function.template")) as fp:
1788
- script_data = fp.read()
1789
- script_data = script_data.replace("<module_name>",self.__module_name).\
1790
- replace("<func_name>",self.__func_name).replace("<params>", json.dumps(self.__params))
2149
+ # File cleanup after processing.
2150
+ os.remove(self._script_file_local)
2151
+ self._remove_script_file(self._script_file_name)
1791
2152
 
1792
- self._model_file_local = os.path.join(self._tdml_tmp_dir, self._model_file_name)
2153
+ return self.modelObj
1793
2154
 
1794
- with open(self._model_file_local, "w") as fp:
1795
- fp.write(script_data)
1796
-
1797
- self._install_script_file(file_identifier=self._model_file_name.split(".")[0],
1798
- file_name=self._model_file_name,
1799
- file_location=self._tdml_tmp_dir)
1800
2155
 
2156
+ class _SKLearnFunctionWrapper(_FunctionWrapper):
2157
+ def __init__(self, module_name, func_name):
2158
+ file_type = "file_fn_sklearn"
2159
+ template_file = "sklearn_function.template"
2160
+ super().__init__(module_name, func_name, file_type=file_type, template_file=template_file)