teradataml 20.0.0.2__py3-none-any.whl → 20.0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (88) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/README.md +196 -2
  3. teradataml/__init__.py +4 -0
  4. teradataml/_version.py +1 -1
  5. teradataml/analytics/analytic_function_executor.py +79 -4
  6. teradataml/analytics/json_parser/metadata.py +12 -3
  7. teradataml/analytics/json_parser/utils.py +7 -2
  8. teradataml/analytics/sqle/__init__.py +1 -0
  9. teradataml/analytics/table_operator/__init__.py +1 -1
  10. teradataml/analytics/uaf/__init__.py +1 -1
  11. teradataml/analytics/utils.py +4 -0
  12. teradataml/automl/data_preparation.py +3 -2
  13. teradataml/automl/feature_engineering.py +15 -7
  14. teradataml/automl/model_training.py +39 -33
  15. teradataml/common/__init__.py +2 -1
  16. teradataml/common/constants.py +35 -0
  17. teradataml/common/garbagecollector.py +2 -1
  18. teradataml/common/messagecodes.py +8 -2
  19. teradataml/common/messages.py +3 -1
  20. teradataml/common/sqlbundle.py +25 -3
  21. teradataml/common/utils.py +134 -9
  22. teradataml/context/context.py +20 -10
  23. teradataml/data/SQL_Fundamentals.pdf +0 -0
  24. teradataml/data/dataframe_example.json +18 -2
  25. teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +1 -1
  26. teradataml/data/docs/sqle/docs_17_20/Shap.py +7 -1
  27. teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +4 -4
  28. teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
  29. teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
  30. teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
  31. teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
  32. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
  33. teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
  34. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
  35. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
  36. teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
  37. teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
  38. teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
  39. teradataml/data/jsons/uaf/17.20/TD_SAX.json +3 -1
  40. teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +15 -5
  41. teradataml/data/medical_readings.csv +101 -0
  42. teradataml/data/patient_profile.csv +101 -0
  43. teradataml/data/scripts/lightgbm/dataset.template +157 -0
  44. teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
  45. teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
  46. teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
  47. teradataml/data/scripts/sklearn/sklearn_fit.py +194 -167
  48. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
  49. teradataml/data/scripts/sklearn/sklearn_function.template +14 -19
  50. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
  51. teradataml/data/scripts/sklearn/sklearn_transform.py +129 -42
  52. teradataml/data/target_udt_data.csv +8 -0
  53. teradataml/data/templates/open_source_ml.json +3 -2
  54. teradataml/data/vectordistance_example.json +4 -0
  55. teradataml/dataframe/dataframe.py +543 -175
  56. teradataml/dataframe/functions.py +553 -25
  57. teradataml/dataframe/sql.py +184 -15
  58. teradataml/dbutils/dbutils.py +556 -18
  59. teradataml/dbutils/filemgr.py +48 -1
  60. teradataml/lib/aed_0_1.dll +0 -0
  61. teradataml/opensource/__init__.py +1 -1
  62. teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
  63. teradataml/opensource/_lightgbm.py +950 -0
  64. teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
  65. teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
  66. teradataml/opensource/sklearn/__init__.py +0 -1
  67. teradataml/opensource/sklearn/_sklearn_wrapper.py +798 -438
  68. teradataml/options/__init__.py +7 -23
  69. teradataml/options/configure.py +29 -3
  70. teradataml/scriptmgmt/UserEnv.py +3 -3
  71. teradataml/scriptmgmt/lls_utils.py +74 -21
  72. teradataml/store/__init__.py +13 -0
  73. teradataml/store/feature_store/__init__.py +0 -0
  74. teradataml/store/feature_store/constants.py +291 -0
  75. teradataml/store/feature_store/feature_store.py +2223 -0
  76. teradataml/store/feature_store/models.py +1505 -0
  77. teradataml/store/vector_store/__init__.py +1586 -0
  78. teradataml/table_operators/query_generator.py +3 -0
  79. teradataml/table_operators/table_operator_query_generator.py +3 -1
  80. teradataml/table_operators/table_operator_util.py +37 -38
  81. teradataml/table_operators/templates/dataframe_register.template +69 -0
  82. teradataml/utils/dtypes.py +4 -2
  83. teradataml/utils/validators.py +33 -1
  84. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +200 -5
  85. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +88 -65
  86. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
  87. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
  88. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0
@@ -28,28 +28,22 @@ def splitter(strr, delim=",", convert_to="str"):
28
28
  return [convert_to_type(i, convert_to) for i in strr.split(delim)]
29
29
 
30
30
  # Arguments to the Script.
31
- if len(sys.argv) != 6:
32
- # 5 arguments command line arguments should be passed to this file.
31
+ if len(sys.argv) != 3:
32
+ # 3 command line arguments should be passed to this file.
33
33
  # 1: file to be run
34
- # 2. Comma separated indices of partition columns.
35
- # 3. Comma separated types of all the data columns.
36
- # 4. Data columns information separted by "--" where each data column information is in the form
37
- # "<arg_name>-<comma separated data indices>-<comma separated data types>".
38
- # 5. Flag to check the system type. True, means Lake, Enterprise otherwise.
39
- # 6. Model file prefix for lake system, None otherwise.
40
- sys.exit("5 arguments command line arguments should be passed: file to be run,"
41
- " comma separated indices of partition columns, comma separated types of all columns,"
42
- " data columns information separated by '--' where each data column information is"
43
- " in the form '<arg_name>-<comma separated data indices>-<comma separated data types>',"
44
- " flag to check lake or enterprise and model file prefix used only for lake system.")
45
-
46
- is_lake_system = eval(sys.argv[4])
34
+ # 2. Model file prefix for lake system, None otherwise.
35
+ # 3. Flag to check the system type. True, means Lake, Enterprise otherwise.
36
+ sys.exit("3 arguments command line arguments should be passed: file to be run,"
37
+ " model file prefix used only for lake system and flag to check lake or enterprise.")
38
+
39
+ is_lake_system = eval(sys.argv[2])
47
40
  if not is_lake_system:
48
41
  db = sys.argv[0].split("/")[1]
49
42
  else:
50
- model_file_prefix = sys.argv[5]
51
- data_partition_column_indices = splitter(sys.argv[1], convert_to="int") # indices are integers.
52
- data_column_types = splitter(sys.argv[2], delim="--")
43
+ model_file_prefix = sys.argv[1]
44
+
45
+ data_partition_column_indices = <partition_cols_indices>
46
+ data_column_types = <types_of_data_cols>
53
47
 
54
48
  data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
55
49
 
@@ -59,7 +53,8 @@ data_args_indices_types = OrderedDict()
59
53
  # Data related arguments values - prepare dictionary and populate data later.
60
54
  data_args_values = {}
61
55
 
62
- for data_arg in sys.argv[3].split("--"):
56
+ data_args_info_str = <data_args_info_str>
57
+ for data_arg in data_args_info_str.split("--"):
63
58
  arg_name, indices, types = data_arg.split("-")
64
59
  indices = splitter(indices, convert_to="int")
65
60
  types = splitter(types)
@@ -3,146 +3,164 @@ import math
3
3
  import sys
4
4
  import numpy as np
5
5
  import base64
6
+ from contextlib import contextmanager
7
+ import os
6
8
 
7
9
  DELIMITER = '\t'
8
10
 
9
- def get_values_list(values, types):
10
- ret_vals = []
11
- for i, val in enumerate(values):
12
- ret_vals.append(convert_to_type(val, types[i]))
13
- return ret_vals
14
-
15
- def convert_to_type(val, typee):
16
- if typee == 'int':
17
- return int(val) if val != "" else np.nan
18
- if typee == 'float':
19
- if isinstance(val, str):
20
- val = val.replace(' ', '')
21
- return float(val) if val != "" else np.nan
22
- if typee == 'bool':
23
- return eval(val) if val != "" else None
24
- return str(val) if val != "" else None
25
-
26
- def splitter(strr, delim=",", convert_to="str"):
11
+ @contextmanager
12
+ def suppress_stderr():
27
13
  """
28
- Split the string based on delimiter and convert to the type specified.
14
+ Function to suppress the warnings(lake systems treats warnings as errors).
29
15
  """
30
- if strr == "None":
31
- return []
32
- return [convert_to_type(i, convert_to) for i in strr.split(delim)]
33
-
34
-
35
- # Arguments to the Script
36
- if len(sys.argv) != 9:
37
- # 9 arguments command line arguments should be passed to this file.
38
- # 1: file to be run
39
- # 2. function name
40
- # 3. No of feature columns.
41
- # 4. No of class labels.
42
- # 5. No of group columns.
43
- # 6. Comma separated indices of partition columns.
44
- # 7. Comma separated types of all the data columns.
45
- # 8. Model file prefix to generated model file using partition columns.
46
- # 9. Flag to check the system type. True, means Lake, Enterprise otherwise.
47
- sys.exit("9 arguments command line arguments should be passed: file to be run,"
48
- " function name, no of feature columns, no of class labels, no of group columns,"
49
- " comma separated indices of partition columns, comma separated types of all columns,"
50
- " model file prefix to generated model file using partition columns and flag to check"
51
- " lake or enterprise.")
52
-
53
-
54
- is_lake_system = eval(sys.argv[8])
55
- if not is_lake_system:
56
- db = sys.argv[0].split("/")[1]
57
- function_name = sys.argv[1]
58
- n_f_cols = int(sys.argv[2])
59
- n_c_labels = int(sys.argv[3])
60
- n_g_cols = int(sys.argv[4])
61
- data_column_types = splitter(sys.argv[6], delim="--")
62
- data_partition_column_indices = splitter(sys.argv[5], convert_to="int") # indices are integers.
63
- model_file_prefix = sys.argv[7]
64
-
65
- data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
66
-
67
- model = None
68
- data_partition_column_values = []
69
-
70
- # Data Format (n_features, k_labels, one data_partition_column):
71
- # feature1, feature2, ..., featuren, label1, label2, ... labelk, data_partition_column1, ...,
72
- # data_partition_columnn.
73
- # labels are optional.
74
-
75
- features = []
76
- labels = []
77
- groups = []
78
- while 1:
79
- try:
80
- line = input()
81
- if line == '': # Exit if user provides blank line
16
+ with open(os.devnull, "w") as devnull:
17
+ old_stderr = sys.stderr
18
+ sys.stderr = devnull
19
+ try:
20
+ yield
21
+ finally:
22
+ sys.stderr = old_stderr
23
+
24
+ ## On Lake system warnings raised by script are treated as a errors.
25
+ ## Hence, to suppress it putting the under suppress_stderr().
26
+ with suppress_stderr():
27
+ def get_values_list(values, types):
28
+ ret_vals = []
29
+ for i, val in enumerate(values):
30
+ ret_vals.append(convert_to_type(val, types[i]))
31
+ return ret_vals
32
+
33
+ def convert_to_type(val, typee):
34
+ if typee == 'int':
35
+ return int(val) if val != "" else np.nan
36
+ if typee == 'float':
37
+ if isinstance(val, str):
38
+ val = val.replace(' ', '')
39
+ return float(val) if val != "" else np.nan
40
+ if typee == 'bool':
41
+ return eval(val) if val != "" else None
42
+ return str(val) if val != "" else None
43
+
44
+ def splitter(strr, delim=",", convert_to="str"):
45
+ """
46
+ Split the string based on delimiter and convert to the type specified.
47
+ """
48
+ if strr == "None":
49
+ return []
50
+ return [convert_to_type(i, convert_to) for i in strr.split(delim)]
51
+
52
+
53
+ # Arguments to the Script
54
+ if len(sys.argv) != 9:
55
+ # 9 arguments command line arguments should be passed to this file.
56
+ # 1: file to be run
57
+ # 2. function name
58
+ # 3. No of feature columns.
59
+ # 4. No of class labels.
60
+ # 5. No of group columns.
61
+ # 6. Comma separated indices of partition columns.
62
+ # 7. Comma separated types of all the data columns.
63
+ # 8. Model file prefix to generated model file using partition columns.
64
+ # 9. Flag to check the system type. True, means Lake, Enterprise otherwise.
65
+ sys.exit("9 arguments command line arguments should be passed: file to be run,"
66
+ " function name, no of feature columns, no of class labels, no of group columns,"
67
+ " comma separated indices of partition columns, comma separated types of all columns,"
68
+ " model file prefix to generated model file using partition columns and flag to check"
69
+ " lake or enterprise.")
70
+
71
+
72
+ is_lake_system = eval(sys.argv[8])
73
+ if not is_lake_system:
74
+ db = sys.argv[0].split("/")[1]
75
+ function_name = sys.argv[1]
76
+ n_f_cols = int(sys.argv[2])
77
+ n_c_labels = int(sys.argv[3])
78
+ n_g_cols = int(sys.argv[4])
79
+ data_column_types = splitter(sys.argv[6], delim="--")
80
+ data_partition_column_indices = splitter(sys.argv[5], convert_to="int") # indices are integers.
81
+ model_file_prefix = sys.argv[7]
82
+
83
+ data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
84
+
85
+ model = None
86
+ data_partition_column_values = []
87
+
88
+ # Data Format (n_features, k_labels, one data_partition_column):
89
+ # feature1, feature2, ..., featuren, label1, label2, ... labelk, data_partition_column1, ...,
90
+ # data_partition_columnn.
91
+ # labels are optional.
92
+
93
+ features = []
94
+ labels = []
95
+ groups = []
96
+ while 1:
97
+ try:
98
+ line = input()
99
+ if line == '': # Exit if user provides blank line
100
+ break
101
+ else:
102
+ values = line.split(DELIMITER)
103
+ values = get_values_list(values, data_column_types)
104
+ if not data_partition_column_values:
105
+ # Partition column values is same for all rows. Hence, only read once.
106
+ for i, val in enumerate(data_partition_column_indices):
107
+ data_partition_column_values.append(
108
+ convert_to_type(values[val], typee=data_partition_column_types[i])
109
+ )
110
+
111
+ # Prepare the corresponding model file name and extract model.
112
+ partition_join = "_".join([str(x) for x in data_partition_column_values])
113
+ # Replace '-' with '_' as '-' because partition_columns can be negative.
114
+ partition_join = partition_join.replace("-", "_")
115
+
116
+ model_file_path = f"{model_file_prefix}_{partition_join}" \
117
+ if is_lake_system else \
118
+ f"./{db}/{model_file_prefix}_{partition_join}"
119
+
120
+ with open(model_file_path, "rb") as fp:
121
+ model = pickle.loads(fp.read())
122
+
123
+ if not model:
124
+ sys.exit("Model file is not installed in Vantage.")
125
+
126
+ start = 0
127
+ if n_f_cols > 0:
128
+ features.append(values[:n_f_cols])
129
+ start = start + n_f_cols
130
+ if n_c_labels > 0:
131
+ labels.append(values[start:(start+n_c_labels)])
132
+ start = start + n_c_labels
133
+ if n_g_cols > 0:
134
+ groups.append(values[start:(start+n_g_cols)])
135
+
136
+ except EOFError: # Exit if reached EOF or CTRL-D
82
137
  break
83
- else:
84
- values = line.split(DELIMITER)
85
- values = get_values_list(values, data_column_types)
86
- if not data_partition_column_values:
87
- # Partition column values is same for all rows. Hence, only read once.
88
- for i, val in enumerate(data_partition_column_indices):
89
- data_partition_column_values.append(
90
- convert_to_type(values[val], typee=data_partition_column_types[i])
91
- )
92
-
93
- # Prepare the corresponding model file name and extract model.
94
- partition_join = "_".join([str(x) for x in data_partition_column_values])
95
- # Replace '-' with '_' as '-' because partition_columns can be negative.
96
- partition_join = partition_join.replace("-", "_")
97
-
98
- model_file_path = f"{model_file_prefix}_{partition_join}" \
99
- if is_lake_system else \
100
- f"./{db}/{model_file_prefix}_{partition_join}"
101
-
102
- with open(model_file_path, "rb") as fp:
103
- model = pickle.loads(fp.read())
104
-
105
- if not model:
106
- sys.exit("Model file is not installed in Vantage.")
107
-
108
- start = 0
109
- if n_f_cols > 0:
110
- features.append(values[:n_f_cols])
111
- start = start + n_f_cols
112
- if n_c_labels > 0:
113
- labels.append(values[start:(start+n_c_labels)])
114
- start = start + n_c_labels
115
- if n_g_cols > 0:
116
- groups.append(values[start:(start+n_g_cols)])
117
-
118
- except EOFError: # Exit if reached EOF or CTRL-D
119
- break
120
-
121
- if len(features) == 0:
122
- sys.exit(0)
123
-
124
- features = np.array(features) if len(features) > 0 else None
125
- labels = np.array(labels).flatten() if len(labels) > 0 else None
126
- groups = np.array(groups).flatten() if len(groups) > 0 else None
127
-
128
- if function_name == "split":
129
- # Printing both train and test data instead of just indices unlike sklearn.
130
- # Generator is created based on split_id and type of split (train/test) in client.
131
- split_id = 1
132
- for train_idx, test_idx in model.split(features, labels, groups):
133
- X_train, X_test = features[train_idx], features[test_idx]
134
- y_train, y_test = labels[train_idx], labels[test_idx]
135
- for X, y in zip(X_train, y_train):
136
- print(*(data_partition_column_values + [split_id, "train"] +
137
- ['' if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))) else val
138
- for val in X] + [y]
139
- ), sep=DELIMITER)
140
- for X, y in zip(X_test, y_test):
141
- print(*(data_partition_column_values + [split_id, "test"] +
142
- ['' if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))) else val
143
- for val in X] + [y]
144
- ), sep=DELIMITER)
145
- split_id += 1
146
- else:
147
- val = getattr(model, function_name)(features, labels, groups)
148
- print(*(data_partition_column_values + [val]), sep=DELIMITER)
138
+
139
+ if len(features) == 0:
140
+ sys.exit(0)
141
+
142
+ features = np.array(features) if len(features) > 0 else None
143
+ labels = np.array(labels).flatten() if len(labels) > 0 else None
144
+ groups = np.array(groups).flatten() if len(groups) > 0 else None
145
+
146
+ if function_name == "split":
147
+ # Printing both train and test data instead of just indices unlike sklearn.
148
+ # Generator is created based on split_id and type of split (train/test) in client.
149
+ split_id = 1
150
+ for train_idx, test_idx in model.split(features, labels, groups):
151
+ X_train, X_test = features[train_idx], features[test_idx]
152
+ y_train, y_test = labels[train_idx], labels[test_idx]
153
+ for X, y in zip(X_train, y_train):
154
+ print(*(data_partition_column_values + [split_id, "train"] +
155
+ ['' if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))) else val
156
+ for val in X] + [y]
157
+ ), sep=DELIMITER)
158
+ for X, y in zip(X_test, y_test):
159
+ print(*(data_partition_column_values + [split_id, "test"] +
160
+ ['' if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))) else val
161
+ for val in X] + [y]
162
+ ), sep=DELIMITER)
163
+ split_id += 1
164
+ else:
165
+ val = getattr(model, function_name)(features, labels, groups)
166
+ print(*(data_partition_column_values + [val]), sep=DELIMITER)
@@ -31,39 +31,83 @@ def splitter(strr, delim=",", convert_to="str"):
31
31
  return []
32
32
  return [convert_to_type(i, convert_to) for i in strr.split(delim)]
33
33
 
34
+ def should_convert(t_val, py_type):
35
+ """
36
+ Function to check type of value and whether value is nan and infinity.
37
+ """
38
+ return not isinstance(t_val, eval(py_type)) and not math.isinf(t_val) and not math.isnan(t_val)
39
+
40
+ def convert_value(t_val, py_type):
41
+ """
42
+ Function to convert value to specified python type.
43
+ """
44
+ return convert_to_type(t_val, py_type) if should_convert(t_val, py_type) else t_val
45
+
34
46
  # Process output returned by sklearn function.
35
- def get_output_data(trans_values, func_name, model_obj, n_c_labels):
36
- # Converting sparse matrix to dense array as sparse matrices are NOT
47
+ def get_output_data(trans_values, func_name, model_obj, n_c_labels, n_out_columns):
48
+ # Converting sparse matrix to dense array as sparse matrices are NOT
37
49
  # supported in Vantage.
38
50
  module_name = model_obj.__module__.split("._")[0]
39
51
 
40
- if type(trans_values).__name__ in ["csr_matrix", "csc_matrix"]:
41
- trans_values = trans_values.toarray()
52
+ # Converting the translated values into corresponding the return column's
53
+ # python type.
54
+ if (func_name == "decision_path" or return_columns_python_types is None \
55
+ or not isinstance(trans_values, np.ndarray)):
56
+ trans_values_list = trans_values
57
+ else:
58
+ # Conversion.....
59
+ trans_values_list = []
60
+ for trans_value in trans_values.tolist():
61
+ if not isinstance(trans_value, list):
62
+ trans_value = [trans_value]
63
+
64
+ converted_list = []
65
+ if len(return_columns_python_types) == len(trans_value):
66
+ for t_val, py_type in zip(trans_value, return_columns_python_types):
67
+ converted_list.append(convert_value(t_val, py_type))
68
+ ## transform() is having only 1 python return type, But it actually returns more than 1 column
69
+ else:
70
+ for t_val in trans_value:
71
+ converted_list.append(convert_value(t_val, "".join(return_columns_python_types)))
72
+
73
+ trans_values_list.append(converted_list)
74
+
75
+ if type(trans_values_list).__name__ in ["csr_matrix", "csc_matrix"]:
76
+ trans_values_list = trans_values_list.toarray()
42
77
 
43
78
  if module_name == "sklearn.cross_decomposition" and n_c_labels > 0 and func_name == "transform":
44
79
  # For cross_decomposition, output is a tuple of arrays when label columns are provided
45
80
  # along with feature columns for transform function. In this case, concatenate the
46
81
  # arrays and return the combined values.
47
- if isinstance(trans_values, tuple):
48
- return np.concatenate(trans_values, axis=1).tolist()[0]
82
+ if isinstance(trans_values_list, tuple):
83
+ return np.concatenate(trans_values_list, axis=1).tolist()[0]
49
84
 
50
- if isinstance(trans_values[0], np.ndarray) \
51
- or isinstance(trans_values[0], list) \
52
- or isinstance(trans_values[0], tuple):
85
+ if isinstance(trans_values_list[0], np.ndarray) \
86
+ or isinstance(trans_values_list[0], list) \
87
+ or isinstance(trans_values_list[0], tuple):
53
88
  # Here, the value returned by sklearn function is list type.
54
- opt_list = list(trans_values[0])
89
+ opt_list = list(trans_values_list[0])
90
+
91
+ if len(opt_list) < n_out_columns:
92
+ # If the output list is less than the required number of columns, append
93
+ # empty strings to the list.
94
+ opt_list += [""] * (n_out_columns - len(opt_list))
95
+
55
96
  if func_name == "inverse_transform" and type(model_obj).__name__ == "MultiLabelBinarizer":
56
97
  # output array "trans_values[0]" may not be of same size. It should be of
57
98
  # maximum size of `model.classes_`
58
99
  # Append None to last elements.
59
100
  if len(opt_list) < len(model_obj.classes_):
60
101
  opt_list += [""] * (len(model_obj.classes_) - len(opt_list))
102
+
61
103
  return opt_list
62
- return [trans_values[0]]
104
+
105
+ # Only one element is returned by the function.
106
+ return [trans_values_list[0]]
63
107
 
64
108
  # Arguments to the Script
65
- if len(sys.argv) != 8:
66
- # 8 arguments command line arguments should be passed to this file.
109
+ if len(sys.argv) != 10:
110
+ # 10 arguments command line arguments should be passed to this file.
67
111
  # 1: file to be run
68
112
  # 2. function name (Eg. predict, fit etc)
69
113
  # 3. No of feature columns.
@@ -71,13 +115,17 @@ if len(sys.argv) != 8:
71
115
  # 5. Comma separated indices of partition columns.
72
116
  # 6. Comma separated types of all the data columns.
73
117
  # 7. Model file prefix to generated model file using partition columns.
74
- # 8. Flag to check the system type. True, means Lake, Enterprise otherwise.
75
- sys.exit("8 arguments should be passed to this file - file to be run, function name, "\
76
- "no of feature columns, no of class labels, comma separated indices of partition "\
77
- "columns, comma separated types of all columns, model file prefix to generate model "\
78
- "file using partition columns and flag to check lake or enterprise.")
79
-
80
- is_lake_system = eval(sys.argv[7])
118
+ # 8. Number of columns to be returned by the sklearn's transform function.
119
+ # 9. Flag to check the system type. True, means Lake, Enterprise otherwise.
120
+ # 10. Python types of returned/transfromed columns.
121
+ sys.exit("10 arguments should be passed to this file - file to be run, function name, "\
122
+ "no of feature columns, no of class labels, comma separated indices of partition "\
123
+ "columns, comma separated types of all columns, model file prefix to generate model "\
124
+ "file using partition columns, number of columns to be returnd by sklearn's "\
125
+ "transform function, flag to check lake or enterprise and Python types of "\
126
+ "returned/transfromed columns.")
127
+
128
+ is_lake_system = eval(sys.argv[8])
81
129
  if not is_lake_system:
82
130
  db = sys.argv[0].split("/")[1]
83
131
  func_name = sys.argv[1]
@@ -86,13 +134,22 @@ n_c_labels = int(sys.argv[3])
86
134
  data_column_types = splitter(sys.argv[5], delim="--")
87
135
  data_partition_column_indices = splitter(sys.argv[4], convert_to="int") # indices are integers.
88
136
  model_file_prefix = sys.argv[6]
137
+ # sys.argv[9] will contain a string of python datatypes with '--'
138
+ # separator OR a single datatype OR None in string format.
139
+ ret_col_argv = sys.argv[9]
140
+ if ret_col_argv == "None":
141
+ return_columns_python_types = eval(ret_col_argv)
142
+ else:
143
+ return_columns_python_types = splitter(ret_col_argv, delim="--")
144
+
145
+ no_of_output_columns = int(sys.argv[7])
89
146
 
90
147
  data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
91
148
 
92
149
  model = None
93
150
  data_partition_column_values = []
94
151
 
95
- missing_indicator_input = []
152
+ all_rows_input = []
96
153
 
97
154
  # Data Format:
98
155
  # feature1, feature2, ..., featuren, label1, label2, ... labelk, data_partition_column1, ...,
@@ -134,30 +191,45 @@ while 1:
134
191
 
135
192
  model_name = model.__class__.__name__
136
193
  np_func_list = ["ClassifierChain", "EllipticEnvelope", "MinCovDet",
137
- "FeatureAgglomeration", "LabelBinarizer", "MultiLabelBinarizer"]
194
+ "FeatureAgglomeration", "LabelBinarizer", "MultiLabelBinarizer",
195
+ "BernoulliRBM"]
138
196
 
139
- # MissingIndicator requires processing the entire dataset simultaneously,
140
- # rather than on a row-by-row basis.
197
+ # MissingIndicator's transform() and SimpleImputer's inverse_transform() requires processing
198
+ # the entire dataset simultaneously, rather than on a row-by-row basis.
141
199
 
142
- # Error getting during row-by-row processing -
200
+ # Error getting during row-by-row processing of MissingIndicator -
143
201
  # "ValueError: MissingIndicator does not support data with dtype <U13.
144
202
  # Please provide either a numeric array (with a floating point or
145
- i# integer dtype) or categorical data represented ei
146
- if model_name == "MissingIndicator" and func_name == "transform":
147
- missing_indicator_input.append(f_)
203
+ # integer dtype) or categorical data represented ei
204
+
205
+ # Error getting during row-by-row processing of SimpleImputer -
206
+ # "IndexError: index 3 is out of bounds for axis 1 with size 3".
207
+ if ((model_name == "MissingIndicator" and func_name == "transform") or \
208
+ (model_name == "SimpleImputer" and func_name == "inverse_transform") or \
209
+ (model_name in ["EllipticEnvelope", "MinCovDet"]
210
+ and func_name == "correct_covariance")):
211
+ all_rows_input.append(f_)
148
212
  continue
149
213
 
150
- f__ = np.array([f_]) if model_name in np_func_list or \
151
- (model_name == "SimpleImputer" and func_name == "inverse_transform")\
152
- else [f_]
214
+ f__ = np.array([f_]) if model_name in np_func_list else [f_]
215
+
216
+ # transform() function in these functions generate different number of output columns and
217
+ # NULLS/NaNs are appended to the end of the output.
218
+ # If we run inverse_transform() on these models, it will take same number of input columns
219
+ # with NULLs/NaNs but those NULLs/NaNs should be ignored while reading the input to
220
+ # inverse_transform() function.
221
+ models_with_all_null_in_last_cols = ["SelectFpr", "SelectFdr", "SelectFwe", "SelectFromModel", "RFECV"]
222
+ if model_name in models_with_all_null_in_last_cols and func_name == "inverse_transform":
223
+ # Remove NULLs/NaNs from the end of one input row.
224
+ _f = np.array([f_])
225
+ _f = _f[~np.isnan(_f)]
226
+ f__ = [_f.tolist()]
153
227
 
154
228
  if n_c_labels > 0:
155
229
  # Labels are present in last column.
156
230
  l_ = values[n_f_cols:n_f_cols+n_c_labels]
157
231
 
158
- l__ = np.array([l_]) if model_name in np_func_list or \
159
- (model_name == "SimpleImputer" and func_name == "inverse_transform")\
160
- else [l_]
232
+ l__ = np.array([l_]) if model_name in np_func_list else [l_]
161
233
  # predict() now takes 'y' also for it to return the labels from script. Skipping 'y'
162
234
  # in function call. Generally, 'y' is passed to return y along with actual output.
163
235
  try:
@@ -181,7 +253,8 @@ while 1:
181
253
  if n_c_labels > 0 and func_name in ["predict", "decision_function"]:
182
254
  result_list += l_
183
255
  result_list += get_output_data(trans_values=trans_values, func_name=func_name,
184
- model_obj=model, n_c_labels=n_c_labels)
256
+ model_obj=model, n_c_labels=n_c_labels,
257
+ n_out_columns=no_of_output_columns)
185
258
 
186
259
  for i, val in enumerate(result_list):
187
260
  if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))):
@@ -198,17 +271,23 @@ while 1:
198
271
  break
199
272
 
200
273
 
201
- # MissingIndicator needs processing of all the dataset at the same time, instead of row by row.
274
+ # MissingIndicator and SimpleImputer needs processing of all the dataset at the same time, instead of row by row.
202
275
  # Hence, handling it outside of the while loop
203
- if model_name == "MissingIndicator" and func_name == "transform":
204
- m_out = model.transform(missing_indicator_input)
276
+ if model_name == "MissingIndicator" and func_name == "transform" or \
277
+ (model_name == "SimpleImputer" and func_name == "inverse_transform"):
278
+ if model_name == "SimpleImputer":
279
+ all_rows_input = np.array(all_rows_input)
280
+ m_out = getattr(model, func_name)(all_rows_input)
205
281
 
206
- for j, vals in enumerate(missing_indicator_input):
282
+ if type(m_out).__name__ in ["csr_matrix", "csc_matrix"]:
283
+ m_out = m_out.toarray()
207
284
 
208
- m_out_list = get_output_data(trans_values=m_out[j], func_name=func_name,
209
- model_obj=model, n_c_labels=n_c_labels)
285
+ for j in range(len(all_rows_input)):
286
+ m_out_list = get_output_data(trans_values=[m_out[j]], func_name=func_name,
287
+ model_obj=model, n_c_labels=n_c_labels,
288
+ n_out_columns=no_of_output_columns)
210
289
 
211
- result_list = missing_indicator_input[j] + m_out_list
290
+ result_list = list(all_rows_input[j]) + list(m_out_list)
212
291
 
213
292
  for i, val in enumerate(result_list):
214
293
  if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))):
@@ -220,3 +299,11 @@ if model_name == "MissingIndicator" and func_name == "transform":
220
299
  result_list[i] = 1
221
300
 
222
301
  print(*(data_partition_column_values + result_list), sep=DELIMITER)
302
+
303
+ ## correct_covariance() requires processing of all the input rows at the same time.
304
+ ## It returns the output dataset in (n_features, n_features) shape, i.e., based on
305
+ ## no. of columns.
306
+ if model_name in ["EllipticEnvelope", "MinCovDet"] and func_name == "correct_covariance":
307
+ result_list = model.correct_covariance(np.array(all_rows_input))
308
+ for l, vals in enumerate(result_list):
309
+ print(*(data_partition_column_values + vals.tolist()), sep=DELIMITER)
@@ -0,0 +1,8 @@
1
+ id,array_col
2
+ 1,"3.33e-05,0.2,0.1"
3
+ 2,"0.5,0.4,0.42"
4
+ 3,"1,0.8,0.9"
5
+ 4,"0.01,0.4,0.2"
6
+ 5,"0.93,0.4,0.7"
7
+ 6,"0.83,0.3,0.6"
8
+ 7,"0.73,0.5,0.7"
@@ -2,8 +2,9 @@
2
2
  "env_specs": [
3
3
  {
4
4
  "env_name": "openml_env",
5
- "libs": ["scikit-learn==1.5.1", "joblib==1.4.2", "numpy==2.0.0",
6
- "scipy==1.14.0", "threadpoolctl==3.5.0"],
5
+ "libs": ["scikit-learn==1.5.1", "joblib==1.4.2", "numpy==1.23.5",
6
+ "scipy==1.14.0", "threadpoolctl==3.5.0", "lightgbm==3.3.3",
7
+ "pandas==2.2.3"],
7
8
  "desc": "DONT DELETE: OpenML environment"
8
9
  }
9
10
  ]
@@ -22,5 +22,9 @@
22
22
  "CallDuration": "REAL",
23
23
  "DataCounter": "REAL",
24
24
  "SMS": "REAL"
25
+ },
26
+ "target_udt_data":{
27
+ "id": "INTEGER",
28
+ "array_col":"AIVector"
25
29
  }
26
30
  }