teradataml 20.0.0.2__py3-none-any.whl → 20.0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/README.md +196 -2
- teradataml/__init__.py +4 -0
- teradataml/_version.py +1 -1
- teradataml/analytics/analytic_function_executor.py +79 -4
- teradataml/analytics/json_parser/metadata.py +12 -3
- teradataml/analytics/json_parser/utils.py +7 -2
- teradataml/analytics/sqle/__init__.py +1 -0
- teradataml/analytics/table_operator/__init__.py +1 -1
- teradataml/analytics/uaf/__init__.py +1 -1
- teradataml/analytics/utils.py +4 -0
- teradataml/automl/data_preparation.py +3 -2
- teradataml/automl/feature_engineering.py +15 -7
- teradataml/automl/model_training.py +39 -33
- teradataml/common/__init__.py +2 -1
- teradataml/common/constants.py +35 -0
- teradataml/common/garbagecollector.py +2 -1
- teradataml/common/messagecodes.py +8 -2
- teradataml/common/messages.py +3 -1
- teradataml/common/sqlbundle.py +25 -3
- teradataml/common/utils.py +134 -9
- teradataml/context/context.py +20 -10
- teradataml/data/SQL_Fundamentals.pdf +0 -0
- teradataml/data/dataframe_example.json +18 -2
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/Shap.py +7 -1
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +4 -4
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
- teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
- teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
- teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
- teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +3 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +15 -5
- teradataml/data/medical_readings.csv +101 -0
- teradataml/data/patient_profile.csv +101 -0
- teradataml/data/scripts/lightgbm/dataset.template +157 -0
- teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
- teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
- teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +194 -167
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
- teradataml/data/scripts/sklearn/sklearn_function.template +14 -19
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
- teradataml/data/scripts/sklearn/sklearn_transform.py +129 -42
- teradataml/data/target_udt_data.csv +8 -0
- teradataml/data/templates/open_source_ml.json +3 -2
- teradataml/data/vectordistance_example.json +4 -0
- teradataml/dataframe/dataframe.py +543 -175
- teradataml/dataframe/functions.py +553 -25
- teradataml/dataframe/sql.py +184 -15
- teradataml/dbutils/dbutils.py +556 -18
- teradataml/dbutils/filemgr.py +48 -1
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/__init__.py +1 -1
- teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
- teradataml/opensource/_lightgbm.py +950 -0
- teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
- teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
- teradataml/opensource/sklearn/__init__.py +0 -1
- teradataml/opensource/sklearn/_sklearn_wrapper.py +798 -438
- teradataml/options/__init__.py +7 -23
- teradataml/options/configure.py +29 -3
- teradataml/scriptmgmt/UserEnv.py +3 -3
- teradataml/scriptmgmt/lls_utils.py +74 -21
- teradataml/store/__init__.py +13 -0
- teradataml/store/feature_store/__init__.py +0 -0
- teradataml/store/feature_store/constants.py +291 -0
- teradataml/store/feature_store/feature_store.py +2223 -0
- teradataml/store/feature_store/models.py +1505 -0
- teradataml/store/vector_store/__init__.py +1586 -0
- teradataml/table_operators/query_generator.py +3 -0
- teradataml/table_operators/table_operator_query_generator.py +3 -1
- teradataml/table_operators/table_operator_util.py +37 -38
- teradataml/table_operators/templates/dataframe_register.template +69 -0
- teradataml/utils/dtypes.py +4 -2
- teradataml/utils/validators.py +33 -1
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +200 -5
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +88 -65
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0
|
@@ -28,28 +28,22 @@ def splitter(strr, delim=",", convert_to="str"):
|
|
|
28
28
|
return [convert_to_type(i, convert_to) for i in strr.split(delim)]
|
|
29
29
|
|
|
30
30
|
# Arguments to the Script.
|
|
31
|
-
if len(sys.argv) !=
|
|
32
|
-
#
|
|
31
|
+
if len(sys.argv) != 3:
|
|
32
|
+
# 3 command line arguments should be passed to this file.
|
|
33
33
|
# 1: file to be run
|
|
34
|
-
# 2.
|
|
35
|
-
# 3.
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
sys.exit("5 arguments command line arguments should be passed: file to be run,"
|
|
41
|
-
" comma separated indices of partition columns, comma separated types of all columns,"
|
|
42
|
-
" data columns information separated by '--' where each data column information is"
|
|
43
|
-
" in the form '<arg_name>-<comma separated data indices>-<comma separated data types>',"
|
|
44
|
-
" flag to check lake or enterprise and model file prefix used only for lake system.")
|
|
45
|
-
|
|
46
|
-
is_lake_system = eval(sys.argv[4])
|
|
34
|
+
# 2. Model file prefix for lake system, None otherwise.
|
|
35
|
+
# 3. Flag to check the system type. True, means Lake, Enterprise otherwise.
|
|
36
|
+
sys.exit("3 arguments command line arguments should be passed: file to be run,"
|
|
37
|
+
" model file prefix used only for lake system and flag to check lake or enterprise.")
|
|
38
|
+
|
|
39
|
+
is_lake_system = eval(sys.argv[2])
|
|
47
40
|
if not is_lake_system:
|
|
48
41
|
db = sys.argv[0].split("/")[1]
|
|
49
42
|
else:
|
|
50
|
-
model_file_prefix = sys.argv[
|
|
51
|
-
|
|
52
|
-
|
|
43
|
+
model_file_prefix = sys.argv[1]
|
|
44
|
+
|
|
45
|
+
data_partition_column_indices = <partition_cols_indices>
|
|
46
|
+
data_column_types = <types_of_data_cols>
|
|
53
47
|
|
|
54
48
|
data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
|
|
55
49
|
|
|
@@ -59,7 +53,8 @@ data_args_indices_types = OrderedDict()
|
|
|
59
53
|
# Data related arguments values - prepare dictionary and populate data later.
|
|
60
54
|
data_args_values = {}
|
|
61
55
|
|
|
62
|
-
|
|
56
|
+
data_args_info_str = <data_args_info_str>
|
|
57
|
+
for data_arg in data_args_info_str.split("--"):
|
|
63
58
|
arg_name, indices, types = data_arg.split("-")
|
|
64
59
|
indices = splitter(indices, convert_to="int")
|
|
65
60
|
types = splitter(types)
|
|
@@ -3,146 +3,164 @@ import math
|
|
|
3
3
|
import sys
|
|
4
4
|
import numpy as np
|
|
5
5
|
import base64
|
|
6
|
+
from contextlib import contextmanager
|
|
7
|
+
import os
|
|
6
8
|
|
|
7
9
|
DELIMITER = '\t'
|
|
8
10
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
for i, val in enumerate(values):
|
|
12
|
-
ret_vals.append(convert_to_type(val, types[i]))
|
|
13
|
-
return ret_vals
|
|
14
|
-
|
|
15
|
-
def convert_to_type(val, typee):
|
|
16
|
-
if typee == 'int':
|
|
17
|
-
return int(val) if val != "" else np.nan
|
|
18
|
-
if typee == 'float':
|
|
19
|
-
if isinstance(val, str):
|
|
20
|
-
val = val.replace(' ', '')
|
|
21
|
-
return float(val) if val != "" else np.nan
|
|
22
|
-
if typee == 'bool':
|
|
23
|
-
return eval(val) if val != "" else None
|
|
24
|
-
return str(val) if val != "" else None
|
|
25
|
-
|
|
26
|
-
def splitter(strr, delim=",", convert_to="str"):
|
|
11
|
+
@contextmanager
|
|
12
|
+
def suppress_stderr():
|
|
27
13
|
"""
|
|
28
|
-
|
|
14
|
+
Function to suppress the warnings(lake systems treats warnings as errors).
|
|
29
15
|
"""
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
if
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
#
|
|
71
|
-
#
|
|
72
|
-
#
|
|
73
|
-
#
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
16
|
+
with open(os.devnull, "w") as devnull:
|
|
17
|
+
old_stderr = sys.stderr
|
|
18
|
+
sys.stderr = devnull
|
|
19
|
+
try:
|
|
20
|
+
yield
|
|
21
|
+
finally:
|
|
22
|
+
sys.stderr = old_stderr
|
|
23
|
+
|
|
24
|
+
## On Lake system warnings raised by script are treated as a errors.
|
|
25
|
+
## Hence, to suppress it putting the under suppress_stderr().
|
|
26
|
+
with suppress_stderr():
|
|
27
|
+
def get_values_list(values, types):
|
|
28
|
+
ret_vals = []
|
|
29
|
+
for i, val in enumerate(values):
|
|
30
|
+
ret_vals.append(convert_to_type(val, types[i]))
|
|
31
|
+
return ret_vals
|
|
32
|
+
|
|
33
|
+
def convert_to_type(val, typee):
|
|
34
|
+
if typee == 'int':
|
|
35
|
+
return int(val) if val != "" else np.nan
|
|
36
|
+
if typee == 'float':
|
|
37
|
+
if isinstance(val, str):
|
|
38
|
+
val = val.replace(' ', '')
|
|
39
|
+
return float(val) if val != "" else np.nan
|
|
40
|
+
if typee == 'bool':
|
|
41
|
+
return eval(val) if val != "" else None
|
|
42
|
+
return str(val) if val != "" else None
|
|
43
|
+
|
|
44
|
+
def splitter(strr, delim=",", convert_to="str"):
|
|
45
|
+
"""
|
|
46
|
+
Split the string based on delimiter and convert to the type specified.
|
|
47
|
+
"""
|
|
48
|
+
if strr == "None":
|
|
49
|
+
return []
|
|
50
|
+
return [convert_to_type(i, convert_to) for i in strr.split(delim)]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# Arguments to the Script
|
|
54
|
+
if len(sys.argv) != 9:
|
|
55
|
+
# 9 arguments command line arguments should be passed to this file.
|
|
56
|
+
# 1: file to be run
|
|
57
|
+
# 2. function name
|
|
58
|
+
# 3. No of feature columns.
|
|
59
|
+
# 4. No of class labels.
|
|
60
|
+
# 5. No of group columns.
|
|
61
|
+
# 6. Comma separated indices of partition columns.
|
|
62
|
+
# 7. Comma separated types of all the data columns.
|
|
63
|
+
# 8. Model file prefix to generated model file using partition columns.
|
|
64
|
+
# 9. Flag to check the system type. True, means Lake, Enterprise otherwise.
|
|
65
|
+
sys.exit("9 arguments command line arguments should be passed: file to be run,"
|
|
66
|
+
" function name, no of feature columns, no of class labels, no of group columns,"
|
|
67
|
+
" comma separated indices of partition columns, comma separated types of all columns,"
|
|
68
|
+
" model file prefix to generated model file using partition columns and flag to check"
|
|
69
|
+
" lake or enterprise.")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
is_lake_system = eval(sys.argv[8])
|
|
73
|
+
if not is_lake_system:
|
|
74
|
+
db = sys.argv[0].split("/")[1]
|
|
75
|
+
function_name = sys.argv[1]
|
|
76
|
+
n_f_cols = int(sys.argv[2])
|
|
77
|
+
n_c_labels = int(sys.argv[3])
|
|
78
|
+
n_g_cols = int(sys.argv[4])
|
|
79
|
+
data_column_types = splitter(sys.argv[6], delim="--")
|
|
80
|
+
data_partition_column_indices = splitter(sys.argv[5], convert_to="int") # indices are integers.
|
|
81
|
+
model_file_prefix = sys.argv[7]
|
|
82
|
+
|
|
83
|
+
data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
|
|
84
|
+
|
|
85
|
+
model = None
|
|
86
|
+
data_partition_column_values = []
|
|
87
|
+
|
|
88
|
+
# Data Format (n_features, k_labels, one data_partition_column):
|
|
89
|
+
# feature1, feature2, ..., featuren, label1, label2, ... labelk, data_partition_column1, ...,
|
|
90
|
+
# data_partition_columnn.
|
|
91
|
+
# labels are optional.
|
|
92
|
+
|
|
93
|
+
features = []
|
|
94
|
+
labels = []
|
|
95
|
+
groups = []
|
|
96
|
+
while 1:
|
|
97
|
+
try:
|
|
98
|
+
line = input()
|
|
99
|
+
if line == '': # Exit if user provides blank line
|
|
100
|
+
break
|
|
101
|
+
else:
|
|
102
|
+
values = line.split(DELIMITER)
|
|
103
|
+
values = get_values_list(values, data_column_types)
|
|
104
|
+
if not data_partition_column_values:
|
|
105
|
+
# Partition column values is same for all rows. Hence, only read once.
|
|
106
|
+
for i, val in enumerate(data_partition_column_indices):
|
|
107
|
+
data_partition_column_values.append(
|
|
108
|
+
convert_to_type(values[val], typee=data_partition_column_types[i])
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Prepare the corresponding model file name and extract model.
|
|
112
|
+
partition_join = "_".join([str(x) for x in data_partition_column_values])
|
|
113
|
+
# Replace '-' with '_' as '-' because partition_columns can be negative.
|
|
114
|
+
partition_join = partition_join.replace("-", "_")
|
|
115
|
+
|
|
116
|
+
model_file_path = f"{model_file_prefix}_{partition_join}" \
|
|
117
|
+
if is_lake_system else \
|
|
118
|
+
f"./{db}/{model_file_prefix}_{partition_join}"
|
|
119
|
+
|
|
120
|
+
with open(model_file_path, "rb") as fp:
|
|
121
|
+
model = pickle.loads(fp.read())
|
|
122
|
+
|
|
123
|
+
if not model:
|
|
124
|
+
sys.exit("Model file is not installed in Vantage.")
|
|
125
|
+
|
|
126
|
+
start = 0
|
|
127
|
+
if n_f_cols > 0:
|
|
128
|
+
features.append(values[:n_f_cols])
|
|
129
|
+
start = start + n_f_cols
|
|
130
|
+
if n_c_labels > 0:
|
|
131
|
+
labels.append(values[start:(start+n_c_labels)])
|
|
132
|
+
start = start + n_c_labels
|
|
133
|
+
if n_g_cols > 0:
|
|
134
|
+
groups.append(values[start:(start+n_g_cols)])
|
|
135
|
+
|
|
136
|
+
except EOFError: # Exit if reached EOF or CTRL-D
|
|
82
137
|
break
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
if n_c_labels > 0:
|
|
113
|
-
labels.append(values[start:(start+n_c_labels)])
|
|
114
|
-
start = start + n_c_labels
|
|
115
|
-
if n_g_cols > 0:
|
|
116
|
-
groups.append(values[start:(start+n_g_cols)])
|
|
117
|
-
|
|
118
|
-
except EOFError: # Exit if reached EOF or CTRL-D
|
|
119
|
-
break
|
|
120
|
-
|
|
121
|
-
if len(features) == 0:
|
|
122
|
-
sys.exit(0)
|
|
123
|
-
|
|
124
|
-
features = np.array(features) if len(features) > 0 else None
|
|
125
|
-
labels = np.array(labels).flatten() if len(labels) > 0 else None
|
|
126
|
-
groups = np.array(groups).flatten() if len(groups) > 0 else None
|
|
127
|
-
|
|
128
|
-
if function_name == "split":
|
|
129
|
-
# Printing both train and test data instead of just indices unlike sklearn.
|
|
130
|
-
# Generator is created based on split_id and type of split (train/test) in client.
|
|
131
|
-
split_id = 1
|
|
132
|
-
for train_idx, test_idx in model.split(features, labels, groups):
|
|
133
|
-
X_train, X_test = features[train_idx], features[test_idx]
|
|
134
|
-
y_train, y_test = labels[train_idx], labels[test_idx]
|
|
135
|
-
for X, y in zip(X_train, y_train):
|
|
136
|
-
print(*(data_partition_column_values + [split_id, "train"] +
|
|
137
|
-
['' if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))) else val
|
|
138
|
-
for val in X] + [y]
|
|
139
|
-
), sep=DELIMITER)
|
|
140
|
-
for X, y in zip(X_test, y_test):
|
|
141
|
-
print(*(data_partition_column_values + [split_id, "test"] +
|
|
142
|
-
['' if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))) else val
|
|
143
|
-
for val in X] + [y]
|
|
144
|
-
), sep=DELIMITER)
|
|
145
|
-
split_id += 1
|
|
146
|
-
else:
|
|
147
|
-
val = getattr(model, function_name)(features, labels, groups)
|
|
148
|
-
print(*(data_partition_column_values + [val]), sep=DELIMITER)
|
|
138
|
+
|
|
139
|
+
if len(features) == 0:
|
|
140
|
+
sys.exit(0)
|
|
141
|
+
|
|
142
|
+
features = np.array(features) if len(features) > 0 else None
|
|
143
|
+
labels = np.array(labels).flatten() if len(labels) > 0 else None
|
|
144
|
+
groups = np.array(groups).flatten() if len(groups) > 0 else None
|
|
145
|
+
|
|
146
|
+
if function_name == "split":
|
|
147
|
+
# Printing both train and test data instead of just indices unlike sklearn.
|
|
148
|
+
# Generator is created based on split_id and type of split (train/test) in client.
|
|
149
|
+
split_id = 1
|
|
150
|
+
for train_idx, test_idx in model.split(features, labels, groups):
|
|
151
|
+
X_train, X_test = features[train_idx], features[test_idx]
|
|
152
|
+
y_train, y_test = labels[train_idx], labels[test_idx]
|
|
153
|
+
for X, y in zip(X_train, y_train):
|
|
154
|
+
print(*(data_partition_column_values + [split_id, "train"] +
|
|
155
|
+
['' if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))) else val
|
|
156
|
+
for val in X] + [y]
|
|
157
|
+
), sep=DELIMITER)
|
|
158
|
+
for X, y in zip(X_test, y_test):
|
|
159
|
+
print(*(data_partition_column_values + [split_id, "test"] +
|
|
160
|
+
['' if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))) else val
|
|
161
|
+
for val in X] + [y]
|
|
162
|
+
), sep=DELIMITER)
|
|
163
|
+
split_id += 1
|
|
164
|
+
else:
|
|
165
|
+
val = getattr(model, function_name)(features, labels, groups)
|
|
166
|
+
print(*(data_partition_column_values + [val]), sep=DELIMITER)
|
|
@@ -31,39 +31,83 @@ def splitter(strr, delim=",", convert_to="str"):
|
|
|
31
31
|
return []
|
|
32
32
|
return [convert_to_type(i, convert_to) for i in strr.split(delim)]
|
|
33
33
|
|
|
34
|
+
def should_convert(t_val, py_type):
|
|
35
|
+
"""
|
|
36
|
+
Function to check type of value and whether value is nan and infinity.
|
|
37
|
+
"""
|
|
38
|
+
return not isinstance(t_val, eval(py_type)) and not math.isinf(t_val) and not math.isnan(t_val)
|
|
39
|
+
|
|
40
|
+
def convert_value(t_val, py_type):
|
|
41
|
+
"""
|
|
42
|
+
Function to convert value to specified python type.
|
|
43
|
+
"""
|
|
44
|
+
return convert_to_type(t_val, py_type) if should_convert(t_val, py_type) else t_val
|
|
45
|
+
|
|
34
46
|
# Process output returned by sklearn function.
|
|
35
|
-
def get_output_data(trans_values, func_name, model_obj, n_c_labels):
|
|
36
|
-
# Converting
|
|
47
|
+
def get_output_data(trans_values, func_name, model_obj, n_c_labels, n_out_columns):
|
|
48
|
+
# Converting sparse matrix to dense array as sparse matrices are NOT
|
|
37
49
|
# supported in Vantage.
|
|
38
50
|
module_name = model_obj.__module__.split("._")[0]
|
|
39
51
|
|
|
40
|
-
|
|
41
|
-
|
|
52
|
+
# Converting the translated values into corresponding the return column's
|
|
53
|
+
# python type.
|
|
54
|
+
if (func_name == "decision_path" or return_columns_python_types is None \
|
|
55
|
+
or not isinstance(trans_values, np.ndarray)):
|
|
56
|
+
trans_values_list = trans_values
|
|
57
|
+
else:
|
|
58
|
+
# Conversion.....
|
|
59
|
+
trans_values_list = []
|
|
60
|
+
for trans_value in trans_values.tolist():
|
|
61
|
+
if not isinstance(trans_value, list):
|
|
62
|
+
trans_value = [trans_value]
|
|
63
|
+
|
|
64
|
+
converted_list = []
|
|
65
|
+
if len(return_columns_python_types) == len(trans_value):
|
|
66
|
+
for t_val, py_type in zip(trans_value, return_columns_python_types):
|
|
67
|
+
converted_list.append(convert_value(t_val, py_type))
|
|
68
|
+
## transform() is having only 1 python return type, But it actually returns more than 1 column
|
|
69
|
+
else:
|
|
70
|
+
for t_val in trans_value:
|
|
71
|
+
converted_list.append(convert_value(t_val, "".join(return_columns_python_types)))
|
|
72
|
+
|
|
73
|
+
trans_values_list.append(converted_list)
|
|
74
|
+
|
|
75
|
+
if type(trans_values_list).__name__ in ["csr_matrix", "csc_matrix"]:
|
|
76
|
+
trans_values_list = trans_values_list.toarray()
|
|
42
77
|
|
|
43
78
|
if module_name == "sklearn.cross_decomposition" and n_c_labels > 0 and func_name == "transform":
|
|
44
79
|
# For cross_decomposition, output is a tuple of arrays when label columns are provided
|
|
45
80
|
# along with feature columns for transform function. In this case, concatenate the
|
|
46
81
|
# arrays and return the combined values.
|
|
47
|
-
if isinstance(
|
|
48
|
-
return np.concatenate(
|
|
82
|
+
if isinstance(trans_values_list, tuple):
|
|
83
|
+
return np.concatenate(trans_values_list, axis=1).tolist()[0]
|
|
49
84
|
|
|
50
|
-
if isinstance(
|
|
51
|
-
or isinstance(
|
|
52
|
-
or isinstance(
|
|
85
|
+
if isinstance(trans_values_list[0], np.ndarray) \
|
|
86
|
+
or isinstance(trans_values_list[0], list) \
|
|
87
|
+
or isinstance(trans_values_list[0], tuple):
|
|
53
88
|
# Here, the value returned by sklearn function is list type.
|
|
54
|
-
opt_list = list(
|
|
89
|
+
opt_list = list(trans_values_list[0])
|
|
90
|
+
|
|
91
|
+
if len(opt_list) < n_out_columns:
|
|
92
|
+
# If the output list is less than the required number of columns, append
|
|
93
|
+
# empty strings to the list.
|
|
94
|
+
opt_list += [""] * (n_out_columns - len(opt_list))
|
|
95
|
+
|
|
55
96
|
if func_name == "inverse_transform" and type(model_obj).__name__ == "MultiLabelBinarizer":
|
|
56
97
|
# output array "trans_values[0]" may not be of same size. It should be of
|
|
57
98
|
# maximum size of `model.classes_`
|
|
58
99
|
# Append None to last elements.
|
|
59
100
|
if len(opt_list) < len(model_obj.classes_):
|
|
60
101
|
opt_list += [""] * (len(model_obj.classes_) - len(opt_list))
|
|
102
|
+
|
|
61
103
|
return opt_list
|
|
62
|
-
|
|
104
|
+
|
|
105
|
+
# Only one element is returned by the function.
|
|
106
|
+
return [trans_values_list[0]]
|
|
63
107
|
|
|
64
108
|
# Arguments to the Script
|
|
65
|
-
if len(sys.argv) !=
|
|
66
|
-
#
|
|
109
|
+
if len(sys.argv) != 10:
|
|
110
|
+
# 10 arguments command line arguments should be passed to this file.
|
|
67
111
|
# 1: file to be run
|
|
68
112
|
# 2. function name (Eg. predict, fit etc)
|
|
69
113
|
# 3. No of feature columns.
|
|
@@ -71,13 +115,17 @@ if len(sys.argv) != 8:
|
|
|
71
115
|
# 5. Comma separated indices of partition columns.
|
|
72
116
|
# 6. Comma separated types of all the data columns.
|
|
73
117
|
# 7. Model file prefix to generated model file using partition columns.
|
|
74
|
-
# 8.
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
118
|
+
# 8. Number of columns to be returned by the sklearn's transform function.
|
|
119
|
+
# 9. Flag to check the system type. True, means Lake, Enterprise otherwise.
|
|
120
|
+
# 10. Python types of returned/transfromed columns.
|
|
121
|
+
sys.exit("10 arguments should be passed to this file - file to be run, function name, "\
|
|
122
|
+
"no of feature columns, no of class labels, comma separated indices of partition "\
|
|
123
|
+
"columns, comma separated types of all columns, model file prefix to generate model "\
|
|
124
|
+
"file using partition columns, number of columns to be returnd by sklearn's "\
|
|
125
|
+
"transform function, flag to check lake or enterprise and Python types of "\
|
|
126
|
+
"returned/transfromed columns.")
|
|
127
|
+
|
|
128
|
+
is_lake_system = eval(sys.argv[8])
|
|
81
129
|
if not is_lake_system:
|
|
82
130
|
db = sys.argv[0].split("/")[1]
|
|
83
131
|
func_name = sys.argv[1]
|
|
@@ -86,13 +134,22 @@ n_c_labels = int(sys.argv[3])
|
|
|
86
134
|
data_column_types = splitter(sys.argv[5], delim="--")
|
|
87
135
|
data_partition_column_indices = splitter(sys.argv[4], convert_to="int") # indices are integers.
|
|
88
136
|
model_file_prefix = sys.argv[6]
|
|
137
|
+
# sys.argv[9] will contain a string of python datatypes with '--'
|
|
138
|
+
# separator OR a single datatype OR None in string format.
|
|
139
|
+
ret_col_argv = sys.argv[9]
|
|
140
|
+
if ret_col_argv == "None":
|
|
141
|
+
return_columns_python_types = eval(ret_col_argv)
|
|
142
|
+
else:
|
|
143
|
+
return_columns_python_types = splitter(ret_col_argv, delim="--")
|
|
144
|
+
|
|
145
|
+
no_of_output_columns = int(sys.argv[7])
|
|
89
146
|
|
|
90
147
|
data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
|
|
91
148
|
|
|
92
149
|
model = None
|
|
93
150
|
data_partition_column_values = []
|
|
94
151
|
|
|
95
|
-
|
|
152
|
+
all_rows_input = []
|
|
96
153
|
|
|
97
154
|
# Data Format:
|
|
98
155
|
# feature1, feature2, ..., featuren, label1, label2, ... labelk, data_partition_column1, ...,
|
|
@@ -134,30 +191,45 @@ while 1:
|
|
|
134
191
|
|
|
135
192
|
model_name = model.__class__.__name__
|
|
136
193
|
np_func_list = ["ClassifierChain", "EllipticEnvelope", "MinCovDet",
|
|
137
|
-
"FeatureAgglomeration", "LabelBinarizer", "MultiLabelBinarizer"
|
|
194
|
+
"FeatureAgglomeration", "LabelBinarizer", "MultiLabelBinarizer",
|
|
195
|
+
"BernoulliRBM"]
|
|
138
196
|
|
|
139
|
-
# MissingIndicator
|
|
140
|
-
# rather than on a row-by-row basis.
|
|
197
|
+
# MissingIndicator's transform() and SimpleImputer's inverse_transform() requires processing
|
|
198
|
+
# the entire dataset simultaneously, rather than on a row-by-row basis.
|
|
141
199
|
|
|
142
|
-
# Error getting during row-by-row processing -
|
|
200
|
+
# Error getting during row-by-row processing of MissingIndicator -
|
|
143
201
|
# "ValueError: MissingIndicator does not support data with dtype <U13.
|
|
144
202
|
# Please provide either a numeric array (with a floating point or
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
203
|
+
# integer dtype) or categorical data represented ei
|
|
204
|
+
|
|
205
|
+
# Error getting during row-by-row processing of SimpleImputer -
|
|
206
|
+
# "IndexError: index 3 is out of bounds for axis 1 with size 3".
|
|
207
|
+
if ((model_name == "MissingIndicator" and func_name == "transform") or \
|
|
208
|
+
(model_name == "SimpleImputer" and func_name == "inverse_transform") or \
|
|
209
|
+
(model_name in ["EllipticEnvelope", "MinCovDet"]
|
|
210
|
+
and func_name == "correct_covariance")):
|
|
211
|
+
all_rows_input.append(f_)
|
|
148
212
|
continue
|
|
149
213
|
|
|
150
|
-
f__ = np.array([f_]) if model_name in np_func_list
|
|
151
|
-
|
|
152
|
-
|
|
214
|
+
f__ = np.array([f_]) if model_name in np_func_list else [f_]
|
|
215
|
+
|
|
216
|
+
# transform() function in these functions generate different number of output columns and
|
|
217
|
+
# NULLS/NaNs are appended to the end of the output.
|
|
218
|
+
# If we run inverse_transform() on these models, it will take same number of input columns
|
|
219
|
+
# with NULLs/NaNs but those NULLs/NaNs should be ignored while reading the input to
|
|
220
|
+
# inverse_transform() function.
|
|
221
|
+
models_with_all_null_in_last_cols = ["SelectFpr", "SelectFdr", "SelectFwe", "SelectFromModel", "RFECV"]
|
|
222
|
+
if model_name in models_with_all_null_in_last_cols and func_name == "inverse_transform":
|
|
223
|
+
# Remove NULLs/NaNs from the end of one input row.
|
|
224
|
+
_f = np.array([f_])
|
|
225
|
+
_f = _f[~np.isnan(_f)]
|
|
226
|
+
f__ = [_f.tolist()]
|
|
153
227
|
|
|
154
228
|
if n_c_labels > 0:
|
|
155
229
|
# Labels are present in last column.
|
|
156
230
|
l_ = values[n_f_cols:n_f_cols+n_c_labels]
|
|
157
231
|
|
|
158
|
-
l__ = np.array([l_]) if model_name in np_func_list
|
|
159
|
-
(model_name == "SimpleImputer" and func_name == "inverse_transform")\
|
|
160
|
-
else [l_]
|
|
232
|
+
l__ = np.array([l_]) if model_name in np_func_list else [l_]
|
|
161
233
|
# predict() now takes 'y' also for it to return the labels from script. Skipping 'y'
|
|
162
234
|
# in function call. Generally, 'y' is passed to return y along with actual output.
|
|
163
235
|
try:
|
|
@@ -181,7 +253,8 @@ while 1:
|
|
|
181
253
|
if n_c_labels > 0 and func_name in ["predict", "decision_function"]:
|
|
182
254
|
result_list += l_
|
|
183
255
|
result_list += get_output_data(trans_values=trans_values, func_name=func_name,
|
|
184
|
-
model_obj=model, n_c_labels=n_c_labels
|
|
256
|
+
model_obj=model, n_c_labels=n_c_labels,
|
|
257
|
+
n_out_columns=no_of_output_columns)
|
|
185
258
|
|
|
186
259
|
for i, val in enumerate(result_list):
|
|
187
260
|
if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))):
|
|
@@ -198,17 +271,23 @@ while 1:
|
|
|
198
271
|
break
|
|
199
272
|
|
|
200
273
|
|
|
201
|
-
# MissingIndicator needs processing of all the dataset at the same time, instead of row by row.
|
|
274
|
+
# MissingIndicator and SimpleImputer needs processing of all the dataset at the same time, instead of row by row.
|
|
202
275
|
# Hence, handling it outside of the while loop
|
|
203
|
-
if model_name == "MissingIndicator" and func_name == "transform"
|
|
204
|
-
|
|
276
|
+
if model_name == "MissingIndicator" and func_name == "transform" or \
|
|
277
|
+
(model_name == "SimpleImputer" and func_name == "inverse_transform"):
|
|
278
|
+
if model_name == "SimpleImputer":
|
|
279
|
+
all_rows_input = np.array(all_rows_input)
|
|
280
|
+
m_out = getattr(model, func_name)(all_rows_input)
|
|
205
281
|
|
|
206
|
-
|
|
282
|
+
if type(m_out).__name__ in ["csr_matrix", "csc_matrix"]:
|
|
283
|
+
m_out = m_out.toarray()
|
|
207
284
|
|
|
208
|
-
|
|
209
|
-
|
|
285
|
+
for j in range(len(all_rows_input)):
|
|
286
|
+
m_out_list = get_output_data(trans_values=[m_out[j]], func_name=func_name,
|
|
287
|
+
model_obj=model, n_c_labels=n_c_labels,
|
|
288
|
+
n_out_columns=no_of_output_columns)
|
|
210
289
|
|
|
211
|
-
result_list =
|
|
290
|
+
result_list = list(all_rows_input[j]) + list(m_out_list)
|
|
212
291
|
|
|
213
292
|
for i, val in enumerate(result_list):
|
|
214
293
|
if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))):
|
|
@@ -220,3 +299,11 @@ if model_name == "MissingIndicator" and func_name == "transform":
|
|
|
220
299
|
result_list[i] = 1
|
|
221
300
|
|
|
222
301
|
print(*(data_partition_column_values + result_list), sep=DELIMITER)
|
|
302
|
+
|
|
303
|
+
## correct_covariance() requires processing of all the input rows at the same time.
|
|
304
|
+
## It returns the output dataset in (n_features, n_features) shape, i.e., based on
|
|
305
|
+
## no. of columns.
|
|
306
|
+
if model_name in ["EllipticEnvelope", "MinCovDet"] and func_name == "correct_covariance":
|
|
307
|
+
result_list = model.correct_covariance(np.array(all_rows_input))
|
|
308
|
+
for l, vals in enumerate(result_list):
|
|
309
|
+
print(*(data_partition_column_values + vals.tolist()), sep=DELIMITER)
|
|
@@ -2,8 +2,9 @@
|
|
|
2
2
|
"env_specs": [
|
|
3
3
|
{
|
|
4
4
|
"env_name": "openml_env",
|
|
5
|
-
"libs": ["scikit-learn==1.5.1", "joblib==1.4.2", "numpy==
|
|
6
|
-
"scipy==1.14.0", "threadpoolctl==3.5.0"
|
|
5
|
+
"libs": ["scikit-learn==1.5.1", "joblib==1.4.2", "numpy==1.23.5",
|
|
6
|
+
"scipy==1.14.0", "threadpoolctl==3.5.0", "lightgbm==3.3.3",
|
|
7
|
+
"pandas==2.2.3"],
|
|
7
8
|
"desc": "DONT DELETE: OpenML environment"
|
|
8
9
|
}
|
|
9
10
|
]
|