teradataml 20.0.0.2__py3-none-any.whl → 20.0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (88) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/README.md +196 -2
  3. teradataml/__init__.py +4 -0
  4. teradataml/_version.py +1 -1
  5. teradataml/analytics/analytic_function_executor.py +79 -4
  6. teradataml/analytics/json_parser/metadata.py +12 -3
  7. teradataml/analytics/json_parser/utils.py +7 -2
  8. teradataml/analytics/sqle/__init__.py +1 -0
  9. teradataml/analytics/table_operator/__init__.py +1 -1
  10. teradataml/analytics/uaf/__init__.py +1 -1
  11. teradataml/analytics/utils.py +4 -0
  12. teradataml/automl/data_preparation.py +3 -2
  13. teradataml/automl/feature_engineering.py +15 -7
  14. teradataml/automl/model_training.py +39 -33
  15. teradataml/common/__init__.py +2 -1
  16. teradataml/common/constants.py +35 -0
  17. teradataml/common/garbagecollector.py +2 -1
  18. teradataml/common/messagecodes.py +8 -2
  19. teradataml/common/messages.py +3 -1
  20. teradataml/common/sqlbundle.py +25 -3
  21. teradataml/common/utils.py +134 -9
  22. teradataml/context/context.py +20 -10
  23. teradataml/data/SQL_Fundamentals.pdf +0 -0
  24. teradataml/data/dataframe_example.json +18 -2
  25. teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +1 -1
  26. teradataml/data/docs/sqle/docs_17_20/Shap.py +7 -1
  27. teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +4 -4
  28. teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
  29. teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
  30. teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
  31. teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
  32. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
  33. teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
  34. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
  35. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
  36. teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
  37. teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
  38. teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
  39. teradataml/data/jsons/uaf/17.20/TD_SAX.json +3 -1
  40. teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +15 -5
  41. teradataml/data/medical_readings.csv +101 -0
  42. teradataml/data/patient_profile.csv +101 -0
  43. teradataml/data/scripts/lightgbm/dataset.template +157 -0
  44. teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
  45. teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
  46. teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
  47. teradataml/data/scripts/sklearn/sklearn_fit.py +194 -167
  48. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
  49. teradataml/data/scripts/sklearn/sklearn_function.template +14 -19
  50. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
  51. teradataml/data/scripts/sklearn/sklearn_transform.py +129 -42
  52. teradataml/data/target_udt_data.csv +8 -0
  53. teradataml/data/templates/open_source_ml.json +3 -2
  54. teradataml/data/vectordistance_example.json +4 -0
  55. teradataml/dataframe/dataframe.py +543 -175
  56. teradataml/dataframe/functions.py +553 -25
  57. teradataml/dataframe/sql.py +184 -15
  58. teradataml/dbutils/dbutils.py +556 -18
  59. teradataml/dbutils/filemgr.py +48 -1
  60. teradataml/lib/aed_0_1.dll +0 -0
  61. teradataml/opensource/__init__.py +1 -1
  62. teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
  63. teradataml/opensource/_lightgbm.py +950 -0
  64. teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
  65. teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
  66. teradataml/opensource/sklearn/__init__.py +0 -1
  67. teradataml/opensource/sklearn/_sklearn_wrapper.py +798 -438
  68. teradataml/options/__init__.py +7 -23
  69. teradataml/options/configure.py +29 -3
  70. teradataml/scriptmgmt/UserEnv.py +3 -3
  71. teradataml/scriptmgmt/lls_utils.py +74 -21
  72. teradataml/store/__init__.py +13 -0
  73. teradataml/store/feature_store/__init__.py +0 -0
  74. teradataml/store/feature_store/constants.py +291 -0
  75. teradataml/store/feature_store/feature_store.py +2223 -0
  76. teradataml/store/feature_store/models.py +1505 -0
  77. teradataml/store/vector_store/__init__.py +1586 -0
  78. teradataml/table_operators/query_generator.py +3 -0
  79. teradataml/table_operators/table_operator_query_generator.py +3 -1
  80. teradataml/table_operators/table_operator_util.py +37 -38
  81. teradataml/table_operators/templates/dataframe_register.template +69 -0
  82. teradataml/utils/dtypes.py +4 -2
  83. teradataml/utils/validators.py +33 -1
  84. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +200 -5
  85. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +88 -65
  86. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
  87. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
  88. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0
@@ -0,0 +1,247 @@
1
+ import base64
2
+ import io
3
+ import math
4
+ import os
5
+ import pickle
6
+ import sys
7
+
8
+ import numpy as np
9
+
10
+ DELIMITER = '\t'
11
+
12
+ def get_values_list(values, types):
13
+ ret_vals = []
14
+ for i, val in enumerate(values):
15
+ ret_vals.append(convert_to_type(val, types[i]))
16
+ return ret_vals
17
+
18
+ def convert_to_type(val, typee):
19
+ if typee == 'int':
20
+ return int(val) if val != "" else np.nan
21
+ if typee == 'float':
22
+ if isinstance(val, str):
23
+ val = val.replace(' ', '')
24
+ return float(val) if val != "" else np.nan
25
+ if typee == 'bool':
26
+ return eval(val) if val != "" else None
27
+ return str(val) if val != "" else None
28
+
29
+ def splitter(strr, delim=",", convert_to="str"):
30
+ """
31
+ Split the string based on delimiter and convert to the type specified.
32
+ """
33
+ if strr == "None":
34
+ return []
35
+ return [convert_to_type(i, convert_to) for i in strr.split(delim)]
36
+
37
+ def should_convert(t_val, py_type):
38
+ """
39
+ Function to check type of value and whether value is nan and infinity.
40
+ """
41
+ return not isinstance(t_val, eval(py_type)) and not math.isinf(t_val) and not math.isnan(t_val)
42
+
43
+ def convert_value(t_val, py_type):
44
+ """
45
+ Function to convert value to specified python type.
46
+ """
47
+ return convert_to_type(t_val, py_type) if should_convert(t_val, py_type) else t_val
48
+
49
+ # Process output returned by sklearn function.
50
+ def get_output_data(trans_values, func_name, n_c_labels, n_out_columns):
51
+ # Converting sparse matrix to dense array as sparse matrices are NOT
52
+ # supported in Vantage.
53
+ # module_name = model_obj.__module__.split("._")[0]
54
+
55
+ # Converting the translated values into corresponding the return column's
56
+ # python type.
57
+ if (return_columns_python_types is None or not isinstance(trans_values, np.ndarray)):
58
+ trans_values_list = trans_values
59
+ else:
60
+ # Conversion.
61
+ trans_values_list = []
62
+ for trans_value in trans_values.tolist():
63
+ if not isinstance(trans_value, list):
64
+ trans_value = [trans_value]
65
+
66
+ converted_list = []
67
+ if len(return_columns_python_types) == len(trans_value):
68
+ for t_val, py_type in zip(trans_value, return_columns_python_types):
69
+ converted_list.append(convert_value(t_val, py_type))
70
+ ## transform() is having only 1 python return type, But it actually returns more than 1 column.
71
+ else:
72
+ for t_val in trans_value:
73
+ converted_list.append(convert_value(t_val, return_columns_python_types[0]))
74
+
75
+ trans_values_list.append(converted_list)
76
+
77
+ if type(trans_values_list).__name__ in ["csr_matrix", "csc_matrix"]:
78
+ trans_values_list = trans_values_list.toarray()
79
+
80
+ if isinstance(trans_values_list[0], np.ndarray) \
81
+ or isinstance(trans_values_list[0], list) \
82
+ or isinstance(trans_values_list[0], tuple):
83
+ # Here, the value returned by sklearn function is list type.
84
+ opt_list = list(trans_values_list[0])
85
+
86
+ if len(opt_list) < n_out_columns:
87
+ # If the output list is less than the required number of columns, append
88
+ # empty strings to the list.
89
+ opt_list += [""] * (n_out_columns - len(opt_list))
90
+
91
+ return opt_list
92
+
93
+ # Only one element is returned by the function.
94
+ return [trans_values_list[0]]
95
+
96
+ # Arguments to the Script
97
+ if len(sys.argv) != 10:
98
+ # 10 arguments command line arguments should be passed to this file.
99
+ # 1: file to be run
100
+ # 2. function name (Eg. predict, fit etc)
101
+ # 3. No of feature columns.
102
+ # 4. No of class labels.
103
+ # 5. Comma separated indices of partition columns.
104
+ # 6. Comma separated types of all the data columns.
105
+ # 7. Model file prefix to generated model file using partition columns.
106
+ # 8. Number of columns to be returned by the sklearn's transform function.
107
+ # 9. Flag to check the system type. True, means Lake, Enterprise otherwise.
108
+ # 10. Python types of returned/transfromed columns.
109
+ sys.exit("10 arguments should be passed to this file - file to be run, function name, "\
110
+ "no of feature columns, no of class labels, comma separated indices of partition "\
111
+ "columns, comma separated types of all columns, model file prefix to generate model "\
112
+ "file using partition columns, number of columns to be returnd by sklearn's "\
113
+ "transform function, flag to check lake or enterprise and Python types of "\
114
+ "returned/transfromed columns.")
115
+
116
+ is_lake_system = eval(sys.argv[8])
117
+ if not is_lake_system:
118
+ db = sys.argv[0].split("/")[1]
119
+ func_name = sys.argv[1]
120
+ n_f_cols = int(sys.argv[2])
121
+ n_c_labels = int(sys.argv[3])
122
+ data_column_types = splitter(sys.argv[5], delim="--")
123
+ data_partition_column_indices = splitter(sys.argv[4], convert_to="int") # indices are integers.
124
+ model_file_prefix = sys.argv[6]
125
+ # sys.argv[9] will contain a string of python datatypes with '--'
126
+ # separator OR a single datatype OR None in string format.
127
+ ret_col_argv = sys.argv[9]
128
+ if ret_col_argv == "None":
129
+ return_columns_python_types = eval(ret_col_argv)
130
+ else:
131
+ return_columns_python_types = splitter(ret_col_argv, delim="--")
132
+
133
+ no_of_output_columns = int(sys.argv[7])
134
+
135
+ data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
136
+
137
+ model = None
138
+ data_partition_column_values = []
139
+
140
+ all_x_rows = []
141
+ all_y_rows = []
142
+
143
+ # Data Format:
144
+ # feature1, feature2, ..., featuren, label1, label2, ... labelk, data_partition_column1, ...,
145
+ # data_partition_columnn.
146
+ # label is optional (it is present when label_exists is not "None")
147
+
148
+ model_name = ""
149
+ while 1:
150
+ try:
151
+ line = input()
152
+ if line == '': # Exit if user provides blank line
153
+ break
154
+ else:
155
+ values = line.split(DELIMITER)
156
+ values = get_values_list(values, data_column_types)
157
+ if not data_partition_column_values:
158
+ # Partition column values is same for all rows. Hence, only read once.
159
+ for i, val in enumerate(data_partition_column_indices):
160
+ data_partition_column_values.append(
161
+ convert_to_type(values[val], typee=data_partition_column_types[i])
162
+ )
163
+
164
+ # Prepare the corresponding model file name and extract model.
165
+ partition_join = "_".join([str(x) for x in data_partition_column_values])
166
+ # Replace '-' with '_' as '-' because partition_columns can be negative.
167
+ partition_join = partition_join.replace("-", "_")
168
+
169
+ model_file_path = f"{model_file_prefix}_{partition_join}" \
170
+ if is_lake_system else \
171
+ f"./{db}/{model_file_prefix}_{partition_join}"
172
+
173
+ with open(model_file_path, "rb") as fp:
174
+ model = pickle.loads(fp.read())
175
+
176
+ if not model:
177
+ sys.exit("Model file is not installed in Vantage.")
178
+
179
+ f_ = values[:n_f_cols]
180
+ f__ = np.array([f_])
181
+
182
+ if n_c_labels > 0:
183
+ l_ = values[n_f_cols:n_f_cols+n_c_labels]
184
+ l__ = np.array([l_])
185
+
186
+ if func_name == "refit":
187
+ # refit() needs all data at once. Hence, read all data at once and call refit().
188
+ all_x_rows.append(f_)
189
+ all_y_rows.append(l_)
190
+ continue
191
+
192
+ # Because `predict` function does not accept 'y' as input, we need to handle it separately.
193
+ if n_c_labels > 0 and func_name not in ["predict"]:
194
+ # Labels are present in last column.
195
+ trans_values = getattr(model, func_name)(f__, l__, **params)
196
+ else:
197
+ # If class labels do not exist in data, don't read labels, read just features.
198
+ trans_values = getattr(model, func_name)(f__, **params)
199
+
200
+ result_list = f_
201
+ if n_c_labels > 0 and func_name in ["predict", "decision_function"]:
202
+ result_list += l_
203
+ result_list += get_output_data(trans_values=trans_values, func_name=func_name,
204
+ n_c_labels=n_c_labels, n_out_columns=no_of_output_columns)
205
+
206
+ for i, val in enumerate(result_list):
207
+ if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))):
208
+ result_list[i] = ""
209
+ elif val == False:
210
+ result_list[i] = 0
211
+ elif val == True:
212
+ result_list[i] = 1
213
+
214
+ print(*(data_partition_column_values + result_list), sep=DELIMITER)
215
+
216
+ except EOFError: # Exit if reached EOF or CTRL-D
217
+ break
218
+
219
+
220
+ if func_name == "refit":
221
+ result = ""
222
+ stdout = None
223
+ try:
224
+ stdout = sys.stdout
225
+ new_stdout = io.StringIO()
226
+ sys.stdout = new_stdout
227
+ trained_model = getattr(model, func_name)(all_x_rows, all_y_rows, **params)
228
+ result = new_stdout.getvalue()
229
+ except Exception:
230
+ raise
231
+ finally:
232
+ sys.stdout = stdout
233
+
234
+ model_str = pickle.dumps(trained_model)
235
+
236
+
237
+ if is_lake_system:
238
+ model_file_path = f"/tmp/{model_file_prefix}_{partition_join}.pickle"
239
+
240
+ # Write to trained model file in Vantage.
241
+ with open(model_file_path, "wb") as fp:
242
+ fp.write(model_str)
243
+
244
+ model_data = model_file_path if is_lake_system else base64.b64encode(model_str)
245
+ console_output = base64.b64encode(result.encode())
246
+
247
+ print(*(data_partition_column_values + [model_data, console_output]), sep="..")
@@ -0,0 +1,216 @@
1
+ import sys, json, io
2
+ import pickle, base64, importlib, numpy as np
3
+ from collections import OrderedDict
4
+
5
+ func_name = "<func_name>"
6
+ module_name = "<module_name>"
7
+ is_lake_system = <is_lake_system>
8
+ params = json.loads('<params>')
9
+ data_partition_column_indices = <partition_cols_indices>
10
+ data_partition_column_types = <partition_cols_types>
11
+ model_file_prefix = "<model_file_prefix>" # Needed in case of lake system for writing model to /tmp
12
+
13
+ DELIMITER = '\t'
14
+
15
+ def convert_to_type(val, typee):
16
+ if typee == 'int':
17
+ return int(val) if val != "" else np.nan
18
+ if typee == 'float':
19
+ if isinstance(val, str):
20
+ val = val.replace(' ', '')
21
+ return float(val) if val != "" else np.nan
22
+ if typee == 'bool':
23
+ return eval(val) if val != "" else None
24
+ return str(val) if val != "" else None
25
+
26
+ if not is_lake_system:
27
+ db = sys.argv[0].split("/")[1]
28
+
29
+ data_present = False
30
+ data_partition_column_values = []
31
+
32
+ while 1:
33
+ try:
34
+ line = input()
35
+ if line == '': # Exit if user provides blank line
36
+ break
37
+ else:
38
+ data_present = True
39
+ values = line.split(DELIMITER)
40
+ if not data_partition_column_values:
41
+ # Partition column values is same for all rows. Hence, only read once.
42
+ for i, val in enumerate(data_partition_column_indices): # Only partition columns are
43
+ data_partition_column_values.append(
44
+ convert_to_type(values[val], typee=data_partition_column_types[i])
45
+ )
46
+
47
+ # Prepare the corresponding model file name and extract model.
48
+ partition_join = "_".join([str(x) for x in data_partition_column_values])
49
+ # Replace '-' with '_' because partition_columns can be negative containing '-'.
50
+ partition_join = partition_join.replace("-", "_")
51
+
52
+ train_set = params.get("train_set") # Gets file name prefix.
53
+ model_file_path = f"{train_set}_{partition_join}"\
54
+ if is_lake_system else \
55
+ f"./{db}/{train_set}_{partition_join}"
56
+
57
+ with open(model_file_path, "rb") as fp:
58
+ params["train_set"] = pickle.loads(fp.read())
59
+
60
+ valid_sets = params.get("valid_sets", None) # Gets file names prefix.
61
+ if valid_sets:
62
+ params["valid_sets"] = []
63
+ for valid_set in valid_sets:
64
+ model_file_path = f"{valid_set}_{partition_join}"\
65
+ if is_lake_system else \
66
+ f"./{db}/{valid_set}_{partition_join}"
67
+ with open(model_file_path, "rb") as fp:
68
+ params["valid_sets"].append(pickle.load(fp))
69
+
70
+ except EOFError: # Exit if reached EOF or CTRL-D
71
+ break
72
+
73
+ if not data_present:
74
+ sys.exit(0)
75
+
76
+ # Handle callbacks.
77
+ rec_eval = None
78
+ if "callbacks" in params and params["callbacks"] is not None:
79
+ callbacks = params["callbacks"]
80
+ callbacks = [callbacks] if not isinstance(callbacks, list) else callbacks
81
+ for i, callback in enumerate(callbacks):
82
+ c_module_name = callback["module"]
83
+ c_func_name = callback["func_name"]
84
+ c_kwargs = callback["kwargs"]
85
+ c_module = importlib.import_module(c_module_name)
86
+ if c_func_name == "record_evaluation":
87
+ # record_evaluation function takes empty dict. If the argument has elements in the
88
+ # dict, they will be deleted as per the documentation from lightgbm as described below:
89
+ # eval_result (dict) -
90
+ # Dictionary used to store all evaluation results of all validation sets. This should
91
+ # be initialized outside of your call to record_evaluation() and should be empty. Any
92
+ # initial contents of the dictionary will be deleted.
93
+ rec_eval = {}
94
+ callbacks[i] = getattr(c_module, c_func_name)(rec_eval)
95
+ else:
96
+ callbacks[i] = getattr(c_module, c_func_name)(**c_kwargs)
97
+
98
+ params["callbacks"] = callbacks
99
+
100
+ module_ = importlib.import_module(module_name)
101
+
102
+ ### LightGBM training is giving some meaningful console output like this:
103
+ ### Hence, capturing it to show to the user.
104
+
105
+ # [LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000190 seconds.
106
+ # You can set `force_row_wise=true` to remove the overhead.
107
+ # And if memory is not enough, you can set `force_col_wise=true`.
108
+ # [LightGBM] [Info] Total Bins 136
109
+ # [LightGBM] [Info] Number of data points in the train set: 97, number of used features: 4
110
+ # [LightGBM] [Info] Start training from score 0.556701
111
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
112
+ # [1] valid_0's l2: 0.219637 valid_1's l2: 0.219637
113
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
114
+ # [2] valid_0's l2: 0.196525 valid_1's l2: 0.196525
115
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
116
+ # [3] valid_0's l2: 0.178462 valid_1's l2: 0.178462
117
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
118
+ # [4] valid_0's l2: 0.162887 valid_1's l2: 0.162887
119
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
120
+ # [5] valid_0's l2: 0.150271 valid_1's l2: 0.150271
121
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
122
+ # [6] valid_0's l2: 0.140219 valid_1's l2: 0.140219
123
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
124
+ # [7] valid_0's l2: 0.131697 valid_1's l2: 0.131697
125
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
126
+ # [8] valid_0's l2: 0.124056 valid_1's l2: 0.124056
127
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
128
+ # [9] valid_0's l2: 0.117944 valid_1's l2: 0.117944
129
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
130
+ # [10] valid_0's l2: 0.11263 valid_1's l2: 0.11263
131
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
132
+ # [11] valid_0's l2: 0.105228 valid_1's l2: 0.105228
133
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
134
+ # [12] valid_0's l2: 0.0981571 valid_1's l2: 0.0981571
135
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
136
+ # [13] valid_0's l2: 0.0924294 valid_1's l2: 0.0924294
137
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
138
+ # [14] valid_0's l2: 0.0877899 valid_1's l2: 0.0877899
139
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
140
+ # [15] valid_0's l2: 0.084032 valid_1's l2: 0.084032
141
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
142
+ # [16] valid_0's l2: 0.080988 valid_1's l2: 0.080988
143
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
144
+ # [17] valid_0's l2: 0.0785224 valid_1's l2: 0.0785224
145
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
146
+ # [18] valid_0's l2: 0.0765253 valid_1's l2: 0.0765253
147
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
148
+ # [19] valid_0's l2: 0.0750803 valid_1's l2: 0.0750803
149
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
150
+ # [20] valid_0's l2: 0.0738915 valid_1's l2: 0.0738915
151
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
152
+ # [21] valid_0's l2: 0.07288 valid_1's l2: 0.07288
153
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
154
+ # [22] valid_0's l2: 0.0718676 valid_1's l2: 0.0718676
155
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
156
+ # [23] valid_0's l2: 0.0706037 valid_1's l2: 0.0706037
157
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
158
+ # [24] valid_0's l2: 0.0695799 valid_1's l2: 0.0695799
159
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
160
+ # [25] valid_0's l2: 0.0687507 valid_1's l2: 0.0687507
161
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
162
+ # [26] valid_0's l2: 0.0680819 valid_1's l2: 0.0680819
163
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
164
+ # [27] valid_0's l2: 0.0674077 valid_1's l2: 0.0674077
165
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
166
+ # [28] valid_0's l2: 0.0665111 valid_1's l2: 0.0665111
167
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
168
+ # [29] valid_0's l2: 0.0659656 valid_1's l2: 0.0659656
169
+ # [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
170
+ # [30] valid_0's l2: 0.0652665 valid_1's l2: 0.0652665
171
+ result = ""
172
+ stdout = None
173
+ try:
174
+ stdout = sys.stdout
175
+ new_stdout = io.StringIO()
176
+ sys.stdout = new_stdout
177
+ trained_model = getattr(module_, func_name)(**params)
178
+ result = new_stdout.getvalue()
179
+ except Exception:
180
+ raise
181
+ finally:
182
+ sys.stdout = stdout
183
+
184
+ model_str = pickle.dumps(trained_model)
185
+ console_output_str = result.encode()
186
+
187
+ if is_lake_system:
188
+ model_file_path = f"/tmp/{model_file_prefix}_{partition_join}.pickle"
189
+ model_console_output_path = f"/tmp/{model_file_prefix}_{partition_join}_console_output.pickle"
190
+
191
+ # Write to file in Vantage, to be used in predict/scoring.
192
+ with open(model_file_path, "wb") as fp:
193
+ fp.write(model_str)
194
+
195
+ with open(model_console_output_path, "wb") as fpc:
196
+ fpc.write(console_output_str)
197
+
198
+
199
+ model_data = model_file_path if is_lake_system else base64.b64encode(model_str)
200
+ console_output = model_console_output_path if is_lake_system else base64.b64encode(console_output_str)
201
+
202
+ output_data = [model_data, console_output]
203
+
204
+ if rec_eval is not None:
205
+ rec_eval = pickle.dumps(rec_eval)
206
+ if is_lake_system:
207
+ rec_eval_file_path = f"/tmp/{model_file_prefix}_{partition_join}_rec_eval.pickle"
208
+
209
+ with open(rec_eval_file_path, "wb") as fp:
210
+ fp.write(rec_eval)
211
+
212
+ rec_eval = rec_eval_file_path if is_lake_system else base64.b64encode(rec_eval)
213
+
214
+ output_data.append(rec_eval)
215
+
216
+ print(*(data_partition_column_values + output_data), sep=DELIMITER)
@@ -0,0 +1,159 @@
1
+ import sys, json
2
+ import pickle, base64, importlib, numpy as np
3
+ from collections import OrderedDict
4
+
5
+ DELIMITER = '\t'
6
+
7
+ func_name = <func_name>
8
+ params = json.loads('<params>')
9
+ is_lake_system = <is_lake_system>
10
+ model_file_prefix = <model_file_prefix>
11
+
12
+ def convert_to_type(val, typee):
13
+ if typee == 'int':
14
+ return int(val) if val != "" else np.nan
15
+ if typee == 'float':
16
+ if isinstance(val, str):
17
+ val = val.replace(' ', '')
18
+ return float(val) if val != "" else np.nan
19
+ if typee == 'bool':
20
+ return eval(val) if val != "" else None
21
+ return str(val) if val != "" else None
22
+
23
+ def splitter(strr, delim=",", convert_to="str"):
24
+ """
25
+ Split the string based on delimiter and convert to the type specified.
26
+ """
27
+ if strr == "None":
28
+ return []
29
+ return [convert_to_type(i, convert_to) for i in strr.split(delim)]
30
+
31
+
32
+ if not is_lake_system:
33
+ db = sys.argv[0].split("/")[1]
34
+
35
+ data_partition_column_indices = <partition_cols_indices>
36
+ data_column_types = <types_of_data_cols>
37
+
38
+ data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
39
+
40
+ # Data related arguments information of indices and types.
41
+ data_args_indices_types = OrderedDict()
42
+
43
+ # Data related arguments values - prepare dictionary and populate data later.
44
+ data_args_values = {}
45
+
46
+ data_args_info_str = <data_args_info_str>
47
+ for data_arg in data_args_info_str.split("--"):
48
+ arg_name, indices, types = data_arg.split("-")
49
+ indices = splitter(indices, convert_to="int")
50
+ types = splitter(types)
51
+
52
+ data_args_indices_types[arg_name] = {"indices": indices, "types": types}
53
+ data_args_values[arg_name] = [] # Keeping empty for each data arg name and populate data later.
54
+
55
+ data_partition_column_values = []
56
+ data_present = False
57
+
58
+ model = None
59
+
60
+ # Read data - columns information is passed as command line argument and stored in
61
+ # data_args_indices_types dictionary.
62
+ while 1:
63
+ try:
64
+ line = input()
65
+ if line == '': # Exit if user provides blank line
66
+ break
67
+ else:
68
+ data_present = True
69
+ values = line.split(DELIMITER)
70
+ if not data_partition_column_values:
71
+ # Partition column values is same for all rows. Hence, only read once.
72
+ for i, val in enumerate(data_partition_column_indices):
73
+ data_partition_column_values.append(
74
+ convert_to_type(values[val], typee=data_partition_column_types[i])
75
+ )
76
+
77
+ # Prepare the corresponding model file name and extract model.
78
+ partition_join = "_".join([str(x) for x in data_partition_column_values])
79
+ # Replace '-' with '_' as '-' because partition_columns can be negative.
80
+ partition_join = partition_join.replace("-", "_")
81
+
82
+
83
+ model_file_path = f"{model_file_prefix}_{partition_join}"\
84
+ if is_lake_system else \
85
+ f"./{db}/{model_file_prefix}_{partition_join}"
86
+
87
+ with open(model_file_path, "rb") as fp:
88
+ model = pickle.loads(fp.read())
89
+
90
+ if model is None:
91
+ sys.exit("Model file is not installed in Vantage.")
92
+
93
+ # Prepare data dictionary containing only arguments related to data.
94
+ for arg_name in data_args_values:
95
+ data_indices = data_args_indices_types[arg_name]["indices"]
96
+ types = data_args_indices_types[arg_name]["types"]
97
+ cur_row = []
98
+ for idx, data_idx in enumerate(data_indices):
99
+ cur_row.append(convert_to_type(values[data_idx], types[idx]))
100
+ data_args_values[arg_name].append(cur_row)
101
+ except EOFError: # Exit if reached EOF or CTRL-D
102
+ break
103
+
104
+ if not data_present:
105
+ sys.exit(0)
106
+
107
+ # Handle callbacks.
108
+ rec_eval = None
109
+ if "callbacks" in params and params["callbacks"] is not None:
110
+ callbacks = params["callbacks"]
111
+ callbacks = [callbacks] if not isinstance(callbacks, list) else callbacks
112
+ for i, callback in enumerate(callbacks):
113
+ c_module_name = callback["module"]
114
+ c_func_name = callback["func_name"]
115
+ c_kwargs = callback["kwargs"]
116
+ c_module = importlib.import_module(c_module_name)
117
+ if c_func_name == "record_evaluation":
118
+ # record_evaluation function takes empty dict. If the argument has elements in the
119
+ # dict, they will be deleted as per the documentation from lightgbm as described below:
120
+ # eval_result (dict) -
121
+ # Dictionary used to store all evaluation results of all validation sets. This should
122
+ # be initialized outside of your call to record_evaluation() and should be empty. Any
123
+ # initial contents of the dictionary will be deleted.
124
+ rec_eval = {}
125
+ callbacks[i] = getattr(c_module, c_func_name)(rec_eval)
126
+ else:
127
+ callbacks[i] = getattr(c_module, c_func_name)(**c_kwargs)
128
+
129
+ params["callbacks"] = callbacks
130
+
131
+ # Update data as numpy arrays.
132
+ for arg_name in data_args_values:
133
+ np_values = np.array(data_args_values[arg_name])
134
+ data_args_values[arg_name] = np_values
135
+ if arg_name == "sample_weight":
136
+ data_args_values[arg_name] = np_values.ravel()
137
+
138
+ # Combine all arguments.
139
+ all_args = {**data_args_values, **params}
140
+
141
+ trained_model = getattr(model, func_name)(**all_args)
142
+
143
+ model_data = 0
144
+ if func_name == "fit":
145
+ model_str = pickle.dumps(trained_model)
146
+
147
+ if is_lake_system:
148
+ model_file_path = f"/tmp/{model_file_prefix}_{partition_join}.pickle"
149
+
150
+ # Write to file in Vantage, to be used in predict/scoring.
151
+ with open(model_file_path, "wb") as fp:
152
+ fp.write(model_str)
153
+
154
+ model_data = model_file_path if is_lake_system else base64.b64encode(model_str)
155
+
156
+ elif func_name == "score":
157
+ model_data = trained_model
158
+
159
+ print(*(data_partition_column_values + [model_data]), sep=DELIMITER)