teradataml 20.0.0.2__py3-none-any.whl → 20.0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (126) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/README.md +315 -2
  3. teradataml/__init__.py +4 -0
  4. teradataml/_version.py +1 -1
  5. teradataml/analytics/analytic_function_executor.py +95 -8
  6. teradataml/analytics/byom/__init__.py +1 -1
  7. teradataml/analytics/json_parser/metadata.py +12 -3
  8. teradataml/analytics/json_parser/utils.py +7 -2
  9. teradataml/analytics/sqle/__init__.py +5 -1
  10. teradataml/analytics/table_operator/__init__.py +1 -1
  11. teradataml/analytics/uaf/__init__.py +1 -1
  12. teradataml/analytics/utils.py +4 -0
  13. teradataml/analytics/valib.py +18 -4
  14. teradataml/automl/__init__.py +51 -6
  15. teradataml/automl/data_preparation.py +59 -35
  16. teradataml/automl/data_transformation.py +58 -33
  17. teradataml/automl/feature_engineering.py +27 -12
  18. teradataml/automl/model_training.py +73 -46
  19. teradataml/common/constants.py +88 -29
  20. teradataml/common/garbagecollector.py +2 -1
  21. teradataml/common/messagecodes.py +19 -3
  22. teradataml/common/messages.py +6 -1
  23. teradataml/common/sqlbundle.py +64 -12
  24. teradataml/common/utils.py +246 -47
  25. teradataml/common/warnings.py +11 -0
  26. teradataml/context/context.py +161 -27
  27. teradataml/data/amazon_reviews_25.csv +26 -0
  28. teradataml/data/byom_example.json +11 -0
  29. teradataml/data/dataframe_example.json +18 -2
  30. teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
  31. teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
  32. teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
  33. teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
  34. teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
  35. teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
  36. teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +1 -1
  37. teradataml/data/docs/sqle/docs_17_20/Shap.py +34 -6
  38. teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +4 -4
  39. teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
  40. teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
  41. teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
  42. teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
  43. teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
  44. teradataml/data/hnsw_alter_data.csv +5 -0
  45. teradataml/data/hnsw_data.csv +10 -0
  46. teradataml/data/jsons/byom/h2opredict.json +1 -1
  47. teradataml/data/jsons/byom/onnxembeddings.json +266 -0
  48. teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
  49. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
  50. teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
  51. teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
  52. teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
  53. teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
  54. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
  55. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
  56. teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
  57. teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
  58. teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
  59. teradataml/data/jsons/uaf/17.20/TD_SAX.json +3 -1
  60. teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +15 -5
  61. teradataml/data/medical_readings.csv +101 -0
  62. teradataml/data/patient_profile.csv +101 -0
  63. teradataml/data/scripts/lightgbm/dataset.template +157 -0
  64. teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
  65. teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
  66. teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
  67. teradataml/data/scripts/sklearn/sklearn_fit.py +194 -167
  68. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
  69. teradataml/data/scripts/sklearn/sklearn_function.template +14 -19
  70. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
  71. teradataml/data/scripts/sklearn/sklearn_transform.py +129 -42
  72. teradataml/data/target_udt_data.csv +8 -0
  73. teradataml/data/templates/open_source_ml.json +3 -2
  74. teradataml/data/teradataml_example.json +8 -0
  75. teradataml/data/vectordistance_example.json +4 -0
  76. teradataml/dataframe/copy_to.py +8 -3
  77. teradataml/dataframe/data_transfer.py +11 -1
  78. teradataml/dataframe/dataframe.py +1049 -285
  79. teradataml/dataframe/dataframe_utils.py +152 -20
  80. teradataml/dataframe/functions.py +578 -35
  81. teradataml/dataframe/setop.py +11 -6
  82. teradataml/dataframe/sql.py +185 -16
  83. teradataml/dbutils/dbutils.py +1049 -115
  84. teradataml/dbutils/filemgr.py +48 -1
  85. teradataml/hyperparameter_tuner/optimizer.py +12 -1
  86. teradataml/lib/aed_0_1.dll +0 -0
  87. teradataml/opensource/__init__.py +1 -1
  88. teradataml/opensource/_base.py +1466 -0
  89. teradataml/opensource/_class.py +464 -0
  90. teradataml/opensource/{sklearn/constants.py → _constants.py} +21 -14
  91. teradataml/opensource/_lightgbm.py +949 -0
  92. teradataml/opensource/_sklearn.py +1008 -0
  93. teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +5 -6
  94. teradataml/options/__init__.py +54 -38
  95. teradataml/options/configure.py +131 -27
  96. teradataml/options/display.py +13 -2
  97. teradataml/plot/axis.py +47 -8
  98. teradataml/plot/figure.py +33 -0
  99. teradataml/plot/plot.py +63 -13
  100. teradataml/scriptmgmt/UserEnv.py +5 -5
  101. teradataml/scriptmgmt/lls_utils.py +130 -40
  102. teradataml/store/__init__.py +12 -0
  103. teradataml/store/feature_store/__init__.py +0 -0
  104. teradataml/store/feature_store/constants.py +291 -0
  105. teradataml/store/feature_store/feature_store.py +2318 -0
  106. teradataml/store/feature_store/models.py +1505 -0
  107. teradataml/table_operators/Apply.py +32 -18
  108. teradataml/table_operators/Script.py +3 -1
  109. teradataml/table_operators/TableOperator.py +3 -1
  110. teradataml/table_operators/query_generator.py +3 -0
  111. teradataml/table_operators/table_operator_query_generator.py +3 -1
  112. teradataml/table_operators/table_operator_util.py +37 -38
  113. teradataml/table_operators/templates/dataframe_register.template +69 -0
  114. teradataml/utils/dtypes.py +51 -2
  115. teradataml/utils/internal_buffer.py +18 -0
  116. teradataml/utils/validators.py +99 -8
  117. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/METADATA +321 -5
  118. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/RECORD +121 -94
  119. teradataml/libaed_0_1.dylib +0 -0
  120. teradataml/libaed_0_1.so +0 -0
  121. teradataml/opensource/sklearn/__init__.py +0 -1
  122. teradataml/opensource/sklearn/_class.py +0 -255
  123. teradataml/opensource/sklearn/_sklearn_wrapper.py +0 -1800
  124. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/WHEEL +0 -0
  125. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/top_level.txt +0 -0
  126. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.4.dist-info}/zip-safe +0 -0
@@ -2,177 +2,204 @@ import sys
2
2
  import numpy as np
3
3
  import pickle
4
4
  import base64
5
+ import os
6
+ from contextlib import contextmanager
5
7
 
6
8
  DELIMITER = '\t'
7
9
 
8
- def get_values_list(values, types, model_obj):
9
- ret_vals = []
10
- for i, val in enumerate(values):
11
- if type(model_obj).__name__ == "MultiLabelBinarizer" and val == "":
12
- continue
13
- ret_vals.append(convert_to_type(val, types[i]))
14
- return ret_vals
15
-
16
- def convert_to_type(val, typee):
17
- if typee == 'int':
18
- return int(val) if val != "" else np.nan
19
- if typee == 'float':
20
- if isinstance(val, str):
21
- val = val.replace(' ', '')
22
- return float(val) if val != "" else np.nan
23
- if typee == 'bool':
24
- return eval(val) if val != "" else None
25
- return str(val) if val != "" else None
26
-
27
- def get_classes_as_list(classes, actual_type):
28
- if classes == "None":
29
- return None
30
- if actual_type == "None":
31
- sys.exit("type of class elements is None where class elements exists.")
32
-
33
- # separated by '--'
34
- classes = classes.split("--")
35
-
36
- for idx, cls in enumerate(classes):
37
- classes[idx] = convert_to_type(cls, actual_type)
38
-
39
- return classes
40
-
41
-
42
- def splitter(strr, delim=",", convert_to="str"):
10
+
11
+ @contextmanager
12
+ def suppress_stderr():
43
13
  """
44
- Split the string based on delimiter and convert to the type specified.
14
+ Function to suppress the warnings(lake systems treats warnings as errors).
45
15
  """
46
- if strr == "None":
47
- return []
48
- return [convert_to_type(i, convert_to) for i in strr.split(delim)]
49
-
50
- # Arguments to the Script
51
- if len(sys.argv) != 10:
52
- # 10 arguments command line arguments should be passed to this file.
53
- # 1: file to be run
54
- # 2. function name
55
- # 3. No of feature columns.
56
- # 4. No of class labels.
57
- # 5. Comma separated indices of partition columns.
58
- # 6. Comma separated types of all the data columns.
59
- # 7. Model file prefix to generated model file using partition columns.
60
- # 8. classes (separated by '--') - should be converted to list. "None" if no classes exists.
61
- # 9. type of elements in passed in classes. "None" if no classes exists.
62
- # 10. Flag to check the system type. True, means Lake, Enterprise otherwise
63
- sys.exit("10 arguments command line arguments should be passed: file to be run,"
64
- " function name, no of feature columns, no of class labels, comma separated indices"
65
- " of partition columns, comma separated types of all columns, model file prefix ,"
66
- " classes, type of elements in classes and flag to check lake or enterprise.")
67
-
68
- is_lake_system = eval(sys.argv[9])
69
- if not is_lake_system:
70
- db = sys.argv[0].split("/")[1]
71
- function_name = sys.argv[1]
72
- n_f_cols = int(sys.argv[2])
73
- n_c_labels = int(sys.argv[3])
74
- data_column_types = splitter(sys.argv[5], delim="--")
75
- data_partition_column_indices = splitter(sys.argv[4], convert_to="int") # indices are integers.
76
- model_file_prefix = sys.argv[6]
77
- class_type = sys.argv[8]
78
- classes = get_classes_as_list(sys.argv[7], class_type)
79
-
80
- data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
81
-
82
- model = None
83
-
84
- # Data Format (n_features, k_labels, one data_partition_column):
85
- # feature1, feature2, ..., featuren, label1, label2, ... labelk, data_partition_column1, ...,
86
- # data_partition_columnn
87
- # There can be no labels also.
88
-
89
- # Read data from table through STO and build features and labels.
90
- features = []
91
- labels = []
92
- data_partition_column_values = []
93
-
94
-
95
- while 1:
96
- try:
97
- line = input()
98
- if line == '': # Exit if user provides blank line
16
+ with open(os.devnull, "w") as devnull:
17
+ old_stderr = sys.stderr
18
+ sys.stderr = devnull
19
+ try:
20
+ yield
21
+ finally:
22
+ sys.stderr = old_stderr
23
+
24
+ ## On Lake system warnings raised by script are treated as a errors.
25
+ ## Hence, to suppress it putting the under suppress_stderr().
26
+ with suppress_stderr():
27
+ def get_values_list(values, types, model_obj):
28
+ ret_vals = []
29
+ for i, val in enumerate(values):
30
+ if type(model_obj).__name__ == "MultiLabelBinarizer" and val == "":
31
+ continue
32
+ ret_vals.append(convert_to_type(val, types[i]))
33
+ return ret_vals
34
+
35
+ def convert_to_type(val, typee):
36
+ if typee == 'int':
37
+ return int(val) if val != "" else np.nan
38
+ if typee == 'float':
39
+ if isinstance(val, str):
40
+ val = val.replace(' ', '')
41
+ return float(val) if val != "" else np.nan
42
+ if typee == 'bool':
43
+ return eval(val) if val != "" else None
44
+ return str(val) if val != "" else None
45
+
46
+ def get_classes_as_list(classes, actual_type):
47
+ if classes == "None":
48
+ return None
49
+ if actual_type == "None":
50
+ sys.exit("type of class elements is None where class elements exists.")
51
+
52
+ # separated by '--'
53
+ classes = classes.split("--")
54
+
55
+ for idx, cls in enumerate(classes):
56
+ classes[idx] = convert_to_type(cls, actual_type)
57
+
58
+ return classes
59
+
60
+
61
+ def splitter(strr, delim=",", convert_to="str"):
62
+ """
63
+ Split the string based on delimiter and convert to the type specified.
64
+ """
65
+ if strr == "None":
66
+ return []
67
+ return [convert_to_type(i, convert_to) for i in strr.split(delim)]
68
+
69
+ # Arguments to the Script
70
+ if len(sys.argv) != 10:
71
+ # 10 arguments command line arguments should be passed to this file.
72
+ # 1: file to be run
73
+ # 2. function name
74
+ # 3. No of feature columns.
75
+ # 4. No of class labels.
76
+ # 5. Comma separated indices of partition columns.
77
+ # 6. Comma separated types of all the data columns.
78
+ # 7. Model file prefix to generated model file using partition columns.
79
+ # 8. classes (separated by '--') - should be converted to list. "None" if no classes exists.
80
+ # 9. type of elements in passed in classes. "None" if no classes exists.
81
+ # 10. Flag to check the system type. True, means Lake, Enterprise otherwise
82
+ sys.exit("10 arguments command line arguments should be passed: file to be run,"
83
+ " function name, no of feature columns, no of class labels, comma separated indices"
84
+ " of partition columns, comma separated types of all columns, model file prefix ,"
85
+ " classes, type of elements in classes and flag to check lake or enterprise.")
86
+
87
+ is_lake_system = eval(sys.argv[9])
88
+ if not is_lake_system:
89
+ db = sys.argv[0].split("/")[1]
90
+ function_name = sys.argv[1]
91
+ n_f_cols = int(sys.argv[2])
92
+ n_c_labels = int(sys.argv[3])
93
+ data_column_types = splitter(sys.argv[5], delim="--")
94
+ data_partition_column_indices = splitter(sys.argv[4], convert_to="int") # indices are integers.
95
+ model_file_prefix = sys.argv[6]
96
+ class_type = sys.argv[8]
97
+ classes = get_classes_as_list(sys.argv[7], class_type)
98
+
99
+ data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
100
+
101
+ model = None
102
+
103
+ # Data Format (n_features, k_labels, one data_partition_column):
104
+ # feature1, feature2, ..., featuren, label1, label2, ... labelk, data_partition_column1, ...,
105
+ # data_partition_columnn
106
+ # There can be no labels also.
107
+
108
+ # Read data from table through STO and build features and labels.
109
+ features = []
110
+ labels = []
111
+ data_partition_column_values = []
112
+
113
+
114
+ while 1:
115
+ try:
116
+ line = input()
117
+ if line == '': # Exit if user provides blank line
118
+ break
119
+ else:
120
+ values = line.split(DELIMITER)
121
+
122
+ if not data_partition_column_values:
123
+ # Partition column values is same for all rows. Hence, only read once.
124
+ for i, val in enumerate(data_partition_column_indices):
125
+ data_partition_column_values.append(
126
+ convert_to_type(values[val], typee=data_partition_column_types[i])
127
+ )
128
+
129
+ # Prepare the corresponding model file name and extract model.
130
+ partition_join = "_".join([str(x) for x in data_partition_column_values])
131
+ # Replace '-' with '_' as '-' because partition_columns can be negative.
132
+ partition_join = partition_join.replace("-", "_")
133
+
134
+ model_file_path = f"{model_file_prefix}_{partition_join}"\
135
+ if is_lake_system else \
136
+ f"./{db}/{model_file_prefix}_{partition_join}"
137
+
138
+ with open(model_file_path, "rb") as fp:
139
+ model = pickle.loads(fp.read())
140
+
141
+ if model is None:
142
+ sys.exit("Model file is not installed in Vantage.")
143
+
144
+ values = get_values_list(values, data_column_types, model)
145
+ values = values[:-len(data_partition_column_indices)] # Already processed partition columns.
146
+ features.append(values[:n_f_cols])
147
+ if n_c_labels > 0:
148
+ labels.append(values[n_f_cols:(n_f_cols+n_c_labels)])
149
+
150
+
151
+ except EOFError: # Exit if reached EOF or CTRL-D
99
152
  break
100
- else:
101
- values = line.split(DELIMITER)
102
-
103
- if not data_partition_column_values:
104
- # Partition column values is same for all rows. Hence, only read once.
105
- for i, val in enumerate(data_partition_column_indices):
106
- data_partition_column_values.append(
107
- convert_to_type(values[val], typee=data_partition_column_types[i])
108
- )
109
-
110
- # Prepare the corresponding model file name and extract model.
111
- partition_join = "_".join([str(x) for x in data_partition_column_values])
112
- # Replace '-' with '_' as '-' because partition_columns can be negative.
113
- partition_join = partition_join.replace("-", "_")
114
-
115
- model_file_path = f"{model_file_prefix}_{partition_join}"\
116
- if is_lake_system else \
117
- f"./{db}/{model_file_prefix}_{partition_join}"
118
-
119
- with open(model_file_path, "rb") as fp:
120
- model = pickle.loads(fp.read())
121
-
122
- if model is None:
123
- sys.exit("Model file is not installed in Vantage.")
124
-
125
- values = get_values_list(values, data_column_types, model)
126
- values = values[:-len(data_partition_column_indices)] # Already processed partition columns.
127
- features.append(values[:n_f_cols])
128
- if n_c_labels > 0:
129
- labels.append(values[n_f_cols:(n_f_cols+n_c_labels)])
130
-
131
-
132
- except EOFError: # Exit if reached EOF or CTRL-D
133
- break
134
-
135
- if not len(features):
136
- sys.exit(0)
137
-
138
- # Fit/partial_fit the model to the data.
139
- if function_name == "partial_fit":
140
- if labels and classes:
141
- model.partial_fit(features, labels, classes=classes)
142
- elif labels:
143
- model.partial_fit(features, labels)
144
- elif classes:
145
- model.partial_fit(features, classes=classes)
146
- else:
147
- model.partial_fit(features)
148
- elif function_name == "fit":
153
+
154
+ if not len(features):
155
+ sys.exit(0)
156
+
157
+ # Fit/partial_fit the model to the data.
149
158
  model_name = model.__class__.__name__
150
- np_func_list = ["OneVsRestClassifier", "LabelBinarizer", "TSNE"]
151
- if labels:
152
- # For IsotonicRegression, fit() accepts training target as
153
- # y: array-like of shape (n_samples,).
154
- if model_name in ["IsotonicRegression", "LinearSVC"]:
155
- labels = np.array(labels).reshape(-1)
156
- if model_name in np_func_list:
157
- labels = np.array(labels)
158
- features = np.array(features)
159
- model.fit(features, labels)
160
- else:
161
- if model_name in np_func_list:
162
- features = np.array(features)
163
- model.fit(features)
164
-
165
- model_str = pickle.dumps(model)
166
-
167
- if is_lake_system:
168
- model_file_path = f"/tmp/{model_file_prefix}_{partition_join}.pickle"
169
-
170
- # Write to file in Vantage, to be used in predict/scoring.
171
- with open(model_file_path, "wb") as fp:
172
- fp.write(model_str)
173
-
174
- model_data = model_file_path if is_lake_system \
175
- else base64.b64encode(model_str)
176
-
177
- # Print the model to be read from script.
178
- print(*(data_partition_column_values + [model_data]), sep=DELIMITER)
159
+ if function_name == "partial_fit":
160
+ if labels and classes:
161
+ if model_name == "SelectFromModel":
162
+ features = np.array(features)
163
+ classes = np.array(classes)
164
+ labels = np.array(labels).ravel()
165
+ model.partial_fit(features, labels, classes=classes)
166
+ elif labels:
167
+ model.partial_fit(features, labels)
168
+ elif classes:
169
+ model.partial_fit(features, classes=classes)
170
+ else:
171
+ model.partial_fit(features)
172
+ elif function_name == "fit":
173
+ np_func_list = ["OneVsRestClassifier", "LabelBinarizer", "TSNE"]
174
+ if labels:
175
+ # For IsotonicRegression, fit() accepts training target as
176
+ # y: array-like of shape (n_samples,).
177
+ if model_name in ["CalibratedClassifierCV", "GaussianProcessClassifier", "GenericUnivariateSelect",
178
+ "IsotonicRegression", "LinearSVC", "GridSearchCV", "LinearDiscriminantAnalysis", "RFECV",
179
+ "RFE", "RandomizedSearchCV", "SelectFdr", "SelectFpr", "SelectFromModel", "SelectFwe",
180
+ "SelectKBest", "SelectPercentile", "SequentialFeatureSelector", "GaussianNB",
181
+ "QuadraticDiscriminantAnalysis"]:
182
+ labels = np.array(labels).reshape(-1)
183
+ if model_name in np_func_list:
184
+ labels = np.array(labels)
185
+ features = np.array(features)
186
+ model.fit(features, labels)
187
+ else:
188
+ if model_name in np_func_list:
189
+ features = np.array(features)
190
+ model.fit(features)
191
+
192
+ model_str = pickle.dumps(model)
193
+
194
+ if is_lake_system:
195
+ model_file_path = f"/tmp/{model_file_prefix}_{partition_join}.pickle"
196
+
197
+ # Write to file in Vantage, to be used in predict/scoring.
198
+ with open(model_file_path, "wb") as fp:
199
+ fp.write(model_str)
200
+
201
+ model_data = model_file_path if is_lake_system \
202
+ else base64.b64encode(model_str)
203
+
204
+ # Print the model to be read from script.
205
+ print(*(data_partition_column_values + [model_data]), sep=DELIMITER)
@@ -2,126 +2,147 @@ import sys
2
2
  import numpy as np
3
3
  import pickle
4
4
  import math
5
+ import os
6
+ from contextlib import contextmanager
5
7
 
6
8
  DELIMITER = '\t'
7
9
 
8
- def get_values_list(values, types):
9
- ret_vals = []
10
- for i, val in enumerate(values):
11
- ret_vals.append(convert_to_type(val, types[i]))
12
- return ret_vals
13
-
14
- def convert_to_type(val, typee):
15
- if typee == 'int':
16
- return int(val) if val != "" else np.nan
17
- if typee == 'float':
18
- if isinstance(val, str):
19
- val = val.replace(' ', '')
20
- return float(val) if val != "" else np.nan
21
- if typee == 'bool':
22
- return eval(val) if val != "" else None
23
- return str(val) if val != "" else None
24
-
25
- def splitter(strr, delim=",", convert_to="str"):
10
+ @contextmanager
11
+ def suppress_stderr():
26
12
  """
27
- Split the string based on delimiter and convert to the type specified.
13
+ Function to suppress the warnings(lake systems treats warnings as errors).
28
14
  """
29
- if strr == "None":
30
- return []
31
- return [convert_to_type(i, convert_to) for i in strr.split(delim)]
32
-
33
- # Arguments to the Script
34
- if len(sys.argv) != 7:
35
- # 6 arguments command line arguments should be passed to this file.
36
- # 1: file to be run
37
- # 2. No of feature columns.
38
- # 3. No of class labels.
39
- # 4. Comma separated indices of partition columns.
40
- # 5. Comma separated types of all the data columns.
41
- # 6. Model file prefix to generated model file using partition columns.
42
- # 7. Flag to check the system type. True, means Lake, Enterprise otherwise.
43
- sys.exit("7 arguments should be passed to this file - file to be run, "\
44
- "no of feature columns, no of class labels, comma separated indices of partition "
45
- "columns, comma separated types of all columns, model file prefix to generate model "
46
- "file using partition columns and flag to check lake or enterprise.")
47
-
48
- is_lake_system = eval(sys.argv[6])
49
- if not is_lake_system:
50
- db = sys.argv[0].split("/")[1]
51
- n_f_cols = int(sys.argv[1])
52
- n_c_labels = int(sys.argv[2])
53
- model_file_prefix = sys.argv[5]
54
- data_column_types = splitter(sys.argv[4], delim="--")
55
- data_partition_column_indices = splitter(sys.argv[3], convert_to="int") # indices are integers.
56
-
57
- data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
58
-
59
- model = None
60
-
61
- # Data Format (n_features, k_labels, one data_partition_columns):
62
- # feature1, feature2, ..., featuren, label1, label2, ... labelk, data_partition_column1, ...,
63
- # data_partition_columnn.
64
- # There can be no labels also.
65
-
66
- # Read data from table through STO and build features and labels.
67
- features = []
68
- labels = []
69
- data_partition_column_values = []
70
-
71
-
72
- while 1:
73
- try:
74
- line = input()
75
- if line == '': # Exit if user provides blank line
15
+ with open(os.devnull, "w") as devnull:
16
+ old_stderr = sys.stderr
17
+ sys.stderr = devnull
18
+ try:
19
+ yield
20
+ finally:
21
+ sys.stderr = old_stderr
22
+
23
+ ## On Lake system warnings raised by script are treated as a errors.
24
+ ## Hence, to suppress it putting the under suppress_stderr().
25
+ with suppress_stderr():
26
+ def get_values_list(values, types):
27
+ ret_vals = []
28
+ for i, val in enumerate(values):
29
+ ret_vals.append(convert_to_type(val, types[i]))
30
+ return ret_vals
31
+
32
+ def convert_to_type(val, typee):
33
+ if typee == 'int':
34
+ return int(val) if val != "" else np.nan
35
+ if typee == 'float':
36
+ if isinstance(val, str):
37
+ val = val.replace(' ', '')
38
+ return float(val) if val != "" else np.nan
39
+ if typee == 'bool':
40
+ return eval(val) if val != "" else None
41
+ return str(val) if val != "" else None
42
+
43
+ def splitter(strr, delim=",", convert_to="str"):
44
+ """
45
+ Split the string based on delimiter and convert to the type specified.
46
+ """
47
+ if strr == "None":
48
+ return []
49
+ return [convert_to_type(i, convert_to) for i in strr.split(delim)]
50
+
51
+ # Arguments to the Script
52
+ if len(sys.argv) != 7:
53
+ # 6 arguments command line arguments should be passed to this file.
54
+ # 1: file to be run
55
+ # 2. No of feature columns.
56
+ # 3. No of class labels.
57
+ # 4. Comma separated indices of partition columns.
58
+ # 5. Comma separated types of all the data columns.
59
+ # 6. Model file prefix to generated model file using partition columns.
60
+ # 7. Flag to check the system type. True, means Lake, Enterprise otherwise.
61
+ sys.exit("7 arguments should be passed to this file - file to be run, "\
62
+ "no of feature columns, no of class labels, comma separated indices of partition "
63
+ "columns, comma separated types of all columns, model file prefix to generate model "
64
+ "file using partition columns and flag to check lake or enterprise.")
65
+
66
+ is_lake_system = eval(sys.argv[6])
67
+ if not is_lake_system:
68
+ db = sys.argv[0].split("/")[1]
69
+ n_f_cols = int(sys.argv[1])
70
+ n_c_labels = int(sys.argv[2])
71
+ model_file_prefix = sys.argv[5]
72
+ data_column_types = splitter(sys.argv[4], delim="--")
73
+ data_partition_column_indices = splitter(sys.argv[3], convert_to="int") # indices are integers.
74
+
75
+ data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
76
+
77
+ model = None
78
+
79
+ # Data Format (n_features, k_labels, one data_partition_columns):
80
+ # feature1, feature2, ..., featuren, label1, label2, ... labelk, data_partition_column1, ...,
81
+ # data_partition_columnn.
82
+ # There can be no labels also.
83
+
84
+ # Read data from table through STO and build features and labels.
85
+ features = []
86
+ labels = []
87
+ data_partition_column_values = []
88
+
89
+
90
+ while 1:
91
+ try:
92
+ line = input()
93
+ if line == '': # Exit if user provides blank line
94
+ break
95
+ else:
96
+ values = line.split(DELIMITER)
97
+ values = get_values_list(values, data_column_types)
98
+ features.append(values[:n_f_cols])
99
+ if n_c_labels > 0:
100
+ labels.append(values[n_f_cols:(n_f_cols+n_c_labels)])
101
+ if not data_partition_column_values:
102
+ # Partition column values is same for all rows. Hence, only read once.
103
+ for i, val in enumerate(data_partition_column_indices):
104
+ data_partition_column_values.append(
105
+ convert_to_type(values[val], typee=data_partition_column_types[i])
106
+ )
107
+
108
+ # Prepare the corresponding model file name and extract model.
109
+ partition_join = "_".join([str(x) for x in data_partition_column_values])
110
+ # Replace '-' with '_' as '-' because partition_columns can be negative.
111
+ partition_join = partition_join.replace("-", "_")
112
+
113
+ model_file_path = f"{model_file_prefix}_{partition_join}" \
114
+ if is_lake_system else \
115
+ f"./{db}/{model_file_prefix}_{partition_join}"
116
+
117
+ with open(model_file_path, "rb") as fp:
118
+ model = pickle.loads(fp.read())
119
+
120
+ if model is None:
121
+ sys.exit("Model file is not installed in Vantage.")
122
+
123
+ except EOFError: # Exit if reached EOF or CTRL-D
76
124
  break
77
- else:
78
- values = line.split(DELIMITER)
79
- values = get_values_list(values, data_column_types)
80
- features.append(values[:n_f_cols])
81
- if n_c_labels > 0:
82
- labels.append(values[n_f_cols:(n_f_cols+n_c_labels)])
83
- if not data_partition_column_values:
84
- # Partition column values is same for all rows. Hence, only read once.
85
- for i, val in enumerate(data_partition_column_indices):
86
- data_partition_column_values.append(
87
- convert_to_type(values[val], typee=data_partition_column_types[i])
88
- )
89
-
90
- # Prepare the corresponding model file name and extract model.
91
- partition_join = "_".join([str(x) for x in data_partition_column_values])
92
- # Replace '-' with '_' as '-' because partition_columns can be negative.
93
- partition_join = partition_join.replace("-", "_")
94
-
95
- model_file_path = f"{model_file_prefix}_{partition_join}" \
96
- if is_lake_system else \
97
- f"./{db}/{model_file_prefix}_{partition_join}"
98
-
99
- with open(model_file_path, "rb") as fp:
100
- model = pickle.loads(fp.read())
101
-
102
- if model is None:
103
- sys.exit("Model file is not installed in Vantage.")
104
-
105
- except EOFError: # Exit if reached EOF or CTRL-D
106
- break
107
-
108
- if not len(features):
109
- sys.exit(0)
110
-
111
- # write code to call fit_predict with features and labels when n_c_labels > 0
112
- if n_c_labels > 0:
113
- predictions = model.fit_predict(features, labels)
114
- else:
115
- predictions = model.fit_predict(features)
116
-
117
- # Export results to to the Databse through standard output
118
- for i in range(len(predictions)):
125
+
126
+ if not len(features):
127
+ sys.exit(0)
128
+
129
+ # write code to call fit_predict with features and labels when n_c_labels > 0
130
+ model_name = model.__class__.__name__
119
131
  if n_c_labels > 0:
120
- # Add labels into output, if user passes it.
121
- result_list = features[i] + labels[i] + [predictions[i]]
132
+ if model_name in ["SelectFromModel"]:
133
+ labels = np.array(labels).ravel()
134
+ predictions = model.fit_predict(features, labels)
122
135
  else:
123
- result_list = features[i] + [predictions[i]]
124
- print(*(data_partition_column_values +
125
- ['' if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val))))
126
- else val for val in result_list]),
127
- sep= DELIMITER)
136
+ predictions = model.fit_predict(features)
137
+
138
+ # Export results to to the Databse through standard output
139
+ for i in range(len(predictions)):
140
+ if n_c_labels > 0:
141
+ # Add labels into output, if user passes it.
142
+ result_list = features[i] + labels[i] + [predictions[i]]
143
+ else:
144
+ result_list = features[i] + [predictions[i]]
145
+ print(*(data_partition_column_values +
146
+ ['' if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val))))
147
+ else val for val in result_list]),
148
+ sep= DELIMITER)