PyPI - teradataml - Versions diffs - 20.0.0.2__py3-none-any.whl → 20.0.0.3__py3-none-any.whl - Mend

teradataml 20.0.0.2py3-none-any.whl → 20.0.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of teradataml might be problematic. Click here for more details.

Files changed (88) hide show

teradataml/LICENSE-3RD-PARTY.pdf +0 -0
teradataml/README.md +196 -2
teradataml/__init__.py +4 -0
teradataml/_version.py +1 -1
teradataml/analytics/analytic_function_executor.py +79 -4
teradataml/analytics/json_parser/metadata.py +12 -3
teradataml/analytics/json_parser/utils.py +7 -2
teradataml/analytics/sqle/__init__.py +1 -0
teradataml/analytics/table_operator/__init__.py +1 -1
teradataml/analytics/uaf/__init__.py +1 -1
teradataml/analytics/utils.py +4 -0
teradataml/automl/data_preparation.py +3 -2
teradataml/automl/feature_engineering.py +15 -7
teradataml/automl/model_training.py +39 -33
teradataml/common/__init__.py +2 -1
teradataml/common/constants.py +35 -0
teradataml/common/garbagecollector.py +2 -1
teradataml/common/messagecodes.py +8 -2
teradataml/common/messages.py +3 -1
teradataml/common/sqlbundle.py +25 -3
teradataml/common/utils.py +134 -9
teradataml/context/context.py +20 -10
teradataml/data/SQL_Fundamentals.pdf +0 -0
teradataml/data/dataframe_example.json +18 -2
teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +1 -1
teradataml/data/docs/sqle/docs_17_20/Shap.py +7 -1
teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +4 -4
teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
teradataml/data/jsons/uaf/17.20/TD_SAX.json +3 -1
teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +15 -5
teradataml/data/medical_readings.csv +101 -0
teradataml/data/patient_profile.csv +101 -0
teradataml/data/scripts/lightgbm/dataset.template +157 -0
teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
teradataml/data/scripts/sklearn/sklearn_fit.py +194 -167
teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
teradataml/data/scripts/sklearn/sklearn_function.template +14 -19
teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
teradataml/data/scripts/sklearn/sklearn_transform.py +129 -42
teradataml/data/target_udt_data.csv +8 -0
teradataml/data/templates/open_source_ml.json +3 -2
teradataml/data/vectordistance_example.json +4 -0
teradataml/dataframe/dataframe.py +543 -175
teradataml/dataframe/functions.py +553 -25
teradataml/dataframe/sql.py +184 -15
teradataml/dbutils/dbutils.py +556 -18
teradataml/dbutils/filemgr.py +48 -1
teradataml/lib/aed_0_1.dll +0 -0
teradataml/opensource/__init__.py +1 -1
teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
teradataml/opensource/_lightgbm.py +950 -0
teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
teradataml/opensource/sklearn/__init__.py +0 -1
teradataml/opensource/sklearn/_sklearn_wrapper.py +798 -438
teradataml/options/__init__.py +7 -23
teradataml/options/configure.py +29 -3
teradataml/scriptmgmt/UserEnv.py +3 -3
teradataml/scriptmgmt/lls_utils.py +74 -21
teradataml/store/__init__.py +13 -0
teradataml/store/feature_store/__init__.py +0 -0
teradataml/store/feature_store/constants.py +291 -0
teradataml/store/feature_store/feature_store.py +2223 -0
teradataml/store/feature_store/models.py +1505 -0
teradataml/store/vector_store/__init__.py +1586 -0
teradataml/table_operators/query_generator.py +3 -0
teradataml/table_operators/table_operator_query_generator.py +3 -1
teradataml/table_operators/table_operator_util.py +37 -38
teradataml/table_operators/templates/dataframe_register.template +69 -0
teradataml/utils/dtypes.py +4 -2
teradataml/utils/validators.py +33 -1
{teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +200 -5
{teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +88 -65
{teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
{teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
{teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0

teradataml/data/scripts/lightgbm/lightgbm_class_functions.template ADDED Viewed

@@ -0,0 +1,247 @@
+import base64
+import io
+import math
+import os
+import pickle
+import sys
+import numpy as np
+DELIMITER = '\t'
+def get_values_list(values, types):
+    ret_vals = []
+    for i, val in enumerate(values):
+        ret_vals.append(convert_to_type(val, types[i]))
+    return ret_vals
+def convert_to_type(val, typee):
+    if typee == 'int':
+        return int(val) if val != "" else np.nan
+    if typee == 'float':
+        if isinstance(val, str):
+            val = val.replace(' ', '')
+        return float(val) if val != "" else np.nan
+    if typee == 'bool':
+        return eval(val) if val != "" else None
+    return str(val) if val != "" else None
+def splitter(strr, delim=",", convert_to="str"):
+    """
+    Split the string based on delimiter and convert to the type specified.
+    """
+    if strr == "None":
+        return []
+    return [convert_to_type(i, convert_to) for i in strr.split(delim)]
+def should_convert(t_val, py_type):
+    """
+    Function to check type of value and whether value is nan and infinity.
+    """
+    return not isinstance(t_val, eval(py_type)) and not math.isinf(t_val) and not math.isnan(t_val)
+def convert_value(t_val, py_type):
+    """
+    Function to convert value to specified python type.
+    """
+    return convert_to_type(t_val, py_type) if should_convert(t_val, py_type) else t_val
+# Process output returned by sklearn function.
+def get_output_data(trans_values, func_name, n_c_labels, n_out_columns):
+    # Converting    sparse matrix to dense array as sparse matrices are NOT
+    # supported in Vantage.
+    # module_name = model_obj.__module__.split("._")[0]
+    # Converting the translated values into corresponding the return column's
+    # python type.
+    if (return_columns_python_types is None or not isinstance(trans_values, np.ndarray)):
+        trans_values_list = trans_values
+    else:
+        # Conversion.
+        trans_values_list = []
+        for trans_value in trans_values.tolist():
+            if not isinstance(trans_value, list):
+                trans_value = [trans_value]
+            converted_list = []
+            if len(return_columns_python_types) == len(trans_value):
+                for t_val, py_type in zip(trans_value, return_columns_python_types):
+                    converted_list.append(convert_value(t_val, py_type))
+            ## transform() is having only 1 python return type, But it actually returns more than 1 column.
+            else:
+                for t_val in trans_value:
+                    converted_list.append(convert_value(t_val, return_columns_python_types[0]))
+            trans_values_list.append(converted_list)
+    if type(trans_values_list).__name__ in ["csr_matrix", "csc_matrix"]:
+        trans_values_list = trans_values_list.toarray()
+    if isinstance(trans_values_list[0], np.ndarray) \
+            or isinstance(trans_values_list[0], list) \
+            or isinstance(trans_values_list[0], tuple):
+        # Here, the value returned by sklearn function is list type.
+        opt_list = list(trans_values_list[0])
+        if len(opt_list) < n_out_columns:
+            # If the output list is less than the required number of columns, append
+            # empty strings to the list.
+            opt_list += [""] * (n_out_columns - len(opt_list))
+        return opt_list
+    # Only one element is returned by the function.
+    return [trans_values_list[0]]
+# Arguments to the Script
+if len(sys.argv) != 10:
+    # 10 arguments command line arguments should be passed to this file.
+    # 1: file to be run
+    # 2. function name (Eg. predict, fit etc)
+    # 3. No of feature columns.
+    # 4. No of class labels.
+    # 5. Comma separated indices of partition columns.
+    # 6. Comma separated types of all the data columns.
+    # 7. Model file prefix to generated model file using partition columns.
+    # 8. Number of columns to be returned by the sklearn's transform function.
+    # 9. Flag to check the system type. True, means Lake, Enterprise otherwise.
+    # 10. Python types of returned/transfromed columns.
+    sys.exit("10 arguments should be passed to this file - file to be run, function name, "\
+                 "no of feature columns, no of class labels, comma separated indices of partition "\
+                 "columns, comma separated types of all columns, model file prefix to generate model "\
+                 "file using partition columns, number of columns to be returnd by sklearn's "\
+                 "transform function, flag to check lake or enterprise and Python types of "\
+                 "returned/transfromed columns.")
+is_lake_system = eval(sys.argv[8])
+if not is_lake_system:
+    db = sys.argv[0].split("/")[1]
+func_name = sys.argv[1]
+n_f_cols = int(sys.argv[2])
+n_c_labels = int(sys.argv[3])
+data_column_types = splitter(sys.argv[5], delim="--")
+data_partition_column_indices = splitter(sys.argv[4], convert_to="int") # indices are integers.
+model_file_prefix = sys.argv[6]
+# sys.argv[9] will contain a string of python datatypes with '--'
+# separator OR a single datatype OR None in string format.
+ret_col_argv = sys.argv[9]
+if ret_col_argv == "None":
+    return_columns_python_types = eval(ret_col_argv)
+else:
+    return_columns_python_types = splitter(ret_col_argv, delim="--")
+no_of_output_columns = int(sys.argv[7])
+data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
+model = None
+data_partition_column_values = []
+all_x_rows = []
+all_y_rows = []
+# Data Format:
+# feature1, feature2, ..., featuren, label1, label2, ... labelk, data_partition_column1, ...,
+# data_partition_columnn.
+# label is optional (it is present when label_exists is not "None")
+model_name = ""
+while 1:
+    try:
+        line = input()
+        if line == '':  # Exit if user provides blank line
+            break
+        else:
+            values = line.split(DELIMITER)
+            values = get_values_list(values, data_column_types)
+            if not data_partition_column_values:
+                # Partition column values is same for all rows. Hence, only read once.
+                for i, val in enumerate(data_partition_column_indices):
+                    data_partition_column_values.append(
+                        convert_to_type(values[val], typee=data_partition_column_types[i])
+                        )
+                # Prepare the corresponding model file name and extract model.
+                partition_join = "_".join([str(x) for x in data_partition_column_values])
+                # Replace '-' with '_' as '-' because partition_columns can be negative.
+                partition_join = partition_join.replace("-", "_")
+                model_file_path = f"{model_file_prefix}_{partition_join}" \
+                    if is_lake_system else \
+                    f"./{db}/{model_file_prefix}_{partition_join}"
+                with open(model_file_path, "rb") as fp:
+                    model = pickle.loads(fp.read())
+                if not model:
+                    sys.exit("Model file is not installed in Vantage.")
+            f_ = values[:n_f_cols]
+            f__ = np.array([f_])
+            if n_c_labels > 0:
+                l_ = values[n_f_cols:n_f_cols+n_c_labels]
+                l__ = np.array([l_])
+            if func_name == "refit":
+                # refit() needs all data at once. Hence, read all data at once and call refit().
+                all_x_rows.append(f_)
+                all_y_rows.append(l_)
+                continue
+            # Because `predict` function does not accept 'y' as input, we need to handle it separately.
+            if n_c_labels > 0 and func_name not in ["predict"]:
+                # Labels are present in last column.
+                trans_values = getattr(model, func_name)(f__, l__, **params)
+            else:
+                # If class labels do not exist in data, don't read labels, read just features.
+                trans_values = getattr(model, func_name)(f__, **params)
+            result_list = f_
+            if n_c_labels > 0 and func_name in ["predict", "decision_function"]:
+                result_list += l_
+            result_list += get_output_data(trans_values=trans_values, func_name=func_name,
+                                           n_c_labels=n_c_labels, n_out_columns=no_of_output_columns)
+            for i, val in enumerate(result_list):
+                if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))):
+                    result_list[i] = ""
+                elif val == False:
+                    result_list[i] = 0
+                elif val == True:
+                    result_list[i] = 1
+            print(*(data_partition_column_values + result_list), sep=DELIMITER)
+    except EOFError:  # Exit if reached EOF or CTRL-D
+        break
+if func_name == "refit":
+    result = ""
+    stdout = None
+    try:
+        stdout = sys.stdout
+        new_stdout = io.StringIO()
+        sys.stdout = new_stdout
+        trained_model = getattr(model, func_name)(all_x_rows, all_y_rows, **params)
+        result = new_stdout.getvalue()
+    except Exception:
+        raise
+    finally:
+        sys.stdout = stdout
+    model_str = pickle.dumps(trained_model)
+    if is_lake_system:
+        model_file_path = f"/tmp/{model_file_prefix}_{partition_join}.pickle"
+    # Write to trained model file in Vantage.
+    with open(model_file_path, "wb") as fp:
+        fp.write(model_str)
+    model_data = model_file_path if is_lake_system else base64.b64encode(model_str)
+    console_output = base64.b64encode(result.encode())
+    print(*(data_partition_column_values + [model_data, console_output]), sep="..")

teradataml/data/scripts/lightgbm/lightgbm_function.template ADDED Viewed

@@ -0,0 +1,216 @@
+import sys, json, io
+import pickle, base64, importlib, numpy as np
+from collections import OrderedDict
+func_name = "<func_name>"
+module_name = "<module_name>"
+is_lake_system = <is_lake_system>
+params = json.loads('<params>')
+data_partition_column_indices = <partition_cols_indices>
+data_partition_column_types = <partition_cols_types>
+model_file_prefix = "<model_file_prefix>" # Needed in case of lake system for writing model to /tmp
+DELIMITER = '\t'
+def convert_to_type(val, typee):
+    if typee == 'int':
+        return int(val) if val != "" else np.nan
+    if typee == 'float':
+        if isinstance(val, str):
+            val = val.replace(' ', '')
+        return float(val) if val != "" else np.nan
+    if typee == 'bool':
+        return eval(val) if val != "" else None
+    return str(val) if val != "" else None
+if not is_lake_system:
+    db = sys.argv[0].split("/")[1]
+data_present = False
+data_partition_column_values = []
+while 1:
+    try:
+        line = input()
+        if line == '':  # Exit if user provides blank line
+            break
+        else:
+            data_present = True
+            values = line.split(DELIMITER)
+            if not data_partition_column_values:
+                # Partition column values is same for all rows. Hence, only read once.
+                for i, val in enumerate(data_partition_column_indices): # Only partition columns are
+                    data_partition_column_values.append(
+                        convert_to_type(values[val], typee=data_partition_column_types[i])
+                        )
+                # Prepare the corresponding model file name and extract model.
+                partition_join = "_".join([str(x) for x in data_partition_column_values])
+                # Replace '-' with '_' because partition_columns can be negative containing '-'.
+                partition_join = partition_join.replace("-", "_")
+                train_set = params.get("train_set") # Gets file name prefix.
+                model_file_path = f"{train_set}_{partition_join}"\
+                    if is_lake_system else \
+                    f"./{db}/{train_set}_{partition_join}"
+                with open(model_file_path, "rb") as fp:
+                    params["train_set"] = pickle.loads(fp.read())
+                valid_sets = params.get("valid_sets", None) # Gets file names prefix.
+                if valid_sets:
+                    params["valid_sets"] = []
+                    for valid_set in valid_sets:
+                        model_file_path = f"{valid_set}_{partition_join}"\
+                            if is_lake_system else \
+                            f"./{db}/{valid_set}_{partition_join}"
+                        with open(model_file_path, "rb") as fp:
+                            params["valid_sets"].append(pickle.load(fp))
+    except EOFError:  # Exit if reached EOF or CTRL-D
+        break
+if not data_present:
+    sys.exit(0)
+# Handle callbacks.
+rec_eval = None
+if "callbacks" in params and params["callbacks"] is not None:
+    callbacks = params["callbacks"]
+    callbacks = [callbacks] if not isinstance(callbacks, list) else callbacks
+    for i, callback in enumerate(callbacks):
+        c_module_name = callback["module"]
+        c_func_name = callback["func_name"]
+        c_kwargs = callback["kwargs"]
+        c_module = importlib.import_module(c_module_name)
+        if c_func_name == "record_evaluation":
+            # record_evaluation function takes empty dict. If the argument has elements in the
+            # dict, they will be deleted as per the documentation from lightgbm as described below:
+            # eval_result (dict) -
+            #   Dictionary used to store all evaluation results of all validation sets. This should
+            #   be initialized outside of your call to record_evaluation() and should be empty. Any
+            #   initial contents of the dictionary will be deleted.
+            rec_eval = {}
+            callbacks[i] = getattr(c_module, c_func_name)(rec_eval)
+        else:
+            callbacks[i] = getattr(c_module, c_func_name)(**c_kwargs)
+    params["callbacks"] = callbacks
+module_ = importlib.import_module(module_name)
+### LightGBM training is giving some meaningful console output like this:
+### Hence, capturing it to show to the user.
+# [LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000190 seconds.
+# You can set `force_row_wise=true` to remove the overhead.
+# And if memory is not enough, you can set `force_col_wise=true`.
+# [LightGBM] [Info] Total Bins 136
+# [LightGBM] [Info] Number of data points in the train set: 97, number of used features: 4
+# [LightGBM] [Info] Start training from score 0.556701
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [1]	valid_0's l2: 0.219637	valid_1's l2: 0.219637
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [2]	valid_0's l2: 0.196525	valid_1's l2: 0.196525
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [3]	valid_0's l2: 0.178462	valid_1's l2: 0.178462
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [4]	valid_0's l2: 0.162887	valid_1's l2: 0.162887
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [5]	valid_0's l2: 0.150271	valid_1's l2: 0.150271
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [6]	valid_0's l2: 0.140219	valid_1's l2: 0.140219
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [7]	valid_0's l2: 0.131697	valid_1's l2: 0.131697
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [8]	valid_0's l2: 0.124056	valid_1's l2: 0.124056
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [9]	valid_0's l2: 0.117944	valid_1's l2: 0.117944
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [10]	valid_0's l2: 0.11263	valid_1's l2: 0.11263
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [11]	valid_0's l2: 0.105228	valid_1's l2: 0.105228
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [12]	valid_0's l2: 0.0981571	valid_1's l2: 0.0981571
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [13]	valid_0's l2: 0.0924294	valid_1's l2: 0.0924294
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [14]	valid_0's l2: 0.0877899	valid_1's l2: 0.0877899
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [15]	valid_0's l2: 0.084032	valid_1's l2: 0.084032
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [16]	valid_0's l2: 0.080988	valid_1's l2: 0.080988
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [17]	valid_0's l2: 0.0785224	valid_1's l2: 0.0785224
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [18]	valid_0's l2: 0.0765253	valid_1's l2: 0.0765253
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [19]	valid_0's l2: 0.0750803	valid_1's l2: 0.0750803
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [20]	valid_0's l2: 0.0738915	valid_1's l2: 0.0738915
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [21]	valid_0's l2: 0.07288	valid_1's l2: 0.07288
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [22]	valid_0's l2: 0.0718676	valid_1's l2: 0.0718676
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [23]	valid_0's l2: 0.0706037	valid_1's l2: 0.0706037
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [24]	valid_0's l2: 0.0695799	valid_1's l2: 0.0695799
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [25]	valid_0's l2: 0.0687507	valid_1's l2: 0.0687507
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [26]	valid_0's l2: 0.0680819	valid_1's l2: 0.0680819
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [27]	valid_0's l2: 0.0674077	valid_1's l2: 0.0674077
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [28]	valid_0's l2: 0.0665111	valid_1's l2: 0.0665111
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [29]	valid_0's l2: 0.0659656	valid_1's l2: 0.0659656
+# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
+# [30]	valid_0's l2: 0.0652665	valid_1's l2: 0.0652665
+result = ""
+stdout = None
+try:
+    stdout = sys.stdout
+    new_stdout = io.StringIO()
+    sys.stdout = new_stdout
+    trained_model = getattr(module_, func_name)(**params)
+    result = new_stdout.getvalue()
+except Exception:
+    raise
+finally:
+    sys.stdout = stdout
+model_str = pickle.dumps(trained_model)
+console_output_str = result.encode()
+if is_lake_system:
+    model_file_path = f"/tmp/{model_file_prefix}_{partition_join}.pickle"
+    model_console_output_path = f"/tmp/{model_file_prefix}_{partition_join}_console_output.pickle"
+    # Write to file in Vantage, to be used in predict/scoring.
+    with open(model_file_path, "wb") as fp:
+        fp.write(model_str)
+    with open(model_console_output_path, "wb") as fpc:
+        fpc.write(console_output_str)
+model_data = model_file_path if is_lake_system else base64.b64encode(model_str)
+console_output = model_console_output_path if is_lake_system else base64.b64encode(console_output_str)
+output_data = [model_data, console_output]
+if rec_eval is not None:
+    rec_eval = pickle.dumps(rec_eval)
+    if is_lake_system:
+        rec_eval_file_path = f"/tmp/{model_file_prefix}_{partition_join}_rec_eval.pickle"
+        with open(rec_eval_file_path, "wb") as fp:
+            fp.write(rec_eval)
+    rec_eval = rec_eval_file_path if is_lake_system else base64.b64encode(rec_eval)
+    output_data.append(rec_eval)
+print(*(data_partition_column_values + output_data), sep=DELIMITER)

teradataml/data/scripts/lightgbm/lightgbm_sklearn.template ADDED Viewed

@@ -0,0 +1,159 @@
+import sys, json
+import pickle, base64, importlib, numpy as np
+from collections import OrderedDict
+DELIMITER = '\t'
+func_name = <func_name>
+params = json.loads('<params>')
+is_lake_system = <is_lake_system>
+model_file_prefix = <model_file_prefix>
+def convert_to_type(val, typee):
+    if typee == 'int':
+        return int(val) if val != "" else np.nan
+    if typee == 'float':
+        if isinstance(val, str):
+            val = val.replace(' ', '')
+        return float(val) if val != "" else np.nan
+    if typee == 'bool':
+        return eval(val) if val != "" else None
+    return str(val) if val != "" else None
+def splitter(strr, delim=",", convert_to="str"):
+    """
+    Split the string based on delimiter and convert to the type specified.
+    """
+    if strr == "None":
+        return []
+    return [convert_to_type(i, convert_to) for i in strr.split(delim)]
+if not is_lake_system:
+    db = sys.argv[0].split("/")[1]
+data_partition_column_indices = <partition_cols_indices>
+data_column_types = <types_of_data_cols>
+data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
+# Data related arguments information of indices and types.
+data_args_indices_types = OrderedDict()
+# Data related arguments values - prepare dictionary and populate data later.
+data_args_values = {}
+data_args_info_str = <data_args_info_str>
+for data_arg in data_args_info_str.split("--"):
+    arg_name, indices, types = data_arg.split("-")
+    indices = splitter(indices, convert_to="int")
+    types = splitter(types)
+    data_args_indices_types[arg_name] = {"indices": indices, "types": types}
+    data_args_values[arg_name] = [] # Keeping empty for each data arg name and populate data later.
+data_partition_column_values = []
+data_present = False
+model = None
+# Read data - columns information is passed as command line argument and stored in
+# data_args_indices_types dictionary.
+while 1:
+    try:
+        line = input()
+        if line == '':  # Exit if user provides blank line
+            break
+        else:
+            data_present = True
+            values = line.split(DELIMITER)
+            if not data_partition_column_values:
+                # Partition column values is same for all rows. Hence, only read once.
+                for i, val in enumerate(data_partition_column_indices):
+                    data_partition_column_values.append(
+                        convert_to_type(values[val], typee=data_partition_column_types[i])
+                        )
+                # Prepare the corresponding model file name and extract model.
+                partition_join = "_".join([str(x) for x in data_partition_column_values])
+                # Replace '-' with '_' as '-' because partition_columns can be negative.
+                partition_join = partition_join.replace("-", "_")
+                model_file_path = f"{model_file_prefix}_{partition_join}"\
+                    if is_lake_system else \
+                    f"./{db}/{model_file_prefix}_{partition_join}"
+                with open(model_file_path, "rb") as fp:
+                    model = pickle.loads(fp.read())
+                if model is None:
+                    sys.exit("Model file is not installed in Vantage.")
+            # Prepare data dictionary containing only arguments related to data.
+            for arg_name in data_args_values:
+                data_indices = data_args_indices_types[arg_name]["indices"]
+                types = data_args_indices_types[arg_name]["types"]
+                cur_row = []
+                for idx, data_idx in enumerate(data_indices):
+                    cur_row.append(convert_to_type(values[data_idx], types[idx]))
+                data_args_values[arg_name].append(cur_row)
+    except EOFError:  # Exit if reached EOF or CTRL-D
+        break
+if not data_present:
+    sys.exit(0)
+# Handle callbacks.
+rec_eval = None
+if "callbacks" in params and params["callbacks"] is not None:
+    callbacks = params["callbacks"]
+    callbacks = [callbacks] if not isinstance(callbacks, list) else callbacks
+    for i, callback in enumerate(callbacks):
+        c_module_name = callback["module"]
+        c_func_name = callback["func_name"]
+        c_kwargs = callback["kwargs"]
+        c_module = importlib.import_module(c_module_name)
+        if c_func_name == "record_evaluation":
+            # record_evaluation function takes empty dict. If the argument has elements in the
+            # dict, they will be deleted as per the documentation from lightgbm as described below:
+            # eval_result (dict) -
+            #   Dictionary used to store all evaluation results of all validation sets. This should
+            #   be initialized outside of your call to record_evaluation() and should be empty. Any
+            #   initial contents of the dictionary will be deleted.
+            rec_eval = {}
+            callbacks[i] = getattr(c_module, c_func_name)(rec_eval)
+        else:
+            callbacks[i] = getattr(c_module, c_func_name)(**c_kwargs)
+    params["callbacks"] = callbacks
+# Update data as numpy arrays.
+for arg_name in data_args_values:
+    np_values = np.array(data_args_values[arg_name])
+    data_args_values[arg_name] = np_values
+    if arg_name == "sample_weight":
+        data_args_values[arg_name] = np_values.ravel()
+# Combine all arguments.
+all_args = {**data_args_values, **params}
+trained_model = getattr(model, func_name)(**all_args)
+model_data = 0
+if func_name == "fit":
+    model_str = pickle.dumps(trained_model)
+    if is_lake_system:
+        model_file_path = f"/tmp/{model_file_prefix}_{partition_join}.pickle"
+    # Write to file in Vantage, to be used in predict/scoring.
+    with open(model_file_path, "wb") as fp:
+        fp.write(model_str)
+    model_data = model_file_path if is_lake_system else base64.b64encode(model_str)
+elif func_name == "score":
+    model_data = trained_model
+print(*(data_partition_column_values + [model_data]), sep=DELIMITER)

teradataml 20.0.0.2__py3-none-any.whl → 20.0.0.3__py3-none-any.whl

Potentially problematic release.

teradataml 20.0.0.2py3-none-any.whl → 20.0.0.3py3-none-any.whl