PyPI - teradataml - Versions diffs - 20.0.0.2__py3-none-any.whl → 20.0.0.3__py3-none-any.whl - Mend

teradataml 20.0.0.2py3-none-any.whl → 20.0.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of teradataml might be problematic. Click here for more details.

Files changed (88) hide show

teradataml/LICENSE-3RD-PARTY.pdf +0 -0
teradataml/README.md +196 -2
teradataml/__init__.py +4 -0
teradataml/_version.py +1 -1
teradataml/analytics/analytic_function_executor.py +79 -4
teradataml/analytics/json_parser/metadata.py +12 -3
teradataml/analytics/json_parser/utils.py +7 -2
teradataml/analytics/sqle/__init__.py +1 -0
teradataml/analytics/table_operator/__init__.py +1 -1
teradataml/analytics/uaf/__init__.py +1 -1
teradataml/analytics/utils.py +4 -0
teradataml/automl/data_preparation.py +3 -2
teradataml/automl/feature_engineering.py +15 -7
teradataml/automl/model_training.py +39 -33
teradataml/common/__init__.py +2 -1
teradataml/common/constants.py +35 -0
teradataml/common/garbagecollector.py +2 -1
teradataml/common/messagecodes.py +8 -2
teradataml/common/messages.py +3 -1
teradataml/common/sqlbundle.py +25 -3
teradataml/common/utils.py +134 -9
teradataml/context/context.py +20 -10
teradataml/data/SQL_Fundamentals.pdf +0 -0
teradataml/data/dataframe_example.json +18 -2
teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +1 -1
teradataml/data/docs/sqle/docs_17_20/Shap.py +7 -1
teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +4 -4
teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
teradataml/data/jsons/uaf/17.20/TD_SAX.json +3 -1
teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +15 -5
teradataml/data/medical_readings.csv +101 -0
teradataml/data/patient_profile.csv +101 -0
teradataml/data/scripts/lightgbm/dataset.template +157 -0
teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
teradataml/data/scripts/sklearn/sklearn_fit.py +194 -167
teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
teradataml/data/scripts/sklearn/sklearn_function.template +14 -19
teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
teradataml/data/scripts/sklearn/sklearn_transform.py +129 -42
teradataml/data/target_udt_data.csv +8 -0
teradataml/data/templates/open_source_ml.json +3 -2
teradataml/data/vectordistance_example.json +4 -0
teradataml/dataframe/dataframe.py +543 -175
teradataml/dataframe/functions.py +553 -25
teradataml/dataframe/sql.py +184 -15
teradataml/dbutils/dbutils.py +556 -18
teradataml/dbutils/filemgr.py +48 -1
teradataml/lib/aed_0_1.dll +0 -0
teradataml/opensource/__init__.py +1 -1
teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
teradataml/opensource/_lightgbm.py +950 -0
teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
teradataml/opensource/sklearn/__init__.py +0 -1
teradataml/opensource/sklearn/_sklearn_wrapper.py +798 -438
teradataml/options/__init__.py +7 -23
teradataml/options/configure.py +29 -3
teradataml/scriptmgmt/UserEnv.py +3 -3
teradataml/scriptmgmt/lls_utils.py +74 -21
teradataml/store/__init__.py +13 -0
teradataml/store/feature_store/__init__.py +0 -0
teradataml/store/feature_store/constants.py +291 -0
teradataml/store/feature_store/feature_store.py +2223 -0
teradataml/store/feature_store/models.py +1505 -0
teradataml/store/vector_store/__init__.py +1586 -0
teradataml/table_operators/query_generator.py +3 -0
teradataml/table_operators/table_operator_query_generator.py +3 -1
teradataml/table_operators/table_operator_util.py +37 -38
teradataml/table_operators/templates/dataframe_register.template +69 -0
teradataml/utils/dtypes.py +4 -2
teradataml/utils/validators.py +33 -1
{teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +200 -5
{teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +88 -65
{teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
{teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
{teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0

teradataml/data/scripts/sklearn/sklearn_function.template CHANGED Viewed

@@ -28,28 +28,22 @@ def splitter(strr, delim=",", convert_to="str"):
     return [convert_to_type(i, convert_to) for i in strr.split(delim)]
 # Arguments to the Script.
-if len(sys.argv) != 6:
-    # 5 arguments command line arguments should be passed to this file.
+if len(sys.argv) != 3:
+    # 3 command line arguments should be passed to this file.
     # 1: file to be run
-    # 2. Comma separated indices of partition columns.
-    # 3. Comma separated types of all the data columns.
-    # 4. Data columns information separted by "--" where each data column information is in the form
-    #    "<arg_name>-<comma separated data indices>-<comma separated data types>".
-    # 5. Flag to check the system type. True, means Lake, Enterprise otherwise.
-    # 6. Model file prefix for lake system, None otherwise.
-    sys.exit("5 arguments command line arguments should be passed: file to be run,"
-             " comma separated indices of partition columns, comma separated types of all columns,"
-             " data columns information separated by '--' where each data column information is"
-             " in the form '<arg_name>-<comma separated data indices>-<comma separated data types>',"
-             " flag to check lake or enterprise and model file prefix used only for lake system.")
-is_lake_system = eval(sys.argv[4])
+    # 2. Model file prefix for lake system, None otherwise.
+    # 3. Flag to check the system type. True, means Lake, Enterprise otherwise.
+    sys.exit("3 arguments command line arguments should be passed: file to be run,"
+             " model file prefix used only for lake system and flag to check lake or enterprise.")
+is_lake_system = eval(sys.argv[2])
 if not is_lake_system:
     db = sys.argv[0].split("/")[1]
 else:
-    model_file_prefix = sys.argv[5]
-data_partition_column_indices = splitter(sys.argv[1], convert_to="int") # indices are integers.
-data_column_types = splitter(sys.argv[2], delim="--")
+    model_file_prefix = sys.argv[1]
+data_partition_column_indices = <partition_cols_indices>
+data_column_types = <types_of_data_cols>
 data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
@@ -59,7 +53,8 @@ data_args_indices_types = OrderedDict()
 # Data related arguments values - prepare dictionary and populate data later.
 data_args_values = {}
-for data_arg in sys.argv[3].split("--"):
+data_args_info_str = <data_args_info_str>
+for data_arg in data_args_info_str.split("--"):
     arg_name, indices, types = data_arg.split("-")
     indices = splitter(indices, convert_to="int")
     types = splitter(types)

teradataml/data/scripts/sklearn/sklearn_model_selection_split.py CHANGED Viewed

@@ -3,146 +3,164 @@ import math
 import sys
 import numpy as np
 import base64
+from contextlib import contextmanager
+import os
 DELIMITER = '\t'
-def get_values_list(values, types):
-    ret_vals = []
-    for i, val in enumerate(values):
-        ret_vals.append(convert_to_type(val, types[i]))
-    return ret_vals
-def convert_to_type(val, typee):
-    if typee == 'int':
-        return int(val) if val != "" else np.nan
-    if typee == 'float':
-        if isinstance(val, str):
-            val = val.replace(' ', '')
-        return float(val) if val != "" else np.nan
-    if typee == 'bool':
-        return eval(val) if val != "" else None
-    return str(val) if val != "" else None
-def splitter(strr, delim=",", convert_to="str"):
+@contextmanager
+def suppress_stderr():
     """
-    Split the string based on delimiter and convert to the type specified.
+    Function to suppress the warnings(lake systems treats warnings as errors).
     """
-    if strr == "None":
-        return []
-    return [convert_to_type(i, convert_to) for i in strr.split(delim)]
-# Arguments to the Script
-if len(sys.argv) != 9:
-    # 9 arguments command line arguments should be passed to this file.
-    # 1: file to be run
-    # 2. function name
-    # 3. No of feature columns.
-    # 4. No of class labels.
-    # 5. No of group columns.
-    # 6. Comma separated indices of partition columns.
-    # 7. Comma separated types of all the data columns.
-    # 8. Model file prefix to generated model file using partition columns.
-    # 9. Flag to check the system type. True, means Lake, Enterprise otherwise.
-    sys.exit("9 arguments command line arguments should be passed: file to be run,"
-             " function name, no of feature columns, no of class labels, no of group columns,"
-             " comma separated indices of partition columns, comma separated types of all columns,"
-             " model file prefix to generated model file using partition columns and flag to check"
-             " lake or enterprise.")
-is_lake_system = eval(sys.argv[8])
-if not is_lake_system:
-    db = sys.argv[0].split("/")[1]
-function_name = sys.argv[1]
-n_f_cols = int(sys.argv[2])
-n_c_labels = int(sys.argv[3])
-n_g_cols = int(sys.argv[4])
-data_column_types = splitter(sys.argv[6], delim="--")
-data_partition_column_indices = splitter(sys.argv[5], convert_to="int") # indices are integers.
-model_file_prefix = sys.argv[7]
-data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
-model = None
-data_partition_column_values = []
-# Data Format (n_features, k_labels, one data_partition_column):
-# feature1, feature2, ..., featuren, label1, label2, ... labelk, data_partition_column1, ...,
-# data_partition_columnn.
-# labels are optional.
-features = []
-labels = []
-groups = []
-while 1:
-    try:
-        line = input()
-        if line == '':  # Exit if user provides blank line
+    with open(os.devnull, "w") as devnull:
+        old_stderr = sys.stderr
+        sys.stderr = devnull
+        try:
+            yield
+        finally:
+            sys.stderr = old_stderr
+## On Lake system warnings raised by script are treated as a errors.
+## Hence, to suppress it putting the under suppress_stderr().
+with suppress_stderr():
+    def get_values_list(values, types):
+        ret_vals = []
+        for i, val in enumerate(values):
+            ret_vals.append(convert_to_type(val, types[i]))
+        return ret_vals
+    def convert_to_type(val, typee):
+        if typee == 'int':
+            return int(val) if val != "" else np.nan
+        if typee == 'float':
+            if isinstance(val, str):
+                val = val.replace(' ', '')
+            return float(val) if val != "" else np.nan
+        if typee == 'bool':
+            return eval(val) if val != "" else None
+        return str(val) if val != "" else None
+    def splitter(strr, delim=",", convert_to="str"):
+        """
+        Split the string based on delimiter and convert to the type specified.
+        """
+        if strr == "None":
+            return []
+        return [convert_to_type(i, convert_to) for i in strr.split(delim)]
+    # Arguments to the Script
+    if len(sys.argv) != 9:
+        # 9 arguments command line arguments should be passed to this file.
+        # 1: file to be run
+        # 2. function name
+        # 3. No of feature columns.
+        # 4. No of class labels.
+        # 5. No of group columns.
+        # 6. Comma separated indices of partition columns.
+        # 7. Comma separated types of all the data columns.
+        # 8. Model file prefix to generated model file using partition columns.
+        # 9. Flag to check the system type. True, means Lake, Enterprise otherwise.
+        sys.exit("9 arguments command line arguments should be passed: file to be run,"
+                 " function name, no of feature columns, no of class labels, no of group columns,"
+                 " comma separated indices of partition columns, comma separated types of all columns,"
+                 " model file prefix to generated model file using partition columns and flag to check"
+                 " lake or enterprise.")
+    is_lake_system = eval(sys.argv[8])
+    if not is_lake_system:
+        db = sys.argv[0].split("/")[1]
+    function_name = sys.argv[1]
+    n_f_cols = int(sys.argv[2])
+    n_c_labels = int(sys.argv[3])
+    n_g_cols = int(sys.argv[4])
+    data_column_types = splitter(sys.argv[6], delim="--")
+    data_partition_column_indices = splitter(sys.argv[5], convert_to="int") # indices are integers.
+    model_file_prefix = sys.argv[7]
+    data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
+    model = None
+    data_partition_column_values = []
+    # Data Format (n_features, k_labels, one data_partition_column):
+    # feature1, feature2, ..., featuren, label1, label2, ... labelk, data_partition_column1, ...,
+    # data_partition_columnn.
+    # labels are optional.
+    features = []
+    labels = []
+    groups = []
+    while 1:
+        try:
+            line = input()
+            if line == '':  # Exit if user provides blank line
+                break
+            else:
+                values = line.split(DELIMITER)
+                values = get_values_list(values, data_column_types)
+                if not data_partition_column_values:
+                    # Partition column values is same for all rows. Hence, only read once.
+                    for i, val in enumerate(data_partition_column_indices):
+                        data_partition_column_values.append(
+                            convert_to_type(values[val], typee=data_partition_column_types[i])
+                            )
+                    # Prepare the corresponding model file name and extract model.
+                    partition_join = "_".join([str(x) for x in data_partition_column_values])
+                    # Replace '-' with '_' as '-' because partition_columns can be negative.
+                    partition_join = partition_join.replace("-", "_")
+                    model_file_path = f"{model_file_prefix}_{partition_join}" \
+                        if is_lake_system else \
+                        f"./{db}/{model_file_prefix}_{partition_join}"
+                    with open(model_file_path, "rb") as fp:
+                        model = pickle.loads(fp.read())
+                    if not model:
+                        sys.exit("Model file is not installed in Vantage.")
+                start = 0
+                if n_f_cols > 0:
+                    features.append(values[:n_f_cols])
+                    start = start + n_f_cols
+                if n_c_labels > 0:
+                    labels.append(values[start:(start+n_c_labels)])
+                    start = start + n_c_labels
+                if n_g_cols > 0:
+                    groups.append(values[start:(start+n_g_cols)])
+        except EOFError:  # Exit if reached EOF or CTRL-D
             break
-        else:
-            values = line.split(DELIMITER)
-            values = get_values_list(values, data_column_types)
-            if not data_partition_column_values:
-                # Partition column values is same for all rows. Hence, only read once.
-                for i, val in enumerate(data_partition_column_indices):
-                    data_partition_column_values.append(
-                        convert_to_type(values[val], typee=data_partition_column_types[i])
-                        )
-                # Prepare the corresponding model file name and extract model.
-                partition_join = "_".join([str(x) for x in data_partition_column_values])
-                # Replace '-' with '_' as '-' because partition_columns can be negative.
-                partition_join = partition_join.replace("-", "_")
-                model_file_path = f"{model_file_prefix}_{partition_join}" \
-                    if is_lake_system else \
-                    f"./{db}/{model_file_prefix}_{partition_join}"
-                with open(model_file_path, "rb") as fp:
-                    model = pickle.loads(fp.read())
-                if not model:
-                    sys.exit("Model file is not installed in Vantage.")
-            start = 0
-            if n_f_cols > 0:
-                features.append(values[:n_f_cols])
-                start = start + n_f_cols
-            if n_c_labels > 0:
-                labels.append(values[start:(start+n_c_labels)])
-                start = start + n_c_labels
-            if n_g_cols > 0:
-                groups.append(values[start:(start+n_g_cols)])
-    except EOFError:  # Exit if reached EOF or CTRL-D
-        break
-if len(features) == 0:
-    sys.exit(0)
-features = np.array(features) if len(features) > 0 else None
-labels = np.array(labels).flatten() if len(labels) > 0 else None
-groups = np.array(groups).flatten() if len(groups) > 0 else None
-if function_name == "split":
-    # Printing both train and test data instead of just indices unlike sklearn.
-    # Generator is created based on split_id and type of split (train/test) in client.
-    split_id = 1
-    for train_idx, test_idx in model.split(features, labels, groups):
-        X_train, X_test = features[train_idx], features[test_idx]
-        y_train, y_test = labels[train_idx], labels[test_idx]
-        for X, y in zip(X_train, y_train):
-            print(*(data_partition_column_values + [split_id, "train"] +
-                    ['' if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))) else val
-                     for val in X] + [y]
-                    ), sep=DELIMITER)
-        for X, y in zip(X_test, y_test):
-            print(*(data_partition_column_values + [split_id, "test"] +
-                    ['' if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))) else val
-                     for val in X] + [y]
-                    ), sep=DELIMITER)
-        split_id += 1
-else:
-    val = getattr(model, function_name)(features, labels, groups)
-    print(*(data_partition_column_values + [val]), sep=DELIMITER)
+    if len(features) == 0:
+        sys.exit(0)
+    features = np.array(features) if len(features) > 0 else None
+    labels = np.array(labels).flatten() if len(labels) > 0 else None
+    groups = np.array(groups).flatten() if len(groups) > 0 else None
+    if function_name == "split":
+        # Printing both train and test data instead of just indices unlike sklearn.
+        # Generator is created based on split_id and type of split (train/test) in client.
+        split_id = 1
+        for train_idx, test_idx in model.split(features, labels, groups):
+            X_train, X_test = features[train_idx], features[test_idx]
+            y_train, y_test = labels[train_idx], labels[test_idx]
+            for X, y in zip(X_train, y_train):
+                print(*(data_partition_column_values + [split_id, "train"] +
+                        ['' if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))) else val
+                         for val in X] + [y]
+                        ), sep=DELIMITER)
+            for X, y in zip(X_test, y_test):
+                print(*(data_partition_column_values + [split_id, "test"] +
+                        ['' if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))) else val
+                         for val in X] + [y]
+                        ), sep=DELIMITER)
+            split_id += 1
+    else:
+        val = getattr(model, function_name)(features, labels, groups)
+        print(*(data_partition_column_values + [val]), sep=DELIMITER)

teradataml/data/scripts/sklearn/sklearn_transform.py CHANGED Viewed

@@ -31,39 +31,83 @@ def splitter(strr, delim=",", convert_to="str"):
         return []
     return [convert_to_type(i, convert_to) for i in strr.split(delim)]
+def should_convert(t_val, py_type):
+    """
+    Function to check type of value and whether value is nan and infinity.
+    """
+    return not isinstance(t_val, eval(py_type)) and not math.isinf(t_val) and not math.isnan(t_val)
+def convert_value(t_val, py_type):
+    """
+    Function to convert value to specified python type.
+    """
+    return convert_to_type(t_val, py_type) if should_convert(t_val, py_type) else t_val
 # Process output returned by sklearn function.
-def get_output_data(trans_values, func_name, model_obj, n_c_labels):
-    # Converting sparse matrix to dense array as sparse matrices are NOT
+def get_output_data(trans_values, func_name, model_obj, n_c_labels, n_out_columns):
+    # Converting    sparse matrix to dense array as sparse matrices are NOT
     # supported in Vantage.
     module_name = model_obj.__module__.split("._")[0]
-    if type(trans_values).__name__ in ["csr_matrix", "csc_matrix"]:
-        trans_values = trans_values.toarray()
+    # Converting the translated values into corresponding the return column's
+    # python type.
+    if (func_name == "decision_path" or return_columns_python_types is None \
+            or not isinstance(trans_values, np.ndarray)):
+        trans_values_list = trans_values
+    else:
+        # Conversion.....
+        trans_values_list = []
+        for trans_value in trans_values.tolist():
+            if not isinstance(trans_value, list):
+                trans_value = [trans_value]
+            converted_list = []
+            if len(return_columns_python_types) == len(trans_value):
+                for t_val, py_type in zip(trans_value, return_columns_python_types):
+                    converted_list.append(convert_value(t_val, py_type))
+            ## transform() is having only 1 python return type, But it actually returns more than 1 column
+            else:
+                for t_val in trans_value:
+                    converted_list.append(convert_value(t_val, "".join(return_columns_python_types)))
+            trans_values_list.append(converted_list)
+    if type(trans_values_list).__name__ in ["csr_matrix", "csc_matrix"]:
+        trans_values_list = trans_values_list.toarray()
     if module_name == "sklearn.cross_decomposition" and n_c_labels > 0 and func_name == "transform":
         # For cross_decomposition, output is a tuple of arrays when label columns are provided
         # along with feature columns for transform function. In this case, concatenate the
         # arrays and return the combined values.
-        if isinstance(trans_values, tuple):
-            return np.concatenate(trans_values, axis=1).tolist()[0]
+        if isinstance(trans_values_list, tuple):
+            return np.concatenate(trans_values_list, axis=1).tolist()[0]
-    if isinstance(trans_values[0], np.ndarray) \
-            or isinstance(trans_values[0], list) \
-            or isinstance(trans_values[0], tuple):
+    if isinstance(trans_values_list[0], np.ndarray) \
+            or isinstance(trans_values_list[0], list) \
+            or isinstance(trans_values_list[0], tuple):
         # Here, the value returned by sklearn function is list type.
-        opt_list = list(trans_values[0])
+        opt_list = list(trans_values_list[0])
+        if len(opt_list) < n_out_columns:
+            # If the output list is less than the required number of columns, append
+            # empty strings to the list.
+            opt_list += [""] * (n_out_columns - len(opt_list))
         if func_name == "inverse_transform" and type(model_obj).__name__ == "MultiLabelBinarizer":
             # output array "trans_values[0]" may not be of same size. It should be of
             # maximum size of `model.classes_`
             # Append None to last elements.
             if len(opt_list) < len(model_obj.classes_):
                 opt_list += [""] * (len(model_obj.classes_) - len(opt_list))
         return opt_list
-    return [trans_values[0]]
+    # Only one element is returned by the function.
+    return [trans_values_list[0]]
 # Arguments to the Script
-if len(sys.argv) != 8:
-    # 8 arguments command line arguments should be passed to this file.
+if len(sys.argv) != 10:
+    # 10 arguments command line arguments should be passed to this file.
     # 1: file to be run
     # 2. function name (Eg. predict, fit etc)
     # 3. No of feature columns.
@@ -71,13 +115,17 @@ if len(sys.argv) != 8:
     # 5. Comma separated indices of partition columns.
     # 6. Comma separated types of all the data columns.
     # 7. Model file prefix to generated model file using partition columns.
-    # 8. Flag to check the system type. True, means Lake, Enterprise otherwise.
-    sys.exit("8 arguments should be passed to this file - file to be run, function name, "\
-             "no of feature columns, no of class labels, comma separated indices of partition "\
-             "columns, comma separated types of all columns, model file prefix to generate model "\
-             "file using partition columns and flag to check lake or enterprise.")
-is_lake_system = eval(sys.argv[7])
+    # 8. Number of columns to be returned by the sklearn's transform function.
+    # 9. Flag to check the system type. True, means Lake, Enterprise otherwise.
+    # 10. Python types of returned/transfromed columns.
+    sys.exit("10 arguments should be passed to this file - file to be run, function name, "\
+                 "no of feature columns, no of class labels, comma separated indices of partition "\
+                 "columns, comma separated types of all columns, model file prefix to generate model "\
+                 "file using partition columns, number of columns to be returnd by sklearn's "\
+                 "transform function, flag to check lake or enterprise and Python types of "\
+                 "returned/transfromed columns.")
+is_lake_system = eval(sys.argv[8])
 if not is_lake_system:
     db = sys.argv[0].split("/")[1]
 func_name = sys.argv[1]
@@ -86,13 +134,22 @@ n_c_labels = int(sys.argv[3])
 data_column_types = splitter(sys.argv[5], delim="--")
 data_partition_column_indices = splitter(sys.argv[4], convert_to="int") # indices are integers.
 model_file_prefix = sys.argv[6]
+# sys.argv[9] will contain a string of python datatypes with '--'
+# separator OR a single datatype OR None in string format.
+ret_col_argv = sys.argv[9]
+if ret_col_argv == "None":
+    return_columns_python_types = eval(ret_col_argv)
+else:
+    return_columns_python_types = splitter(ret_col_argv, delim="--")
+no_of_output_columns = int(sys.argv[7])
 data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
 model = None
 data_partition_column_values = []
-missing_indicator_input = []
+all_rows_input = []
 # Data Format:
 # feature1, feature2, ..., featuren, label1, label2, ... labelk, data_partition_column1, ...,
@@ -134,30 +191,45 @@ while 1:
             model_name = model.__class__.__name__
             np_func_list = ["ClassifierChain", "EllipticEnvelope", "MinCovDet",
-                            "FeatureAgglomeration", "LabelBinarizer", "MultiLabelBinarizer"]
+                            "FeatureAgglomeration", "LabelBinarizer", "MultiLabelBinarizer",
+                            "BernoulliRBM"]
-            # MissingIndicator requires processing the entire dataset simultaneously,
-            # rather than on a row-by-row basis.
+            # MissingIndicator's transform() and SimpleImputer's inverse_transform() requires processing
+            # the entire dataset simultaneously, rather than on a row-by-row basis.
-            # Error getting during row-by-row processing -
+            # Error getting during row-by-row processing of MissingIndicator -
             # "ValueError: MissingIndicator does not support data with dtype <U13.
             # Please provide either a numeric array (with a floating point or
-            i# integer dtype) or categorical data represented ei
-            if model_name == "MissingIndicator" and func_name == "transform":
-                missing_indicator_input.append(f_)
+            # integer dtype) or categorical data represented ei
+            # Error getting during row-by-row processing of SimpleImputer -
+            # "IndexError: index 3 is out of bounds for axis 1 with size 3".
+            if ((model_name == "MissingIndicator" and func_name == "transform") or \
+                (model_name == "SimpleImputer" and func_name == "inverse_transform") or \
+                    (model_name in ["EllipticEnvelope", "MinCovDet"]
+                        and func_name == "correct_covariance")):
+                all_rows_input.append(f_)
                 continue
-            f__ = np.array([f_]) if model_name in np_func_list or \
-                                    (model_name == "SimpleImputer" and func_name == "inverse_transform")\
-                else [f_]
+            f__ = np.array([f_]) if model_name in np_func_list else [f_]
+            # transform() function in these functions generate different number of output columns and
+            # NULLS/NaNs are appended to the end of the output.
+            # If we run inverse_transform() on these models, it will take same number of input columns
+            # with NULLs/NaNs but those NULLs/NaNs should be ignored while reading the input to
+            # inverse_transform() function.
+            models_with_all_null_in_last_cols = ["SelectFpr", "SelectFdr", "SelectFwe", "SelectFromModel", "RFECV"]
+            if model_name in models_with_all_null_in_last_cols and func_name == "inverse_transform":
+                # Remove NULLs/NaNs from the end of one input row.
+                _f  = np.array([f_])
+                _f = _f[~np.isnan(_f)]
+                f__ = [_f.tolist()]
             if n_c_labels > 0:
                 # Labels are present in last column.
                 l_ = values[n_f_cols:n_f_cols+n_c_labels]
-                l__ = np.array([l_]) if model_name in np_func_list or \
-                                        (model_name == "SimpleImputer" and func_name == "inverse_transform")\
-                    else [l_]
+                l__ = np.array([l_]) if model_name in np_func_list else [l_]
                 # predict() now takes 'y' also for it to return the labels from script. Skipping 'y'
                 # in function call. Generally, 'y' is passed to return y along with actual output.
                 try:
@@ -181,7 +253,8 @@ while 1:
             if n_c_labels > 0 and func_name in ["predict", "decision_function"]:
                 result_list += l_
             result_list += get_output_data(trans_values=trans_values, func_name=func_name,
-                                           model_obj=model, n_c_labels=n_c_labels)
+                                           model_obj=model, n_c_labels=n_c_labels,
+                                           n_out_columns=no_of_output_columns)
             for i, val in enumerate(result_list):
                 if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))):
@@ -198,17 +271,23 @@ while 1:
         break
-# MissingIndicator needs processing of all the dataset at the same time, instead of row by row.
+# MissingIndicator and SimpleImputer needs processing of all the dataset at the same time, instead of row by row.
 # Hence, handling it outside of the while loop
-if model_name == "MissingIndicator" and func_name == "transform":
-    m_out = model.transform(missing_indicator_input)
+if model_name == "MissingIndicator" and func_name == "transform" or \
+    (model_name == "SimpleImputer" and func_name == "inverse_transform"):
+    if model_name == "SimpleImputer":
+        all_rows_input = np.array(all_rows_input)
+    m_out = getattr(model, func_name)(all_rows_input)
-    for j, vals in enumerate(missing_indicator_input):
+    if type(m_out).__name__ in ["csr_matrix", "csc_matrix"]:
+        m_out = m_out.toarray()
-        m_out_list = get_output_data(trans_values=m_out[j], func_name=func_name,
-                                     model_obj=model, n_c_labels=n_c_labels)
+    for j in range(len(all_rows_input)):
+        m_out_list = get_output_data(trans_values=[m_out[j]], func_name=func_name,
+                                     model_obj=model, n_c_labels=n_c_labels,
+                                     n_out_columns=no_of_output_columns)
-        result_list = missing_indicator_input[j] + m_out_list
+        result_list = list(all_rows_input[j]) + list(m_out_list)
         for i, val in enumerate(result_list):
             if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))):
@@ -220,3 +299,11 @@ if model_name == "MissingIndicator" and func_name == "transform":
                 result_list[i] = 1
         print(*(data_partition_column_values + result_list), sep=DELIMITER)
+## correct_covariance() requires processing of all the input rows at the same time.
+## It returns the output dataset  in (n_features, n_features) shape, i.e., based on
+## no. of columns.
+if model_name in ["EllipticEnvelope", "MinCovDet"] and func_name == "correct_covariance":
+    result_list = model.correct_covariance(np.array(all_rows_input))
+    for l, vals in enumerate(result_list):
+        print(*(data_partition_column_values + vals.tolist()), sep=DELIMITER)

teradataml/data/target_udt_data.csv ADDED Viewed

@@ -0,0 +1,8 @@
+id,array_col
+1,"3.33e-05,0.2,0.1"
+2,"0.5,0.4,0.42"
+3,"1,0.8,0.9"
+4,"0.01,0.4,0.2"
+5,"0.93,0.4,0.7"
+6,"0.83,0.3,0.6"
+7,"0.73,0.5,0.7"

teradataml/data/templates/open_source_ml.json CHANGED Viewed

@@ -2,8 +2,9 @@
   "env_specs": [
     {
       "env_name": "openml_env",
-      "libs": ["scikit-learn==1.5.1", "joblib==1.4.2", "numpy==2.0.0",
-               "scipy==1.14.0", "threadpoolctl==3.5.0"],
+      "libs": ["scikit-learn==1.5.1", "joblib==1.4.2", "numpy==1.23.5",
+               "scipy==1.14.0", "threadpoolctl==3.5.0", "lightgbm==3.3.3",
+               "pandas==2.2.3"],
       "desc": "DONT DELETE: OpenML environment"
     }
   ]

teradataml/data/vectordistance_example.json CHANGED Viewed

@@ -22,5 +22,9 @@
     "CallDuration": "REAL",
     "DataCounter": "REAL",
     "SMS": "REAL"
+  },
+  "target_udt_data":{
+    "id": "INTEGER",
+    "array_col":"AIVector"
   }
 }

teradataml 20.0.0.2__py3-none-any.whl → 20.0.0.3__py3-none-any.whl

Potentially problematic release.

teradataml 20.0.0.2py3-none-any.whl → 20.0.0.3py3-none-any.whl