PyPI - teradataml - Versions diffs - 20.0.0.0__py3-none-any.whl → 20.0.0.2__py3-none-any.whl - Mend

teradataml 20.0.0.0py3-none-any.whl → 20.0.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of teradataml might be problematic. Click here for more details.

Files changed (263) hide show

teradataml/LICENSE-3RD-PARTY.pdf +0 -0
teradataml/LICENSE.pdf +0 -0
teradataml/README.md +183 -0
teradataml/__init__.py +6 -3
teradataml/_version.py +2 -2
teradataml/analytics/__init__.py +3 -2
teradataml/analytics/analytic_function_executor.py +275 -40
teradataml/analytics/analytic_query_generator.py +92 -0
teradataml/analytics/byom/__init__.py +3 -2
teradataml/analytics/json_parser/metadata.py +1 -0
teradataml/analytics/json_parser/utils.py +17 -21
teradataml/analytics/meta_class.py +40 -1
teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
teradataml/analytics/sqle/__init__.py +10 -2
teradataml/analytics/table_operator/__init__.py +3 -2
teradataml/analytics/uaf/__init__.py +21 -2
teradataml/analytics/utils.py +62 -1
teradataml/analytics/valib.py +1 -1
teradataml/automl/__init__.py +1553 -319
teradataml/automl/custom_json_utils.py +139 -61
teradataml/automl/data_preparation.py +276 -319
teradataml/automl/data_transformation.py +163 -81
teradataml/automl/feature_engineering.py +402 -239
teradataml/automl/feature_exploration.py +9 -2
teradataml/automl/model_evaluation.py +48 -51
teradataml/automl/model_training.py +291 -189
teradataml/catalog/byom.py +8 -8
teradataml/catalog/model_cataloging_utils.py +1 -1
teradataml/clients/auth_client.py +133 -0
teradataml/clients/pkce_client.py +1 -1
teradataml/common/aed_utils.py +3 -2
teradataml/common/constants.py +48 -6
teradataml/common/deprecations.py +13 -7
teradataml/common/garbagecollector.py +156 -120
teradataml/common/messagecodes.py +6 -1
teradataml/common/messages.py +3 -1
teradataml/common/sqlbundle.py +1 -1
teradataml/common/utils.py +103 -11
teradataml/common/wrapper_utils.py +1 -1
teradataml/context/context.py +121 -31
teradataml/data/advertising.csv +201 -0
teradataml/data/bank_marketing.csv +11163 -0
teradataml/data/bike_sharing.csv +732 -0
teradataml/data/boston2cols.csv +721 -0
teradataml/data/breast_cancer.csv +570 -0
teradataml/data/complaints_test_tokenized.csv +353 -0
teradataml/data/complaints_tokens_model.csv +348 -0
teradataml/data/covid_confirm_sd.csv +83 -0
teradataml/data/customer_segmentation_test.csv +2628 -0
teradataml/data/customer_segmentation_train.csv +8069 -0
teradataml/data/dataframe_example.json +10 -0
teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
teradataml/data/docs/sqle/docs_17_20/Shap.py +197 -0
teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
teradataml/data/dwt2d_dataTable.csv +65 -0
teradataml/data/dwt_dataTable.csv +8 -0
teradataml/data/dwt_filterTable.csv +3 -0
teradataml/data/finance_data4.csv +13 -0
teradataml/data/glm_example.json +28 -1
teradataml/data/grocery_transaction.csv +19 -0
teradataml/data/housing_train_segment.csv +201 -0
teradataml/data/idwt2d_dataTable.csv +5 -0
teradataml/data/idwt_dataTable.csv +8 -0
teradataml/data/idwt_filterTable.csv +3 -0
teradataml/data/insect2Cols.csv +61 -0
teradataml/data/interval_data.csv +5 -0
teradataml/data/jsons/paired_functions.json +14 -0
teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
teradataml/data/jsons/uaf/17.20/TD_SAX.json +208 -0
teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +400 -0
teradataml/data/kmeans_example.json +5 -0
teradataml/data/kmeans_table.csv +10 -0
teradataml/data/load_example_data.py +8 -2
teradataml/data/naivebayestextclassifier_example.json +1 -1
teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
teradataml/data/onehot_encoder_train.csv +4 -0
teradataml/data/openml_example.json +29 -0
teradataml/data/peppers.png +0 -0
teradataml/data/real_values.csv +14 -0
teradataml/data/sax_example.json +8 -0
teradataml/data/scale_attributes.csv +3 -0
teradataml/data/scale_example.json +52 -1
teradataml/data/scale_input_part_sparse.csv +31 -0
teradataml/data/scale_input_partitioned.csv +16 -0
teradataml/data/scale_input_sparse.csv +11 -0
teradataml/data/scale_parameters.csv +3 -0
teradataml/data/scripts/deploy_script.py +21 -2
teradataml/data/scripts/sklearn/sklearn_fit.py +40 -37
teradataml/data/scripts/sklearn/sklearn_fit_predict.py +22 -30
teradataml/data/scripts/sklearn/sklearn_function.template +42 -24
teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
teradataml/data/scripts/sklearn/sklearn_neighbors.py +19 -28
teradataml/data/scripts/sklearn/sklearn_score.py +32 -32
teradataml/data/scripts/sklearn/sklearn_transform.py +85 -42
teradataml/data/star_pivot.csv +8 -0
teradataml/data/templates/open_source_ml.json +2 -1
teradataml/data/teradataml_example.json +97 -1
teradataml/data/timestamp_data.csv +4 -0
teradataml/data/titanic_dataset_unpivoted.csv +19 -0
teradataml/data/uaf_example.json +55 -1
teradataml/data/unpivot_example.json +15 -0
teradataml/data/url_data.csv +9 -0
teradataml/data/windowdfft.csv +16 -0
teradataml/data/ztest_example.json +16 -0
teradataml/dataframe/copy_to.py +9 -4
teradataml/dataframe/data_transfer.py +125 -64
teradataml/dataframe/dataframe.py +575 -57
teradataml/dataframe/dataframe_utils.py +47 -9
teradataml/dataframe/fastload.py +273 -90
teradataml/dataframe/functions.py +339 -0
teradataml/dataframe/row.py +160 -0
teradataml/dataframe/setop.py +2 -2
teradataml/dataframe/sql.py +740 -18
teradataml/dataframe/window.py +1 -1
teradataml/dbutils/dbutils.py +324 -18
teradataml/geospatial/geodataframe.py +1 -1
teradataml/geospatial/geodataframecolumn.py +1 -1
teradataml/hyperparameter_tuner/optimizer.py +13 -13
teradataml/lib/aed_0_1.dll +0 -0
teradataml/opensource/sklearn/_sklearn_wrapper.py +254 -122
teradataml/options/__init__.py +16 -5
teradataml/options/configure.py +39 -6
teradataml/options/display.py +2 -2
teradataml/plot/axis.py +4 -4
teradataml/scriptmgmt/UserEnv.py +26 -19
teradataml/scriptmgmt/lls_utils.py +120 -16
teradataml/table_operators/Script.py +4 -5
teradataml/table_operators/TableOperator.py +160 -26
teradataml/table_operators/table_operator_util.py +88 -41
teradataml/table_operators/templates/dataframe_udf.template +63 -0
teradataml/telemetry_utils/__init__.py +0 -0
teradataml/telemetry_utils/queryband.py +52 -0
teradataml/utils/validators.py +41 -3
{teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/METADATA +191 -6
{teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/RECORD +263 -185
{teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/WHEEL +0 -0
{teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/top_level.txt +0 -0
{teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/zip-safe +0 -0

teradataml/data/scripts/sklearn/sklearn_model_selection_split.py CHANGED Viewed

@@ -6,35 +6,22 @@ import base64
 DELIMITER = '\t'
-def get_value(value):
-    ret_val = value
-    try:
-        ret_val = round(float("".join(value.split())), 2)
-    except Exception as ex:
-        # If the value can't be converted to float, then it is string.
-        pass
-    return ret_val
-def get_values_list(values, ignore_none=True):
+def get_values_list(values, types):
     ret_vals = []
-    for val in values:
-        if val == "" and ignore_none:
-            # Empty cell value in the database table.
-            continue
-        ret_vals.append(get_value(val))
+    for i, val in enumerate(values):
+        ret_vals.append(convert_to_type(val, types[i]))
     return ret_vals
 def convert_to_type(val, typee):
     if typee == 'int':
-        return int(val)
+        return int(val) if val != "" else np.nan
     if typee == 'float':
-        return float(val)
+        if isinstance(val, str):
+            val = val.replace(' ', '')
+        return float(val) if val != "" else np.nan
     if typee == 'bool':
-        return eval(val)
-    return str(val)
+        return eval(val) if val != "" else None
+    return str(val) if val != "" else None
 def splitter(strr, delim=",", convert_to="str"):
     """
@@ -54,13 +41,14 @@ if len(sys.argv) != 9:
     # 4. No of class labels.
     # 5. No of group columns.
     # 6. Comma separated indices of partition columns.
-    # 7. Comma separated types of the partition columns.
+    # 7. Comma separated types of all the data columns.
     # 8. Model file prefix to generated model file using partition columns.
     # 9. Flag to check the system type. True, means Lake, Enterprise otherwise.
     sys.exit("9 arguments command line arguments should be passed: file to be run,"
              " function name, no of feature columns, no of class labels, no of group columns,"
-             " comma separated indices and types of partition columns, model file prefix to"
-             " generated model file using partition columns and flag to check lake or enterprise.")
+             " comma separated indices of partition columns, comma separated types of all columns,"
+             " model file prefix to generated model file using partition columns and flag to check"
+             " lake or enterprise.")
 is_lake_system = eval(sys.argv[8])
@@ -70,10 +58,11 @@ function_name = sys.argv[1]
 n_f_cols = int(sys.argv[2])
 n_c_labels = int(sys.argv[3])
 n_g_cols = int(sys.argv[4])
-data_partition_column_types = splitter(sys.argv[6])
+data_column_types = splitter(sys.argv[6], delim="--")
 data_partition_column_indices = splitter(sys.argv[5], convert_to="int") # indices are integers.
 model_file_prefix = sys.argv[7]
+data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
 model = None
 data_partition_column_values = []
@@ -93,6 +82,7 @@ while 1:
             break
         else:
             values = line.split(DELIMITER)
+            values = get_values_list(values, data_column_types)
             if not data_partition_column_values:
                 # Partition column values is same for all rows. Hence, only read once.
                 for i, val in enumerate(data_partition_column_indices):
@@ -117,13 +107,13 @@ while 1:
             start = 0
             if n_f_cols > 0:
-                features.append(get_values_list(values[:n_f_cols]))
+                features.append(values[:n_f_cols])
                 start = start + n_f_cols
             if n_c_labels > 0:
-                labels.append(get_values_list(values[start:(start+n_c_labels)]))
+                labels.append(values[start:(start+n_c_labels)])
                 start = start + n_c_labels
             if n_g_cols > 0:
-                groups.append(get_values_list(values[start:(start+n_g_cols)]))
+                groups.append(values[start:(start+n_g_cols)])
     except EOFError:  # Exit if reached EOF or CTRL-D
         break
@@ -144,14 +134,14 @@ if function_name == "split":
         y_train, y_test = labels[train_idx], labels[test_idx]
         for X, y in zip(X_train, y_train):
             print(*(data_partition_column_values + [split_id, "train"] +
-                    ['' if (val is None or math.isnan(val) or math.isinf(val)) else val
+                    ['' if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))) else val
                      for val in X] + [y]
-                    ),sep=DELIMITER)
+                    ), sep=DELIMITER)
         for X, y in zip(X_test, y_test):
             print(*(data_partition_column_values + [split_id, "test"] +
-                    ['' if (val is None or math.isnan(val) or math.isinf(val)) else val
+                    ['' if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))) else val
                      for val in X] + [y]
-                    ),sep=DELIMITER)
+                    ), sep=DELIMITER)
         split_id += 1
 else:
     val = getattr(model, function_name)(features, labels, groups)

teradataml/data/scripts/sklearn/sklearn_neighbors.py CHANGED Viewed

@@ -12,34 +12,22 @@ from scipy.sparse.csr import csr_matrix
 DELIMITER = '\t'
-def get_value(value):
-    ret_val = value
-    try:
-        ret_val = float(value.replace(' ', ''))
-    except Exception as ex:
-        # If the value can't be converted to float, then it is string.
-        pass
-    return ret_val
-def get_values_list(values, ignore_none=True):
+def get_values_list(values, types):
     ret_vals = []
-    for val in values:
-        if val == "" and ignore_none:
-            # Empty cell value in the database table.
-            continue
-        ret_vals.append(get_value(val))
+    for i, val in enumerate(values):
+        ret_vals.append(convert_to_type(val, types[i]))
     return ret_vals
 def convert_to_type(val, typee):
     if typee == 'int':
-        return int(val)
+        return int(val) if val != "" else np.nan
     if typee == 'float':
-        return float(val)
+        if isinstance(val, str):
+            val = val.replace(' ', '')
+        return float(val) if val != "" else np.nan
     if typee == 'bool':
-        return eval(val)
-    return str(val)
+        return eval(val) if val != "" else None
+    return str(val) if val != "" else None
 def splitter(strr, delim=",", convert_to="str"):
     """
@@ -57,15 +45,15 @@ if len(sys.argv) < 7:
     # 2. function name.
     # 3. No of feature columns.
     # 4. Comma separated indices of partition columns.
-    # 5. Comma separated types of the partition columns.
+    # 5. Comma separated types of all the data columns.
     # 6. Model file prefix to generate model file using partition columns.
     # 7. Flag to check the system type. True, means Lake, Enterprise otherwise.
     # 8. OPTIONAL - Arguments in string format like "return_distance True-bool",
     #    "n_neighbors 3-int", "radius 3.4-float" etc.
     sys.exit("At least 7 arguments should be passed to this file - file to be run, function name, "\
-             "no of feature columns, comma separated indices and types of partition columns, "\
-             "model file prefix to generate model file using partition columns, flag to check "\
-             "lake or enterprise and optional arguments in string format.")
+             "no of feature columns, comma separated indices of partition columns, comma "\
+             "separated types of all columns, model file prefix to generate model file using "\
+             "partition columns, flag to check lake or enterprise and optional arguments in string format.")
 convert_to_int = lambda x: int(x) if x != "None" else None
@@ -74,7 +62,7 @@ if not is_lake_system:
     db = sys.argv[0].split("/")[1]
 func_name = sys.argv[1]
 n_f_cols = convert_to_int(sys.argv[2])
-data_partition_column_types = splitter(sys.argv[4])
+data_column_types = splitter(sys.argv[4], delim="--")
 data_partition_column_indices = splitter(sys.argv[3], convert_to="int") # indices are integers.
 model_file_prefix = sys.argv[5]
 # Extract arguments from string.
@@ -83,6 +71,8 @@ for i in range(7, len(sys.argv), 2):
     value = sys.argv[i + 1].split("-", 1)
     arguments[sys.argv[i]] = convert_to_type(value[0], value[1])
+data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
 model = None
 data_partition_column_values = []
@@ -101,6 +91,7 @@ while 1:
             break
         else:
             values = line.split(DELIMITER)
+            values = get_values_list(values, data_column_types)
             if not data_partition_column_values:
                 # Partition column values is same for all rows. Hence, only read once.
                 for i, val in enumerate(data_partition_column_indices):
@@ -123,9 +114,9 @@ while 1:
                 if not model:
                     sys.exit("Model file is not installed in Vantage.")
-            f_ = get_values_list(values[:n_f_cols])
+            f_ = values[:n_f_cols]
             if f_:
-                output = getattr(model, func_name)(np.array([f_]), **arguments)
+                output = getattr(model, func_name)([f_], **arguments)
             else:
                 output = getattr(model, func_name)(**arguments)
             result_list = f_

teradataml/data/scripts/sklearn/sklearn_score.py CHANGED Viewed

@@ -6,34 +6,22 @@ import numpy as np
 DELIMITER = '\t'
-def get_value(value):
-    ret_val = value
-    try:
-        ret_val = float("".join(value.split()))
-    except Exception as ex:
-        # If the value can't be converted to float, then it is string.
-        pass
-    return ret_val
-def get_values_list(values, ignore_none=True):
+def get_values_list(values, types):
     ret_vals = []
-    for val in values:
-        if val == "" and ignore_none:
-            # Empty cell value in the database table.
-            continue
-        ret_vals.append(get_value(val))
+    for i, val in enumerate(values):
+        ret_vals.append(convert_to_type(val, types[i]))
     return ret_vals
 def convert_to_type(val, typee):
     if typee == 'int':
-        return int(val)
+        return int(val) if val != "" else np.nan
     if typee == 'float':
-        return float(val)
+        if isinstance(val, str):
+            val = val.replace(' ', '')
+        return float(val) if val != "" else np.nan
     if typee == 'bool':
-        return bool(val)
-    return str(val)
+        return eval(val) if val != "" else None
+    return str(val) if val != "" else None
 def splitter(strr, delim=",", convert_to="str"):
     """
@@ -51,13 +39,13 @@ if len(sys.argv) != 8:
     # 3. No of feature columns.
     # 4. No of class labels.
     # 5. Comma separated indices of partition columns.
-    # 6. Comma separated types of the partition columns.
+    # 6. Comma separated types of all the data columns.
     # 7. Model file prefix to generated model file using partition columns.
     # 8. Flag to check the system type. True, means Lake, Enterprise otherwise.
     sys.exit("8 arguments should be passed to this file - file to be run, function name, "\
-             "no of feature columns, no of class labels, comma separated indices and types of "\
-             "partition columns, model file prefix to generate model file using partition "\
-             "columns and flag to check lake or enterprise.")
+             "no of feature columns, no of class labels, comma separated indices "
+             "of partition columns, comma separated types of all columns, model file prefix to "\
+             "generate model file using partition columns and flag to check lake or enterprise.")
 is_lake_system = eval(sys.argv[7])
 if not is_lake_system:
@@ -65,10 +53,12 @@ if not is_lake_system:
 func_name = sys.argv[1]
 n_f_cols = int(sys.argv[2])
 n_c_labels = int(sys.argv[3])
-data_partition_column_types = splitter(sys.argv[5])
+data_column_types = splitter(sys.argv[5], delim="--")
 data_partition_column_indices = splitter(sys.argv[4], convert_to="int") # indices are integers.
 model_file_prefix = sys.argv[6]
+data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
 model = None
 # Data Format (n_features, k_labels, one data_partition_column):
@@ -87,9 +77,10 @@ while 1:
             break
         else:
             values = line.split(DELIMITER)
-            features.append(get_values_list(values[:n_f_cols]))
+            values = get_values_list(values, data_column_types)
+            features.append(values[:n_f_cols])
             if n_c_labels > 0:
-                labels.append(get_values_list(values[n_f_cols:(n_f_cols+n_c_labels)]))
+                labels.append(values[n_f_cols:(n_f_cols+n_c_labels)])
             if not data_partition_column_values:
                 # Partition column values is same for all rows. Hence, only read once.
@@ -119,10 +110,19 @@ while 1:
 if len(features) == 0:
     sys.exit(0)
+model_name = model.__class__.__name__
+np_func_list = ["MultiOutputClassifier", "GaussianMixture"]
+if model_name in np_func_list:
+    features = np.array(features)
 if labels:
-    val = getattr(model, func_name)(np.array(features), np.array(labels))
+    if model_name in np_func_list:
+        labels = np.array(labels)
+    val = getattr(model, func_name)(features, labels)
 else:
-    val = getattr(model, func_name)(np.array(features))
+    val = getattr(model, func_name)(features)
-result_val = ['' if (val is None or math.isnan(val) or math.isinf(val)) else val]
-print(*(data_partition_column_values + result_val), sep=DELIMITER)
+result_val = ['' if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))) else val]
+print(*(data_partition_column_values + result_val), sep=DELIMITER)

teradataml/data/scripts/sklearn/sklearn_transform.py CHANGED Viewed

@@ -4,41 +4,24 @@ import os
 import sys
 import numpy as np
-# The below import is needed to convert sparse matrix to dense array as sparse matrices are NOT
-# supported in Vantage.
-# This is in scipy 1.10.0. Might vary based on scipy version.
-from scipy.sparse import csr_matrix
 DELIMITER = '\t'
-def get_value(value):
-    ret_val = value
-    try:
-        ret_val = float(value.replace(' ', ''))
-    except Exception as ex:
-        # If the value can't be converted to float, then it is string.
-        pass
-    return ret_val
-def get_values_list(values, ignore_none=True):
+def get_values_list(values, types):
     ret_vals = []
-    for val in values:
-        if val == "" and ignore_none:
-            # Empty cell value in the database table.
-            continue
-        ret_vals.append(get_value(val))
+    for i, val in enumerate(values):
+        ret_vals.append(convert_to_type(val, types[i]))
     return ret_vals
 def convert_to_type(val, typee):
     if typee == 'int':
-        return int(val)
+        return int(val) if val != "" else np.nan
     if typee == 'float':
-        return float(val)
+        if isinstance(val, str):
+            val = val.replace(' ', '')
+        return float(val) if val != "" else np.nan
     if typee == 'bool':
-        return eval(val)
-    return str(val)
+        return eval(val) if val != "" else None
+    return str(val) if val != "" else None
 def splitter(strr, delim=",", convert_to="str"):
     """
@@ -54,7 +37,7 @@ def get_output_data(trans_values, func_name, model_obj, n_c_labels):
     # supported in Vantage.
     module_name = model_obj.__module__.split("._")[0]
-    if isinstance(trans_values, csr_matrix):
+    if type(trans_values).__name__ in ["csr_matrix", "csc_matrix"]:
         trans_values = trans_values.toarray()
     if module_name == "sklearn.cross_decomposition" and n_c_labels > 0 and func_name == "transform":
@@ -86,13 +69,13 @@ if len(sys.argv) != 8:
     # 3. No of feature columns.
     # 4. No of class labels.
     # 5. Comma separated indices of partition columns.
-    # 6. Comma separated types of the partition columns.
+    # 6. Comma separated types of all the data columns.
     # 7. Model file prefix to generated model file using partition columns.
     # 8. Flag to check the system type. True, means Lake, Enterprise otherwise.
     sys.exit("8 arguments should be passed to this file - file to be run, function name, "\
-             "no of feature columns, no of class labels, comma separated indices and types of "\
-             "partition columns, model file prefix to generate model file using partition "\
-             "columns and flag to check lake or enterprise.")
+             "no of feature columns, no of class labels, comma separated indices of partition "\
+             "columns, comma separated types of all columns, model file prefix to generate model "\
+             "file using partition columns and flag to check lake or enterprise.")
 is_lake_system = eval(sys.argv[7])
 if not is_lake_system:
@@ -100,18 +83,23 @@ if not is_lake_system:
 func_name = sys.argv[1]
 n_f_cols = int(sys.argv[2])
 n_c_labels = int(sys.argv[3])
-data_partition_column_types = splitter(sys.argv[5])
+data_column_types = splitter(sys.argv[5], delim="--")
 data_partition_column_indices = splitter(sys.argv[4], convert_to="int") # indices are integers.
 model_file_prefix = sys.argv[6]
+data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
 model = None
 data_partition_column_values = []
+missing_indicator_input = []
 # Data Format:
 # feature1, feature2, ..., featuren, label1, label2, ... labelk, data_partition_column1, ...,
 # data_partition_columnn.
 # label is optional (it is present when label_exists is not "None")
+model_name = ""
 while 1:
     try:
         line = input()
@@ -119,6 +107,7 @@ while 1:
             break
         else:
             values = line.split(DELIMITER)
+            values = get_values_list(values, data_column_types)
             if not data_partition_column_values:
                 # Partition column values is same for all rows. Hence, only read once.
                 for i, val in enumerate(data_partition_column_indices):
@@ -141,10 +130,34 @@ while 1:
                 if not model:
                     sys.exit("Model file is not installed in Vantage.")
-            f_ = get_values_list(values[:n_f_cols])
+            f_ = values[:n_f_cols]
+            model_name = model.__class__.__name__
+            np_func_list = ["ClassifierChain", "EllipticEnvelope", "MinCovDet",
+                            "FeatureAgglomeration", "LabelBinarizer", "MultiLabelBinarizer"]
+            # MissingIndicator requires processing the entire dataset simultaneously,
+            # rather than on a row-by-row basis.
+            # Error getting during row-by-row processing -
+            # "ValueError: MissingIndicator does not support data with dtype <U13.
+            # Please provide either a numeric array (with a floating point or
+            i# integer dtype) or categorical data represented ei
+            if model_name == "MissingIndicator" and func_name == "transform":
+                missing_indicator_input.append(f_)
+                continue
+            f__ = np.array([f_]) if model_name in np_func_list or \
+                                    (model_name == "SimpleImputer" and func_name == "inverse_transform")\
+                else [f_]
             if n_c_labels > 0:
                 # Labels are present in last column.
-                l_ = get_values_list(values[n_f_cols:n_f_cols+n_c_labels])
+                l_ = values[n_f_cols:n_f_cols+n_c_labels]
+                l__ = np.array([l_]) if model_name in np_func_list or \
+                                        (model_name == "SimpleImputer" and func_name == "inverse_transform")\
+                    else [l_]
                 # predict() now takes 'y' also for it to return the labels from script. Skipping 'y'
                 # in function call. Generally, 'y' is passed to return y along with actual output.
                 try:
@@ -152,17 +165,17 @@ while 1:
                     # used 'in' in if constion, as model.__module__ is giving
                     # 'sklearn.cross_decomposition._pls'.
                     if "cross_decomposition" in model.__module__:
-                        trans_values = getattr(model, func_name)(X=np.array([f_]), Y=np.array([l_]))
+                        trans_values = getattr(model, func_name)(X=f__, Y=l__)
                     else:
-                        trans_values = getattr(model, func_name)(X=np.array([f_]), y=np.array([l_]))
+                        trans_values = getattr(model, func_name)(X=f__, y=l__)
                 except TypeError as ex:
                     # Function which does not accept 'y' like predict_proba() raises error like
                     # "TypeError: predict_proba() takes 2 positional arguments but 3 were given".
-                    trans_values = getattr(model, func_name)(np.array([f_]))
+                    trans_values = getattr(model, func_name)(f__)
             else:
                 # If class labels do not exist in data, don't read labels, read just features.
-                trans_values = getattr(model, func_name)(np.array([f_]))
+                trans_values = getattr(model, func_name)(f__)
             result_list = f_
             if n_c_labels > 0 and func_name in ["predict", "decision_function"]:
@@ -170,10 +183,40 @@ while 1:
             result_list += get_output_data(trans_values=trans_values, func_name=func_name,
                                            model_obj=model, n_c_labels=n_c_labels)
-            print(*(data_partition_column_values +
-                    ['' if (val is None or math.isnan(val) or math.isinf(val))
-                     else val for val in result_list]),
-                     sep=DELIMITER)
+            for i, val in enumerate(result_list):
+                if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))):
+                    result_list[i] = ""
+                # MissingIndicator returns boolean values. Convert them to 0/1.
+                elif val == False:
+                    result_list[i] = 0
+                elif val == True:
+                    result_list[i] = 1
+            print(*(data_partition_column_values + result_list), sep=DELIMITER)
     except EOFError:  # Exit if reached EOF or CTRL-D
         break
+# MissingIndicator needs processing of all the dataset at the same time, instead of row by row.
+# Hence, handling it outside of the while loop
+if model_name == "MissingIndicator" and func_name == "transform":
+    m_out = model.transform(missing_indicator_input)
+    for j, vals in enumerate(missing_indicator_input):
+        m_out_list = get_output_data(trans_values=m_out[j], func_name=func_name,
+                                     model_obj=model, n_c_labels=n_c_labels)
+        result_list = missing_indicator_input[j] + m_out_list
+        for i, val in enumerate(result_list):
+            if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))):
+                result_list[i] = ""
+            # MissingIndicator returns boolean values. Convert them to 0/1.
+            elif val == False:
+                result_list[i] = 0
+            elif val == True:
+                result_list[i] = 1
+        print(*(data_partition_column_values + result_list), sep=DELIMITER)

teradataml/data/star_pivot.csv ADDED Viewed

@@ -0,0 +1,8 @@
+country,state,yr,qtr,sales,cogs,rating
+USA,CA,2001,Q1,30,15,A
+USA,NY,2001,Q1,45,25,D
+USA,CA,2001,Q2,50,20,A
+USA,CA,2001,Q2,5,5,B
+Canada,ON,2001,Q2,10,0,B
+Canada,BC,2001,Q3,15,0,A
+Canada,BC,2001,Q3,10,0,A

teradataml/data/templates/open_source_ml.json CHANGED Viewed

@@ -2,7 +2,8 @@
   "env_specs": [
     {
       "env_name": "openml_env",
-      "libs": "scikit-learn",
+      "libs": ["scikit-learn==1.5.1", "joblib==1.4.2", "numpy==2.0.0",
+               "scipy==1.14.0", "threadpoolctl==3.5.0"],
       "desc": "DONT DELETE: OpenML environment"
     }
   ]

teradataml/data/teradataml_example.json CHANGED Viewed

@@ -1271,6 +1271,102 @@
     "height":"INTEGER",
     "weight":"INTEGER",
     "bmi":"INTEGER"
+  },
+  "breast_cancer":{
+    "id":"BIGINT",
+    "diagnosis":"VARCHAR(20)",
+    "radius_mean":"FLOAT",
+    "texture_mean":"FLOAT",
+    "perimeter_mean":"FLOAT",
+    "area_mean":"FLOAT",
+    "smoothness_mean":"FLOAT",
+    "compactness_mean":"FLOAT",
+    "concavity_mean":"FLOAT",
+    "concave_points_mean":"FLOAT",
+    "symmetry_mean":"FLOAT",
+    "fractal_dimension_mean":"FLOAT",
+    "radius_se":"FLOAT",
+    "texture_se":"FLOAT",
+    "perimeter_se":"FLOAT",
+    "area_se":"FLOAT",
+    "smoothness_se":"FLOAT",
+    "compactness_se":"FLOAT",
+    "concavity_se":"FLOAT",
+    "concave_points_se":"FLOAT",
+    "symmetry_se":"FLOAT",
+    "fractal_dimension_se":"FLOAT",
+    "radius_worst":"FLOAT",
+    "texture_worst":"FLOAT",
+    "perimeter_worst":"FLOAT",
+    "area_worst":"FLOAT",
+    "smoothness_worst":"FLOAT",
+    "compactness_worst":"FLOAT",
+    "concavity_worst":"FLOAT",
+    "concave_points_worst":"FLOAT",
+    "symmetry_worst":"FLOAT",
+    "fractal_dimension_worst":"FLOAT"
+  },
+  "bike_sharing" :{
+    "instant":"BIGINT",
+    "dteday":"DATE FORMAT 'dd-mm-yyyy'",
+    "season":"BIGINT",
+    "yr":"BIGINT",
+    "mnth":"BIGINT",
+    "holiday":"BIGINT",
+    "weekday":"BIGINT",
+    "workingday":"BIGINT",
+    "weathersit":"BIGINT",
+    "temp":"FLOAT",
+    "atemp":"FLOAT",
+    "hum":"FLOAT",
+    "windspeed":"FLOAT",
+    "casual":"BIGINT",
+    "registered":"BIGINT",
+    "cnt":"BIGINT"
+  },
+  "bank_marketing":{
+    "age":"BIGINT",
+    "job":"VARCHAR(20)",
+    "marital":"VARCHAR(20)",
+    "education":"VARCHAR(20)",
+    "default_value":"VARCHAR(20)",
+    "balance":"BIGINT",
+    "housing":"VARCHAR(20)",
+    "loan":"VARCHAR(20)",
+    "contact":"VARCHAR(20)",
+    "day_of_month":"BIGINT",
+    "month_of_year":"VARCHAR(20)",
+    "duration":"BIGINT",
+    "campaign":"BIGINT",
+    "pdays":"BIGINT",
+    "previous":"BIGINT",
+    "poutcome":"VARCHAR(20)",
+    "deposit":"VARCHAR(20)"
+  },
+  "advertising":{
+    "TV":"FLOAT",
+    "radio":"FLOAT",
+    "newspaper":"FLOAT",
+    "sales":"FLOAT"
+  },
+  "timestamp_data":{
+    "id": "INTEGER",
+    "timestamp_col": "VARCHAR(50)",
+    "timestamp_col1": "BIGINT",
+    "format_col": "VARCHAR(50)",
+    "timezone_col": "VARCHAR(50)"
+  },
+  "interval_data":{
+    "id": "INTEGER",
+    "int_col": "BIGINT",
+    "value_col": "VARCHAR(30)",
+    "value_col1": "VARCHAR(30)",
+    "str_col1": "VARCHAR(30)",
+    "str_col2": "VARCHAR(30)"
+  },
+  "url_data": {
+    "id": "INTEGER",
+    "urls": "VARCHAR(60)",
+    "part": "VARCHAR(20)"
   }
 }

teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.2__py3-none-any.whl

Potentially problematic release.

teradataml 20.0.0.0py3-none-any.whl → 20.0.0.2py3-none-any.whl