teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +183 -0
- teradataml/__init__.py +6 -3
- teradataml/_version.py +2 -2
- teradataml/analytics/__init__.py +3 -2
- teradataml/analytics/analytic_function_executor.py +275 -40
- teradataml/analytics/analytic_query_generator.py +92 -0
- teradataml/analytics/byom/__init__.py +3 -2
- teradataml/analytics/json_parser/metadata.py +1 -0
- teradataml/analytics/json_parser/utils.py +17 -21
- teradataml/analytics/meta_class.py +40 -1
- teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
- teradataml/analytics/sqle/__init__.py +10 -2
- teradataml/analytics/table_operator/__init__.py +3 -2
- teradataml/analytics/uaf/__init__.py +21 -2
- teradataml/analytics/utils.py +62 -1
- teradataml/analytics/valib.py +1 -1
- teradataml/automl/__init__.py +1553 -319
- teradataml/automl/custom_json_utils.py +139 -61
- teradataml/automl/data_preparation.py +276 -319
- teradataml/automl/data_transformation.py +163 -81
- teradataml/automl/feature_engineering.py +402 -239
- teradataml/automl/feature_exploration.py +9 -2
- teradataml/automl/model_evaluation.py +48 -51
- teradataml/automl/model_training.py +291 -189
- teradataml/catalog/byom.py +8 -8
- teradataml/catalog/model_cataloging_utils.py +1 -1
- teradataml/clients/auth_client.py +133 -0
- teradataml/clients/pkce_client.py +1 -1
- teradataml/common/aed_utils.py +3 -2
- teradataml/common/constants.py +48 -6
- teradataml/common/deprecations.py +13 -7
- teradataml/common/garbagecollector.py +156 -120
- teradataml/common/messagecodes.py +6 -1
- teradataml/common/messages.py +3 -1
- teradataml/common/sqlbundle.py +1 -1
- teradataml/common/utils.py +103 -11
- teradataml/common/wrapper_utils.py +1 -1
- teradataml/context/context.py +121 -31
- teradataml/data/advertising.csv +201 -0
- teradataml/data/bank_marketing.csv +11163 -0
- teradataml/data/bike_sharing.csv +732 -0
- teradataml/data/boston2cols.csv +721 -0
- teradataml/data/breast_cancer.csv +570 -0
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/customer_segmentation_test.csv +2628 -0
- teradataml/data/customer_segmentation_train.csv +8069 -0
- teradataml/data/dataframe_example.json +10 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
- teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
- teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
- teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
- teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
- teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
- teradataml/data/docs/sqle/docs_17_20/Shap.py +197 -0
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
- teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
- teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
- teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/glm_example.json +28 -1
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/housing_train_segment.csv +201 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/insect2Cols.csv +61 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/jsons/paired_functions.json +14 -0
- teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
- teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
- teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
- teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
- teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
- teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
- teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
- teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
- teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
- teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
- teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
- teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +208 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +400 -0
- teradataml/data/kmeans_example.json +5 -0
- teradataml/data/kmeans_table.csv +10 -0
- teradataml/data/load_example_data.py +8 -2
- teradataml/data/naivebayestextclassifier_example.json +1 -1
- teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
- teradataml/data/onehot_encoder_train.csv +4 -0
- teradataml/data/openml_example.json +29 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/sax_example.json +8 -0
- teradataml/data/scale_attributes.csv +3 -0
- teradataml/data/scale_example.json +52 -1
- teradataml/data/scale_input_part_sparse.csv +31 -0
- teradataml/data/scale_input_partitioned.csv +16 -0
- teradataml/data/scale_input_sparse.csv +11 -0
- teradataml/data/scale_parameters.csv +3 -0
- teradataml/data/scripts/deploy_script.py +21 -2
- teradataml/data/scripts/sklearn/sklearn_fit.py +40 -37
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +22 -30
- teradataml/data/scripts/sklearn/sklearn_function.template +42 -24
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +19 -28
- teradataml/data/scripts/sklearn/sklearn_score.py +32 -32
- teradataml/data/scripts/sklearn/sklearn_transform.py +85 -42
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/templates/open_source_ml.json +2 -1
- teradataml/data/teradataml_example.json +97 -1
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/uaf_example.json +55 -1
- teradataml/data/unpivot_example.json +15 -0
- teradataml/data/url_data.csv +9 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/data/ztest_example.json +16 -0
- teradataml/dataframe/copy_to.py +9 -4
- teradataml/dataframe/data_transfer.py +125 -64
- teradataml/dataframe/dataframe.py +575 -57
- teradataml/dataframe/dataframe_utils.py +47 -9
- teradataml/dataframe/fastload.py +273 -90
- teradataml/dataframe/functions.py +339 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +2 -2
- teradataml/dataframe/sql.py +740 -18
- teradataml/dataframe/window.py +1 -1
- teradataml/dbutils/dbutils.py +324 -18
- teradataml/geospatial/geodataframe.py +1 -1
- teradataml/geospatial/geodataframecolumn.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +13 -13
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/sklearn/_sklearn_wrapper.py +254 -122
- teradataml/options/__init__.py +16 -5
- teradataml/options/configure.py +39 -6
- teradataml/options/display.py +2 -2
- teradataml/plot/axis.py +4 -4
- teradataml/scriptmgmt/UserEnv.py +26 -19
- teradataml/scriptmgmt/lls_utils.py +120 -16
- teradataml/table_operators/Script.py +4 -5
- teradataml/table_operators/TableOperator.py +160 -26
- teradataml/table_operators/table_operator_util.py +88 -41
- teradataml/table_operators/templates/dataframe_udf.template +63 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +52 -0
- teradataml/utils/validators.py +41 -3
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/METADATA +191 -6
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/RECORD +263 -185
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.2.dist-info}/zip-safe +0 -0
|
@@ -6,35 +6,22 @@ import base64
|
|
|
6
6
|
|
|
7
7
|
DELIMITER = '\t'
|
|
8
8
|
|
|
9
|
-
|
|
10
|
-
def get_value(value):
|
|
11
|
-
ret_val = value
|
|
12
|
-
try:
|
|
13
|
-
ret_val = round(float("".join(value.split())), 2)
|
|
14
|
-
except Exception as ex:
|
|
15
|
-
# If the value can't be converted to float, then it is string.
|
|
16
|
-
pass
|
|
17
|
-
return ret_val
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def get_values_list(values, ignore_none=True):
|
|
9
|
+
def get_values_list(values, types):
|
|
21
10
|
ret_vals = []
|
|
22
|
-
for val in values:
|
|
23
|
-
|
|
24
|
-
# Empty cell value in the database table.
|
|
25
|
-
continue
|
|
26
|
-
ret_vals.append(get_value(val))
|
|
27
|
-
|
|
11
|
+
for i, val in enumerate(values):
|
|
12
|
+
ret_vals.append(convert_to_type(val, types[i]))
|
|
28
13
|
return ret_vals
|
|
29
14
|
|
|
30
15
|
def convert_to_type(val, typee):
|
|
31
16
|
if typee == 'int':
|
|
32
|
-
return int(val)
|
|
17
|
+
return int(val) if val != "" else np.nan
|
|
33
18
|
if typee == 'float':
|
|
34
|
-
|
|
19
|
+
if isinstance(val, str):
|
|
20
|
+
val = val.replace(' ', '')
|
|
21
|
+
return float(val) if val != "" else np.nan
|
|
35
22
|
if typee == 'bool':
|
|
36
|
-
return eval(val)
|
|
37
|
-
return str(val)
|
|
23
|
+
return eval(val) if val != "" else None
|
|
24
|
+
return str(val) if val != "" else None
|
|
38
25
|
|
|
39
26
|
def splitter(strr, delim=",", convert_to="str"):
|
|
40
27
|
"""
|
|
@@ -54,13 +41,14 @@ if len(sys.argv) != 9:
|
|
|
54
41
|
# 4. No of class labels.
|
|
55
42
|
# 5. No of group columns.
|
|
56
43
|
# 6. Comma separated indices of partition columns.
|
|
57
|
-
# 7. Comma separated types of the
|
|
44
|
+
# 7. Comma separated types of all the data columns.
|
|
58
45
|
# 8. Model file prefix to generated model file using partition columns.
|
|
59
46
|
# 9. Flag to check the system type. True, means Lake, Enterprise otherwise.
|
|
60
47
|
sys.exit("9 arguments command line arguments should be passed: file to be run,"
|
|
61
48
|
" function name, no of feature columns, no of class labels, no of group columns,"
|
|
62
|
-
" comma separated indices
|
|
63
|
-
" generated model file using partition columns and flag to check
|
|
49
|
+
" comma separated indices of partition columns, comma separated types of all columns,"
|
|
50
|
+
" model file prefix to generated model file using partition columns and flag to check"
|
|
51
|
+
" lake or enterprise.")
|
|
64
52
|
|
|
65
53
|
|
|
66
54
|
is_lake_system = eval(sys.argv[8])
|
|
@@ -70,10 +58,11 @@ function_name = sys.argv[1]
|
|
|
70
58
|
n_f_cols = int(sys.argv[2])
|
|
71
59
|
n_c_labels = int(sys.argv[3])
|
|
72
60
|
n_g_cols = int(sys.argv[4])
|
|
73
|
-
|
|
61
|
+
data_column_types = splitter(sys.argv[6], delim="--")
|
|
74
62
|
data_partition_column_indices = splitter(sys.argv[5], convert_to="int") # indices are integers.
|
|
75
63
|
model_file_prefix = sys.argv[7]
|
|
76
64
|
|
|
65
|
+
data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
|
|
77
66
|
|
|
78
67
|
model = None
|
|
79
68
|
data_partition_column_values = []
|
|
@@ -93,6 +82,7 @@ while 1:
|
|
|
93
82
|
break
|
|
94
83
|
else:
|
|
95
84
|
values = line.split(DELIMITER)
|
|
85
|
+
values = get_values_list(values, data_column_types)
|
|
96
86
|
if not data_partition_column_values:
|
|
97
87
|
# Partition column values is same for all rows. Hence, only read once.
|
|
98
88
|
for i, val in enumerate(data_partition_column_indices):
|
|
@@ -117,13 +107,13 @@ while 1:
|
|
|
117
107
|
|
|
118
108
|
start = 0
|
|
119
109
|
if n_f_cols > 0:
|
|
120
|
-
features.append(
|
|
110
|
+
features.append(values[:n_f_cols])
|
|
121
111
|
start = start + n_f_cols
|
|
122
112
|
if n_c_labels > 0:
|
|
123
|
-
labels.append(
|
|
113
|
+
labels.append(values[start:(start+n_c_labels)])
|
|
124
114
|
start = start + n_c_labels
|
|
125
115
|
if n_g_cols > 0:
|
|
126
|
-
groups.append(
|
|
116
|
+
groups.append(values[start:(start+n_g_cols)])
|
|
127
117
|
|
|
128
118
|
except EOFError: # Exit if reached EOF or CTRL-D
|
|
129
119
|
break
|
|
@@ -144,14 +134,14 @@ if function_name == "split":
|
|
|
144
134
|
y_train, y_test = labels[train_idx], labels[test_idx]
|
|
145
135
|
for X, y in zip(X_train, y_train):
|
|
146
136
|
print(*(data_partition_column_values + [split_id, "train"] +
|
|
147
|
-
['' if (val is None or math.isnan(val) or math.isinf(val)) else val
|
|
137
|
+
['' if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))) else val
|
|
148
138
|
for val in X] + [y]
|
|
149
|
-
),sep=DELIMITER)
|
|
139
|
+
), sep=DELIMITER)
|
|
150
140
|
for X, y in zip(X_test, y_test):
|
|
151
141
|
print(*(data_partition_column_values + [split_id, "test"] +
|
|
152
|
-
['' if (val is None or math.isnan(val) or math.isinf(val)) else val
|
|
142
|
+
['' if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))) else val
|
|
153
143
|
for val in X] + [y]
|
|
154
|
-
),sep=DELIMITER)
|
|
144
|
+
), sep=DELIMITER)
|
|
155
145
|
split_id += 1
|
|
156
146
|
else:
|
|
157
147
|
val = getattr(model, function_name)(features, labels, groups)
|
|
@@ -12,34 +12,22 @@ from scipy.sparse.csr import csr_matrix
|
|
|
12
12
|
DELIMITER = '\t'
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
def
|
|
16
|
-
ret_val = value
|
|
17
|
-
try:
|
|
18
|
-
ret_val = float(value.replace(' ', ''))
|
|
19
|
-
except Exception as ex:
|
|
20
|
-
# If the value can't be converted to float, then it is string.
|
|
21
|
-
pass
|
|
22
|
-
return ret_val
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def get_values_list(values, ignore_none=True):
|
|
15
|
+
def get_values_list(values, types):
|
|
26
16
|
ret_vals = []
|
|
27
|
-
for val in values:
|
|
28
|
-
|
|
29
|
-
# Empty cell value in the database table.
|
|
30
|
-
continue
|
|
31
|
-
ret_vals.append(get_value(val))
|
|
32
|
-
|
|
17
|
+
for i, val in enumerate(values):
|
|
18
|
+
ret_vals.append(convert_to_type(val, types[i]))
|
|
33
19
|
return ret_vals
|
|
34
20
|
|
|
35
21
|
def convert_to_type(val, typee):
|
|
36
22
|
if typee == 'int':
|
|
37
|
-
return int(val)
|
|
23
|
+
return int(val) if val != "" else np.nan
|
|
38
24
|
if typee == 'float':
|
|
39
|
-
|
|
25
|
+
if isinstance(val, str):
|
|
26
|
+
val = val.replace(' ', '')
|
|
27
|
+
return float(val) if val != "" else np.nan
|
|
40
28
|
if typee == 'bool':
|
|
41
|
-
return eval(val)
|
|
42
|
-
return str(val)
|
|
29
|
+
return eval(val) if val != "" else None
|
|
30
|
+
return str(val) if val != "" else None
|
|
43
31
|
|
|
44
32
|
def splitter(strr, delim=",", convert_to="str"):
|
|
45
33
|
"""
|
|
@@ -57,15 +45,15 @@ if len(sys.argv) < 7:
|
|
|
57
45
|
# 2. function name.
|
|
58
46
|
# 3. No of feature columns.
|
|
59
47
|
# 4. Comma separated indices of partition columns.
|
|
60
|
-
# 5. Comma separated types of the
|
|
48
|
+
# 5. Comma separated types of all the data columns.
|
|
61
49
|
# 6. Model file prefix to generate model file using partition columns.
|
|
62
50
|
# 7. Flag to check the system type. True, means Lake, Enterprise otherwise.
|
|
63
51
|
# 8. OPTIONAL - Arguments in string format like "return_distance True-bool",
|
|
64
52
|
# "n_neighbors 3-int", "radius 3.4-float" etc.
|
|
65
53
|
sys.exit("At least 7 arguments should be passed to this file - file to be run, function name, "\
|
|
66
|
-
"no of feature columns, comma separated indices
|
|
67
|
-
"model file prefix to generate model file using
|
|
68
|
-
"lake or enterprise and optional arguments in string format.")
|
|
54
|
+
"no of feature columns, comma separated indices of partition columns, comma "\
|
|
55
|
+
"separated types of all columns, model file prefix to generate model file using "\
|
|
56
|
+
"partition columns, flag to check lake or enterprise and optional arguments in string format.")
|
|
69
57
|
|
|
70
58
|
convert_to_int = lambda x: int(x) if x != "None" else None
|
|
71
59
|
|
|
@@ -74,7 +62,7 @@ if not is_lake_system:
|
|
|
74
62
|
db = sys.argv[0].split("/")[1]
|
|
75
63
|
func_name = sys.argv[1]
|
|
76
64
|
n_f_cols = convert_to_int(sys.argv[2])
|
|
77
|
-
|
|
65
|
+
data_column_types = splitter(sys.argv[4], delim="--")
|
|
78
66
|
data_partition_column_indices = splitter(sys.argv[3], convert_to="int") # indices are integers.
|
|
79
67
|
model_file_prefix = sys.argv[5]
|
|
80
68
|
# Extract arguments from string.
|
|
@@ -83,6 +71,8 @@ for i in range(7, len(sys.argv), 2):
|
|
|
83
71
|
value = sys.argv[i + 1].split("-", 1)
|
|
84
72
|
arguments[sys.argv[i]] = convert_to_type(value[0], value[1])
|
|
85
73
|
|
|
74
|
+
data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
|
|
75
|
+
|
|
86
76
|
model = None
|
|
87
77
|
data_partition_column_values = []
|
|
88
78
|
|
|
@@ -101,6 +91,7 @@ while 1:
|
|
|
101
91
|
break
|
|
102
92
|
else:
|
|
103
93
|
values = line.split(DELIMITER)
|
|
94
|
+
values = get_values_list(values, data_column_types)
|
|
104
95
|
if not data_partition_column_values:
|
|
105
96
|
# Partition column values is same for all rows. Hence, only read once.
|
|
106
97
|
for i, val in enumerate(data_partition_column_indices):
|
|
@@ -123,9 +114,9 @@ while 1:
|
|
|
123
114
|
if not model:
|
|
124
115
|
sys.exit("Model file is not installed in Vantage.")
|
|
125
116
|
|
|
126
|
-
f_ =
|
|
117
|
+
f_ = values[:n_f_cols]
|
|
127
118
|
if f_:
|
|
128
|
-
output = getattr(model, func_name)(
|
|
119
|
+
output = getattr(model, func_name)([f_], **arguments)
|
|
129
120
|
else:
|
|
130
121
|
output = getattr(model, func_name)(**arguments)
|
|
131
122
|
result_list = f_
|
|
@@ -6,34 +6,22 @@ import numpy as np
|
|
|
6
6
|
DELIMITER = '\t'
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
def
|
|
10
|
-
ret_val = value
|
|
11
|
-
try:
|
|
12
|
-
ret_val = float("".join(value.split()))
|
|
13
|
-
except Exception as ex:
|
|
14
|
-
# If the value can't be converted to float, then it is string.
|
|
15
|
-
pass
|
|
16
|
-
return ret_val
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def get_values_list(values, ignore_none=True):
|
|
9
|
+
def get_values_list(values, types):
|
|
20
10
|
ret_vals = []
|
|
21
|
-
for val in values:
|
|
22
|
-
|
|
23
|
-
# Empty cell value in the database table.
|
|
24
|
-
continue
|
|
25
|
-
ret_vals.append(get_value(val))
|
|
26
|
-
|
|
11
|
+
for i, val in enumerate(values):
|
|
12
|
+
ret_vals.append(convert_to_type(val, types[i]))
|
|
27
13
|
return ret_vals
|
|
28
14
|
|
|
29
15
|
def convert_to_type(val, typee):
|
|
30
16
|
if typee == 'int':
|
|
31
|
-
return int(val)
|
|
17
|
+
return int(val) if val != "" else np.nan
|
|
32
18
|
if typee == 'float':
|
|
33
|
-
|
|
19
|
+
if isinstance(val, str):
|
|
20
|
+
val = val.replace(' ', '')
|
|
21
|
+
return float(val) if val != "" else np.nan
|
|
34
22
|
if typee == 'bool':
|
|
35
|
-
return
|
|
36
|
-
return str(val)
|
|
23
|
+
return eval(val) if val != "" else None
|
|
24
|
+
return str(val) if val != "" else None
|
|
37
25
|
|
|
38
26
|
def splitter(strr, delim=",", convert_to="str"):
|
|
39
27
|
"""
|
|
@@ -51,13 +39,13 @@ if len(sys.argv) != 8:
|
|
|
51
39
|
# 3. No of feature columns.
|
|
52
40
|
# 4. No of class labels.
|
|
53
41
|
# 5. Comma separated indices of partition columns.
|
|
54
|
-
# 6. Comma separated types of the
|
|
42
|
+
# 6. Comma separated types of all the data columns.
|
|
55
43
|
# 7. Model file prefix to generated model file using partition columns.
|
|
56
44
|
# 8. Flag to check the system type. True, means Lake, Enterprise otherwise.
|
|
57
45
|
sys.exit("8 arguments should be passed to this file - file to be run, function name, "\
|
|
58
|
-
"no of feature columns, no of class labels, comma separated indices
|
|
59
|
-
"partition columns,
|
|
60
|
-
"columns and flag to check lake or enterprise.")
|
|
46
|
+
"no of feature columns, no of class labels, comma separated indices "
|
|
47
|
+
"of partition columns, comma separated types of all columns, model file prefix to "\
|
|
48
|
+
"generate model file using partition columns and flag to check lake or enterprise.")
|
|
61
49
|
|
|
62
50
|
is_lake_system = eval(sys.argv[7])
|
|
63
51
|
if not is_lake_system:
|
|
@@ -65,10 +53,12 @@ if not is_lake_system:
|
|
|
65
53
|
func_name = sys.argv[1]
|
|
66
54
|
n_f_cols = int(sys.argv[2])
|
|
67
55
|
n_c_labels = int(sys.argv[3])
|
|
68
|
-
|
|
56
|
+
data_column_types = splitter(sys.argv[5], delim="--")
|
|
69
57
|
data_partition_column_indices = splitter(sys.argv[4], convert_to="int") # indices are integers.
|
|
70
58
|
model_file_prefix = sys.argv[6]
|
|
71
59
|
|
|
60
|
+
data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
|
|
61
|
+
|
|
72
62
|
model = None
|
|
73
63
|
|
|
74
64
|
# Data Format (n_features, k_labels, one data_partition_column):
|
|
@@ -87,9 +77,10 @@ while 1:
|
|
|
87
77
|
break
|
|
88
78
|
else:
|
|
89
79
|
values = line.split(DELIMITER)
|
|
90
|
-
|
|
80
|
+
values = get_values_list(values, data_column_types)
|
|
81
|
+
features.append(values[:n_f_cols])
|
|
91
82
|
if n_c_labels > 0:
|
|
92
|
-
labels.append(
|
|
83
|
+
labels.append(values[n_f_cols:(n_f_cols+n_c_labels)])
|
|
93
84
|
|
|
94
85
|
if not data_partition_column_values:
|
|
95
86
|
# Partition column values is same for all rows. Hence, only read once.
|
|
@@ -119,10 +110,19 @@ while 1:
|
|
|
119
110
|
if len(features) == 0:
|
|
120
111
|
sys.exit(0)
|
|
121
112
|
|
|
113
|
+
|
|
114
|
+
model_name = model.__class__.__name__
|
|
115
|
+
np_func_list = ["MultiOutputClassifier", "GaussianMixture"]
|
|
116
|
+
|
|
117
|
+
if model_name in np_func_list:
|
|
118
|
+
features = np.array(features)
|
|
119
|
+
|
|
122
120
|
if labels:
|
|
123
|
-
|
|
121
|
+
if model_name in np_func_list:
|
|
122
|
+
labels = np.array(labels)
|
|
123
|
+
val = getattr(model, func_name)(features, labels)
|
|
124
124
|
else:
|
|
125
|
-
val = getattr(model, func_name)(
|
|
125
|
+
val = getattr(model, func_name)(features)
|
|
126
126
|
|
|
127
|
-
result_val = ['' if (val is None or math.isnan(val) or math.isinf(val)) else val]
|
|
128
|
-
print(*(data_partition_column_values + result_val), sep=DELIMITER)
|
|
127
|
+
result_val = ['' if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))) else val]
|
|
128
|
+
print(*(data_partition_column_values + result_val), sep=DELIMITER)
|
|
@@ -4,41 +4,24 @@ import os
|
|
|
4
4
|
import sys
|
|
5
5
|
import numpy as np
|
|
6
6
|
|
|
7
|
-
# The below import is needed to convert sparse matrix to dense array as sparse matrices are NOT
|
|
8
|
-
# supported in Vantage.
|
|
9
|
-
# This is in scipy 1.10.0. Might vary based on scipy version.
|
|
10
|
-
from scipy.sparse import csr_matrix
|
|
11
|
-
|
|
12
7
|
DELIMITER = '\t'
|
|
13
8
|
|
|
14
|
-
def
|
|
15
|
-
ret_val = value
|
|
16
|
-
try:
|
|
17
|
-
ret_val = float(value.replace(' ', ''))
|
|
18
|
-
except Exception as ex:
|
|
19
|
-
# If the value can't be converted to float, then it is string.
|
|
20
|
-
pass
|
|
21
|
-
return ret_val
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def get_values_list(values, ignore_none=True):
|
|
9
|
+
def get_values_list(values, types):
|
|
25
10
|
ret_vals = []
|
|
26
|
-
for val in values:
|
|
27
|
-
|
|
28
|
-
# Empty cell value in the database table.
|
|
29
|
-
continue
|
|
30
|
-
ret_vals.append(get_value(val))
|
|
31
|
-
|
|
11
|
+
for i, val in enumerate(values):
|
|
12
|
+
ret_vals.append(convert_to_type(val, types[i]))
|
|
32
13
|
return ret_vals
|
|
33
14
|
|
|
34
15
|
def convert_to_type(val, typee):
|
|
35
16
|
if typee == 'int':
|
|
36
|
-
return int(val)
|
|
17
|
+
return int(val) if val != "" else np.nan
|
|
37
18
|
if typee == 'float':
|
|
38
|
-
|
|
19
|
+
if isinstance(val, str):
|
|
20
|
+
val = val.replace(' ', '')
|
|
21
|
+
return float(val) if val != "" else np.nan
|
|
39
22
|
if typee == 'bool':
|
|
40
|
-
return eval(val)
|
|
41
|
-
return str(val)
|
|
23
|
+
return eval(val) if val != "" else None
|
|
24
|
+
return str(val) if val != "" else None
|
|
42
25
|
|
|
43
26
|
def splitter(strr, delim=",", convert_to="str"):
|
|
44
27
|
"""
|
|
@@ -54,7 +37,7 @@ def get_output_data(trans_values, func_name, model_obj, n_c_labels):
|
|
|
54
37
|
# supported in Vantage.
|
|
55
38
|
module_name = model_obj.__module__.split("._")[0]
|
|
56
39
|
|
|
57
|
-
if
|
|
40
|
+
if type(trans_values).__name__ in ["csr_matrix", "csc_matrix"]:
|
|
58
41
|
trans_values = trans_values.toarray()
|
|
59
42
|
|
|
60
43
|
if module_name == "sklearn.cross_decomposition" and n_c_labels > 0 and func_name == "transform":
|
|
@@ -86,13 +69,13 @@ if len(sys.argv) != 8:
|
|
|
86
69
|
# 3. No of feature columns.
|
|
87
70
|
# 4. No of class labels.
|
|
88
71
|
# 5. Comma separated indices of partition columns.
|
|
89
|
-
# 6. Comma separated types of the
|
|
72
|
+
# 6. Comma separated types of all the data columns.
|
|
90
73
|
# 7. Model file prefix to generated model file using partition columns.
|
|
91
74
|
# 8. Flag to check the system type. True, means Lake, Enterprise otherwise.
|
|
92
75
|
sys.exit("8 arguments should be passed to this file - file to be run, function name, "\
|
|
93
|
-
"no of feature columns, no of class labels, comma separated indices
|
|
94
|
-
"
|
|
95
|
-
"columns and flag to check lake or enterprise.")
|
|
76
|
+
"no of feature columns, no of class labels, comma separated indices of partition "\
|
|
77
|
+
"columns, comma separated types of all columns, model file prefix to generate model "\
|
|
78
|
+
"file using partition columns and flag to check lake or enterprise.")
|
|
96
79
|
|
|
97
80
|
is_lake_system = eval(sys.argv[7])
|
|
98
81
|
if not is_lake_system:
|
|
@@ -100,18 +83,23 @@ if not is_lake_system:
|
|
|
100
83
|
func_name = sys.argv[1]
|
|
101
84
|
n_f_cols = int(sys.argv[2])
|
|
102
85
|
n_c_labels = int(sys.argv[3])
|
|
103
|
-
|
|
86
|
+
data_column_types = splitter(sys.argv[5], delim="--")
|
|
104
87
|
data_partition_column_indices = splitter(sys.argv[4], convert_to="int") # indices are integers.
|
|
105
88
|
model_file_prefix = sys.argv[6]
|
|
106
89
|
|
|
90
|
+
data_partition_column_types = [data_column_types[idx] for idx in data_partition_column_indices]
|
|
91
|
+
|
|
107
92
|
model = None
|
|
108
93
|
data_partition_column_values = []
|
|
109
94
|
|
|
95
|
+
missing_indicator_input = []
|
|
96
|
+
|
|
110
97
|
# Data Format:
|
|
111
98
|
# feature1, feature2, ..., featuren, label1, label2, ... labelk, data_partition_column1, ...,
|
|
112
99
|
# data_partition_columnn.
|
|
113
100
|
# label is optional (it is present when label_exists is not "None")
|
|
114
101
|
|
|
102
|
+
model_name = ""
|
|
115
103
|
while 1:
|
|
116
104
|
try:
|
|
117
105
|
line = input()
|
|
@@ -119,6 +107,7 @@ while 1:
|
|
|
119
107
|
break
|
|
120
108
|
else:
|
|
121
109
|
values = line.split(DELIMITER)
|
|
110
|
+
values = get_values_list(values, data_column_types)
|
|
122
111
|
if not data_partition_column_values:
|
|
123
112
|
# Partition column values is same for all rows. Hence, only read once.
|
|
124
113
|
for i, val in enumerate(data_partition_column_indices):
|
|
@@ -141,10 +130,34 @@ while 1:
|
|
|
141
130
|
if not model:
|
|
142
131
|
sys.exit("Model file is not installed in Vantage.")
|
|
143
132
|
|
|
144
|
-
f_ =
|
|
133
|
+
f_ = values[:n_f_cols]
|
|
134
|
+
|
|
135
|
+
model_name = model.__class__.__name__
|
|
136
|
+
np_func_list = ["ClassifierChain", "EllipticEnvelope", "MinCovDet",
|
|
137
|
+
"FeatureAgglomeration", "LabelBinarizer", "MultiLabelBinarizer"]
|
|
138
|
+
|
|
139
|
+
# MissingIndicator requires processing the entire dataset simultaneously,
|
|
140
|
+
# rather than on a row-by-row basis.
|
|
141
|
+
|
|
142
|
+
# Error getting during row-by-row processing -
|
|
143
|
+
# "ValueError: MissingIndicator does not support data with dtype <U13.
|
|
144
|
+
# Please provide either a numeric array (with a floating point or
|
|
145
|
+
i# integer dtype) or categorical data represented ei
|
|
146
|
+
if model_name == "MissingIndicator" and func_name == "transform":
|
|
147
|
+
missing_indicator_input.append(f_)
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
f__ = np.array([f_]) if model_name in np_func_list or \
|
|
151
|
+
(model_name == "SimpleImputer" and func_name == "inverse_transform")\
|
|
152
|
+
else [f_]
|
|
153
|
+
|
|
145
154
|
if n_c_labels > 0:
|
|
146
155
|
# Labels are present in last column.
|
|
147
|
-
l_ =
|
|
156
|
+
l_ = values[n_f_cols:n_f_cols+n_c_labels]
|
|
157
|
+
|
|
158
|
+
l__ = np.array([l_]) if model_name in np_func_list or \
|
|
159
|
+
(model_name == "SimpleImputer" and func_name == "inverse_transform")\
|
|
160
|
+
else [l_]
|
|
148
161
|
# predict() now takes 'y' also for it to return the labels from script. Skipping 'y'
|
|
149
162
|
# in function call. Generally, 'y' is passed to return y along with actual output.
|
|
150
163
|
try:
|
|
@@ -152,17 +165,17 @@ while 1:
|
|
|
152
165
|
# used 'in' in if constion, as model.__module__ is giving
|
|
153
166
|
# 'sklearn.cross_decomposition._pls'.
|
|
154
167
|
if "cross_decomposition" in model.__module__:
|
|
155
|
-
trans_values = getattr(model, func_name)(X=
|
|
168
|
+
trans_values = getattr(model, func_name)(X=f__, Y=l__)
|
|
156
169
|
else:
|
|
157
|
-
trans_values = getattr(model, func_name)(X=
|
|
170
|
+
trans_values = getattr(model, func_name)(X=f__, y=l__)
|
|
158
171
|
|
|
159
172
|
except TypeError as ex:
|
|
160
173
|
# Function which does not accept 'y' like predict_proba() raises error like
|
|
161
174
|
# "TypeError: predict_proba() takes 2 positional arguments but 3 were given".
|
|
162
|
-
trans_values = getattr(model, func_name)(
|
|
175
|
+
trans_values = getattr(model, func_name)(f__)
|
|
163
176
|
else:
|
|
164
177
|
# If class labels do not exist in data, don't read labels, read just features.
|
|
165
|
-
trans_values = getattr(model, func_name)(
|
|
178
|
+
trans_values = getattr(model, func_name)(f__)
|
|
166
179
|
|
|
167
180
|
result_list = f_
|
|
168
181
|
if n_c_labels > 0 and func_name in ["predict", "decision_function"]:
|
|
@@ -170,10 +183,40 @@ while 1:
|
|
|
170
183
|
result_list += get_output_data(trans_values=trans_values, func_name=func_name,
|
|
171
184
|
model_obj=model, n_c_labels=n_c_labels)
|
|
172
185
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
186
|
+
for i, val in enumerate(result_list):
|
|
187
|
+
if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))):
|
|
188
|
+
result_list[i] = ""
|
|
189
|
+
# MissingIndicator returns boolean values. Convert them to 0/1.
|
|
190
|
+
elif val == False:
|
|
191
|
+
result_list[i] = 0
|
|
192
|
+
elif val == True:
|
|
193
|
+
result_list[i] = 1
|
|
194
|
+
|
|
195
|
+
print(*(data_partition_column_values + result_list), sep=DELIMITER)
|
|
177
196
|
|
|
178
197
|
except EOFError: # Exit if reached EOF or CTRL-D
|
|
179
198
|
break
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# MissingIndicator needs processing of all the dataset at the same time, instead of row by row.
|
|
202
|
+
# Hence, handling it outside of the while loop
|
|
203
|
+
if model_name == "MissingIndicator" and func_name == "transform":
|
|
204
|
+
m_out = model.transform(missing_indicator_input)
|
|
205
|
+
|
|
206
|
+
for j, vals in enumerate(missing_indicator_input):
|
|
207
|
+
|
|
208
|
+
m_out_list = get_output_data(trans_values=m_out[j], func_name=func_name,
|
|
209
|
+
model_obj=model, n_c_labels=n_c_labels)
|
|
210
|
+
|
|
211
|
+
result_list = missing_indicator_input[j] + m_out_list
|
|
212
|
+
|
|
213
|
+
for i, val in enumerate(result_list):
|
|
214
|
+
if (val is None or (not isinstance(val, str) and (math.isnan(val) or math.isinf(val)))):
|
|
215
|
+
result_list[i] = ""
|
|
216
|
+
# MissingIndicator returns boolean values. Convert them to 0/1.
|
|
217
|
+
elif val == False:
|
|
218
|
+
result_list[i] = 0
|
|
219
|
+
elif val == True:
|
|
220
|
+
result_list[i] = 1
|
|
221
|
+
|
|
222
|
+
print(*(data_partition_column_values + result_list), sep=DELIMITER)
|
|
@@ -1271,6 +1271,102 @@
|
|
|
1271
1271
|
"height":"INTEGER",
|
|
1272
1272
|
"weight":"INTEGER",
|
|
1273
1273
|
"bmi":"INTEGER"
|
|
1274
|
+
},
|
|
1275
|
+
"breast_cancer":{
|
|
1276
|
+
"id":"BIGINT",
|
|
1277
|
+
"diagnosis":"VARCHAR(20)",
|
|
1278
|
+
"radius_mean":"FLOAT",
|
|
1279
|
+
"texture_mean":"FLOAT",
|
|
1280
|
+
"perimeter_mean":"FLOAT",
|
|
1281
|
+
"area_mean":"FLOAT",
|
|
1282
|
+
"smoothness_mean":"FLOAT",
|
|
1283
|
+
"compactness_mean":"FLOAT",
|
|
1284
|
+
"concavity_mean":"FLOAT",
|
|
1285
|
+
"concave_points_mean":"FLOAT",
|
|
1286
|
+
"symmetry_mean":"FLOAT",
|
|
1287
|
+
"fractal_dimension_mean":"FLOAT",
|
|
1288
|
+
"radius_se":"FLOAT",
|
|
1289
|
+
"texture_se":"FLOAT",
|
|
1290
|
+
"perimeter_se":"FLOAT",
|
|
1291
|
+
"area_se":"FLOAT",
|
|
1292
|
+
"smoothness_se":"FLOAT",
|
|
1293
|
+
"compactness_se":"FLOAT",
|
|
1294
|
+
"concavity_se":"FLOAT",
|
|
1295
|
+
"concave_points_se":"FLOAT",
|
|
1296
|
+
"symmetry_se":"FLOAT",
|
|
1297
|
+
"fractal_dimension_se":"FLOAT",
|
|
1298
|
+
"radius_worst":"FLOAT",
|
|
1299
|
+
"texture_worst":"FLOAT",
|
|
1300
|
+
"perimeter_worst":"FLOAT",
|
|
1301
|
+
"area_worst":"FLOAT",
|
|
1302
|
+
"smoothness_worst":"FLOAT",
|
|
1303
|
+
"compactness_worst":"FLOAT",
|
|
1304
|
+
"concavity_worst":"FLOAT",
|
|
1305
|
+
"concave_points_worst":"FLOAT",
|
|
1306
|
+
"symmetry_worst":"FLOAT",
|
|
1307
|
+
"fractal_dimension_worst":"FLOAT"
|
|
1308
|
+
},
|
|
1309
|
+
"bike_sharing" :{
|
|
1310
|
+
"instant":"BIGINT",
|
|
1311
|
+
"dteday":"DATE FORMAT 'dd-mm-yyyy'",
|
|
1312
|
+
"season":"BIGINT",
|
|
1313
|
+
"yr":"BIGINT",
|
|
1314
|
+
"mnth":"BIGINT",
|
|
1315
|
+
"holiday":"BIGINT",
|
|
1316
|
+
"weekday":"BIGINT",
|
|
1317
|
+
"workingday":"BIGINT",
|
|
1318
|
+
"weathersit":"BIGINT",
|
|
1319
|
+
"temp":"FLOAT",
|
|
1320
|
+
"atemp":"FLOAT",
|
|
1321
|
+
"hum":"FLOAT",
|
|
1322
|
+
"windspeed":"FLOAT",
|
|
1323
|
+
"casual":"BIGINT",
|
|
1324
|
+
"registered":"BIGINT",
|
|
1325
|
+
"cnt":"BIGINT"
|
|
1326
|
+
},
|
|
1327
|
+
"bank_marketing":{
|
|
1328
|
+
"age":"BIGINT",
|
|
1329
|
+
"job":"VARCHAR(20)",
|
|
1330
|
+
"marital":"VARCHAR(20)",
|
|
1331
|
+
"education":"VARCHAR(20)",
|
|
1332
|
+
"default_value":"VARCHAR(20)",
|
|
1333
|
+
"balance":"BIGINT",
|
|
1334
|
+
"housing":"VARCHAR(20)",
|
|
1335
|
+
"loan":"VARCHAR(20)",
|
|
1336
|
+
"contact":"VARCHAR(20)",
|
|
1337
|
+
"day_of_month":"BIGINT",
|
|
1338
|
+
"month_of_year":"VARCHAR(20)",
|
|
1339
|
+
"duration":"BIGINT",
|
|
1340
|
+
"campaign":"BIGINT",
|
|
1341
|
+
"pdays":"BIGINT",
|
|
1342
|
+
"previous":"BIGINT",
|
|
1343
|
+
"poutcome":"VARCHAR(20)",
|
|
1344
|
+
"deposit":"VARCHAR(20)"
|
|
1345
|
+
},
|
|
1346
|
+
"advertising":{
|
|
1347
|
+
"TV":"FLOAT",
|
|
1348
|
+
"radio":"FLOAT",
|
|
1349
|
+
"newspaper":"FLOAT",
|
|
1350
|
+
"sales":"FLOAT"
|
|
1351
|
+
},
|
|
1352
|
+
"timestamp_data":{
|
|
1353
|
+
"id": "INTEGER",
|
|
1354
|
+
"timestamp_col": "VARCHAR(50)",
|
|
1355
|
+
"timestamp_col1": "BIGINT",
|
|
1356
|
+
"format_col": "VARCHAR(50)",
|
|
1357
|
+
"timezone_col": "VARCHAR(50)"
|
|
1358
|
+
},
|
|
1359
|
+
"interval_data":{
|
|
1360
|
+
"id": "INTEGER",
|
|
1361
|
+
"int_col": "BIGINT",
|
|
1362
|
+
"value_col": "VARCHAR(30)",
|
|
1363
|
+
"value_col1": "VARCHAR(30)",
|
|
1364
|
+
"str_col1": "VARCHAR(30)",
|
|
1365
|
+
"str_col2": "VARCHAR(30)"
|
|
1366
|
+
},
|
|
1367
|
+
"url_data": {
|
|
1368
|
+
"id": "INTEGER",
|
|
1369
|
+
"urls": "VARCHAR(60)",
|
|
1370
|
+
"part": "VARCHAR(20)"
|
|
1274
1371
|
}
|
|
1275
|
-
|
|
1276
1372
|
}
|