teradataml 20.0.0.0__py3-none-any.whl → 20.0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +71 -0
- teradataml/_version.py +2 -2
- teradataml/analytics/analytic_function_executor.py +51 -24
- teradataml/analytics/json_parser/utils.py +11 -17
- teradataml/automl/__init__.py +103 -48
- teradataml/automl/data_preparation.py +55 -37
- teradataml/automl/data_transformation.py +131 -69
- teradataml/automl/feature_engineering.py +117 -185
- teradataml/automl/feature_exploration.py +9 -2
- teradataml/automl/model_evaluation.py +13 -25
- teradataml/automl/model_training.py +214 -75
- teradataml/catalog/model_cataloging_utils.py +1 -1
- teradataml/clients/auth_client.py +133 -0
- teradataml/common/aed_utils.py +3 -2
- teradataml/common/constants.py +11 -6
- teradataml/common/garbagecollector.py +5 -0
- teradataml/common/messagecodes.py +3 -1
- teradataml/common/messages.py +2 -1
- teradataml/common/utils.py +6 -0
- teradataml/context/context.py +49 -29
- teradataml/data/advertising.csv +201 -0
- teradataml/data/bank_marketing.csv +11163 -0
- teradataml/data/bike_sharing.csv +732 -0
- teradataml/data/boston2cols.csv +721 -0
- teradataml/data/breast_cancer.csv +570 -0
- teradataml/data/customer_segmentation_test.csv +2628 -0
- teradataml/data/customer_segmentation_train.csv +8069 -0
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingFit.py +3 -1
- teradataml/data/docs/sqle/docs_17_10/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_10/OutlierFilterTransform.py +5 -1
- teradataml/data/docs/sqle/docs_17_20/ANOVA.py +61 -1
- teradataml/data/docs/sqle/docs_17_20/ColumnTransformer.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/FTest.py +105 -26
- teradataml/data/docs/sqle/docs_17_20/GLM.py +162 -1
- teradataml/data/docs/sqle/docs_17_20/GetFutileColumns.py +5 -3
- teradataml/data/docs/sqle/docs_17_20/KMeans.py +48 -1
- teradataml/data/docs/sqle/docs_17_20/NonLinearCombineFit.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingFit.py +5 -0
- teradataml/data/docs/sqle/docs_17_20/OneHotEncodingTransform.py +6 -0
- teradataml/data/docs/sqle/docs_17_20/ROC.py +3 -2
- teradataml/data/docs/sqle/docs_17_20/SVMPredict.py +13 -2
- teradataml/data/docs/sqle/docs_17_20/ScaleFit.py +119 -1
- teradataml/data/docs/sqle/docs_17_20/ScaleTransform.py +93 -1
- teradataml/data/docs/sqle/docs_17_20/TDGLMPredict.py +163 -1
- teradataml/data/docs/sqle/docs_17_20/XGBoost.py +12 -4
- teradataml/data/docs/sqle/docs_17_20/XGBoostPredict.py +7 -1
- teradataml/data/docs/sqle/docs_17_20/ZTest.py +72 -7
- teradataml/data/glm_example.json +28 -1
- teradataml/data/housing_train_segment.csv +201 -0
- teradataml/data/insect2Cols.csv +61 -0
- teradataml/data/jsons/sqle/17.20/TD_ANOVA.json +99 -27
- teradataml/data/jsons/sqle/17.20/TD_FTest.json +166 -83
- teradataml/data/jsons/sqle/17.20/TD_GLM.json +90 -14
- teradataml/data/jsons/sqle/17.20/TD_GLMPREDICT.json +48 -5
- teradataml/data/jsons/sqle/17.20/TD_GetFutileColumns.json +5 -3
- teradataml/data/jsons/sqle/17.20/TD_KMeans.json +31 -11
- teradataml/data/jsons/sqle/17.20/TD_NonLinearCombineFit.json +3 -2
- teradataml/data/jsons/sqle/17.20/TD_ROC.json +2 -1
- teradataml/data/jsons/sqle/17.20/TD_SVM.json +16 -16
- teradataml/data/jsons/sqle/17.20/TD_SVMPredict.json +19 -1
- teradataml/data/jsons/sqle/17.20/TD_ScaleFit.json +168 -15
- teradataml/data/jsons/sqle/17.20/TD_ScaleTransform.json +50 -1
- teradataml/data/jsons/sqle/17.20/TD_XGBoost.json +25 -7
- teradataml/data/jsons/sqle/17.20/TD_XGBoostPredict.json +17 -4
- teradataml/data/jsons/sqle/17.20/TD_ZTest.json +157 -80
- teradataml/data/kmeans_example.json +5 -0
- teradataml/data/kmeans_table.csv +10 -0
- teradataml/data/onehot_encoder_train.csv +4 -0
- teradataml/data/openml_example.json +29 -0
- teradataml/data/scale_attributes.csv +3 -0
- teradataml/data/scale_example.json +52 -1
- teradataml/data/scale_input_part_sparse.csv +31 -0
- teradataml/data/scale_input_partitioned.csv +16 -0
- teradataml/data/scale_input_sparse.csv +11 -0
- teradataml/data/scale_parameters.csv +3 -0
- teradataml/data/scripts/deploy_script.py +20 -1
- teradataml/data/scripts/sklearn/sklearn_fit.py +23 -27
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +20 -28
- teradataml/data/scripts/sklearn/sklearn_function.template +13 -18
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +23 -33
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +18 -27
- teradataml/data/scripts/sklearn/sklearn_score.py +20 -29
- teradataml/data/scripts/sklearn/sklearn_transform.py +30 -38
- teradataml/data/teradataml_example.json +77 -0
- teradataml/data/ztest_example.json +16 -0
- teradataml/dataframe/copy_to.py +8 -3
- teradataml/dataframe/data_transfer.py +120 -61
- teradataml/dataframe/dataframe.py +102 -17
- teradataml/dataframe/dataframe_utils.py +47 -9
- teradataml/dataframe/fastload.py +272 -89
- teradataml/dataframe/sql.py +84 -0
- teradataml/dbutils/dbutils.py +2 -2
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/sklearn/_sklearn_wrapper.py +102 -55
- teradataml/options/__init__.py +13 -4
- teradataml/options/configure.py +27 -6
- teradataml/scriptmgmt/UserEnv.py +19 -16
- teradataml/scriptmgmt/lls_utils.py +117 -14
- teradataml/table_operators/Script.py +2 -3
- teradataml/table_operators/TableOperator.py +58 -10
- teradataml/utils/validators.py +40 -2
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/METADATA +78 -6
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/RECORD +108 -90
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.0.dist-info → teradataml-20.0.0.1.dist-info}/zip-safe +0 -0
|
@@ -7,7 +7,9 @@ def OneHotEncodingFit(data=None, is_input_dense=None, target_column=None, catego
|
|
|
7
7
|
Such as, target attributes and their categorical values to be encoded and other parameters.
|
|
8
8
|
Output of OneHotEncodingFit() function is used by OneHotEncodingTransform() function for encoding
|
|
9
9
|
the input data. It supports inputs in both sparse and dense format.
|
|
10
|
-
|
|
10
|
+
Note:
|
|
11
|
+
* For input to be considered as sparse input, column names must be provided for
|
|
12
|
+
'data_partition_column' argument.
|
|
11
13
|
|
|
12
14
|
PARAMETERS:
|
|
13
15
|
data:
|
|
@@ -3,6 +3,12 @@ def OneHotEncodingTransform(data=None, object=None, is_input_dense=None, **gener
|
|
|
3
3
|
DESCRIPTION:
|
|
4
4
|
Function encodes specified attributes and categorical values as one-hot numeric vectors,
|
|
5
5
|
using OneHotEncodingFit() function output.
|
|
6
|
+
Notes:
|
|
7
|
+
* In case of sparse input, neither 'data_partition_column' nor
|
|
8
|
+
'object_partition_column' can be used independently.
|
|
9
|
+
* In case of dense input, if 'data_partition_column' is having value
|
|
10
|
+
PartitionKind.ANY, then 'object_partition_column' should have value
|
|
11
|
+
PartitionKind.DIMENSION.
|
|
6
12
|
|
|
7
13
|
|
|
8
14
|
PARAMETERS:
|
|
@@ -5,7 +5,11 @@ def OutlierFilterTransform(data=None, object=None, **generic_arguments):
|
|
|
5
5
|
OutlierFilterTransform() uses the result DataFrame from OutlierFilterFit() function to get
|
|
6
6
|
statistics like median, count of rows, lower percentile and upper percentile for every column
|
|
7
7
|
specified in target columns argument and filters the outliers in the input data.
|
|
8
|
-
|
|
8
|
+
Notes:
|
|
9
|
+
* Partitioning of input data and model is allowed using 'data_partition_column' and
|
|
10
|
+
'object_partition_column' only if 'group_columns' are passed while creating model
|
|
11
|
+
using OutlierFilterFit() function.
|
|
12
|
+
* Neither 'data_partition_column' nor 'object_partition_column' can be used independently.
|
|
9
13
|
|
|
10
14
|
PARAMETERS:
|
|
11
15
|
data:
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
def ANOVA(data=None, group_columns=None, alpha=0.05,
|
|
1
|
+
def ANOVA(data=None, group_columns=None, alpha=0.05, group_name_column=None,
|
|
2
|
+
group_value_column=None, group_names=None, num_groups=None,
|
|
3
|
+
**generic_arguments):
|
|
2
4
|
"""
|
|
3
5
|
DESCRIPTION:
|
|
4
6
|
The ANOVA() function performs one-way ANOVA (Analysis of Variance) on
|
|
@@ -37,6 +39,42 @@ def ANOVA(data=None, group_columns=None, alpha=0.05, **generic_arguments):
|
|
|
37
39
|
Default Value: 0.05
|
|
38
40
|
Types: float
|
|
39
41
|
|
|
42
|
+
group_name_column:
|
|
43
|
+
Optional Argument.
|
|
44
|
+
Specifies the column name in "data" containing the names of the groups
|
|
45
|
+
included in the computation.
|
|
46
|
+
Note:
|
|
47
|
+
* This argument is used when data contains group names in a column
|
|
48
|
+
and group values in another column.
|
|
49
|
+
* This argument must be used in conjunction with "group_value_column".
|
|
50
|
+
Types: str
|
|
51
|
+
|
|
52
|
+
group_value_column:
|
|
53
|
+
Optional Argument.
|
|
54
|
+
Specifies the column name in "data" containing the values for each group member.
|
|
55
|
+
Note:
|
|
56
|
+
* This argument is used when data contains group values in a column
|
|
57
|
+
and group names in another column.
|
|
58
|
+
* This argument must be used in conjunction with "group_name_column".
|
|
59
|
+
Types: str
|
|
60
|
+
|
|
61
|
+
group_names:
|
|
62
|
+
Optional Argument.
|
|
63
|
+
Specifies the names of the groups included in the computation.
|
|
64
|
+
Note:
|
|
65
|
+
* This argument is used when data contains group values in a column
|
|
66
|
+
and group names in another column.
|
|
67
|
+
Types: list of Strings (str)
|
|
68
|
+
|
|
69
|
+
num_groups:
|
|
70
|
+
Optional Argument.
|
|
71
|
+
Specifies the number of different groups in the "data" included
|
|
72
|
+
in the computation.
|
|
73
|
+
Note:
|
|
74
|
+
* This argument is used when data contains group values in a column
|
|
75
|
+
and group names in another column.
|
|
76
|
+
Types: int
|
|
77
|
+
|
|
40
78
|
**generic_arguments:
|
|
41
79
|
Specifies the generic keyword arguments SQLE functions accept. Below
|
|
42
80
|
are the generic keyword arguments:
|
|
@@ -97,9 +135,11 @@ def ANOVA(data=None, group_columns=None, alpha=0.05, **generic_arguments):
|
|
|
97
135
|
|
|
98
136
|
# Load the example data.
|
|
99
137
|
load_example_data("teradataml", ["insect_sprays"])
|
|
138
|
+
load_example_data("ztest", 'insect2Cols')
|
|
100
139
|
|
|
101
140
|
# Create teradataml DataFrame objects.
|
|
102
141
|
insect_sprays = DataFrame.from_table("insect_sprays")
|
|
142
|
+
insect_gp = DataFrame.from_table("insect2Cols")
|
|
103
143
|
|
|
104
144
|
# Check the list of available analytic functions.
|
|
105
145
|
display_analytic_functions()
|
|
@@ -123,4 +163,24 @@ def ANOVA(data=None, group_columns=None, alpha=0.05, **generic_arguments):
|
|
|
123
163
|
# Print the result DataFrame.
|
|
124
164
|
print(ANOVA_out_2.result)
|
|
125
165
|
|
|
166
|
+
# Example 3 : Perform one-way anova analysis on a data set with more
|
|
167
|
+
# than two groups and group_name_column, group_value_column,
|
|
168
|
+
# group_names.
|
|
169
|
+
ANOVA_out_3 = ANOVA(data = insect_gp,
|
|
170
|
+
group_name_column='groupName',
|
|
171
|
+
group_value_column='groupValue',
|
|
172
|
+
group_names=['groupA', 'groupB', 'groupC'])
|
|
173
|
+
|
|
174
|
+
# Print the result DataFrame.
|
|
175
|
+
print(ANOVA_out_3.result)
|
|
176
|
+
|
|
177
|
+
# Example 4 : Perform one-way anova analysis on a data set with more
|
|
178
|
+
# than two groups and num_groups.
|
|
179
|
+
ANOVA_out_4 = ANOVA(data = insect_gp,
|
|
180
|
+
group_name_column='groupName',
|
|
181
|
+
group_value_column='groupValue',
|
|
182
|
+
num_groups=6)
|
|
183
|
+
|
|
184
|
+
# Print the result DataFrame.
|
|
185
|
+
print(ANOVA_out_4.result)
|
|
126
186
|
"""
|
|
@@ -22,6 +22,8 @@ def ColumnTransformer(input_data = None, bincode_fit_data = None, function_fit_d
|
|
|
22
22
|
User must create the FIT dataframe before using the function and must be provided in the same order
|
|
23
23
|
as in the training data sequence to transform the dataset. The FIT dataframe can have maximum of
|
|
24
24
|
128 columns.
|
|
25
|
+
Note:
|
|
26
|
+
* ColumnTransformer() function works only with python 3.6 and above.
|
|
25
27
|
|
|
26
28
|
|
|
27
29
|
PARAMETERS:
|
|
@@ -1,12 +1,8 @@
|
|
|
1
|
-
def FTest(data = None, alpha = None,
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
second_sample_column=None,
|
|
7
|
-
df2=2,
|
|
8
|
-
alternate_hypothesis='two-tailed',
|
|
9
|
-
**generic_arguments):
|
|
1
|
+
def FTest(data = None, alpha = 0.05, first_sample_variance=None,
|
|
2
|
+
first_sample_column=None, df1=None, second_sample_variance=None,
|
|
3
|
+
second_sample_column=None, df2=2, alternate_hypothesis='two-tailed',
|
|
4
|
+
sample_name_column=None, sample_value_column=None, first_sample_name=None,
|
|
5
|
+
second_sample_name=None, **generic_arguments):
|
|
10
6
|
"""
|
|
11
7
|
DESCRIPTION:
|
|
12
8
|
The FTest() function performs an F-test, for which the test statistic follows an
|
|
@@ -27,47 +23,72 @@ def FTest(data = None, alpha = None,
|
|
|
27
23
|
|
|
28
24
|
alpha:
|
|
29
25
|
Optional Argument.
|
|
30
|
-
Specifies the probability of rejecting the null
|
|
31
|
-
|
|
32
|
-
|
|
26
|
+
Specifies the probability of rejecting the null
|
|
27
|
+
hypothesis when the null hypothesis is true.
|
|
28
|
+
Note:
|
|
29
|
+
* "alpha" must be a numeric value in the range [0, 1].
|
|
33
30
|
Default Value: 0.05
|
|
34
31
|
Types: float
|
|
35
32
|
|
|
36
33
|
first_sample_column:
|
|
37
|
-
|
|
38
|
-
Specifies the
|
|
39
|
-
|
|
34
|
+
Optional Argument.
|
|
35
|
+
Specifies the first sample column in F-Test.
|
|
36
|
+
Note:
|
|
37
|
+
* This argument must be specified with "first_sample_variance" and "df1"
|
|
38
|
+
or allowed combination is "first_sample_column" with
|
|
39
|
+
"second_sample_variance" and "df2".
|
|
40
|
+
* This argument cannot be used in conjunction with "sample_name_column"
|
|
41
|
+
and "sample_value_column".
|
|
40
42
|
Types: str
|
|
41
43
|
|
|
42
44
|
first_sample_variance:
|
|
43
|
-
|
|
44
|
-
Specifies the
|
|
45
|
+
Optional Argument.
|
|
46
|
+
Specifies the first sample variance.
|
|
47
|
+
Note:
|
|
48
|
+
* This argument must be specified with "first_sample_column" and "df1"
|
|
49
|
+
or other allowed combination is "second_sample_column" with
|
|
50
|
+
"first_sample_variance" and "df1".
|
|
45
51
|
Types: float
|
|
46
52
|
|
|
47
53
|
df1:
|
|
48
|
-
|
|
54
|
+
Optional Argument.
|
|
49
55
|
Specifies the degrees of freedom of the first sample.
|
|
56
|
+
Note:
|
|
57
|
+
* This argument must be specified with "first_sample_column" and
|
|
58
|
+
"first_sample_variance".
|
|
50
59
|
Types: integer
|
|
51
60
|
|
|
52
61
|
second_sample_column:
|
|
53
|
-
|
|
54
|
-
Specifies the
|
|
55
|
-
|
|
62
|
+
Optional Argument.
|
|
63
|
+
Specifies the second sample column in F-Test.
|
|
64
|
+
Note:
|
|
65
|
+
* This argument must be specified with "second_sample_variance" and "df2"
|
|
66
|
+
or allowed combination is "second_sample_column" with "first_sample_variance"
|
|
67
|
+
and "df1".
|
|
68
|
+
* This argument cannot be used in conjunction with "sample_name_column"
|
|
69
|
+
and "sample_value_column".
|
|
56
70
|
Types: str
|
|
57
71
|
|
|
58
72
|
second_sample_variance:
|
|
59
|
-
|
|
60
|
-
Specifies the
|
|
73
|
+
Optional Argument.
|
|
74
|
+
Specifies the second sample variance.
|
|
75
|
+
Note:
|
|
76
|
+
* This argument must be specified with "second_sample_column" and "df2"
|
|
77
|
+
or allowed combination is "first_sample_column" with
|
|
78
|
+
"second_sample_variance" and df2.
|
|
61
79
|
Types: float
|
|
62
80
|
|
|
63
81
|
df2:
|
|
64
|
-
|
|
65
|
-
Specifies the
|
|
82
|
+
Optional Argument.
|
|
83
|
+
Specifies the degree of freedom of the second sample.
|
|
84
|
+
Note:
|
|
85
|
+
* This argument must be specified with "second_sample_column" and
|
|
86
|
+
"second_sample_variance".
|
|
66
87
|
Types: integer
|
|
67
88
|
|
|
68
89
|
alternate_hypothesis:
|
|
69
90
|
Optional Argument.
|
|
70
|
-
Specifies the
|
|
91
|
+
Specifies the alternate hypothesis.
|
|
71
92
|
Permitted Values:
|
|
72
93
|
* lower-tailed - Alternate hypothesis (H 1): μ < μ0.
|
|
73
94
|
* upper-tailed - Alternate hypothesis (H 1): μ > μ0.
|
|
@@ -79,6 +100,27 @@ def FTest(data = None, alpha = None,
|
|
|
79
100
|
Default Value: two-tailed
|
|
80
101
|
Types: str
|
|
81
102
|
|
|
103
|
+
sample_name_column:
|
|
104
|
+
Optional Argument.
|
|
105
|
+
Specifies the column name in "data" containing the names of the samples
|
|
106
|
+
included in the F-Test.
|
|
107
|
+
Types: str
|
|
108
|
+
|
|
109
|
+
sample_value_column:
|
|
110
|
+
Optional Argument.
|
|
111
|
+
Specifies the column name in "data" containing the values for each sample member.
|
|
112
|
+
Types: str
|
|
113
|
+
|
|
114
|
+
first_sample_name:
|
|
115
|
+
Optional Argument.
|
|
116
|
+
Specifies the name of the first sample included in the F-Test.
|
|
117
|
+
Types: str
|
|
118
|
+
|
|
119
|
+
second_sample_name:
|
|
120
|
+
Optional Argument.
|
|
121
|
+
Specifies the name of the second sample included in the F-Test.
|
|
122
|
+
Types: str
|
|
123
|
+
|
|
82
124
|
**generic_arguments:
|
|
83
125
|
Specifies the generic keyword arguments SQLE functions accept.
|
|
84
126
|
Below are the generic keyword arguments:
|
|
@@ -129,9 +171,11 @@ def FTest(data = None, alpha = None,
|
|
|
129
171
|
|
|
130
172
|
# Load the example data.
|
|
131
173
|
load_example_data("teradataml", "titanic")
|
|
174
|
+
load_example_data("ztest", 'insect2Cols')
|
|
132
175
|
|
|
133
176
|
# Create teradataml DataFrame object.
|
|
134
177
|
titanic_data = DataFrame.from_table("titanic")
|
|
178
|
+
insect_gp = DataFrame.from_table("insect2Cols")
|
|
135
179
|
|
|
136
180
|
# Check the list of available analytic functions.
|
|
137
181
|
display_analytic_functions()
|
|
@@ -158,4 +202,39 @@ def FTest(data = None, alpha = None,
|
|
|
158
202
|
|
|
159
203
|
# Print the result DataFrame.
|
|
160
204
|
print(obj.result)
|
|
205
|
+
|
|
206
|
+
# Example 3: Run FTest() with sample_name_column, sample_value_column,
|
|
207
|
+
# first_sample_name and second_sample_name.
|
|
208
|
+
obj = FTest(data=insect_gp,
|
|
209
|
+
sample_value_column='groupValue',
|
|
210
|
+
sample_name_column='groupName',
|
|
211
|
+
first_sample_name='groupE',
|
|
212
|
+
second_sample_name='groupC')
|
|
213
|
+
|
|
214
|
+
# Print the result DataFrame.
|
|
215
|
+
print(obj.result)
|
|
216
|
+
|
|
217
|
+
# Example 4: Run FTest() with sample_name_column, sample_value_column,
|
|
218
|
+
# first_sample_name and second_sample_name.
|
|
219
|
+
obj = FTest(data=insect_gp,
|
|
220
|
+
sample_value_column='groupValue',
|
|
221
|
+
sample_name_column='groupName',
|
|
222
|
+
first_sample_name='groupE',
|
|
223
|
+
second_sample_variance=100.0,
|
|
224
|
+
df2=25)
|
|
225
|
+
|
|
226
|
+
# Print the result DataFrame.
|
|
227
|
+
print(obj.result)
|
|
228
|
+
|
|
229
|
+
# Example 5: Run FTest() with sample_name_column, sample_value_column,
|
|
230
|
+
# second_sample_name and first_sample_variance.
|
|
231
|
+
obj = FTest(data=insect_gp,
|
|
232
|
+
sample_value_column='groupValue',
|
|
233
|
+
sample_name_column='groupName',
|
|
234
|
+
second_sample_name='groupC',
|
|
235
|
+
first_sample_variance=85.0,
|
|
236
|
+
df1=19)
|
|
237
|
+
|
|
238
|
+
# Print the result DataFrame.
|
|
239
|
+
print(obj.result)
|
|
161
240
|
"""
|
|
@@ -2,7 +2,9 @@ def GLM(formula=None, data=None, input_columns=None, response_column=None, famil
|
|
|
2
2
|
iter_max=300, batch_size=10, lambda1=0.02, alpha=0.15,
|
|
3
3
|
iter_num_no_change=50, tolerance=0.001, intercept=True, class_weights="0:1.0, 1:1.0",
|
|
4
4
|
learning_rate=None, initial_eta=0.05, decay_rate=0.25, decay_steps=5, momentum=0.0,
|
|
5
|
-
nesterov=True, local_sgd_iterations=0,
|
|
5
|
+
nesterov=True, local_sgd_iterations=0, stepwise_direction=None, max_steps_num=5,
|
|
6
|
+
initial_stepwise_columns=None, attribute_data=None, parameter_data=None, iteration_mode="BATCH",
|
|
7
|
+
partition_column=None, **generic_arguments):
|
|
6
8
|
"""
|
|
7
9
|
DESCRIPTION:
|
|
8
10
|
The generalized linear model (GLM) function performs regression and classification
|
|
@@ -252,6 +254,55 @@ def GLM(formula=None, data=None, input_columns=None, response_column=None, famil
|
|
|
252
254
|
Default Value: 0
|
|
253
255
|
Types: int
|
|
254
256
|
|
|
257
|
+
stepwise_direction:
|
|
258
|
+
Optional Argument.
|
|
259
|
+
Specify the type of stepwise algorithm to be used.
|
|
260
|
+
Permitted Values: 'FORWARD', 'BACKWARD', 'BOTH', 'BIDIRECTIONAL'
|
|
261
|
+
Types: str
|
|
262
|
+
|
|
263
|
+
max_steps_num:
|
|
264
|
+
Optional Argument.
|
|
265
|
+
Specifies the maximum number of steps to be used for the Stepwise Algorithm.
|
|
266
|
+
Note:
|
|
267
|
+
* The "max_steps_num" must be in the range [1, 2147483647].
|
|
268
|
+
Default Value: 5
|
|
269
|
+
Types: int
|
|
270
|
+
|
|
271
|
+
attribute_data:
|
|
272
|
+
Optional Argument.
|
|
273
|
+
Specifies the teradataml DataFrame containing the attribute data.
|
|
274
|
+
Note:
|
|
275
|
+
* This is valid when "data_partition_column" argument is used.
|
|
276
|
+
Types: teradataml DataFrame
|
|
277
|
+
|
|
278
|
+
parameter_data:
|
|
279
|
+
Optional Argument.
|
|
280
|
+
Specifies the teradataml DataFrame containing the parameter data.
|
|
281
|
+
Note:
|
|
282
|
+
* This is valid when "data_partition_column" argument is used.
|
|
283
|
+
Types: teradataml DataFrame
|
|
284
|
+
|
|
285
|
+
iteration_mode:
|
|
286
|
+
Optional Argument.
|
|
287
|
+
Specifies the iteration mode.
|
|
288
|
+
Note:
|
|
289
|
+
* This is valid when "data_partition_column" argument is used.
|
|
290
|
+
Permitted Values: 'BATCH', 'EPOCH'
|
|
291
|
+
Default Value: 'BATCH'
|
|
292
|
+
Types: str
|
|
293
|
+
|
|
294
|
+
partition_column:
|
|
295
|
+
Optional Argument.
|
|
296
|
+
Specifies the column names of "data" on which to partition the input.
|
|
297
|
+
The name should be consistent with the "data_partition_column".
|
|
298
|
+
Note:
|
|
299
|
+
* If the "data_partition_column" is unicode with foreign language characters,
|
|
300
|
+
it is necessary to specify "partition_column" argument.
|
|
301
|
+
* Column range is not supported for "partition_column" argument.
|
|
302
|
+
* This is valid when "data_partition_column" argument is used.
|
|
303
|
+
Types: str
|
|
304
|
+
|
|
305
|
+
|
|
255
306
|
**generic_arguments:
|
|
256
307
|
Specifies the generic keyword arguments SQLE functions accept. Below
|
|
257
308
|
are the generic keyword arguments:
|
|
@@ -377,4 +428,114 @@ def GLM(formula=None, data=None, input_columns=None, response_column=None, famil
|
|
|
377
428
|
# Print the result DataFrame.
|
|
378
429
|
print(GLM_out_2.result)
|
|
379
430
|
print(GLM_out_2.output_data)
|
|
431
|
+
|
|
432
|
+
# Example 3 : Generate generalized linear model(GLM) using stepwise regression algorithm.
|
|
433
|
+
# This example uses the boston dataset and scales the data.
|
|
434
|
+
# Scaled data is used as input data to generate the GLM model.
|
|
435
|
+
# loading the example data
|
|
436
|
+
load_example_data("decisionforest", ["boston"])
|
|
437
|
+
load_example_data('glm', ['housing_train_segment', 'housing_train_parameter', 'housing_train_attribute'])
|
|
438
|
+
|
|
439
|
+
# Create teradataml DataFrame objects.
|
|
440
|
+
boston_df = DataFrame('boston')
|
|
441
|
+
housing_seg = DataFrame('housing_train_segment')
|
|
442
|
+
housing_parameter = DataFrame('housing_train_parameter')
|
|
443
|
+
housing_attribute = DataFrame('housing_train_attribute')
|
|
444
|
+
|
|
445
|
+
# Scaling the data
|
|
446
|
+
# Scale "target_columns" with respect to 'STD' value of the column.
|
|
447
|
+
fit_obj = ScaleFit(data=boston_df,
|
|
448
|
+
target_columns=['crim','zn','indus','chas','nox','rm','age','dis','rad','tax','ptratio','black','lstat',],
|
|
449
|
+
scale_method="STD")
|
|
450
|
+
|
|
451
|
+
# Scale values specified in the input data using the fit data generated by the ScaleFit() function above.
|
|
452
|
+
obj = ScaleTransform(object=fit_obj.output,
|
|
453
|
+
data=boston_df,
|
|
454
|
+
accumulate=["id","medv"])
|
|
455
|
+
|
|
456
|
+
boston = obj.result
|
|
457
|
+
|
|
458
|
+
# Generate generalized linear model(GLM) using stepwise regression algorithm.
|
|
459
|
+
glm_1 = GLM(data=boston,
|
|
460
|
+
input_columns=['indus','chas','nox','rm'],
|
|
461
|
+
response_column='medv',
|
|
462
|
+
family='GAUSSIAN',
|
|
463
|
+
lambda1=0.02,
|
|
464
|
+
alpha=0.33,
|
|
465
|
+
batch_size=10,
|
|
466
|
+
learning_rate='optimal',
|
|
467
|
+
iter_max=36,
|
|
468
|
+
iter_num_no_change=100,
|
|
469
|
+
tolerance=0.0001,
|
|
470
|
+
initial_eta=0.02,
|
|
471
|
+
stepwise_direction='backward',
|
|
472
|
+
max_steps_num=10)
|
|
473
|
+
|
|
474
|
+
# Print the result DataFrame.
|
|
475
|
+
print(glm_1.result)
|
|
476
|
+
|
|
477
|
+
# Example 4 : Generate generalized linear model(GLM) using
|
|
478
|
+
# stepwise regression algorithm with initial_stepwise_columns.
|
|
479
|
+
glm_2 = GLM(data=boston,
|
|
480
|
+
input_columns=['crim','zn','indus','chas','nox','rm','age','dis','rad','tax','ptratio','black','lstat'],
|
|
481
|
+
response_column='medv',
|
|
482
|
+
family='GAUSSIAN',
|
|
483
|
+
lambda1=0.02,
|
|
484
|
+
alpha=0.33,
|
|
485
|
+
batch_size=10,
|
|
486
|
+
learning_rate='optimal',
|
|
487
|
+
iter_max=36,
|
|
488
|
+
iter_num_no_change=100,
|
|
489
|
+
tolerance=0.0001,
|
|
490
|
+
initial_eta=0.02,
|
|
491
|
+
stepwise_direction='bidirectional',
|
|
492
|
+
max_steps_num=10,
|
|
493
|
+
initial_stepwise_columns=['rad','tax']
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
# Print the result DataFrame.
|
|
497
|
+
print(glm_2.result)
|
|
498
|
+
|
|
499
|
+
# Example 5 : Generate generalized linear model(GLM) using partition by key.
|
|
500
|
+
glm_3 = GLM(data=housing_seg,
|
|
501
|
+
input_columns=['bedrooms', 'bathrms', 'stories', 'driveway', 'recroom', 'fullbase', 'gashw', 'airco'],
|
|
502
|
+
response_column='price',
|
|
503
|
+
family='GAUSSIAN',
|
|
504
|
+
batch_size=10,
|
|
505
|
+
iter_max=1000,
|
|
506
|
+
data_partition_column='partition_id'
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
# Print the result DataFrame.
|
|
510
|
+
print(glm_3.result)
|
|
511
|
+
|
|
512
|
+
# Example 6 : Generate generalized linear model(GLM) using partition by key with attribute data.
|
|
513
|
+
glm_4 = GLM(data=housing_seg,
|
|
514
|
+
input_columns=['bedrooms', 'bathrms', 'stories', 'driveway', 'recroom', 'fullbase', 'gashw', 'airco'],
|
|
515
|
+
response_column='price',
|
|
516
|
+
family='GAUSSIAN',
|
|
517
|
+
batch_size=10,
|
|
518
|
+
iter_max=1000,
|
|
519
|
+
data_partition_column='partition_id',
|
|
520
|
+
attribute_data = housing_attribute,
|
|
521
|
+
attribute_data_partition_column = 'partition_id'
|
|
522
|
+
)
|
|
523
|
+
|
|
524
|
+
# Print the result DataFrame.
|
|
525
|
+
print(glm_4.result)
|
|
526
|
+
|
|
527
|
+
# Example 7 : Generate generalized linear model(GLM) using partition by key with parameter data
|
|
528
|
+
glm_5 = GLM(data=housing_seg,
|
|
529
|
+
input_columns=['bedrooms', 'bathrms', 'stories', 'driveway', 'recroom', 'fullbase', 'gashw', 'airco'],
|
|
530
|
+
response_column='homestyle',
|
|
531
|
+
family='binomial',
|
|
532
|
+
iter_max=1000,
|
|
533
|
+
data_partition_column='partition_id',
|
|
534
|
+
parameter_data = housing_parameter,
|
|
535
|
+
parameter_data_partition_column = 'partition_id'
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
# Print the result DataFrame.
|
|
539
|
+
print(glm_5.result)
|
|
540
|
+
|
|
380
541
|
"""
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
def GetFutileColumns(data=None, object=None, category_summary_column=
|
|
1
|
+
def GetFutileColumns(data=None, object=None, category_summary_column='ColumnName', threshold_value=0.95, **generic_arguments):
|
|
2
2
|
"""
|
|
3
3
|
DESCRIPTION:
|
|
4
4
|
GetFutileColumns() function returns the futile column names if either
|
|
@@ -31,14 +31,16 @@ def GetFutileColumns(data=None, object=None, category_summary_column=None, thres
|
|
|
31
31
|
Types: teradataml DataFrame or CategoricalSummary
|
|
32
32
|
|
|
33
33
|
category_summary_column:
|
|
34
|
-
|
|
34
|
+
Optional Argument.
|
|
35
35
|
Specifies the column from categorical summary DataFrame which provides names of
|
|
36
36
|
the columns in "data".
|
|
37
|
+
Default Value: 'ColumnName'
|
|
37
38
|
Types: str
|
|
38
39
|
|
|
39
40
|
threshold_value:
|
|
40
|
-
|
|
41
|
+
Optional Argument.
|
|
41
42
|
Specifies the threshold value for the columns in "data".
|
|
43
|
+
Default Value: 0.95
|
|
42
44
|
Types: float
|
|
43
45
|
|
|
44
46
|
**generic_arguments:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
def KMeans(data=None, centroids_data=None, id_column=None, target_columns=None, num_clusters=None,
|
|
2
2
|
seed=None, threshold=0.0395, iter_max=10, num_init=1, output_cluster_assignment=False,
|
|
3
|
-
**generic_arguments):
|
|
3
|
+
initialcentroids_method="RANDOM", **generic_arguments):
|
|
4
4
|
"""
|
|
5
5
|
DESCRIPTION:
|
|
6
6
|
The K-means() function groups a set of observations into k clusters
|
|
@@ -45,6 +45,10 @@ def KMeans(data=None, centroids_data=None, id_column=None, target_columns=None,
|
|
|
45
45
|
Optional Argument.
|
|
46
46
|
Specifies the input teradataml DataFrame containing
|
|
47
47
|
set of initial centroids.
|
|
48
|
+
Note:
|
|
49
|
+
* This argument is not required if "num_clusters" provided.
|
|
50
|
+
* If provided, the function uses the initial centroids
|
|
51
|
+
from this input.
|
|
48
52
|
Types: teradataml DataFrame
|
|
49
53
|
|
|
50
54
|
id_column:
|
|
@@ -105,6 +109,15 @@ def KMeans(data=None, centroids_data=None, id_column=None, target_columns=None,
|
|
|
105
109
|
Specifies whether to output Cluster Assignment information.
|
|
106
110
|
Default Value: False
|
|
107
111
|
Types: bool
|
|
112
|
+
|
|
113
|
+
initialcentroids_method:
|
|
114
|
+
Optional Argument.
|
|
115
|
+
Specifies the initialization method to be used for selecting initial set of centroids.
|
|
116
|
+
Permitted Values: 'RANDOM', 'KMEANS++'
|
|
117
|
+
Default Value: 'RANDOM'
|
|
118
|
+
Note:
|
|
119
|
+
* This argument is not required if "centroids_data" is provided.
|
|
120
|
+
Types: str
|
|
108
121
|
|
|
109
122
|
**generic_arguments:
|
|
110
123
|
Specifies the generic keyword arguments SQLE functions accept. Below
|
|
@@ -167,9 +180,11 @@ def KMeans(data=None, centroids_data=None, id_column=None, target_columns=None,
|
|
|
167
180
|
|
|
168
181
|
# Load the example data.
|
|
169
182
|
load_example_data("kmeans", "computers_train1")
|
|
183
|
+
load_example_data("kmeans",'kmeans_table')
|
|
170
184
|
|
|
171
185
|
# Create teradataml DataFrame objects.
|
|
172
186
|
computers_train1 = DataFrame.from_table("computers_train1")
|
|
187
|
+
kmeans_tab = DataFrame('kmeans_table')
|
|
173
188
|
|
|
174
189
|
# Check the list of available analytic functions.
|
|
175
190
|
display_analytic_functions()
|
|
@@ -191,6 +206,7 @@ def KMeans(data=None, centroids_data=None, id_column=None, target_columns=None,
|
|
|
191
206
|
# Get the set of initial centroids by accessing the group of rows
|
|
192
207
|
# from input data.
|
|
193
208
|
kmeans_initial_centroids_table = computers_train1.loc[[19, 97]]
|
|
209
|
+
kmeans_initial_centroids = kmeans_tab.loc[[2, 4]]
|
|
194
210
|
|
|
195
211
|
KMeans_out_1 = KMeans(id_column="id",
|
|
196
212
|
target_columns=['price', 'speed'],
|
|
@@ -201,4 +217,35 @@ def KMeans(data=None, centroids_data=None, id_column=None, target_columns=None,
|
|
|
201
217
|
print(KMeans_out_1.result)
|
|
202
218
|
print(KMeans_out_1.model_data)
|
|
203
219
|
|
|
220
|
+
# Example 3 : Grouping a set of observations into 2 clusters by
|
|
221
|
+
# specifying the number of clusters and seed value
|
|
222
|
+
# with output cluster assignment information.
|
|
223
|
+
obj = KMeans(data=kmeans_tab,
|
|
224
|
+
id_column='id',
|
|
225
|
+
target_columns=['c1', 'c2'],
|
|
226
|
+
threshold=0.0395,
|
|
227
|
+
iter_max=3,
|
|
228
|
+
centroids_data=kmeans_initial_centroids,
|
|
229
|
+
output_cluster_assignment=True
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# Print the result DataFrames.
|
|
233
|
+
print(obj.result)
|
|
234
|
+
|
|
235
|
+
# Example 4 : Grouping a set of observations into 3 clusters by
|
|
236
|
+
# specifying the number of clusters for initial centroids
|
|
237
|
+
# method as KMEANS++.
|
|
238
|
+
obj = KMeans(data=kmeans_tab,
|
|
239
|
+
id_column='id',
|
|
240
|
+
target_columns=['c1', 'c2'],
|
|
241
|
+
threshold=0.0395,
|
|
242
|
+
iter_max=3,
|
|
243
|
+
num_clusters=3,
|
|
244
|
+
output_cluster_assignment=True,
|
|
245
|
+
initialcentroids_method="KMEANS++"
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Print the result DataFrames.
|
|
249
|
+
print(obj.result)
|
|
250
|
+
|
|
204
251
|
"""
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
def NonLinearCombineFit(data=None, target_columns=None, formula=None,
|
|
2
|
-
result_column=
|
|
2
|
+
result_column='TD_CombinedValue', **generic_arguments):
|
|
3
3
|
"""
|
|
4
4
|
DESCRIPTION:
|
|
5
5
|
The NonLinearCombineFit() function returns the target columns and a
|
|
@@ -31,9 +31,10 @@ def NonLinearCombineFit(data=None, target_columns=None, formula=None,
|
|
|
31
31
|
Types: str
|
|
32
32
|
|
|
33
33
|
result_column:
|
|
34
|
-
|
|
34
|
+
Optional Argument.
|
|
35
35
|
Specifies the name of the new feature column generated by the Transform function.
|
|
36
36
|
This function saves the specified formula in this column.
|
|
37
|
+
Default Value: 'TD_CombinedValue'
|
|
37
38
|
Types: str
|
|
38
39
|
|
|
39
40
|
**generic_arguments:
|
|
@@ -12,6 +12,11 @@ def OneHotEncodingFit(data=None, category_data=None, target_column=None,
|
|
|
12
12
|
* This function requires the UTF8 client character set for UNICODE data.
|
|
13
13
|
* This function does not support Pass Through Characters (PTCs).
|
|
14
14
|
* This function does not support KanjiSJIS or Graphic data types.
|
|
15
|
+
* For input to be considered as sparse input, column names should be
|
|
16
|
+
provided for 'data_partition_column' argument.
|
|
17
|
+
* In case of dense input, only allowed value for 'data_partition_column'
|
|
18
|
+
is PartitionKind.ANY and that for 'category_data_partition_column' is
|
|
19
|
+
PartitionKind.DIMENSION.
|
|
15
20
|
|
|
16
21
|
PARAMETERS:
|
|
17
22
|
data:
|
|
@@ -3,6 +3,12 @@ def OneHotEncodingTransform(data=None, object=None, is_input_dense=None, **gener
|
|
|
3
3
|
DESCRIPTION:
|
|
4
4
|
Function encodes specified attributes and categorical values as one-hot numeric vectors,
|
|
5
5
|
using OneHotEncodingFit() function output.
|
|
6
|
+
Notes:
|
|
7
|
+
* In case of sparse input, neither 'data_partition_column' nor
|
|
8
|
+
'object_partition_column' can be used independently.
|
|
9
|
+
* In case of dense input, if 'data_partition_column' is having value
|
|
10
|
+
PartitionKind.ANY, then 'object_partition_column' should have value
|
|
11
|
+
PartitionKind.DIMENSION.
|
|
6
12
|
|
|
7
13
|
|
|
8
14
|
PARAMETERS:
|