teradataml 20.0.0.4__py3-none-any.whl → 20.0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/README.md +86 -13
- teradataml/__init__.py +2 -1
- teradataml/_version.py +2 -2
- teradataml/analytics/analytic_function_executor.py +7 -12
- teradataml/analytics/json_parser/analytic_functions_argument.py +4 -0
- teradataml/analytics/sqle/__init__.py +16 -1
- teradataml/analytics/utils.py +15 -1
- teradataml/automl/__init__.py +290 -106
- teradataml/automl/autodataprep/__init__.py +471 -0
- teradataml/automl/data_preparation.py +29 -10
- teradataml/automl/data_transformation.py +11 -0
- teradataml/automl/feature_engineering.py +64 -4
- teradataml/automl/feature_exploration.py +639 -25
- teradataml/automl/model_training.py +1 -1
- teradataml/clients/auth_client.py +2 -2
- teradataml/common/constants.py +61 -26
- teradataml/common/messagecodes.py +2 -1
- teradataml/common/messages.py +5 -4
- teradataml/common/utils.py +255 -37
- teradataml/context/context.py +225 -87
- teradataml/data/apriori_example.json +22 -0
- teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
- teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +3 -3
- teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
- teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +54 -3
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/DFFT.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFT2.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +6 -6
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/PACF.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Resample.py +5 -5
- teradataml/data/docs/uaf/docs_17_20/SAX.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Smoothma.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +1 -1
- teradataml/data/jsons/sqle/17.20/NGramSplitter.json +6 -6
- teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
- teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
- teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
- teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +114 -9
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +420 -0
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +343 -0
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +359 -0
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +360 -0
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +343 -0
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +343 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +2 -2
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +1 -1
- teradataml/data/ner_dict.csv +8 -0
- teradataml/data/ner_input_eng.csv +7 -0
- teradataml/data/ner_rule.csv +5 -0
- teradataml/data/pos_input.csv +40 -0
- teradataml/data/tdnerextractor_example.json +14 -0
- teradataml/data/teradataml_example.json +13 -0
- teradataml/data/textmorph_example.json +5 -0
- teradataml/data/to_num_data.csv +4 -0
- teradataml/data/tochar_data.csv +5 -0
- teradataml/data/trans_dense.csv +16 -0
- teradataml/data/trans_sparse.csv +55 -0
- teradataml/dataframe/copy_to.py +37 -26
- teradataml/dataframe/data_transfer.py +61 -45
- teradataml/dataframe/dataframe.py +130 -50
- teradataml/dataframe/dataframe_utils.py +15 -2
- teradataml/dataframe/functions.py +109 -9
- teradataml/dataframe/sql.py +328 -76
- teradataml/dbutils/dbutils.py +33 -13
- teradataml/dbutils/filemgr.py +14 -10
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/_base.py +6 -157
- teradataml/options/configure.py +4 -5
- teradataml/scriptmgmt/UserEnv.py +305 -38
- teradataml/scriptmgmt/lls_utils.py +376 -130
- teradataml/store/__init__.py +1 -1
- teradataml/table_operators/Apply.py +16 -1
- teradataml/table_operators/Script.py +20 -1
- teradataml/table_operators/table_operator_util.py +58 -9
- teradataml/utils/dtypes.py +2 -1
- teradataml/utils/internal_buffer.py +22 -2
- teradataml/utils/validators.py +313 -57
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/METADATA +89 -14
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/RECORD +107 -77
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.4.dist-info → teradataml-20.0.0.5.dist-info}/zip-safe +0 -0
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
def Apriori(data=None, target_column=None, id_column=None, partition_columns=None,
|
|
2
|
+
max_len=2, delimiter=",", is_dense_input=False, patterns_or_rules=None,
|
|
3
|
+
support=0.01, **generic_arguments):
|
|
4
|
+
"""
|
|
5
|
+
DESCRIPTION:
|
|
6
|
+
The Apriori() function finds patterns and calculates different statistical metrics to
|
|
7
|
+
understand the influence of the occurrence of a set of items on others.
|
|
8
|
+
|
|
9
|
+
PARAMETERS:
|
|
10
|
+
data:
|
|
11
|
+
Required Argument.
|
|
12
|
+
Specifies the input teradataml DataFrame.
|
|
13
|
+
Types: teradataml DataFrame
|
|
14
|
+
|
|
15
|
+
target_column:
|
|
16
|
+
Required Argument.
|
|
17
|
+
Specifies the input teradataml DataFrame column which contains the data to filter.
|
|
18
|
+
Types: str
|
|
19
|
+
|
|
20
|
+
id_column:
|
|
21
|
+
Optional Argument.
|
|
22
|
+
Specifies the name of the column that uniquely groups the items that are purchased together.
|
|
23
|
+
Applicable only when `is_dense_input` is False.
|
|
24
|
+
Types: str
|
|
25
|
+
|
|
26
|
+
partition_columns:
|
|
27
|
+
Optional Argument.
|
|
28
|
+
Specifies the column name(s) in the "data" to partition the input.
|
|
29
|
+
Types: str or list of str
|
|
30
|
+
|
|
31
|
+
max_len:
|
|
32
|
+
Optional Argument.
|
|
33
|
+
Specifies the maximum number of items in the item set.
|
|
34
|
+
"max_len" must be greater than or equal to 1 and less than or equal to 20.
|
|
35
|
+
Default Value: 2
|
|
36
|
+
Types: int
|
|
37
|
+
|
|
38
|
+
delimiter:
|
|
39
|
+
Optional Argument, Required when "is_dense_input" is set to True.
|
|
40
|
+
Specifies a character or string that separates words in the input text.
|
|
41
|
+
Default Value: ","
|
|
42
|
+
Types: str
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
is_dense_input:
|
|
46
|
+
Optional Argument.
|
|
47
|
+
Specifies whether input data is in dense format or not.
|
|
48
|
+
When set to True, function considers the data is in dense format.
|
|
49
|
+
Otherwise function considers data is not in dense format.
|
|
50
|
+
Default Value: False
|
|
51
|
+
Types: bool
|
|
52
|
+
|
|
53
|
+
patterns_or_rules:
|
|
54
|
+
Optional Argument.
|
|
55
|
+
Specifies whether to emit PATTERNS or RULES as output.
|
|
56
|
+
Permitted Values: "PATTERNS", "RULES"
|
|
57
|
+
Types: str
|
|
58
|
+
|
|
59
|
+
support:
|
|
60
|
+
Optional Argument.
|
|
61
|
+
Specifies the support value (minimum occurrence threshold) of the itemset.
|
|
62
|
+
Default Value: 0.01
|
|
63
|
+
Types: float
|
|
64
|
+
|
|
65
|
+
**generic_arguments:
|
|
66
|
+
Optional Argument.
|
|
67
|
+
Specifies the generic keyword arguments SQLE functions accept. Below are the generic
|
|
68
|
+
keyword arguments:
|
|
69
|
+
persist:
|
|
70
|
+
Optional Argument.
|
|
71
|
+
Specifies whether to persist the results of the function in a table or not.
|
|
72
|
+
When set to True, results are persisted in a table; otherwise, results are
|
|
73
|
+
garbage collected at the end of the session.
|
|
74
|
+
Default Value: False
|
|
75
|
+
Types: bool
|
|
76
|
+
|
|
77
|
+
volatile:
|
|
78
|
+
Optional Argument.
|
|
79
|
+
Specifies whether to put the results of the function in a volatile table or not.
|
|
80
|
+
When set to True, results are stored in a volatile table; otherwise not.
|
|
81
|
+
Default Value: False
|
|
82
|
+
Types: bool
|
|
83
|
+
|
|
84
|
+
Function allows the user to partition, hash, order or local order the input
|
|
85
|
+
data. These generic arguments are available for each argument that accepts
|
|
86
|
+
teradataml DataFrame as input and can be accessed as:
|
|
87
|
+
* "<input_data_arg_name>_partition_column" accepts str or list of str (Strings)
|
|
88
|
+
* "<input_data_arg_name>_hash_column" accepts str or list of str (Strings)
|
|
89
|
+
* "<input_data_arg_name>_order_column" accepts str or list of str (Strings)
|
|
90
|
+
* "local_order_<input_data_arg_name>" accepts boolean
|
|
91
|
+
Note:
|
|
92
|
+
These generic arguments are supported by teradataml if the underlying SQLE Engine
|
|
93
|
+
function supports, else an exception is raised.
|
|
94
|
+
|
|
95
|
+
RETURNS:
|
|
96
|
+
Instance of Apriori.
|
|
97
|
+
Output teradataml DataFrames can be accessed using attribute references, such as AprioriObj.<attribute_name>.
|
|
98
|
+
Output teradataml DataFrame attribute name is:
|
|
99
|
+
result
|
|
100
|
+
|
|
101
|
+
RAISES:
|
|
102
|
+
TeradataMlException, TypeError, ValueError
|
|
103
|
+
|
|
104
|
+
EXAMPLES:
|
|
105
|
+
# Notes:
|
|
106
|
+
# 1. Get the connection to Vantage to execute the function.
|
|
107
|
+
# 2. One must import the required functions mentioned in the example from teradataml.
|
|
108
|
+
# 3. Function will raise an error if not supported on the Vantage user is connected to.
|
|
109
|
+
|
|
110
|
+
# Load the example data.
|
|
111
|
+
load_example_data("apriori", ["trans_dense","trans_sparse"])
|
|
112
|
+
|
|
113
|
+
# Create teradataml DataFrame objects.
|
|
114
|
+
dense_table = DataFrame.from_table("trans_dense")
|
|
115
|
+
sparse_table = DataFrame.from_table("trans_sparse")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# Check the list of available analytic functions.
|
|
119
|
+
display_analytic_functions()
|
|
120
|
+
|
|
121
|
+
# Import function Apriori.
|
|
122
|
+
from teradataml import Apriori
|
|
123
|
+
|
|
124
|
+
# Example 1: Find patterns in the input data with DENSE DATA, PARTITION,RULES .
|
|
125
|
+
Apriori_out = Apriori(data=dense_table, target_column="item",
|
|
126
|
+
partition_columns=["location"], max_len=2,
|
|
127
|
+
patterns_or_rules="rules", support=0.01)
|
|
128
|
+
|
|
129
|
+
# Print the result DataFrame.
|
|
130
|
+
print(Apriori_out.result)
|
|
131
|
+
|
|
132
|
+
# Example 2: Find patterns in the input data with SPARSE DATA, NO PARTITIONS, PATTERNS.
|
|
133
|
+
Apriori_out = Apriori(data=sparse_table, target_column="item",
|
|
134
|
+
id_column="tranid", max_len=3)
|
|
135
|
+
|
|
136
|
+
# Print the result DataFrame.
|
|
137
|
+
print(Apriori_out.result)
|
|
138
|
+
"""
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
def NERExtractor(data=None, user_defined_data=None, rules_data=None, text_column=None,
|
|
2
|
+
input_language="EN", show_context=0, accumulate=None,
|
|
3
|
+
**generic_arguments):
|
|
4
|
+
"""
|
|
5
|
+
DESCRIPTION:
|
|
6
|
+
NERExtractor() performs Named Entity Recognition (NER) on input text
|
|
7
|
+
according to user-defined dictionary words or regular expression (regex) patterns.
|
|
8
|
+
|
|
9
|
+
PARAMETERS:
|
|
10
|
+
data:
|
|
11
|
+
Required Argument.
|
|
12
|
+
Specifies the input teradataml DataFrame.
|
|
13
|
+
Types: teradataml DataFrame
|
|
14
|
+
|
|
15
|
+
user_defined_data:
|
|
16
|
+
Required Argument.
|
|
17
|
+
Specifies the teradataml DataFrame which contains user defined words and the corresponding entity label.
|
|
18
|
+
Types: teradataml DataFrame
|
|
19
|
+
|
|
20
|
+
rules_data:
|
|
21
|
+
Required Argument.
|
|
22
|
+
Specifies the teradataml DataFrame which contains user-defined regex patterns and the corresponding entity label.
|
|
23
|
+
Types: teradataml DataFrame
|
|
24
|
+
|
|
25
|
+
text_column:
|
|
26
|
+
Required Argument.
|
|
27
|
+
Specifies the name of the teradataml DataFrame column that will be used for NER search.
|
|
28
|
+
Types: str
|
|
29
|
+
|
|
30
|
+
input_language:
|
|
31
|
+
Optional Argument.
|
|
32
|
+
Specifies the language of input text.
|
|
33
|
+
Default Value: "EN"
|
|
34
|
+
Types: str
|
|
35
|
+
|
|
36
|
+
show_context:
|
|
37
|
+
Optional Argument.
|
|
38
|
+
Specifies the number of words before and after the matched entity. If leading or trailing
|
|
39
|
+
words are less than "show_context", then ellipsis (...) are added. Must be a positive value
|
|
40
|
+
less than 10.
|
|
41
|
+
Default Value: 0
|
|
42
|
+
Types: int
|
|
43
|
+
|
|
44
|
+
accumulate:
|
|
45
|
+
Optional Argument.
|
|
46
|
+
Specifies the name(s) of input teradataml DataFrame column(s) to copy to the output.
|
|
47
|
+
table to output.
|
|
48
|
+
Types: str or list of str
|
|
49
|
+
|
|
50
|
+
**generic_arguments:
|
|
51
|
+
Optional Argument.
|
|
52
|
+
Specifies the generic keyword arguments SQLE functions accept. Below are the generic
|
|
53
|
+
keyword arguments:
|
|
54
|
+
persist:
|
|
55
|
+
Optional Argument.
|
|
56
|
+
Specifies whether to persist the results of the function in a table or not.
|
|
57
|
+
When set to True, results are persisted in a table; otherwise, results are
|
|
58
|
+
garbage collected at the end of the session.
|
|
59
|
+
Default Value: False
|
|
60
|
+
Types: bool
|
|
61
|
+
|
|
62
|
+
volatile:
|
|
63
|
+
Optional Argument.
|
|
64
|
+
Specifies whether to put the results of the function in a volatile table or not.
|
|
65
|
+
When set to True, results are stored in a volatile table; otherwise not.
|
|
66
|
+
Default Value: False
|
|
67
|
+
Types: bool
|
|
68
|
+
|
|
69
|
+
Function allows the user to partition, hash, order or local order the input
|
|
70
|
+
data. These generic arguments are available for each argument that accepts
|
|
71
|
+
teradataml DataFrame as input and can be accessed as:
|
|
72
|
+
* "<input_data_arg_name>_partition_column" accepts str or list of str (Strings)
|
|
73
|
+
* "<input_data_arg_name>_hash_column" accepts str or list of str (Strings)
|
|
74
|
+
* "<input_data_arg_name>_order_column" accepts str or list of str (Strings)
|
|
75
|
+
* "local_order_<input_data_arg_name>" accepts boolean
|
|
76
|
+
Note:
|
|
77
|
+
These generic arguments are supported by teradataml if the underlying SQLE Engine
|
|
78
|
+
function supports, else an exception is raised.
|
|
79
|
+
|
|
80
|
+
RETURNS:
|
|
81
|
+
Instance of NERExtractor.
|
|
82
|
+
Output teradataml DataFrames can be accessed using attribute references, such as TDNERExtractorObj.<attribute_name>.
|
|
83
|
+
Output teradataml DataFrame attribute name is:
|
|
84
|
+
result
|
|
85
|
+
|
|
86
|
+
RAISES:
|
|
87
|
+
TeradataMlException, TypeError, ValueError
|
|
88
|
+
|
|
89
|
+
EXAMPLES:
|
|
90
|
+
# Notes:
|
|
91
|
+
# 1. Get the connection to Vantage to execute the function.
|
|
92
|
+
# 2. One must import the required functions mentioned in the example from teradataml.
|
|
93
|
+
# 3. Function will raise an error if not supported on the Vantage user is connected to.
|
|
94
|
+
|
|
95
|
+
# Load the example data.
|
|
96
|
+
load_example_data("tdnerextractor", ["ner_input_eng", "ner_dict", "ner_rule"])
|
|
97
|
+
|
|
98
|
+
# Create teradataml DataFrame objects.
|
|
99
|
+
df = DataFrame.from_table("ner_input_eng")
|
|
100
|
+
user_defined_words = DataFrame.from_table("ner_dict")
|
|
101
|
+
rules = DataFrame.from_table("ner_rule")
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# Check the list of available analytic functions.
|
|
105
|
+
display_analytic_functions()
|
|
106
|
+
|
|
107
|
+
# Import function NERExtractor.
|
|
108
|
+
from teradataml import NERExtractor
|
|
109
|
+
|
|
110
|
+
# Example 1: Perform Named Entity Recognition (NER) using Rules and Dict with Accumulate.
|
|
111
|
+
NER_out = NERExtractor(data=df,
|
|
112
|
+
user_defined_data=user_defined_words,
|
|
113
|
+
rules_data=rules,
|
|
114
|
+
text_column=["txt"],
|
|
115
|
+
input_language="en",
|
|
116
|
+
show_context=3,
|
|
117
|
+
accumulate=["id"])
|
|
118
|
+
|
|
119
|
+
# Print the result DataFrame.
|
|
120
|
+
print(NER_out.result)
|
|
121
|
+
"""
|
|
@@ -33,7 +33,7 @@ def NGramSplitter(data=None, text_column=None, delimiter=" ", grams=None, overla
|
|
|
33
33
|
|
|
34
34
|
delimiter:
|
|
35
35
|
Optional Argument.
|
|
36
|
-
Specifies a character or string that separates words in the input text. The
|
|
36
|
+
Specifies a character or string or a regular expression that separates words in the input text. The
|
|
37
37
|
default value is the set of all whitespace characters which includes
|
|
38
38
|
the characters for space, tab, newline, carriage return and some
|
|
39
39
|
others.
|
|
@@ -66,14 +66,14 @@ def NGramSplitter(data=None, text_column=None, delimiter=" ", grams=None, overla
|
|
|
66
66
|
|
|
67
67
|
punctuation:
|
|
68
68
|
Optional Argument.
|
|
69
|
-
Specifies a string that specifies the punctuation characters for the function
|
|
69
|
+
Specifies a string or a regular expression that specifies the punctuation characters for the function
|
|
70
70
|
to remove before evaluating the input text.
|
|
71
71
|
Default Value: "`~#^&*()-"
|
|
72
72
|
Types: str
|
|
73
73
|
|
|
74
74
|
reset:
|
|
75
75
|
Optional Argument.
|
|
76
|
-
Specifies a string that specifies the character or string that ends a sentence.
|
|
76
|
+
Specifies a string or a regular expression that specifies the character or string that ends a sentence.
|
|
77
77
|
At the end of a sentence, the function discards any partial n-grams and searches
|
|
78
78
|
for the next n-gram at the beginning of the next sentence. An n-gram
|
|
79
79
|
cannot span two sentences.
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
def SMOTE(data = None, encoding_data = None, id_column = None,
|
|
2
|
+
response_column = None, input_columns = None, categorical_columns = None,
|
|
3
|
+
median_standard_deviation = None, minority_class = None,
|
|
4
|
+
oversampling_factor = 5, sampling_strategy = "smote",
|
|
5
|
+
fill_sampleid = True, noninput_columns_value = "sample", n_neighbors = 5,
|
|
6
|
+
seed = None, **generic_arguments):
|
|
7
|
+
"""
|
|
8
|
+
DESCRIPTION:
|
|
9
|
+
SMOTE() function generates data by oversampling a minority class using
|
|
10
|
+
smote, adasyn, borderline-2 or smote-nc algorithms.
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
PARAMETERS:
|
|
14
|
+
data:
|
|
15
|
+
Required Argument.
|
|
16
|
+
Specifies the input teradataml DataFrame.
|
|
17
|
+
Types: teradataml DataFrame
|
|
18
|
+
|
|
19
|
+
encoding_data:
|
|
20
|
+
Optional Argument, Required when "sampling_strategy" is set to 'smotenc' algorithm.
|
|
21
|
+
Specifies the teradataml dataframe containing the ordinal encoding information.
|
|
22
|
+
Types: teradataml DataFrame
|
|
23
|
+
|
|
24
|
+
id_column:
|
|
25
|
+
Required Argument.
|
|
26
|
+
Specifies the name of the column in "data" that
|
|
27
|
+
uniquely identifies a data sample.
|
|
28
|
+
Types: str
|
|
29
|
+
|
|
30
|
+
response_column:
|
|
31
|
+
Optional Argument.
|
|
32
|
+
Specifies the name of the column in "data" that contains the
|
|
33
|
+
numeric value to be used as the response value for a sample.
|
|
34
|
+
Types: str
|
|
35
|
+
|
|
36
|
+
input_columns:
|
|
37
|
+
Required Argument.
|
|
38
|
+
Specifies the name of the input columns in "data" for oversampling.
|
|
39
|
+
Types: str OR list of Strings (str)
|
|
40
|
+
|
|
41
|
+
categorical_columns:
|
|
42
|
+
Optional Argument, Required when "sampling_strategy" is set to 'smotenc' algorithm.
|
|
43
|
+
Specifies the name of the categorical columns in the "data" that
|
|
44
|
+
the function uses for oversampling with smotenc.
|
|
45
|
+
Types: str OR list of Strings (str)
|
|
46
|
+
|
|
47
|
+
median_standard_deviation:
|
|
48
|
+
Optional Argument, Required when "sampling_strategy" is set to 'smotenc' algorithm.
|
|
49
|
+
Specifies the median of the standard deviations computed over the
|
|
50
|
+
numerical input columns.
|
|
51
|
+
Types: float
|
|
52
|
+
|
|
53
|
+
minority_class:
|
|
54
|
+
Required Argument.
|
|
55
|
+
Specifies the minority class for which synthetic samples need to be
|
|
56
|
+
generated.
|
|
57
|
+
Note:
|
|
58
|
+
* The label for minority class under response column must be numeric integer.
|
|
59
|
+
Types: str
|
|
60
|
+
|
|
61
|
+
oversampling_factor:
|
|
62
|
+
Optional Argument.
|
|
63
|
+
Specifies the factor for oversampling the minority class.
|
|
64
|
+
Default Value: 5
|
|
65
|
+
Types: float
|
|
66
|
+
|
|
67
|
+
sampling_strategy:
|
|
68
|
+
Optional Argument.
|
|
69
|
+
Specifies the oversampling algorithm to be used to create synthetic samples.
|
|
70
|
+
Default Value: "smote"
|
|
71
|
+
Permitted Values: "smote", "adasyn", "borderline", "smotenc"
|
|
72
|
+
Types: str
|
|
73
|
+
|
|
74
|
+
fill_sampleid:
|
|
75
|
+
Optional Argument.
|
|
76
|
+
Specifies whether to include the id of the original observation used
|
|
77
|
+
to generate each synthetic observation.
|
|
78
|
+
Default Value: True
|
|
79
|
+
Types: bool
|
|
80
|
+
|
|
81
|
+
noninput_columns_value:
|
|
82
|
+
Optional Argument.
|
|
83
|
+
Specifies the value to put in a sample column for columns not
|
|
84
|
+
specified as input columns.
|
|
85
|
+
Default Value: "sample"
|
|
86
|
+
Permitted Values: "sample", "neighbor", "null"
|
|
87
|
+
Types: str
|
|
88
|
+
|
|
89
|
+
n_neighbors:
|
|
90
|
+
Optional Argument.
|
|
91
|
+
Specifies the number of nearest neighbors for choosing the sample to
|
|
92
|
+
be used in oversampling.
|
|
93
|
+
Default Value: 5
|
|
94
|
+
Types: int
|
|
95
|
+
|
|
96
|
+
seed:
|
|
97
|
+
Optional Argument.
|
|
98
|
+
Specifies the random seed the algorithm uses for repeatable results.
|
|
99
|
+
The function uses the seed for random interpolation and generate the
|
|
100
|
+
synthetic sample.
|
|
101
|
+
Types: int
|
|
102
|
+
|
|
103
|
+
**generic_arguments:
|
|
104
|
+
Specifies the generic keyword arguments SQLE functions accept. Below
|
|
105
|
+
are the generic keyword arguments:
|
|
106
|
+
persist:
|
|
107
|
+
Optional Argument.
|
|
108
|
+
Specifies whether to persist the results of the
|
|
109
|
+
function in a table or not. When set to True,
|
|
110
|
+
results are persisted in a table; otherwise,
|
|
111
|
+
results are garbage collected at the end of the
|
|
112
|
+
session.
|
|
113
|
+
Default Value: False
|
|
114
|
+
Types: bool
|
|
115
|
+
|
|
116
|
+
volatile:
|
|
117
|
+
Optional Argument.
|
|
118
|
+
Specifies whether to put the results of the
|
|
119
|
+
function in a volatile table or not. When set to
|
|
120
|
+
True, results are stored in a volatile table,
|
|
121
|
+
otherwise not.
|
|
122
|
+
Default Value: False
|
|
123
|
+
Types: bool
|
|
124
|
+
|
|
125
|
+
Function allows the user to partition, hash, order or local
|
|
126
|
+
order the input data. These generic arguments are available
|
|
127
|
+
for each argument that accepts teradataml DataFrame as
|
|
128
|
+
input and can be accessed as:
|
|
129
|
+
* "<input_data_arg_name>_partition_column" accepts str or
|
|
130
|
+
list of str (Strings)
|
|
131
|
+
* "<input_data_arg_name>_hash_column" accepts str or list
|
|
132
|
+
of str (Strings)
|
|
133
|
+
* "<input_data_arg_name>_order_column" accepts str or list
|
|
134
|
+
of str (Strings)
|
|
135
|
+
* "local_order_<input_data_arg_name>" accepts boolean
|
|
136
|
+
Note:
|
|
137
|
+
These generic arguments are supported by teradataml if
|
|
138
|
+
the underlying SQL Engine function supports, else an
|
|
139
|
+
exception is raised.
|
|
140
|
+
|
|
141
|
+
RETURNS:
|
|
142
|
+
Instance of SMOTE.
|
|
143
|
+
Output teradataml DataFrames can be accessed using attribute
|
|
144
|
+
references, such as SMOTEObj.<attribute_name>.
|
|
145
|
+
Output teradataml DataFrame attribute name is:
|
|
146
|
+
result
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
RAISES:
|
|
150
|
+
TeradataMlException, TypeError, ValueError
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
EXAMPLES:
|
|
154
|
+
# Notes:
|
|
155
|
+
# 1. Get the connection to Vantage, before importing the
|
|
156
|
+
# function in user space.
|
|
157
|
+
# 2. User can import the function, if it is available on
|
|
158
|
+
# Vantage user is connected to.
|
|
159
|
+
# 3. To check the list of analytic functions available on
|
|
160
|
+
# Vantage user connected to, use
|
|
161
|
+
# "display_analytic_functions()".
|
|
162
|
+
|
|
163
|
+
# Load the example data.
|
|
164
|
+
load_example_data("dataframe", "iris_test")
|
|
165
|
+
load_example_data("teradataml", "titanic")
|
|
166
|
+
|
|
167
|
+
# Create teradataml DataFrame objects.
|
|
168
|
+
iris_input = DataFrame.from_table("iris_test").iloc[:25]
|
|
169
|
+
titanic_input = DataFrame("titanic").iloc[:50]
|
|
170
|
+
|
|
171
|
+
# Create Encoding DataFrame objects.
|
|
172
|
+
encoded_data = OrdinalEncodingFit(data=titanic_input,
|
|
173
|
+
target_column=['sex','embarked'],
|
|
174
|
+
approach="AUTO")
|
|
175
|
+
|
|
176
|
+
# Check the list of available analytic functions.
|
|
177
|
+
display_analytic_functions()
|
|
178
|
+
|
|
179
|
+
# Import function SMOTE.
|
|
180
|
+
from teradataml import SMOTE
|
|
181
|
+
|
|
182
|
+
# Example 1 : Generate synthetic samples using smote algorithm.
|
|
183
|
+
smote_out = SMOTE(data = iris_input,
|
|
184
|
+
n_neighbors = 5,
|
|
185
|
+
id_column='id',
|
|
186
|
+
minority_class='3',
|
|
187
|
+
response_column='species',
|
|
188
|
+
input_columns =['sepal_length', 'sepal_width', 'petal_length', 'petal_width'],
|
|
189
|
+
oversampling_factor=2,
|
|
190
|
+
sampling_strategy='smote',
|
|
191
|
+
seed=10)
|
|
192
|
+
|
|
193
|
+
# Print the result DataFrame.
|
|
194
|
+
print(smote_out.result)
|
|
195
|
+
|
|
196
|
+
# Example 2 : Generate synthetic samples using smotenc algorithm with categorical columns.
|
|
197
|
+
smote_out2 = SMOTE(data = titanic_input,
|
|
198
|
+
encoding_data = encoded_data.result,
|
|
199
|
+
id_column = 'passenger',
|
|
200
|
+
response_column = 'survived',
|
|
201
|
+
input_columns = ['parch', 'age', 'sibsp'],
|
|
202
|
+
categorical_columns = ['sex', 'embarked'],
|
|
203
|
+
median_standard_deviation = 31.47806044604718,
|
|
204
|
+
minority_class = '1',
|
|
205
|
+
oversampling_factor = 5,
|
|
206
|
+
sampling_strategy = "smotenc",
|
|
207
|
+
noninput_columns_value = "null",
|
|
208
|
+
n_neighbors = 5)
|
|
209
|
+
|
|
210
|
+
# Print the result DataFrame.
|
|
211
|
+
print(smote_out2.result)
|
|
212
|
+
"""
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
def TextMorph(data=None, word_column=None, pos=None,
|
|
2
|
+
single_output=False, postag_column=None,
|
|
3
|
+
accumulate=None, **generic_arguments):
|
|
4
|
+
"""
|
|
5
|
+
DESCRIPTION:
|
|
6
|
+
TextMorph() function generate morphs of given words in the input dataset.
|
|
7
|
+
|
|
8
|
+
PARAMETERS:
|
|
9
|
+
data:
|
|
10
|
+
Required Argument.
|
|
11
|
+
Specifies the input teradataml DataFrame.
|
|
12
|
+
Types: teradataml DataFrame
|
|
13
|
+
|
|
14
|
+
word_column:
|
|
15
|
+
Required Argument.
|
|
16
|
+
Specifies the name of the input column that contains words for which morphs are to be generated.
|
|
17
|
+
Types: str
|
|
18
|
+
|
|
19
|
+
pos:
|
|
20
|
+
Optional Argument.
|
|
21
|
+
Specifies the part of speech (POS) to output.
|
|
22
|
+
Permitted Values: "NOUN", "VERB", "ADV", "ADJ"
|
|
23
|
+
Types: str or list of str
|
|
24
|
+
|
|
25
|
+
single_output:
|
|
26
|
+
Optional Argument.
|
|
27
|
+
Specifies whether to output only one morph for each word. If set to `False`,
|
|
28
|
+
the function outputs all morphs for each word.
|
|
29
|
+
Default Value: False
|
|
30
|
+
Types: bool
|
|
31
|
+
|
|
32
|
+
postag_column:
|
|
33
|
+
Optional Argument.
|
|
34
|
+
Specifies the name of the column in data that contains the part-of-speech (POS)
|
|
35
|
+
tags of the words, output by the function TD_POSTagger.
|
|
36
|
+
Types: str
|
|
37
|
+
|
|
38
|
+
accumulate:
|
|
39
|
+
Optional Argument.
|
|
40
|
+
Specifies the names of the input columns to copy to the output table.
|
|
41
|
+
Types: str or list of str
|
|
42
|
+
|
|
43
|
+
**generic_arguments:
|
|
44
|
+
Optional Argument.
|
|
45
|
+
Specifies the generic keyword arguments SQLE functions accept. Below are the generic
|
|
46
|
+
keyword arguments:
|
|
47
|
+
persist:
|
|
48
|
+
Optional Argument.
|
|
49
|
+
Specifies whether to persist the results of the function in a table or not.
|
|
50
|
+
When set to True, results are persisted in a table; otherwise, results are
|
|
51
|
+
garbage collected at the end of the session.
|
|
52
|
+
Default Value: False
|
|
53
|
+
Types: bool
|
|
54
|
+
|
|
55
|
+
volatile:
|
|
56
|
+
Optional Argument.
|
|
57
|
+
Specifies whether to put the results of the function in a volatile table or not.
|
|
58
|
+
When set to True, results are stored in a volatile table; otherwise not.
|
|
59
|
+
Default Value: False
|
|
60
|
+
Types: bool
|
|
61
|
+
|
|
62
|
+
Function allows the user to partition, hash, order or local order the input
|
|
63
|
+
data. These generic arguments are available for each argument that accepts
|
|
64
|
+
teradataml DataFrame as input and can be accessed as:
|
|
65
|
+
* "<input_data_arg_name>_partition_column" accepts str or list of str (Strings)
|
|
66
|
+
* "<input_data_arg_name>_hash_column" accepts str or list of str (Strings)
|
|
67
|
+
* "<input_data_arg_name>_order_column" accepts str or list of str (Strings)
|
|
68
|
+
* "local_order_<input_data_arg_name>" accepts boolean
|
|
69
|
+
Note:
|
|
70
|
+
These generic arguments are supported by teradataml if the underlying SQLE Engine
|
|
71
|
+
function supports, else an exception is raised.
|
|
72
|
+
|
|
73
|
+
RETURNS:
|
|
74
|
+
Instance of TextMorph.
|
|
75
|
+
Output teradataml DataFrames can be accessed using attribute references, such as TDTextMorphObj.<attribute_name>.
|
|
76
|
+
Output teradataml DataFrame attribute name is:
|
|
77
|
+
result
|
|
78
|
+
|
|
79
|
+
RAISES:
|
|
80
|
+
TeradataMlException, TypeError, ValueError
|
|
81
|
+
|
|
82
|
+
EXAMPLES:
|
|
83
|
+
# Notes:
|
|
84
|
+
# 1. Get the connection to Vantage to execute the function.
|
|
85
|
+
# 2. One must import the required functions mentioned in the example from teradataml.
|
|
86
|
+
# 3. Function will raise an error if not supported on the Vantage user is connected to.
|
|
87
|
+
|
|
88
|
+
# Load the example data.
|
|
89
|
+
load_example_data("textmorph", ["words_input","pos_input"])
|
|
90
|
+
|
|
91
|
+
# Create teradataml DataFrame objects.
|
|
92
|
+
data1 = DataFrame.from_table("words_input")
|
|
93
|
+
data2 = DataFrame.from_table("pos_input")
|
|
94
|
+
|
|
95
|
+
# Check the list of available analytic functions.
|
|
96
|
+
display_analytic_functions()
|
|
97
|
+
|
|
98
|
+
# Import function TextMorph.
|
|
99
|
+
from teradataml import TextMorph
|
|
100
|
+
|
|
101
|
+
# Example 1: Generate morphs for words in the input dataset.
|
|
102
|
+
TextMorph_out = TextMorph(data=data1,
|
|
103
|
+
word_column="data2",
|
|
104
|
+
pos=["noun", "verb"],
|
|
105
|
+
single_output=True,
|
|
106
|
+
accumulate=["id"])
|
|
107
|
+
|
|
108
|
+
# Print the result DataFrame.
|
|
109
|
+
print(TextMorph_out.result)
|
|
110
|
+
|
|
111
|
+
Example 2 : Generate morphs for words in the input dataset with POS tags.
|
|
112
|
+
TextMorph_pos = TextMorph(data=data2,
|
|
113
|
+
word_column="word",
|
|
114
|
+
postag_column="pos_tag",
|
|
115
|
+
accumulate=["id","pos_tag"])
|
|
116
|
+
|
|
117
|
+
# Print the result DataFrame.
|
|
118
|
+
print(TextMorph_pos.result)
|
|
119
|
+
"""
|