teradataml 20.0.0.3__py3-none-any.whl → 20.0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/README.md +193 -1
- teradataml/__init__.py +2 -1
- teradataml/_version.py +2 -2
- teradataml/analytics/analytic_function_executor.py +25 -18
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/analytic_functions_argument.py +4 -0
- teradataml/analytics/sqle/__init__.py +20 -2
- teradataml/analytics/utils.py +15 -1
- teradataml/analytics/valib.py +18 -4
- teradataml/automl/__init__.py +341 -112
- teradataml/automl/autodataprep/__init__.py +471 -0
- teradataml/automl/data_preparation.py +84 -42
- teradataml/automl/data_transformation.py +69 -33
- teradataml/automl/feature_engineering.py +76 -9
- teradataml/automl/feature_exploration.py +639 -25
- teradataml/automl/model_training.py +35 -14
- teradataml/clients/auth_client.py +2 -2
- teradataml/common/__init__.py +1 -2
- teradataml/common/constants.py +122 -63
- teradataml/common/messagecodes.py +14 -3
- teradataml/common/messages.py +8 -4
- teradataml/common/sqlbundle.py +40 -10
- teradataml/common/utils.py +366 -74
- teradataml/common/warnings.py +11 -0
- teradataml/context/context.py +348 -86
- teradataml/data/amazon_reviews_25.csv +26 -0
- teradataml/data/apriori_example.json +22 -0
- teradataml/data/byom_example.json +11 -0
- teradataml/data/docs/byom/docs/DataRobotPredict.py +2 -2
- teradataml/data/docs/byom/docs/DataikuPredict.py +40 -1
- teradataml/data/docs/byom/docs/H2OPredict.py +2 -2
- teradataml/data/docs/byom/docs/ONNXEmbeddings.py +242 -0
- teradataml/data/docs/byom/docs/ONNXPredict.py +2 -2
- teradataml/data/docs/byom/docs/PMMLPredict.py +2 -2
- teradataml/data/docs/sqle/docs_17_20/Apriori.py +138 -0
- teradataml/data/docs/sqle/docs_17_20/NERExtractor.py +121 -0
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +3 -3
- teradataml/data/docs/sqle/docs_17_20/SMOTE.py +212 -0
- teradataml/data/docs/sqle/docs_17_20/Shap.py +28 -6
- teradataml/data/docs/sqle/docs_17_20/TextMorph.py +119 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +54 -3
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/DFFT.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFT2.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DFFTConv.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +6 -6
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/MultivarRegr.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/PACF.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerTransform.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Resample.py +5 -5
- teradataml/data/docs/uaf/docs_17_20/SAX.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/SimpleExp.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Smoothma.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/UNDIFF.py +1 -1
- teradataml/data/hnsw_alter_data.csv +5 -0
- teradataml/data/hnsw_data.csv +10 -0
- teradataml/data/jsons/byom/h2opredict.json +1 -1
- teradataml/data/jsons/byom/onnxembeddings.json +266 -0
- teradataml/data/jsons/sqle/17.20/NGramSplitter.json +6 -6
- teradataml/data/jsons/sqle/17.20/TD_Apriori.json +181 -0
- teradataml/data/jsons/sqle/17.20/TD_NERExtractor.json +145 -0
- teradataml/data/jsons/sqle/17.20/TD_SMOTE.json +267 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +0 -1
- teradataml/data/jsons/sqle/17.20/TD_TextMorph.json +134 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +114 -9
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +420 -0
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +343 -0
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +328 -0
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +359 -0
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +360 -0
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +343 -0
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +343 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSW.json +296 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWPredict.json +206 -0
- teradataml/data/jsons/sqle/20.00/TD_HNSWSummary.json +32 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +2 -2
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +3 -3
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +6 -6
- teradataml/data/ner_dict.csv +8 -0
- teradataml/data/ner_input_eng.csv +7 -0
- teradataml/data/ner_rule.csv +5 -0
- teradataml/data/pos_input.csv +40 -0
- teradataml/data/tdnerextractor_example.json +14 -0
- teradataml/data/teradataml_example.json +21 -0
- teradataml/data/textmorph_example.json +5 -0
- teradataml/data/to_num_data.csv +4 -0
- teradataml/data/tochar_data.csv +5 -0
- teradataml/data/trans_dense.csv +16 -0
- teradataml/data/trans_sparse.csv +55 -0
- teradataml/data/vectordistance_example.json +1 -1
- teradataml/dataframe/copy_to.py +45 -29
- teradataml/dataframe/data_transfer.py +72 -46
- teradataml/dataframe/dataframe.py +642 -166
- teradataml/dataframe/dataframe_utils.py +167 -22
- teradataml/dataframe/functions.py +135 -20
- teradataml/dataframe/setop.py +11 -6
- teradataml/dataframe/sql.py +330 -78
- teradataml/dbutils/dbutils.py +556 -140
- teradataml/dbutils/filemgr.py +14 -10
- teradataml/hyperparameter_tuner/optimizer.py +12 -1
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/{sklearn/_sklearn_wrapper.py → _base.py} +168 -1013
- teradataml/opensource/_class.py +141 -17
- teradataml/opensource/{constants.py → _constants.py} +7 -3
- teradataml/opensource/_lightgbm.py +52 -53
- teradataml/opensource/_sklearn.py +1008 -0
- teradataml/opensource/_wrapper_utils.py +5 -5
- teradataml/options/__init__.py +47 -15
- teradataml/options/configure.py +103 -26
- teradataml/options/display.py +13 -2
- teradataml/plot/axis.py +47 -8
- teradataml/plot/figure.py +33 -0
- teradataml/plot/plot.py +63 -13
- teradataml/scriptmgmt/UserEnv.py +307 -40
- teradataml/scriptmgmt/lls_utils.py +428 -145
- teradataml/store/__init__.py +2 -3
- teradataml/store/feature_store/feature_store.py +102 -7
- teradataml/table_operators/Apply.py +48 -19
- teradataml/table_operators/Script.py +23 -2
- teradataml/table_operators/TableOperator.py +3 -1
- teradataml/table_operators/table_operator_util.py +58 -9
- teradataml/utils/dtypes.py +49 -1
- teradataml/utils/internal_buffer.py +38 -0
- teradataml/utils/validators.py +377 -62
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/METADATA +200 -4
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/RECORD +146 -112
- teradataml/data/SQL_Fundamentals.pdf +0 -0
- teradataml/libaed_0_1.dylib +0 -0
- teradataml/libaed_0_1.so +0 -0
- teradataml/opensource/sklearn/__init__.py +0 -0
- teradataml/store/vector_store/__init__.py +0 -1586
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.3.dist-info → teradataml-20.0.0.5.dist-info}/zip-safe +0 -0
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
def NERExtractor(data=None, user_defined_data=None, rules_data=None, text_column=None,
|
|
2
|
+
input_language="EN", show_context=0, accumulate=None,
|
|
3
|
+
**generic_arguments):
|
|
4
|
+
"""
|
|
5
|
+
DESCRIPTION:
|
|
6
|
+
NERExtractor() performs Named Entity Recognition (NER) on input text
|
|
7
|
+
according to user-defined dictionary words or regular expression (regex) patterns.
|
|
8
|
+
|
|
9
|
+
PARAMETERS:
|
|
10
|
+
data:
|
|
11
|
+
Required Argument.
|
|
12
|
+
Specifies the input teradataml DataFrame.
|
|
13
|
+
Types: teradataml DataFrame
|
|
14
|
+
|
|
15
|
+
user_defined_data:
|
|
16
|
+
Required Argument.
|
|
17
|
+
Specifies the teradataml DataFrame which contains user defined words and the corresponding entity label.
|
|
18
|
+
Types: teradataml DataFrame
|
|
19
|
+
|
|
20
|
+
rules_data:
|
|
21
|
+
Required Argument.
|
|
22
|
+
Specifies the teradataml DataFrame which contains user-defined regex patterns and the corresponding entity label.
|
|
23
|
+
Types: teradataml DataFrame
|
|
24
|
+
|
|
25
|
+
text_column:
|
|
26
|
+
Required Argument.
|
|
27
|
+
Specifies the name of the teradataml DataFrame column that will be used for NER search.
|
|
28
|
+
Types: str
|
|
29
|
+
|
|
30
|
+
input_language:
|
|
31
|
+
Optional Argument.
|
|
32
|
+
Specifies the language of input text.
|
|
33
|
+
Default Value: "EN"
|
|
34
|
+
Types: str
|
|
35
|
+
|
|
36
|
+
show_context:
|
|
37
|
+
Optional Argument.
|
|
38
|
+
Specifies the number of words before and after the matched entity. If leading or trailing
|
|
39
|
+
words are less than "show_context", then ellipsis (...) are added. Must be a positive value
|
|
40
|
+
less than 10.
|
|
41
|
+
Default Value: 0
|
|
42
|
+
Types: int
|
|
43
|
+
|
|
44
|
+
accumulate:
|
|
45
|
+
Optional Argument.
|
|
46
|
+
Specifies the name(s) of input teradataml DataFrame column(s) to copy to the output.
|
|
47
|
+
table to output.
|
|
48
|
+
Types: str or list of str
|
|
49
|
+
|
|
50
|
+
**generic_arguments:
|
|
51
|
+
Optional Argument.
|
|
52
|
+
Specifies the generic keyword arguments SQLE functions accept. Below are the generic
|
|
53
|
+
keyword arguments:
|
|
54
|
+
persist:
|
|
55
|
+
Optional Argument.
|
|
56
|
+
Specifies whether to persist the results of the function in a table or not.
|
|
57
|
+
When set to True, results are persisted in a table; otherwise, results are
|
|
58
|
+
garbage collected at the end of the session.
|
|
59
|
+
Default Value: False
|
|
60
|
+
Types: bool
|
|
61
|
+
|
|
62
|
+
volatile:
|
|
63
|
+
Optional Argument.
|
|
64
|
+
Specifies whether to put the results of the function in a volatile table or not.
|
|
65
|
+
When set to True, results are stored in a volatile table; otherwise not.
|
|
66
|
+
Default Value: False
|
|
67
|
+
Types: bool
|
|
68
|
+
|
|
69
|
+
Function allows the user to partition, hash, order or local order the input
|
|
70
|
+
data. These generic arguments are available for each argument that accepts
|
|
71
|
+
teradataml DataFrame as input and can be accessed as:
|
|
72
|
+
* "<input_data_arg_name>_partition_column" accepts str or list of str (Strings)
|
|
73
|
+
* "<input_data_arg_name>_hash_column" accepts str or list of str (Strings)
|
|
74
|
+
* "<input_data_arg_name>_order_column" accepts str or list of str (Strings)
|
|
75
|
+
* "local_order_<input_data_arg_name>" accepts boolean
|
|
76
|
+
Note:
|
|
77
|
+
These generic arguments are supported by teradataml if the underlying SQLE Engine
|
|
78
|
+
function supports, else an exception is raised.
|
|
79
|
+
|
|
80
|
+
RETURNS:
|
|
81
|
+
Instance of NERExtractor.
|
|
82
|
+
Output teradataml DataFrames can be accessed using attribute references, such as TDNERExtractorObj.<attribute_name>.
|
|
83
|
+
Output teradataml DataFrame attribute name is:
|
|
84
|
+
result
|
|
85
|
+
|
|
86
|
+
RAISES:
|
|
87
|
+
TeradataMlException, TypeError, ValueError
|
|
88
|
+
|
|
89
|
+
EXAMPLES:
|
|
90
|
+
# Notes:
|
|
91
|
+
# 1. Get the connection to Vantage to execute the function.
|
|
92
|
+
# 2. One must import the required functions mentioned in the example from teradataml.
|
|
93
|
+
# 3. Function will raise an error if not supported on the Vantage user is connected to.
|
|
94
|
+
|
|
95
|
+
# Load the example data.
|
|
96
|
+
load_example_data("tdnerextractor", ["ner_input_eng", "ner_dict", "ner_rule"])
|
|
97
|
+
|
|
98
|
+
# Create teradataml DataFrame objects.
|
|
99
|
+
df = DataFrame.from_table("ner_input_eng")
|
|
100
|
+
user_defined_words = DataFrame.from_table("ner_dict")
|
|
101
|
+
rules = DataFrame.from_table("ner_rule")
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# Check the list of available analytic functions.
|
|
105
|
+
display_analytic_functions()
|
|
106
|
+
|
|
107
|
+
# Import function NERExtractor.
|
|
108
|
+
from teradataml import NERExtractor
|
|
109
|
+
|
|
110
|
+
# Example 1: Perform Named Entity Recognition (NER) using Rules and Dict with Accumulate.
|
|
111
|
+
NER_out = NERExtractor(data=df,
|
|
112
|
+
user_defined_data=user_defined_words,
|
|
113
|
+
rules_data=rules,
|
|
114
|
+
text_column=["txt"],
|
|
115
|
+
input_language="en",
|
|
116
|
+
show_context=3,
|
|
117
|
+
accumulate=["id"])
|
|
118
|
+
|
|
119
|
+
# Print the result DataFrame.
|
|
120
|
+
print(NER_out.result)
|
|
121
|
+
"""
|
|
@@ -33,7 +33,7 @@ def NGramSplitter(data=None, text_column=None, delimiter=" ", grams=None, overla
|
|
|
33
33
|
|
|
34
34
|
delimiter:
|
|
35
35
|
Optional Argument.
|
|
36
|
-
Specifies a character or string that separates words in the input text. The
|
|
36
|
+
Specifies a character or string or a regular expression that separates words in the input text. The
|
|
37
37
|
default value is the set of all whitespace characters which includes
|
|
38
38
|
the characters for space, tab, newline, carriage return and some
|
|
39
39
|
others.
|
|
@@ -66,14 +66,14 @@ def NGramSplitter(data=None, text_column=None, delimiter=" ", grams=None, overla
|
|
|
66
66
|
|
|
67
67
|
punctuation:
|
|
68
68
|
Optional Argument.
|
|
69
|
-
Specifies a string that specifies the punctuation characters for the function
|
|
69
|
+
Specifies a string or a regular expression that specifies the punctuation characters for the function
|
|
70
70
|
to remove before evaluating the input text.
|
|
71
71
|
Default Value: "`~#^&*()-"
|
|
72
72
|
Types: str
|
|
73
73
|
|
|
74
74
|
reset:
|
|
75
75
|
Optional Argument.
|
|
76
|
-
Specifies a string that specifies the character or string that ends a sentence.
|
|
76
|
+
Specifies a string or a regular expression that specifies the character or string that ends a sentence.
|
|
77
77
|
At the end of a sentence, the function discards any partial n-grams and searches
|
|
78
78
|
for the next n-gram at the beginning of the next sentence. An n-gram
|
|
79
79
|
cannot span two sentences.
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
def SMOTE(data = None, encoding_data = None, id_column = None,
|
|
2
|
+
response_column = None, input_columns = None, categorical_columns = None,
|
|
3
|
+
median_standard_deviation = None, minority_class = None,
|
|
4
|
+
oversampling_factor = 5, sampling_strategy = "smote",
|
|
5
|
+
fill_sampleid = True, noninput_columns_value = "sample", n_neighbors = 5,
|
|
6
|
+
seed = None, **generic_arguments):
|
|
7
|
+
"""
|
|
8
|
+
DESCRIPTION:
|
|
9
|
+
SMOTE() function generates data by oversampling a minority class using
|
|
10
|
+
smote, adasyn, borderline-2 or smote-nc algorithms.
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
PARAMETERS:
|
|
14
|
+
data:
|
|
15
|
+
Required Argument.
|
|
16
|
+
Specifies the input teradataml DataFrame.
|
|
17
|
+
Types: teradataml DataFrame
|
|
18
|
+
|
|
19
|
+
encoding_data:
|
|
20
|
+
Optional Argument, Required when "sampling_strategy" is set to 'smotenc' algorithm.
|
|
21
|
+
Specifies the teradataml dataframe containing the ordinal encoding information.
|
|
22
|
+
Types: teradataml DataFrame
|
|
23
|
+
|
|
24
|
+
id_column:
|
|
25
|
+
Required Argument.
|
|
26
|
+
Specifies the name of the column in "data" that
|
|
27
|
+
uniquely identifies a data sample.
|
|
28
|
+
Types: str
|
|
29
|
+
|
|
30
|
+
response_column:
|
|
31
|
+
Optional Argument.
|
|
32
|
+
Specifies the name of the column in "data" that contains the
|
|
33
|
+
numeric value to be used as the response value for a sample.
|
|
34
|
+
Types: str
|
|
35
|
+
|
|
36
|
+
input_columns:
|
|
37
|
+
Required Argument.
|
|
38
|
+
Specifies the name of the input columns in "data" for oversampling.
|
|
39
|
+
Types: str OR list of Strings (str)
|
|
40
|
+
|
|
41
|
+
categorical_columns:
|
|
42
|
+
Optional Argument, Required when "sampling_strategy" is set to 'smotenc' algorithm.
|
|
43
|
+
Specifies the name of the categorical columns in the "data" that
|
|
44
|
+
the function uses for oversampling with smotenc.
|
|
45
|
+
Types: str OR list of Strings (str)
|
|
46
|
+
|
|
47
|
+
median_standard_deviation:
|
|
48
|
+
Optional Argument, Required when "sampling_strategy" is set to 'smotenc' algorithm.
|
|
49
|
+
Specifies the median of the standard deviations computed over the
|
|
50
|
+
numerical input columns.
|
|
51
|
+
Types: float
|
|
52
|
+
|
|
53
|
+
minority_class:
|
|
54
|
+
Required Argument.
|
|
55
|
+
Specifies the minority class for which synthetic samples need to be
|
|
56
|
+
generated.
|
|
57
|
+
Note:
|
|
58
|
+
* The label for minority class under response column must be numeric integer.
|
|
59
|
+
Types: str
|
|
60
|
+
|
|
61
|
+
oversampling_factor:
|
|
62
|
+
Optional Argument.
|
|
63
|
+
Specifies the factor for oversampling the minority class.
|
|
64
|
+
Default Value: 5
|
|
65
|
+
Types: float
|
|
66
|
+
|
|
67
|
+
sampling_strategy:
|
|
68
|
+
Optional Argument.
|
|
69
|
+
Specifies the oversampling algorithm to be used to create synthetic samples.
|
|
70
|
+
Default Value: "smote"
|
|
71
|
+
Permitted Values: "smote", "adasyn", "borderline", "smotenc"
|
|
72
|
+
Types: str
|
|
73
|
+
|
|
74
|
+
fill_sampleid:
|
|
75
|
+
Optional Argument.
|
|
76
|
+
Specifies whether to include the id of the original observation used
|
|
77
|
+
to generate each synthetic observation.
|
|
78
|
+
Default Value: True
|
|
79
|
+
Types: bool
|
|
80
|
+
|
|
81
|
+
noninput_columns_value:
|
|
82
|
+
Optional Argument.
|
|
83
|
+
Specifies the value to put in a sample column for columns not
|
|
84
|
+
specified as input columns.
|
|
85
|
+
Default Value: "sample"
|
|
86
|
+
Permitted Values: "sample", "neighbor", "null"
|
|
87
|
+
Types: str
|
|
88
|
+
|
|
89
|
+
n_neighbors:
|
|
90
|
+
Optional Argument.
|
|
91
|
+
Specifies the number of nearest neighbors for choosing the sample to
|
|
92
|
+
be used in oversampling.
|
|
93
|
+
Default Value: 5
|
|
94
|
+
Types: int
|
|
95
|
+
|
|
96
|
+
seed:
|
|
97
|
+
Optional Argument.
|
|
98
|
+
Specifies the random seed the algorithm uses for repeatable results.
|
|
99
|
+
The function uses the seed for random interpolation and generate the
|
|
100
|
+
synthetic sample.
|
|
101
|
+
Types: int
|
|
102
|
+
|
|
103
|
+
**generic_arguments:
|
|
104
|
+
Specifies the generic keyword arguments SQLE functions accept. Below
|
|
105
|
+
are the generic keyword arguments:
|
|
106
|
+
persist:
|
|
107
|
+
Optional Argument.
|
|
108
|
+
Specifies whether to persist the results of the
|
|
109
|
+
function in a table or not. When set to True,
|
|
110
|
+
results are persisted in a table; otherwise,
|
|
111
|
+
results are garbage collected at the end of the
|
|
112
|
+
session.
|
|
113
|
+
Default Value: False
|
|
114
|
+
Types: bool
|
|
115
|
+
|
|
116
|
+
volatile:
|
|
117
|
+
Optional Argument.
|
|
118
|
+
Specifies whether to put the results of the
|
|
119
|
+
function in a volatile table or not. When set to
|
|
120
|
+
True, results are stored in a volatile table,
|
|
121
|
+
otherwise not.
|
|
122
|
+
Default Value: False
|
|
123
|
+
Types: bool
|
|
124
|
+
|
|
125
|
+
Function allows the user to partition, hash, order or local
|
|
126
|
+
order the input data. These generic arguments are available
|
|
127
|
+
for each argument that accepts teradataml DataFrame as
|
|
128
|
+
input and can be accessed as:
|
|
129
|
+
* "<input_data_arg_name>_partition_column" accepts str or
|
|
130
|
+
list of str (Strings)
|
|
131
|
+
* "<input_data_arg_name>_hash_column" accepts str or list
|
|
132
|
+
of str (Strings)
|
|
133
|
+
* "<input_data_arg_name>_order_column" accepts str or list
|
|
134
|
+
of str (Strings)
|
|
135
|
+
* "local_order_<input_data_arg_name>" accepts boolean
|
|
136
|
+
Note:
|
|
137
|
+
These generic arguments are supported by teradataml if
|
|
138
|
+
the underlying SQL Engine function supports, else an
|
|
139
|
+
exception is raised.
|
|
140
|
+
|
|
141
|
+
RETURNS:
|
|
142
|
+
Instance of SMOTE.
|
|
143
|
+
Output teradataml DataFrames can be accessed using attribute
|
|
144
|
+
references, such as SMOTEObj.<attribute_name>.
|
|
145
|
+
Output teradataml DataFrame attribute name is:
|
|
146
|
+
result
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
RAISES:
|
|
150
|
+
TeradataMlException, TypeError, ValueError
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
EXAMPLES:
|
|
154
|
+
# Notes:
|
|
155
|
+
# 1. Get the connection to Vantage, before importing the
|
|
156
|
+
# function in user space.
|
|
157
|
+
# 2. User can import the function, if it is available on
|
|
158
|
+
# Vantage user is connected to.
|
|
159
|
+
# 3. To check the list of analytic functions available on
|
|
160
|
+
# Vantage user connected to, use
|
|
161
|
+
# "display_analytic_functions()".
|
|
162
|
+
|
|
163
|
+
# Load the example data.
|
|
164
|
+
load_example_data("dataframe", "iris_test")
|
|
165
|
+
load_example_data("teradataml", "titanic")
|
|
166
|
+
|
|
167
|
+
# Create teradataml DataFrame objects.
|
|
168
|
+
iris_input = DataFrame.from_table("iris_test").iloc[:25]
|
|
169
|
+
titanic_input = DataFrame("titanic").iloc[:50]
|
|
170
|
+
|
|
171
|
+
# Create Encoding DataFrame objects.
|
|
172
|
+
encoded_data = OrdinalEncodingFit(data=titanic_input,
|
|
173
|
+
target_column=['sex','embarked'],
|
|
174
|
+
approach="AUTO")
|
|
175
|
+
|
|
176
|
+
# Check the list of available analytic functions.
|
|
177
|
+
display_analytic_functions()
|
|
178
|
+
|
|
179
|
+
# Import function SMOTE.
|
|
180
|
+
from teradataml import SMOTE
|
|
181
|
+
|
|
182
|
+
# Example 1 : Generate synthetic samples using smote algorithm.
|
|
183
|
+
smote_out = SMOTE(data = iris_input,
|
|
184
|
+
n_neighbors = 5,
|
|
185
|
+
id_column='id',
|
|
186
|
+
minority_class='3',
|
|
187
|
+
response_column='species',
|
|
188
|
+
input_columns =['sepal_length', 'sepal_width', 'petal_length', 'petal_width'],
|
|
189
|
+
oversampling_factor=2,
|
|
190
|
+
sampling_strategy='smote',
|
|
191
|
+
seed=10)
|
|
192
|
+
|
|
193
|
+
# Print the result DataFrame.
|
|
194
|
+
print(smote_out.result)
|
|
195
|
+
|
|
196
|
+
# Example 2 : Generate synthetic samples using smotenc algorithm with categorical columns.
|
|
197
|
+
smote_out2 = SMOTE(data = titanic_input,
|
|
198
|
+
encoding_data = encoded_data.result,
|
|
199
|
+
id_column = 'passenger',
|
|
200
|
+
response_column = 'survived',
|
|
201
|
+
input_columns = ['parch', 'age', 'sibsp'],
|
|
202
|
+
categorical_columns = ['sex', 'embarked'],
|
|
203
|
+
median_standard_deviation = 31.47806044604718,
|
|
204
|
+
minority_class = '1',
|
|
205
|
+
oversampling_factor = 5,
|
|
206
|
+
sampling_strategy = "smotenc",
|
|
207
|
+
noninput_columns_value = "null",
|
|
208
|
+
n_neighbors = 5)
|
|
209
|
+
|
|
210
|
+
# Print the result DataFrame.
|
|
211
|
+
print(smote_out2.result)
|
|
212
|
+
"""
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
def Shap(data = None, object = None, id_column=None, training_function =
|
|
1
|
+
def Shap(data = None, object = None, id_column=None, training_function = None,
|
|
2
2
|
model_type = "Regression", input_columns = None, detailed = False,
|
|
3
3
|
accumulate = None, num_parallel_trees = 1000, num_boost_rounds = 10,
|
|
4
4
|
**generic_arguments):
|
|
@@ -29,7 +29,6 @@ def Shap(data = None, object = None, id_column=None, training_function = "TD_GLM
|
|
|
29
29
|
training_function:
|
|
30
30
|
Required Argument.
|
|
31
31
|
Specifies the model type name.
|
|
32
|
-
Default Value: "TD_GLM"
|
|
33
32
|
Permitted Values: TD_GLM, TD_DECISIONFOREST, TD_XGBOOST
|
|
34
33
|
Types: str
|
|
35
34
|
|
|
@@ -50,6 +49,9 @@ def Shap(data = None, object = None, id_column=None, training_function = "TD_GLM
|
|
|
50
49
|
Optional Argument.
|
|
51
50
|
Specifies whether to output detailed shap information about the
|
|
52
51
|
forest trees.
|
|
52
|
+
Note:
|
|
53
|
+
* It is only supported for "TD_XGBOOST" and "TD_DECISIONFOREST"
|
|
54
|
+
training functions.
|
|
53
55
|
Default Value: False
|
|
54
56
|
Types: bool
|
|
55
57
|
|
|
@@ -151,10 +153,10 @@ def Shap(data = None, object = None, id_column=None, training_function = "TD_GLM
|
|
|
151
153
|
|
|
152
154
|
# Example 1: Shap for classification model.
|
|
153
155
|
XGBoost_out = XGBoost(data=iris_input,
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
156
|
+
input_columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'],
|
|
157
|
+
response_column = 'species',
|
|
158
|
+
model_type='Classification',
|
|
159
|
+
iter_num=25)
|
|
158
160
|
|
|
159
161
|
Shap_out = Shap(data=iris_input,
|
|
160
162
|
object=XGBoost_out.result,
|
|
@@ -200,4 +202,24 @@ def Shap(data = None, object = None, id_column=None, training_function = "TD_GLM
|
|
|
200
202
|
|
|
201
203
|
# Print the result DataFrame.
|
|
202
204
|
print(Shap_out2.output_data)
|
|
205
|
+
|
|
206
|
+
# Example 3: Shap for GLM model.
|
|
207
|
+
from teradataml import GLM
|
|
208
|
+
GLM_out = GLM(data=transform_obj.result,
|
|
209
|
+
input_columns=['MedInc', 'HouseAge', 'AveRooms',
|
|
210
|
+
'AveBedrms', 'Population', 'AveOccup',
|
|
211
|
+
'Latitude', 'Longitude'],
|
|
212
|
+
response_column="MedHouseVal",
|
|
213
|
+
family="GAUSSIAN")
|
|
214
|
+
|
|
215
|
+
Shap_out3 = Shap(data=transform_obj.result,
|
|
216
|
+
object=GLM_out.result,
|
|
217
|
+
id_column='id',
|
|
218
|
+
training_function="TD_GLM",
|
|
219
|
+
model_type="Regression",
|
|
220
|
+
input_columns=['MedInc', 'HouseAge', 'AveRooms','AveBedrms', 'Population', 'AveOccup','Latitude', 'Longitude'],
|
|
221
|
+
detailed=False)
|
|
222
|
+
|
|
223
|
+
# Print the result DataFrame.
|
|
224
|
+
print(Shap_out3.output_data)
|
|
203
225
|
"""
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
def TextMorph(data=None, word_column=None, pos=None,
|
|
2
|
+
single_output=False, postag_column=None,
|
|
3
|
+
accumulate=None, **generic_arguments):
|
|
4
|
+
"""
|
|
5
|
+
DESCRIPTION:
|
|
6
|
+
TextMorph() function generate morphs of given words in the input dataset.
|
|
7
|
+
|
|
8
|
+
PARAMETERS:
|
|
9
|
+
data:
|
|
10
|
+
Required Argument.
|
|
11
|
+
Specifies the input teradataml DataFrame.
|
|
12
|
+
Types: teradataml DataFrame
|
|
13
|
+
|
|
14
|
+
word_column:
|
|
15
|
+
Required Argument.
|
|
16
|
+
Specifies the name of the input column that contains words for which morphs are to be generated.
|
|
17
|
+
Types: str
|
|
18
|
+
|
|
19
|
+
pos:
|
|
20
|
+
Optional Argument.
|
|
21
|
+
Specifies the part of speech (POS) to output.
|
|
22
|
+
Permitted Values: "NOUN", "VERB", "ADV", "ADJ"
|
|
23
|
+
Types: str or list of str
|
|
24
|
+
|
|
25
|
+
single_output:
|
|
26
|
+
Optional Argument.
|
|
27
|
+
Specifies whether to output only one morph for each word. If set to `False`,
|
|
28
|
+
the function outputs all morphs for each word.
|
|
29
|
+
Default Value: False
|
|
30
|
+
Types: bool
|
|
31
|
+
|
|
32
|
+
postag_column:
|
|
33
|
+
Optional Argument.
|
|
34
|
+
Specifies the name of the column in data that contains the part-of-speech (POS)
|
|
35
|
+
tags of the words, output by the function TD_POSTagger.
|
|
36
|
+
Types: str
|
|
37
|
+
|
|
38
|
+
accumulate:
|
|
39
|
+
Optional Argument.
|
|
40
|
+
Specifies the names of the input columns to copy to the output table.
|
|
41
|
+
Types: str or list of str
|
|
42
|
+
|
|
43
|
+
**generic_arguments:
|
|
44
|
+
Optional Argument.
|
|
45
|
+
Specifies the generic keyword arguments SQLE functions accept. Below are the generic
|
|
46
|
+
keyword arguments:
|
|
47
|
+
persist:
|
|
48
|
+
Optional Argument.
|
|
49
|
+
Specifies whether to persist the results of the function in a table or not.
|
|
50
|
+
When set to True, results are persisted in a table; otherwise, results are
|
|
51
|
+
garbage collected at the end of the session.
|
|
52
|
+
Default Value: False
|
|
53
|
+
Types: bool
|
|
54
|
+
|
|
55
|
+
volatile:
|
|
56
|
+
Optional Argument.
|
|
57
|
+
Specifies whether to put the results of the function in a volatile table or not.
|
|
58
|
+
When set to True, results are stored in a volatile table; otherwise not.
|
|
59
|
+
Default Value: False
|
|
60
|
+
Types: bool
|
|
61
|
+
|
|
62
|
+
Function allows the user to partition, hash, order or local order the input
|
|
63
|
+
data. These generic arguments are available for each argument that accepts
|
|
64
|
+
teradataml DataFrame as input and can be accessed as:
|
|
65
|
+
* "<input_data_arg_name>_partition_column" accepts str or list of str (Strings)
|
|
66
|
+
* "<input_data_arg_name>_hash_column" accepts str or list of str (Strings)
|
|
67
|
+
* "<input_data_arg_name>_order_column" accepts str or list of str (Strings)
|
|
68
|
+
* "local_order_<input_data_arg_name>" accepts boolean
|
|
69
|
+
Note:
|
|
70
|
+
These generic arguments are supported by teradataml if the underlying SQLE Engine
|
|
71
|
+
function supports, else an exception is raised.
|
|
72
|
+
|
|
73
|
+
RETURNS:
|
|
74
|
+
Instance of TextMorph.
|
|
75
|
+
Output teradataml DataFrames can be accessed using attribute references, such as TDTextMorphObj.<attribute_name>.
|
|
76
|
+
Output teradataml DataFrame attribute name is:
|
|
77
|
+
result
|
|
78
|
+
|
|
79
|
+
RAISES:
|
|
80
|
+
TeradataMlException, TypeError, ValueError
|
|
81
|
+
|
|
82
|
+
EXAMPLES:
|
|
83
|
+
# Notes:
|
|
84
|
+
# 1. Get the connection to Vantage to execute the function.
|
|
85
|
+
# 2. One must import the required functions mentioned in the example from teradataml.
|
|
86
|
+
# 3. Function will raise an error if not supported on the Vantage user is connected to.
|
|
87
|
+
|
|
88
|
+
# Load the example data.
|
|
89
|
+
load_example_data("textmorph", ["words_input","pos_input"])
|
|
90
|
+
|
|
91
|
+
# Create teradataml DataFrame objects.
|
|
92
|
+
data1 = DataFrame.from_table("words_input")
|
|
93
|
+
data2 = DataFrame.from_table("pos_input")
|
|
94
|
+
|
|
95
|
+
# Check the list of available analytic functions.
|
|
96
|
+
display_analytic_functions()
|
|
97
|
+
|
|
98
|
+
# Import function TextMorph.
|
|
99
|
+
from teradataml import TextMorph
|
|
100
|
+
|
|
101
|
+
# Example 1: Generate morphs for words in the input dataset.
|
|
102
|
+
TextMorph_out = TextMorph(data=data1,
|
|
103
|
+
word_column="data2",
|
|
104
|
+
pos=["noun", "verb"],
|
|
105
|
+
single_output=True,
|
|
106
|
+
accumulate=["id"])
|
|
107
|
+
|
|
108
|
+
# Print the result DataFrame.
|
|
109
|
+
print(TextMorph_out.result)
|
|
110
|
+
|
|
111
|
+
Example 2 : Generate morphs for words in the input dataset with POS tags.
|
|
112
|
+
TextMorph_pos = TextMorph(data=data2,
|
|
113
|
+
word_column="word",
|
|
114
|
+
postag_column="pos_tag",
|
|
115
|
+
accumulate=["id","pos_tag"])
|
|
116
|
+
|
|
117
|
+
# Print the result DataFrame.
|
|
118
|
+
print(TextMorph_pos.result)
|
|
119
|
+
"""
|
|
@@ -1,6 +1,9 @@
|
|
|
1
|
-
def TextParser(data=None, object=None, text_column=None,
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
def TextParser(data=None, object=None, text_column=None, enforce_token_limit=False,
|
|
2
|
+
convert_to_lowercase=True, stem_tokens=False, remove_stopwords=False,
|
|
3
|
+
accumulate=None, delimiter=" \t\n\f\r", delimiter_regex=None,
|
|
4
|
+
punctuation="!#$%&()*+,-./:;?@\^_`{|}~", token_col_name=None,
|
|
5
|
+
doc_id_column=None, list_positions=False, token_frequency=False,
|
|
6
|
+
output_by_word=True, **generic_arguments):
|
|
4
7
|
"""
|
|
5
8
|
DESCRIPTION:
|
|
6
9
|
The TextParser() function can parse text and perform the following operations:
|
|
@@ -38,6 +41,13 @@ def TextParser(data=None, object=None, text_column=None, convert_to_lowercase=Tr
|
|
|
38
41
|
Specifies the name of the input data column whose contents are to be tokenized.
|
|
39
42
|
Types: str
|
|
40
43
|
|
|
44
|
+
enforce_token_limit:
|
|
45
|
+
Optional Argument.
|
|
46
|
+
Specifies whether to throw an informative error when finding token larger than
|
|
47
|
+
64K/32K or silently discard those larger tokens.
|
|
48
|
+
Default Value: False
|
|
49
|
+
Types: bool
|
|
50
|
+
|
|
41
51
|
convert_to_lowercase:
|
|
42
52
|
Optional Argument.
|
|
43
53
|
Specifies whether to convert the text in "text_column" to lowercase.
|
|
@@ -71,6 +81,11 @@ def TextParser(data=None, object=None, text_column=None, convert_to_lowercase=Tr
|
|
|
71
81
|
Default Value: " \\t\\n\\f\\r"
|
|
72
82
|
Types: str
|
|
73
83
|
|
|
84
|
+
delimiter_regex:
|
|
85
|
+
Optional Argument.
|
|
86
|
+
Specifies a Perl Compatible regular expression that represents the word delimiter.
|
|
87
|
+
Types: str
|
|
88
|
+
|
|
74
89
|
punctuation:
|
|
75
90
|
Optional Argument.
|
|
76
91
|
Specifies the punctuation characters to replace with a space in the input text.
|
|
@@ -83,6 +98,29 @@ def TextParser(data=None, object=None, text_column=None, convert_to_lowercase=Tr
|
|
|
83
98
|
the text of the specified column in the "text_column" element.
|
|
84
99
|
Types: str
|
|
85
100
|
|
|
101
|
+
doc_id_column:
|
|
102
|
+
Optional Argument.
|
|
103
|
+
Specifies the name of the column that uniquely identifies a row in the input table.
|
|
104
|
+
Types: str
|
|
105
|
+
|
|
106
|
+
list_positions:
|
|
107
|
+
Optional Argument.
|
|
108
|
+
Specifies whether to output the positions of a word in list form.
|
|
109
|
+
Default Value: False
|
|
110
|
+
Types: bool
|
|
111
|
+
|
|
112
|
+
token_frequency:
|
|
113
|
+
Optional Argument.
|
|
114
|
+
Specifies whether to output the frequency for each token.
|
|
115
|
+
Default Value: False
|
|
116
|
+
Types: bool
|
|
117
|
+
|
|
118
|
+
output_by_word:
|
|
119
|
+
Optional Argument.
|
|
120
|
+
Specifies whether to output each token in a separate row or all tokens in one.
|
|
121
|
+
Default Value: True
|
|
122
|
+
Types: bool
|
|
123
|
+
|
|
86
124
|
**generic_arguments:
|
|
87
125
|
Specifies the generic keyword arguments SQLE functions accept. Below
|
|
88
126
|
are the generic keyword arguments:
|
|
@@ -170,4 +208,17 @@ def TextParser(data=None, object=None, text_column=None, convert_to_lowercase=Tr
|
|
|
170
208
|
|
|
171
209
|
# Print the result DataFrame.
|
|
172
210
|
print(TextParser_out.result)
|
|
211
|
+
|
|
212
|
+
# Example 3 : Tokenize words in "text_data" column using delimiter regex,
|
|
213
|
+
# convert tokens to lowercase and output token positions in a list format
|
|
214
|
+
TextParser_out = TextParser(data=complaints,
|
|
215
|
+
text_column="text_data",
|
|
216
|
+
doc_id_column="doc_id",
|
|
217
|
+
delimeter_regex="[ \t\f\r\n]+",
|
|
218
|
+
list_positions=True,
|
|
219
|
+
convert_to_lowercase=True,
|
|
220
|
+
output_by_word=False)
|
|
221
|
+
|
|
222
|
+
# Print the result DataFrame.
|
|
223
|
+
print(TextParser_out.result)
|
|
173
224
|
"""
|
|
@@ -94,7 +94,7 @@ def ACF(data=None, data_filter_expr=None, max_lags=None,
|
|
|
94
94
|
Default behavior when "alpha" avoided or not a positive
|
|
95
95
|
float:
|
|
96
96
|
* The function does not return confidence intervals.
|
|
97
|
-
Types: float
|
|
97
|
+
Types: int OR float
|
|
98
98
|
|
|
99
99
|
**generic_arguments:
|
|
100
100
|
Specifies the generic keyword arguments of UAF functions.
|
|
@@ -169,7 +169,7 @@ def ArimaEstimate(data1=None, data1_filter_expr=None, data2=None,
|
|
|
169
169
|
at the end to specify the intercept coefficient initial
|
|
170
170
|
value, then the formula is as follows:
|
|
171
171
|
p+q+P+Q+constant
|
|
172
|
-
Types: float, list of float
|
|
172
|
+
Types: int, list of int, float, list of float
|
|
173
173
|
|
|
174
174
|
fixed:
|
|
175
175
|
Optional Argument.
|
|
@@ -183,7 +183,7 @@ def ArimaEstimate(data1=None, data1_filter_expr=None, data2=None,
|
|
|
183
183
|
at the end to specify the intercept coefficient initial
|
|
184
184
|
value, then the formula is as follows:
|
|
185
185
|
p+q+P+Q+constant
|
|
186
|
-
Types: float, list of float
|
|
186
|
+
Types: int, list of int, float, list of float
|
|
187
187
|
|
|
188
188
|
constant:
|
|
189
189
|
Optional Argument.
|
|
@@ -95,7 +95,7 @@ def ArimaXEstimate(data1=None, data1_filter_expr=None, data2=None,
|
|
|
95
95
|
MA coefficients, the seasonal SAR regression
|
|
96
96
|
coefficients and the SMA coefficients. The formula is
|
|
97
97
|
as follows: 'p+q+P+Q+CONSTANT-length-init-list'
|
|
98
|
-
Types: float, list of float
|
|
98
|
+
Types: int, list of int, float, list of float
|
|
99
99
|
|
|
100
100
|
fixed:
|
|
101
101
|
Optional Argument.
|
|
@@ -107,7 +107,7 @@ def ArimaXEstimate(data1=None, data1_filter_expr=None, data2=None,
|
|
|
107
107
|
If an intercept is needed, one more value is added at
|
|
108
108
|
the end to specify the intercept coefficient initial value.
|
|
109
109
|
The formula is as follows: 'p+q+P+Q+CONSTANT-length-fixed-list'
|
|
110
|
-
Types: float, list of float
|
|
110
|
+
Types: int, list of int, float, list of float
|
|
111
111
|
|
|
112
112
|
constant:
|
|
113
113
|
Optional Argument.
|