teradataml 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/README.md +210 -0
- teradataml/__init__.py +1 -1
- teradataml/_version.py +1 -1
- teradataml/analytics/analytic_function_executor.py +162 -76
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/__init__.py +2 -0
- teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
- teradataml/analytics/json_parser/metadata.py +22 -4
- teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
- teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
- teradataml/analytics/sqle/__init__.py +3 -0
- teradataml/analytics/utils.py +4 -1
- teradataml/automl/__init__.py +2369 -464
- teradataml/automl/autodataprep/__init__.py +15 -0
- teradataml/automl/custom_json_utils.py +184 -112
- teradataml/automl/data_preparation.py +113 -58
- teradataml/automl/data_transformation.py +154 -53
- teradataml/automl/feature_engineering.py +113 -53
- teradataml/automl/feature_exploration.py +548 -25
- teradataml/automl/model_evaluation.py +260 -32
- teradataml/automl/model_training.py +399 -206
- teradataml/clients/auth_client.py +2 -2
- teradataml/common/aed_utils.py +11 -2
- teradataml/common/bulk_exposed_utils.py +4 -2
- teradataml/common/constants.py +62 -2
- teradataml/common/garbagecollector.py +50 -21
- teradataml/common/messagecodes.py +47 -2
- teradataml/common/messages.py +19 -1
- teradataml/common/sqlbundle.py +23 -6
- teradataml/common/utils.py +116 -10
- teradataml/context/aed_context.py +16 -10
- teradataml/data/Employee.csv +5 -0
- teradataml/data/Employee_Address.csv +4 -0
- teradataml/data/Employee_roles.csv +5 -0
- teradataml/data/JulesBelvezeDummyData.csv +100 -0
- teradataml/data/byom_example.json +5 -0
- teradataml/data/creditcard_data.csv +284618 -0
- teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
- teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
- teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
- teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
- teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
- teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
- teradataml/data/load_example_data.py +29 -11
- teradataml/data/payment_fraud_dataset.csv +10001 -0
- teradataml/data/teradataml_example.json +67 -0
- teradataml/dataframe/copy_to.py +714 -54
- teradataml/dataframe/dataframe.py +1153 -33
- teradataml/dataframe/dataframe_utils.py +8 -3
- teradataml/dataframe/functions.py +168 -1
- teradataml/dataframe/setop.py +4 -1
- teradataml/dataframe/sql.py +141 -9
- teradataml/dbutils/dbutils.py +470 -35
- teradataml/dbutils/filemgr.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +456 -142
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/lib/libaed_0_1.dylib +0 -0
- teradataml/lib/libaed_0_1.so +0 -0
- teradataml/lib/libaed_0_1_aarch64.so +0 -0
- teradataml/scriptmgmt/UserEnv.py +234 -34
- teradataml/scriptmgmt/lls_utils.py +43 -17
- teradataml/sdk/_json_parser.py +1 -1
- teradataml/sdk/api_client.py +9 -6
- teradataml/sdk/modelops/_client.py +3 -0
- teradataml/series/series.py +12 -7
- teradataml/store/feature_store/constants.py +601 -234
- teradataml/store/feature_store/feature_store.py +2886 -616
- teradataml/store/feature_store/mind_map.py +639 -0
- teradataml/store/feature_store/models.py +5831 -214
- teradataml/store/feature_store/utils.py +390 -0
- teradataml/table_operators/table_operator_util.py +1 -1
- teradataml/table_operators/templates/dataframe_register.template +6 -2
- teradataml/table_operators/templates/dataframe_udf.template +6 -2
- teradataml/utils/docstring.py +527 -0
- teradataml/utils/dtypes.py +93 -0
- teradataml/utils/internal_buffer.py +2 -2
- teradataml/utils/utils.py +41 -2
- teradataml/utils/validators.py +694 -17
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +213 -2
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +96 -81
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0
|
@@ -13,273 +13,450 @@ from enum import Enum
|
|
|
13
13
|
|
|
14
14
|
# Template for creating the triggers on
|
|
15
15
|
# corresponding tables.
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
16
|
+
|
|
17
|
+
# Tables for storing the data domains.
|
|
18
|
+
EFS_DATA_DOMAINS="""
|
|
19
|
+
CREATE MULTISET TABLE {0}.{1}
|
|
20
|
+
(
|
|
21
|
+
name VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
22
|
+
created_time TIMESTAMP(6)
|
|
23
|
+
)
|
|
24
|
+
UNIQUE PRIMARY INDEX (name);
|
|
25
25
|
"""
|
|
26
26
|
|
|
27
|
-
# Table for storing the features.
|
|
28
|
-
EFS_FEATURES_SPEC = {
|
|
29
|
-
"table_name": "_efs_features",
|
|
30
|
-
"columns": {
|
|
31
|
-
"name": VARCHAR(200),
|
|
32
|
-
"column_name": VARCHAR(200),
|
|
33
|
-
"description": VARCHAR(1024),
|
|
34
|
-
"tags": VARCHAR(2000),
|
|
35
|
-
"data_type": VARCHAR(1024),
|
|
36
|
-
"feature_type": VARCHAR(100),
|
|
37
|
-
"status": VARCHAR(100),
|
|
38
|
-
"creation_time": TIMESTAMP,
|
|
39
|
-
"modified_time": TIMESTAMP
|
|
40
|
-
},
|
|
41
|
-
"primary_index": "name"
|
|
42
|
-
}
|
|
43
27
|
|
|
44
|
-
#
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
28
|
+
# Tables for storing the features.
|
|
29
|
+
EFS_FEATURES = """
|
|
30
|
+
CREATE MULTISET TABLE {0}.{1}
|
|
31
|
+
(
|
|
32
|
+
id INTEGER,
|
|
33
|
+
name VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
34
|
+
data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
35
|
+
column_name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
36
|
+
description VARCHAR(1024) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
37
|
+
tags VARCHAR(2000) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
38
|
+
data_type VARCHAR(1024) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
39
|
+
feature_type VARCHAR(100) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
40
|
+
status VARCHAR(100) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
41
|
+
creation_time TIMESTAMP(6),
|
|
42
|
+
modified_time TIMESTAMP(6),
|
|
43
|
+
CONSTRAINT data_domain_fk FOREIGN KEY (data_domain) REFERENCES _efs_data_domains (name)
|
|
44
|
+
)
|
|
45
|
+
UNIQUE PRIMARY INDEX (name, data_domain)
|
|
46
|
+
UNIQUE INDEX (id);
|
|
47
|
+
"""
|
|
61
48
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
49
|
+
EFS_FEATURES_STAGING="""
|
|
50
|
+
CREATE MULTISET TABLE {0}.{1}
|
|
51
|
+
(
|
|
52
|
+
id INTEGER,
|
|
53
|
+
name VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
54
|
+
data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
55
|
+
column_name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
56
|
+
description VARCHAR(1024) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
57
|
+
tags VARCHAR(2000) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
58
|
+
data_type VARCHAR(1024) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
59
|
+
feature_type VARCHAR(100) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
60
|
+
status VARCHAR(100) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
61
|
+
creation_time TIMESTAMP(6),
|
|
62
|
+
modified_time TIMESTAMP(6),
|
|
63
|
+
archived_time TIMESTAMP(6)
|
|
64
|
+
)
|
|
65
|
+
NO PRIMARY INDEX ;
|
|
66
|
+
"""
|
|
79
67
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
}
|
|
68
|
+
EFS_FEATURES_TRG="""
|
|
69
|
+
CREATE TRIGGER {0}.{1}
|
|
70
|
+
AFTER DELETE ON {0}.{2}
|
|
71
|
+
REFERENCING OLD AS DeletedRow
|
|
72
|
+
FOR EACH ROW
|
|
73
|
+
INSERT INTO {3}
|
|
74
|
+
VALUES (DeletedRow.id, DeletedRow.name, DeletedRow.data_domain, DeletedRow.column_name, DeletedRow.description, DeletedRow.tags, DeletedRow.data_type, DeletedRow.feature_type, DeletedRow.status, DeletedRow.creation_time, DeletedRow.modified_time,
|
|
75
|
+
current_timestamp(6)
|
|
76
|
+
);
|
|
77
|
+
"""
|
|
91
78
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
columns=", ".join(("DeletedRow.{}".format(col) for col in EFS_ENTITY_SPEC["columns"]))
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
EFS_ENTITY_XREF_SPEC = {
|
|
98
|
-
"table_name": "_efs_entity_xref",
|
|
99
|
-
"columns": {
|
|
100
|
-
"entity_name": VARCHAR(200),
|
|
101
|
-
"entity_column": VARCHAR(200)
|
|
102
|
-
},
|
|
103
|
-
"primary_index": ["entity_name", "entity_column"],
|
|
104
|
-
"foreign_keys": [
|
|
79
|
+
EFS_GROUP_FEATURES = """
|
|
80
|
+
CREATE MULTISET TABLE {0}.{1}
|
|
105
81
|
(
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
82
|
+
feature_name VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
83
|
+
feature_data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
84
|
+
group_name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
85
|
+
group_data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
86
|
+
creation_time TIMESTAMP(6),
|
|
87
|
+
modified_time TIMESTAMP(6),
|
|
88
|
+
CONSTRAINT feature_name_fk FOREIGN KEY (feature_name, feature_data_domain) REFERENCES {0}._efs_features (name, data_domain),
|
|
89
|
+
CONSTRAINT group_name_fk FOREIGN KEY (group_name, group_data_domain) REFERENCES {0}._efs_feature_group (name, data_domain),
|
|
90
|
+
CONSTRAINT data_domain_fk1 FOREIGN KEY (feature_data_domain) REFERENCES {0}._efs_data_domains (name),
|
|
91
|
+
CONSTRAINT data_domain_fk2 FOREIGN KEY (group_data_domain) REFERENCES {0}._efs_data_domains (name)
|
|
109
92
|
)
|
|
110
|
-
|
|
111
|
-
|
|
93
|
+
UNIQUE PRIMARY INDEX (feature_name, feature_data_domain, group_name, group_data_domain);
|
|
94
|
+
"""
|
|
112
95
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
96
|
+
EFS_GROUP_FEATURES_STAGING = """
|
|
97
|
+
CREATE MULTISET TABLE {0}.{1}
|
|
98
|
+
(
|
|
99
|
+
feature_name VARCHAR(255),
|
|
100
|
+
feature_data_domain VARCHAR(255),
|
|
101
|
+
group_name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
102
|
+
group_data_domain VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
103
|
+
creation_time TIMESTAMP(6),
|
|
104
|
+
modified_time TIMESTAMP(6),
|
|
105
|
+
archived_time TIMESTAMP(6)
|
|
106
|
+
)
|
|
107
|
+
NO PRIMARY INDEX ;
|
|
108
|
+
"""
|
|
122
109
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
110
|
+
EFS_GROUP_FEATURES_TRG = """
|
|
111
|
+
CREATE TRIGGER {0}.{1}
|
|
112
|
+
AFTER DELETE ON {0}.{2}
|
|
113
|
+
REFERENCING OLD AS DeletedRow
|
|
114
|
+
FOR EACH ROW
|
|
115
|
+
INSERT INTO {3}
|
|
116
|
+
VALUES (DeletedRow.feature_name, DeletedRow.feature_data_domain, DeletedRow.group_name, DeletedRow.group_data_domain, DeletedRow.creation_time, DeletedRow.modified_time,
|
|
117
|
+
current_timestamp(6)
|
|
118
|
+
);
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
# Tables for Entities.
|
|
122
|
+
|
|
123
|
+
EFS_ENTITY = """
|
|
124
|
+
CREATE MULTISET TABLE {0}.{1}
|
|
125
|
+
(
|
|
126
|
+
name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
127
|
+
data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
128
|
+
description VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
129
|
+
creation_time TIMESTAMP(6),
|
|
130
|
+
modified_time TIMESTAMP(6),
|
|
131
|
+
CONSTRAINT data_domain_fk FOREIGN KEY (data_domain) REFERENCES {0}._efs_data_domains (name)
|
|
132
|
+
)
|
|
133
|
+
UNIQUE PRIMARY INDEX (name, data_domain);
|
|
134
|
+
"""
|
|
142
135
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
136
|
+
EFS_ENTITY_STAGING= """
|
|
137
|
+
CREATE MULTISET TABLE {0}.{1}
|
|
138
|
+
(
|
|
139
|
+
name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
140
|
+
data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
141
|
+
description VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
142
|
+
creation_time TIMESTAMP(6),
|
|
143
|
+
modified_time TIMESTAMP(6),
|
|
144
|
+
archived_time TIMESTAMP(6))
|
|
145
|
+
NO PRIMARY INDEX ;
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
EFS_ENTITY_TRG = """
|
|
149
|
+
CREATE TRIGGER {0}.{1}
|
|
150
|
+
AFTER DELETE ON {0}.{2}
|
|
151
|
+
REFERENCING OLD AS DeletedRow
|
|
152
|
+
FOR EACH ROW
|
|
153
|
+
INSERT INTO {3}
|
|
154
|
+
VALUES (DeletedRow.name, DeletedRow.data_domain, DeletedRow.description, DeletedRow.creation_time, DeletedRow.modified_time,
|
|
155
|
+
current_timestamp(6)
|
|
156
|
+
);
|
|
157
|
+
"""
|
|
156
158
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
columns=", ".join(("DeletedRow.{}".format(col) for col in EFS_DATA_SOURCE_SPEC["columns"]))
|
|
160
|
-
)
|
|
161
|
-
|
|
162
|
-
# Table for storing the feature groups. This table holds all the required
|
|
163
|
-
# parameters for creating DataFrame.
|
|
164
|
-
EFS_FEATURE_GROUP_SPEC = {
|
|
165
|
-
"table_name": "_efs_feature_group",
|
|
166
|
-
"columns": {
|
|
167
|
-
"name": VARCHAR(200),
|
|
168
|
-
"description": VARCHAR(200),
|
|
169
|
-
"data_source_name": VARCHAR(200),
|
|
170
|
-
"entity_name": VARCHAR(200),
|
|
171
|
-
"creation_time": TIMESTAMP,
|
|
172
|
-
"modified_time": TIMESTAMP
|
|
173
|
-
},
|
|
174
|
-
"primary_index": "name",
|
|
175
|
-
"foreign_keys": [
|
|
159
|
+
EFS_ENTITY_XREF= """
|
|
160
|
+
CREATE MULTISET TABLE {0}.{1}
|
|
176
161
|
(
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
162
|
+
entity_name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
163
|
+
data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
164
|
+
entity_column VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
165
|
+
CONSTRAINT entity_xref_fk FOREIGN KEY (entity_name, data_domain) REFERENCES {0}._efs_entity (name, data_domain),
|
|
166
|
+
CONSTRAINT data_domain_fk FOREIGN KEY (data_domain) REFERENCES {0}._efs_data_domains (name)
|
|
167
|
+
)
|
|
168
|
+
UNIQUE PRIMARY INDEX (entity_name, data_domain, entity_column);
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
EFS_ENTITY_XREF_STAGING = """
|
|
172
|
+
CREATE MULTISET TABLE {0}.{1}
|
|
181
173
|
(
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
174
|
+
entity_name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
175
|
+
data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
176
|
+
entity_column VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
177
|
+
archived_time TIMESTAMP(6)
|
|
178
|
+
)
|
|
179
|
+
NO PRIMARY INDEX ;
|
|
180
|
+
"""
|
|
186
181
|
|
|
187
|
-
|
|
188
|
-
}
|
|
182
|
+
EFS_ENTITY_XREF_TRG = """
|
|
183
|
+
CREATE TRIGGER {0}.{1}
|
|
184
|
+
AFTER DELETE ON {0}.{2}
|
|
185
|
+
REFERENCING OLD AS DeletedRow
|
|
186
|
+
FOR EACH ROW
|
|
187
|
+
INSERT INTO {3}
|
|
188
|
+
VALUES (DeletedRow.entity_name, DeletedRow.data_domain, DeletedRow.entity_column,
|
|
189
|
+
current_timestamp(6)
|
|
190
|
+
);
|
|
191
|
+
"""
|
|
189
192
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
}
|
|
193
|
+
# Table for Data sources.
|
|
194
|
+
|
|
195
|
+
EFS_DATA_SOURCE = """
|
|
196
|
+
CREATE MULTISET TABLE {0}.{1}
|
|
197
|
+
(
|
|
198
|
+
name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
199
|
+
data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
200
|
+
description VARCHAR(1024) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
201
|
+
timestamp_column VARCHAR(50) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
202
|
+
source VARCHAR(5000) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
203
|
+
creation_time TIMESTAMP(6),
|
|
204
|
+
modified_time TIMESTAMP(6),
|
|
205
|
+
CONSTRAINT data_domain_fk FOREIGN KEY (data_domain) REFERENCES {0}._efs_data_domains (name)
|
|
206
|
+
)
|
|
207
|
+
UNIQUE PRIMARY INDEX (name, data_domain);
|
|
208
|
+
"""
|
|
203
209
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
columns=", ".join(("DeletedRow.{}".format(col) for col in EFS_FEATURE_GROUP_SPEC["columns"]))
|
|
207
|
-
)
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
# Table for storing the feature names and associated group names.
|
|
211
|
-
EFS_GROUP_FEATURES_SPEC = {
|
|
212
|
-
"table_name": "_efs_group_features",
|
|
213
|
-
"columns": {
|
|
214
|
-
"feature_name": VARCHAR(200),
|
|
215
|
-
"group_name": VARCHAR(200),
|
|
216
|
-
"creation_time": TIMESTAMP,
|
|
217
|
-
"modified_time": TIMESTAMP
|
|
218
|
-
},
|
|
219
|
-
"primary_index": ["feature_name", "group_name"],
|
|
220
|
-
"foreign_keys": [
|
|
210
|
+
EFS_DATA_SOURCE_STAGING = """
|
|
211
|
+
CREATE MULTISET TABLE {0}.{1}
|
|
221
212
|
(
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
),
|
|
213
|
+
name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
214
|
+
data_domain VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
215
|
+
description VARCHAR(1024) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
216
|
+
timestamp_column VARCHAR(50) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
217
|
+
source VARCHAR(5000) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
218
|
+
creation_time TIMESTAMP(6),
|
|
219
|
+
modified_time TIMESTAMP(6),
|
|
220
|
+
archived_time TIMESTAMP(6))
|
|
221
|
+
NO PRIMARY INDEX;
|
|
222
|
+
"""
|
|
223
|
+
|
|
224
|
+
EFS_DATA_SOURCE_TRG = """
|
|
225
|
+
CREATE TRIGGER {0}.{1}
|
|
226
|
+
AFTER DELETE ON {0}.{2}
|
|
227
|
+
REFERENCING OLD AS DeletedRow
|
|
228
|
+
FOR EACH ROW
|
|
229
|
+
INSERT INTO {3}
|
|
230
|
+
VALUES (DeletedRow.name, DeletedRow.data_domain, DeletedRow.description, DeletedRow.timestamp_column, DeletedRow.source, DeletedRow.creation_time, DeletedRow.modified_time,
|
|
231
|
+
current_timestamp(6)
|
|
232
|
+
);
|
|
233
|
+
"""
|
|
234
|
+
|
|
235
|
+
# Table for Feature groups.
|
|
236
|
+
|
|
237
|
+
EFS_FEATURE_GROUP = """
|
|
238
|
+
CREATE MULTISET TABLE {0}.{1}
|
|
226
239
|
(
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
240
|
+
name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
241
|
+
data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
242
|
+
description VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
243
|
+
data_source_name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
244
|
+
entity_name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
245
|
+
creation_time TIMESTAMP(6),
|
|
246
|
+
modified_time TIMESTAMP(6),
|
|
247
|
+
CONSTRAINT data_source_name_fk FOREIGN KEY (data_source_name, data_domain) REFERENCES {0}._efs_data_source (name, data_domain),
|
|
248
|
+
CONSTRAINT entity_fk FOREIGN KEY (entity_name, data_domain) REFERENCES {0}._efs_entity (name, data_domain),
|
|
249
|
+
CONSTRAINT data_domain_fk FOREIGN KEY (data_domain) REFERENCES {0}._efs_data_domains (name)
|
|
250
|
+
)
|
|
251
|
+
UNIQUE PRIMARY INDEX (name, data_domain);
|
|
252
|
+
"""
|
|
231
253
|
|
|
232
|
-
|
|
233
|
-
}
|
|
254
|
+
EFS_FEATURE_GROUP_STAGING = """
|
|
255
|
+
CREATE MULTISET TABLE {0}.{1}
|
|
256
|
+
(
|
|
257
|
+
name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
258
|
+
data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
259
|
+
description VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
260
|
+
data_source_name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
261
|
+
entity_name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
262
|
+
creation_time TIMESTAMP(6),
|
|
263
|
+
modified_time TIMESTAMP(6),
|
|
264
|
+
archived_time TIMESTAMP(6))
|
|
265
|
+
NO PRIMARY INDEX ;
|
|
266
|
+
"""
|
|
267
|
+
|
|
268
|
+
EFS_FEATURE_GROUP_TRG = """
|
|
269
|
+
CREATE TRIGGER {0}.{1}
|
|
270
|
+
AFTER DELETE ON {0}.{2}
|
|
271
|
+
REFERENCING OLD AS DeletedRow
|
|
272
|
+
FOR EACH ROW
|
|
273
|
+
INSERT INTO {3}
|
|
274
|
+
VALUES (DeletedRow.name, DeletedRow.data_domain, DeletedRow.description, DeletedRow.data_source_name, DeletedRow.entity_name, DeletedRow.creation_time, DeletedRow.modified_time,
|
|
275
|
+
current_timestamp(6)
|
|
276
|
+
);
|
|
277
|
+
"""
|
|
278
|
+
|
|
279
|
+
# Table for feature process.
|
|
280
|
+
EFS_FEATURE_PROCESS = """
|
|
281
|
+
CREATE MULTISET TABLE {0}.{1}
|
|
282
|
+
(
|
|
283
|
+
process_id VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
284
|
+
description VARCHAR(2000) CHARACTER SET LATIN CASESPECIFIC,
|
|
285
|
+
data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
286
|
+
process_type VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
287
|
+
data_source VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
288
|
+
entity_id VARCHAR(255) CHARACTER SET LATIN CASESPECIFIC,
|
|
289
|
+
feature_names VARCHAR(2000) CHARACTER SET LATIN CASESPECIFIC,
|
|
290
|
+
feature_ids VARCHAR(2000) CHARACTER SET LATIN CASESPECIFIC,
|
|
291
|
+
valid_start TIMESTAMP(6) WITH TIME ZONE NOT NULL,
|
|
292
|
+
valid_end TIMESTAMP(6) WITH TIME ZONE NOT NULL,
|
|
293
|
+
PERIOD FOR ValidPeriod (valid_start, valid_end) AS VALIDTIME)
|
|
294
|
+
PRIMARY INDEX (process_id);
|
|
295
|
+
"""
|
|
234
296
|
|
|
235
|
-
EFS_GROUP_FEATURES_STAGING_SPEC = {
|
|
236
|
-
"table_name": "{}_staging".format(EFS_GROUP_FEATURES_SPEC["table_name"]),
|
|
237
|
-
"columns": {
|
|
238
|
-
"feature_name": VARCHAR(200),
|
|
239
|
-
"group_name": VARCHAR(200),
|
|
240
|
-
"creation_time": TIMESTAMP,
|
|
241
|
-
"modified_time": TIMESTAMP,
|
|
242
|
-
"archived_time": TIMESTAMP
|
|
243
|
-
},
|
|
244
|
-
"primary_index": None
|
|
245
|
-
}
|
|
246
297
|
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
)
|
|
298
|
+
EFS_FEATURE_RUNS = """
|
|
299
|
+
CREATE MULTISET TABLE {0}.{1}
|
|
300
|
+
(
|
|
301
|
+
run_id BIGINT GENERATED ALWAYS AS IDENTITY (START WITH 1 INCREMENT BY 1 MINVALUE 1 NO MAXVALUE NO CYCLE) NOT NULL,
|
|
302
|
+
process_id VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
303
|
+
data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
304
|
+
start_time TIMESTAMP(6),
|
|
305
|
+
end_time TIMESTAMP(6),
|
|
306
|
+
status VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
307
|
+
filter VARCHAR(2000) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
308
|
+
as_of_start TIMESTAMP(6) WITH TIME ZONE,
|
|
309
|
+
as_of_end TIMESTAMP(6) WITH TIME ZONE,
|
|
310
|
+
failure_reason VARCHAR(2000) CHARACTER SET LATIN CASESPECIFIC)
|
|
311
|
+
UNIQUE PRIMARY INDEX (run_id);
|
|
312
|
+
"""
|
|
313
|
+
|
|
314
|
+
# Table for storing the features metadata.
|
|
315
|
+
EFS_FEATURES_METADATA = """
|
|
316
|
+
CREATE MULTISET TABLE {0}.{1}
|
|
317
|
+
(
|
|
318
|
+
entity_name VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
319
|
+
data_domain VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
320
|
+
feature_id BIGINT NOT NULL,
|
|
321
|
+
table_name VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
322
|
+
valid_start TIMESTAMP(6) WITH TIME ZONE NOT NULL,
|
|
323
|
+
valid_end TIMESTAMP(6) WITH TIME ZONE NOT NULL,
|
|
324
|
+
PERIOD FOR ValidPeriod (valid_start, valid_end) AS VALIDTIME)
|
|
325
|
+
PRIMARY INDEX (entity_name);
|
|
326
|
+
"""
|
|
327
|
+
|
|
328
|
+
EFS_DATASET_CATALOG = """
|
|
329
|
+
CREATE MULTISET TABLE {0}.{1}
|
|
330
|
+
(
|
|
331
|
+
id VARCHAR(36) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
332
|
+
data_domain VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
333
|
+
name VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
334
|
+
entity_name VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
335
|
+
database_name VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
336
|
+
description VARCHAR(2000) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
337
|
+
valid_start TIMESTAMP(6) WITH TIME ZONE NOT NULL,
|
|
338
|
+
valid_end TIMESTAMP(6) WITH TIME ZONE NOT NULL,
|
|
339
|
+
PERIOD FOR ValidPeriod (valid_start, valid_end) AS VALIDTIME)
|
|
340
|
+
PRIMARY INDEX (id);
|
|
341
|
+
"""
|
|
342
|
+
|
|
343
|
+
EFS_DATASET_FEATURES = """
|
|
344
|
+
CREATE MULTISET TABLE {0}.{1}
|
|
345
|
+
(
|
|
346
|
+
dataset_id VARCHAR(36) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
347
|
+
data_domain VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
348
|
+
feature_id BIGINT,
|
|
349
|
+
feature_name VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
350
|
+
feature_version VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
351
|
+
feature_repo VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
352
|
+
feature_view VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
353
|
+
valid_start TIMESTAMP(6) WITH TIME ZONE NOT NULL,
|
|
354
|
+
valid_end TIMESTAMP(6) WITH TIME ZONE NOT NULL,
|
|
355
|
+
PERIOD FOR ValidPeriod (valid_start, valid_end) AS VALIDTIME)
|
|
356
|
+
PRIMARY INDEX (dataset_id);
|
|
357
|
+
"""
|
|
358
|
+
|
|
359
|
+
EFS_FEATURE_VERSION = """
|
|
360
|
+
CREATE VIEW {}.{} AS
|
|
361
|
+
LOCK ROW FOR ACCESS
|
|
362
|
+
SELECT
|
|
363
|
+
data_domain,
|
|
364
|
+
entity_id,
|
|
365
|
+
trim(NGRAM) AS feature_name,
|
|
366
|
+
PROCESS_ID as feature_version
|
|
367
|
+
FROM NGramSplitter (
|
|
368
|
+
ON (
|
|
369
|
+
SELECT * FROM {}.{}
|
|
370
|
+
) as paragraphs_input
|
|
371
|
+
USING
|
|
372
|
+
TextColumn ('FEATURE_NAMES')
|
|
373
|
+
ConvertToLowerCase ('false')
|
|
374
|
+
Grams ('1')
|
|
375
|
+
Delimiter(',')
|
|
376
|
+
) AS dt;
|
|
377
|
+
"""
|
|
378
|
+
|
|
379
|
+
# Select the archived records.
|
|
380
|
+
EFS_ARCHIVED_RECORDS = """
|
|
381
|
+
SELECT {},
|
|
382
|
+
CASE WHEN valid_end < current_timestamp then 1 else 0 end as is_archived
|
|
383
|
+
FROM {}
|
|
384
|
+
WHERE {}"""
|
|
251
385
|
|
|
252
386
|
# Table to store the version of feature store. This is very important.
|
|
253
387
|
# When teradataml incrementally adds functionality for feature store, this
|
|
254
388
|
# version will be deciding factor whether teradataml should automatically
|
|
255
389
|
# update metadata or not.
|
|
256
|
-
EFS_VERSION_SPEC = {
|
|
257
|
-
"table_name": "_efs_version",
|
|
258
|
-
"columns": {
|
|
259
|
-
"version": VARCHAR(20),
|
|
260
|
-
"creation_time": TIMESTAMP
|
|
261
|
-
}
|
|
262
|
-
}
|
|
263
390
|
|
|
264
|
-
EFS_VERSION = "
|
|
391
|
+
EFS_VERSION = """
|
|
392
|
+
CREATE MULTISET TABLE {0}.{1} (
|
|
393
|
+
version VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC,
|
|
394
|
+
creation_time TIMESTAMP(6)
|
|
395
|
+
);
|
|
396
|
+
"""
|
|
397
|
+
|
|
398
|
+
EFS_VERSION_ = "2.0.0"
|
|
399
|
+
|
|
400
|
+
EFS_DB_COMPONENTS = {
|
|
401
|
+
"data_domain": "_efs_data_domains",
|
|
402
|
+
"feature": "_efs_features",
|
|
403
|
+
"feature_staging": "_efs_features_staging",
|
|
404
|
+
"feature_trg": "_efs_features_trg",
|
|
405
|
+
"group_features": "_efs_group_features",
|
|
406
|
+
"group_features_staging": "_efs_group_features_staging",
|
|
407
|
+
"group_features_trg": "_efs_group_features_trg",
|
|
408
|
+
"entity": "_efs_entity",
|
|
409
|
+
"entity_staging": "_efs_entity_staging",
|
|
410
|
+
"entity_trg": "_efs_entity_trg",
|
|
411
|
+
"entity_xref": "_efs_entity_xref",
|
|
412
|
+
"entity_staging_xref": "_efs_entity_xref_staging",
|
|
413
|
+
"entity_xref_trg": "_efs_entity_xref_trg",
|
|
414
|
+
"data_source": "_efs_data_source",
|
|
415
|
+
"data_source_staging": "_efs_data_source_staging",
|
|
416
|
+
"data_source_trg": "_efs_data_source_trg",
|
|
417
|
+
"feature_group": "_efs_feature_group",
|
|
418
|
+
"feature_group_staging": "_efs_feature_group_staging",
|
|
419
|
+
"feature_group_trg": "_efs_feature_group_trg",
|
|
420
|
+
"feature_process": "_efs_feature_process",
|
|
421
|
+
"feature_runs": "_efs_feature_runs",
|
|
422
|
+
"feature_metadata": "_efs_features_metadata",
|
|
423
|
+
"dataset_catalog": "_efs_dataset_catalog",
|
|
424
|
+
"dataset_features": "_efs_dataset_features",
|
|
425
|
+
"feature_version": "_efs_feature_version",
|
|
426
|
+
"version": "_efs_version"
|
|
427
|
+
}
|
|
265
428
|
|
|
266
429
|
|
|
267
430
|
EFS_TABLES = {
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
431
|
+
EFS_DATA_DOMAINS: "_efs_data_domains",
|
|
432
|
+
EFS_FEATURES: "_efs_features",
|
|
433
|
+
EFS_FEATURES_STAGING: "_efs_features_staging",
|
|
434
|
+
EFS_GROUP_FEATURES: "_efs_group_features",
|
|
435
|
+
EFS_GROUP_FEATURES_STAGING: "_efs_group_features_staging",
|
|
436
|
+
EFS_ENTITY: "_efs_entity",
|
|
437
|
+
EFS_ENTITY_STAGING: "_efs_entity_staging",
|
|
438
|
+
EFS_ENTITY_XREF: "_efs_entity_xref",
|
|
439
|
+
EFS_ENTITY_XREF_STAGING: "_efs_entity_xref_staging",
|
|
440
|
+
EFS_DATA_SOURCE: "_efs_data_source",
|
|
441
|
+
EFS_DATA_SOURCE_STAGING: "_efs_data_source_staging",
|
|
442
|
+
EFS_FEATURE_GROUP: "_efs_feature_group",
|
|
443
|
+
EFS_FEATURE_RUNS: "_efs_feature_runs",
|
|
444
|
+
EFS_FEATURE_GROUP_STAGING: "_efs_feature_group_staging",
|
|
445
|
+
EFS_FEATURE_PROCESS: "_efs_feature_process",
|
|
446
|
+
EFS_FEATURES_METADATA: "_efs_features_metadata",
|
|
447
|
+
EFS_DATASET_CATALOG: "_efs_dataset_catalog",
|
|
448
|
+
EFS_DATASET_FEATURES: "_efs_dataset_features",
|
|
449
|
+
EFS_VERSION: "_efs_version"
|
|
281
450
|
}
|
|
282
451
|
|
|
452
|
+
EFS_TRIGGERS = {
|
|
453
|
+
EFS_FEATURES_TRG: "_efs_features_trg",
|
|
454
|
+
EFS_GROUP_FEATURES_TRG: "_efs_group_features_trg",
|
|
455
|
+
EFS_ENTITY_TRG: "_efs_entity_trg",
|
|
456
|
+
EFS_ENTITY_XREF_TRG: "_efs_entity_xref_trg",
|
|
457
|
+
EFS_DATA_SOURCE_TRG: "_efs_data_source_trg",
|
|
458
|
+
EFS_FEATURE_GROUP_TRG: "_efs_feature_group_trg"
|
|
459
|
+
}
|
|
283
460
|
|
|
284
461
|
class FeatureStatus(Enum):
|
|
285
462
|
ACTIVE = 1
|
|
@@ -289,3 +466,193 @@ class FeatureStatus(Enum):
|
|
|
289
466
|
class FeatureType(Enum):
|
|
290
467
|
CONTINUOUS = 1
|
|
291
468
|
CATEGORICAL = 2
|
|
469
|
+
NUMERICAL = 3
|
|
470
|
+
|
|
471
|
+
class ProcessType(Enum):
|
|
472
|
+
DENORMALIZED_VIEW = 'denormalized view'
|
|
473
|
+
FEATURE_GROUP = 'feature group'
|
|
474
|
+
NEW = 'new'
|
|
475
|
+
EXISTING = 'existing'
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
class ProcessStatus(Enum):
|
|
479
|
+
NOT_STARTED = 'not started'
|
|
480
|
+
RUNNING = 'running'
|
|
481
|
+
COMPLETED = 'completed'
|
|
482
|
+
FAILED = 'failed'
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
class _FeatureStoreDFContainer:
|
|
486
|
+
"""
|
|
487
|
+
Utility class for FeatureStore DataFrame operations.
|
|
488
|
+
|
|
489
|
+
This class provides static methods for creating and managing DataFrames
|
|
490
|
+
used across different FeatureStore components, eliminating code duplication
|
|
491
|
+
and providing a centralized, efficient approach to DataFrame handling.
|
|
492
|
+
"""
|
|
493
|
+
__df_container = {}
|
|
494
|
+
|
|
495
|
+
@staticmethod
|
|
496
|
+
def get_df(obj_type, repo, data_domain):
|
|
497
|
+
"""
|
|
498
|
+
DESCRIPTION:
|
|
499
|
+
Generic static method to create and manage DataFrames for different object types
|
|
500
|
+
in FeatureStore. Handles joins and special object type processing.
|
|
501
|
+
|
|
502
|
+
PARAMETERS:
|
|
503
|
+
obj_type:
|
|
504
|
+
Required Argument.
|
|
505
|
+
Specifies the type of DataFrame to return.
|
|
506
|
+
Supported types: 'feature', 'feature_staging', 'entity', 'entity_staging',
|
|
507
|
+
'feature_wog', 'feature_info', 'feature_catalog', 'entity_info', and all
|
|
508
|
+
other types defined in EFS_DB_COMPONENTS.
|
|
509
|
+
Types: str
|
|
510
|
+
|
|
511
|
+
repo:
|
|
512
|
+
Required Argument.
|
|
513
|
+
Specifies the repository name.
|
|
514
|
+
Types: str
|
|
515
|
+
|
|
516
|
+
data_domain:
|
|
517
|
+
Required Argument.
|
|
518
|
+
Specifies the data domain for filtering operations.
|
|
519
|
+
Types: str
|
|
520
|
+
|
|
521
|
+
RETURNS:
|
|
522
|
+
teradataml DataFrame.
|
|
523
|
+
|
|
524
|
+
RAISES:
|
|
525
|
+
TeradataMlException
|
|
526
|
+
|
|
527
|
+
EXAMPLES:
|
|
528
|
+
>>> # Basic DataFrame retrieval
|
|
529
|
+
>>> df = _FeatureStoreDFContainer.get_df(
|
|
530
|
+
... obj_type='feature',
|
|
531
|
+
... repo='my_repo',
|
|
532
|
+
... data_domain='analytics'
|
|
533
|
+
... )
|
|
534
|
+
|
|
535
|
+
>>> # Complex join for feature info
|
|
536
|
+
>>> df = _FeatureStoreDFContainer.get_df(
|
|
537
|
+
... obj_type='feature_info',
|
|
538
|
+
... repo='my_repo',
|
|
539
|
+
... data_domain='analytics'
|
|
540
|
+
... )
|
|
541
|
+
"""
|
|
542
|
+
from teradataml.dataframe.dataframe import DataFrame, in_schema
|
|
543
|
+
repo_obj = repo + '.' + data_domain + '.' + obj_type
|
|
544
|
+
|
|
545
|
+
if repo_obj not in _FeatureStoreDFContainer.__df_container:
|
|
546
|
+
|
|
547
|
+
# Handle complex FeatureStore-specific patterns with joins
|
|
548
|
+
if obj_type in ["feature", "feature_staging"]:
|
|
549
|
+
# Join features with group_features for group name
|
|
550
|
+
map_ = {"feature": "group_features", "feature_staging": "group_features_staging"}
|
|
551
|
+
features = DataFrame(in_schema(repo, EFS_DB_COMPONENTS[obj_type]))
|
|
552
|
+
features_xref = DataFrame(in_schema(repo, EFS_DB_COMPONENTS[map_[obj_type]]))
|
|
553
|
+
features = features[features.data_domain == data_domain]
|
|
554
|
+
features_xref = features_xref[features_xref.feature_data_domain == data_domain].select(["feature_name", "group_name"])
|
|
555
|
+
df = features.join(features_xref, on=["name==feature_name"], how='left')
|
|
556
|
+
_FeatureStoreDFContainer.__df_container[repo_obj] = df.select(features.columns + ["group_name"])
|
|
557
|
+
|
|
558
|
+
elif obj_type in ["entity", "entity_staging"]:
|
|
559
|
+
# Join entity with entity_xref for entity columns
|
|
560
|
+
ent_df = DataFrame(in_schema(repo, EFS_DB_COMPONENTS[obj_type]))
|
|
561
|
+
xref_df = DataFrame(in_schema(repo, EFS_DB_COMPONENTS["{}_xref".format(obj_type)]))
|
|
562
|
+
ent_df = ent_df[ent_df.data_domain == data_domain]
|
|
563
|
+
xref_df = xref_df[xref_df.data_domain == data_domain].select(['entity_name', 'entity_column'])
|
|
564
|
+
df = ent_df.join(xref_df, on=["name==entity_name"], how="inner")
|
|
565
|
+
_FeatureStoreDFContainer.__df_container[repo_obj] = df.select(ent_df.columns + ["entity_column"])
|
|
566
|
+
|
|
567
|
+
elif obj_type == "feature_wog":
|
|
568
|
+
# Feature without group - direct access to feature table
|
|
569
|
+
_FeatureStoreDFContainer.__df_container[repo_obj] = DataFrame(in_schema(repo, EFS_DB_COMPONENTS["feature"]))
|
|
570
|
+
|
|
571
|
+
elif obj_type == "feature_info":
|
|
572
|
+
# join: features + metadata
|
|
573
|
+
# Use feature_wog (without group)
|
|
574
|
+
feature = _FeatureStoreDFContainer.get_df('feature_wog', repo, data_domain)
|
|
575
|
+
|
|
576
|
+
# Get metadata DataFrame
|
|
577
|
+
feature_metadata = DataFrame(in_schema(repo, EFS_DB_COMPONENTS["feature_metadata"]))
|
|
578
|
+
|
|
579
|
+
# Drop ValidPeriod column if it exists
|
|
580
|
+
if 'ValidPeriod' in feature_metadata.columns:
|
|
581
|
+
feature_metadata = feature_metadata.drop(columns=["ValidPeriod"])
|
|
582
|
+
|
|
583
|
+
df = feature_metadata.join(feature,
|
|
584
|
+
how="inner",
|
|
585
|
+
on=[feature_metadata.feature_id == feature.id,
|
|
586
|
+
feature_metadata.data_domain == feature.data_domain,
|
|
587
|
+
feature_metadata.data_domain == data_domain],
|
|
588
|
+
lsuffix="_meta",
|
|
589
|
+
rsuffix="_feat")
|
|
590
|
+
_FeatureStoreDFContainer.__df_container[repo_obj] = df
|
|
591
|
+
|
|
592
|
+
elif obj_type == "feature_catalog":
|
|
593
|
+
# join: features + metadata + version
|
|
594
|
+
# Get the required DataFrames directly
|
|
595
|
+
fv = DataFrame(in_schema(repo, EFS_DB_COMPONENTS["feature_version"]))
|
|
596
|
+
f_ = _FeatureStoreDFContainer.get_df("feature", repo, data_domain)
|
|
597
|
+
|
|
598
|
+
# Feature can be mapped to more than one feature group. So, 'f_' can have duplicate rows
|
|
599
|
+
# which propagates these duplicates to final result.
|
|
600
|
+
f_ = f_.drop_duplicate(['id', 'data_domain', 'name'])
|
|
601
|
+
fm = DataFrame(in_schema(repo, EFS_DB_COMPONENTS["feature_metadata"]))
|
|
602
|
+
|
|
603
|
+
ndf = fm.select(['entity_name', 'data_domain', 'feature_id', 'table_name', 'valid_end'])
|
|
604
|
+
hdf = ndf.join(
|
|
605
|
+
f_, on=((f_.id == ndf.feature_id) & (ndf.data_domain == f_.data_domain)),
|
|
606
|
+
how='inner',
|
|
607
|
+
lprefix='l'
|
|
608
|
+
).select(['entity_name', 'data_domain', 'id', 'name', 'table_name', 'valid_end'])
|
|
609
|
+
|
|
610
|
+
vdf = hdf.join(fv,
|
|
611
|
+
on=(
|
|
612
|
+
(hdf.data_domain == fv.data_domain) &
|
|
613
|
+
(hdf.entity_name == fv.entity_id) &
|
|
614
|
+
(fv.feature_name == hdf.name) &
|
|
615
|
+
(fv.data_domain == data_domain)
|
|
616
|
+
),
|
|
617
|
+
how='inner',
|
|
618
|
+
lprefix='l'
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
_FeatureStoreDFContainer.__df_container[repo_obj] = vdf.select(
|
|
622
|
+
['entity_id', 'data_domain', 'id', 'name', 'table_name', 'feature_version', 'valid_end']
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
elif obj_type == "entity_info":
|
|
626
|
+
# join: entity + entity_xref
|
|
627
|
+
entity_df = DataFrame(in_schema(repo, EFS_DB_COMPONENTS["entity"]))
|
|
628
|
+
entity_xref_df = DataFrame(in_schema(repo, EFS_DB_COMPONENTS["entity_xref"]))
|
|
629
|
+
|
|
630
|
+
# Build join conditions
|
|
631
|
+
join_conditions = [
|
|
632
|
+
entity_df.name == entity_xref_df.entity_name,
|
|
633
|
+
entity_df.data_domain == entity_xref_df.data_domain,
|
|
634
|
+
entity_df.data_domain == data_domain
|
|
635
|
+
]
|
|
636
|
+
|
|
637
|
+
df = entity_df.join(
|
|
638
|
+
other=entity_xref_df,
|
|
639
|
+
on=join_conditions,
|
|
640
|
+
lsuffix="l"
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
_FeatureStoreDFContainer.__df_container[repo_obj] = df.select(
|
|
644
|
+
['entity_name', 'data_domain', 'entity_column', 'description']
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
elif obj_type == 'data_domain':
|
|
648
|
+
_FeatureStoreDFContainer.__df_container[repo_obj] = DataFrame(in_schema(repo, EFS_DB_COMPONENTS["data_domain"]))
|
|
649
|
+
|
|
650
|
+
# Default case: simple DataFrame creation
|
|
651
|
+
else:
|
|
652
|
+
df = DataFrame(in_schema(repo, EFS_DB_COMPONENTS[obj_type]))
|
|
653
|
+
if 'data_domain' in df.columns:
|
|
654
|
+
df = df[df.data_domain == data_domain]
|
|
655
|
+
|
|
656
|
+
_FeatureStoreDFContainer.__df_container[repo_obj] = df
|
|
657
|
+
|
|
658
|
+
return _FeatureStoreDFContainer.__df_container[repo_obj]
|