teradataml 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (96) hide show
  1. teradataml/README.md +210 -0
  2. teradataml/__init__.py +1 -1
  3. teradataml/_version.py +1 -1
  4. teradataml/analytics/analytic_function_executor.py +162 -76
  5. teradataml/analytics/byom/__init__.py +1 -1
  6. teradataml/analytics/json_parser/__init__.py +2 -0
  7. teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
  8. teradataml/analytics/json_parser/metadata.py +22 -4
  9. teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
  10. teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
  11. teradataml/analytics/sqle/__init__.py +3 -0
  12. teradataml/analytics/utils.py +4 -1
  13. teradataml/automl/__init__.py +2369 -464
  14. teradataml/automl/autodataprep/__init__.py +15 -0
  15. teradataml/automl/custom_json_utils.py +184 -112
  16. teradataml/automl/data_preparation.py +113 -58
  17. teradataml/automl/data_transformation.py +154 -53
  18. teradataml/automl/feature_engineering.py +113 -53
  19. teradataml/automl/feature_exploration.py +548 -25
  20. teradataml/automl/model_evaluation.py +260 -32
  21. teradataml/automl/model_training.py +399 -206
  22. teradataml/clients/auth_client.py +2 -2
  23. teradataml/common/aed_utils.py +11 -2
  24. teradataml/common/bulk_exposed_utils.py +4 -2
  25. teradataml/common/constants.py +62 -2
  26. teradataml/common/garbagecollector.py +50 -21
  27. teradataml/common/messagecodes.py +47 -2
  28. teradataml/common/messages.py +19 -1
  29. teradataml/common/sqlbundle.py +23 -6
  30. teradataml/common/utils.py +116 -10
  31. teradataml/context/aed_context.py +16 -10
  32. teradataml/data/Employee.csv +5 -0
  33. teradataml/data/Employee_Address.csv +4 -0
  34. teradataml/data/Employee_roles.csv +5 -0
  35. teradataml/data/JulesBelvezeDummyData.csv +100 -0
  36. teradataml/data/byom_example.json +5 -0
  37. teradataml/data/creditcard_data.csv +284618 -0
  38. teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
  39. teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
  40. teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
  41. teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
  42. teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
  43. teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
  44. teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
  45. teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
  46. teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
  47. teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
  48. teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
  49. teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
  50. teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
  51. teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
  52. teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
  53. teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
  54. teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
  55. teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
  56. teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
  57. teradataml/data/load_example_data.py +29 -11
  58. teradataml/data/payment_fraud_dataset.csv +10001 -0
  59. teradataml/data/teradataml_example.json +67 -0
  60. teradataml/dataframe/copy_to.py +714 -54
  61. teradataml/dataframe/dataframe.py +1153 -33
  62. teradataml/dataframe/dataframe_utils.py +8 -3
  63. teradataml/dataframe/functions.py +168 -1
  64. teradataml/dataframe/setop.py +4 -1
  65. teradataml/dataframe/sql.py +141 -9
  66. teradataml/dbutils/dbutils.py +470 -35
  67. teradataml/dbutils/filemgr.py +1 -1
  68. teradataml/hyperparameter_tuner/optimizer.py +456 -142
  69. teradataml/lib/aed_0_1.dll +0 -0
  70. teradataml/lib/libaed_0_1.dylib +0 -0
  71. teradataml/lib/libaed_0_1.so +0 -0
  72. teradataml/lib/libaed_0_1_aarch64.so +0 -0
  73. teradataml/scriptmgmt/UserEnv.py +234 -34
  74. teradataml/scriptmgmt/lls_utils.py +43 -17
  75. teradataml/sdk/_json_parser.py +1 -1
  76. teradataml/sdk/api_client.py +9 -6
  77. teradataml/sdk/modelops/_client.py +3 -0
  78. teradataml/series/series.py +12 -7
  79. teradataml/store/feature_store/constants.py +601 -234
  80. teradataml/store/feature_store/feature_store.py +2886 -616
  81. teradataml/store/feature_store/mind_map.py +639 -0
  82. teradataml/store/feature_store/models.py +5831 -214
  83. teradataml/store/feature_store/utils.py +390 -0
  84. teradataml/table_operators/table_operator_util.py +1 -1
  85. teradataml/table_operators/templates/dataframe_register.template +6 -2
  86. teradataml/table_operators/templates/dataframe_udf.template +6 -2
  87. teradataml/utils/docstring.py +527 -0
  88. teradataml/utils/dtypes.py +93 -0
  89. teradataml/utils/internal_buffer.py +2 -2
  90. teradataml/utils/utils.py +41 -2
  91. teradataml/utils/validators.py +694 -17
  92. {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +213 -2
  93. {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +96 -81
  94. {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
  95. {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
  96. {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0
@@ -13,273 +13,450 @@ from enum import Enum
13
13
 
14
14
  # Template for creating the triggers on
15
15
  # corresponding tables.
16
- _EFS_TRIGGER_TEMPLATE = """
17
- CREATE TRIGGER {{schema_name}}.{table}_trg
18
- AFTER DELETE ON {{schema_name}}.{table}
19
- REFERENCING OLD AS DeletedRow
20
- FOR EACH ROW
21
- INSERT INTO {{schema_name}}.{table}_staging
22
- VALUES ({columns},
23
- current_timestamp(6)
24
- )
16
+
17
+ # Tables for storing the data domains.
18
+ EFS_DATA_DOMAINS="""
19
+ CREATE MULTISET TABLE {0}.{1}
20
+ (
21
+ name VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
22
+ created_time TIMESTAMP(6)
23
+ )
24
+ UNIQUE PRIMARY INDEX (name);
25
25
  """
26
26
 
27
- # Table for storing the features.
28
- EFS_FEATURES_SPEC = {
29
- "table_name": "_efs_features",
30
- "columns": {
31
- "name": VARCHAR(200),
32
- "column_name": VARCHAR(200),
33
- "description": VARCHAR(1024),
34
- "tags": VARCHAR(2000),
35
- "data_type": VARCHAR(1024),
36
- "feature_type": VARCHAR(100),
37
- "status": VARCHAR(100),
38
- "creation_time": TIMESTAMP,
39
- "modified_time": TIMESTAMP
40
- },
41
- "primary_index": "name"
42
- }
43
27
 
44
- # Table for storing the features.
45
- EFS_FEATURES_STAGING_SPEC = {
46
- "table_name": "{}_staging".format(EFS_FEATURES_SPEC["table_name"]),
47
- "columns": {
48
- "name": VARCHAR(200),
49
- "column_name": VARCHAR(200),
50
- "description": VARCHAR(1024),
51
- "tags": VARCHAR(2000),
52
- "data_type": VARCHAR(1024),
53
- "feature_type": VARCHAR(100),
54
- "status": VARCHAR(100),
55
- "creation_time": TIMESTAMP,
56
- "modified_time": TIMESTAMP,
57
- "archived_time": TIMESTAMP
58
- },
59
- "primary_index": None
60
- }
28
+ # Tables for storing the features.
29
+ EFS_FEATURES = """
30
+ CREATE MULTISET TABLE {0}.{1}
31
+ (
32
+ id INTEGER,
33
+ name VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
34
+ data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
35
+ column_name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
36
+ description VARCHAR(1024) CHARACTER SET LATIN NOT CASESPECIFIC,
37
+ tags VARCHAR(2000) CHARACTER SET LATIN NOT CASESPECIFIC,
38
+ data_type VARCHAR(1024) CHARACTER SET LATIN NOT CASESPECIFIC,
39
+ feature_type VARCHAR(100) CHARACTER SET LATIN NOT CASESPECIFIC,
40
+ status VARCHAR(100) CHARACTER SET LATIN NOT CASESPECIFIC,
41
+ creation_time TIMESTAMP(6),
42
+ modified_time TIMESTAMP(6),
43
+ CONSTRAINT data_domain_fk FOREIGN KEY (data_domain) REFERENCES _efs_data_domains (name)
44
+ )
45
+ UNIQUE PRIMARY INDEX (name, data_domain)
46
+ UNIQUE INDEX (id);
47
+ """
61
48
 
62
- EFS_FEATURES_TRG = _EFS_TRIGGER_TEMPLATE.format(
63
- table=EFS_FEATURES_SPEC["table_name"],
64
- columns=", ".join(("DeletedRow.{}".format(col) for col in EFS_FEATURES_SPEC["columns"]))
65
- )
66
-
67
- # Table for storing the entities. Every Dataset has column(s) that are unique.
68
- # This table holds all such columns.
69
- EFS_ENTITY_SPEC = {
70
- "table_name": "_efs_entity",
71
- "columns": {
72
- "name": VARCHAR(200),
73
- "description": VARCHAR(200),
74
- "creation_time": TIMESTAMP,
75
- "modified_time": TIMESTAMP
76
- },
77
- "primary_index": ["name"]
78
- }
49
+ EFS_FEATURES_STAGING="""
50
+ CREATE MULTISET TABLE {0}.{1}
51
+ (
52
+ id INTEGER,
53
+ name VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
54
+ data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
55
+ column_name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
56
+ description VARCHAR(1024) CHARACTER SET LATIN NOT CASESPECIFIC,
57
+ tags VARCHAR(2000) CHARACTER SET LATIN NOT CASESPECIFIC,
58
+ data_type VARCHAR(1024) CHARACTER SET LATIN NOT CASESPECIFIC,
59
+ feature_type VARCHAR(100) CHARACTER SET LATIN NOT CASESPECIFIC,
60
+ status VARCHAR(100) CHARACTER SET LATIN NOT CASESPECIFIC,
61
+ creation_time TIMESTAMP(6),
62
+ modified_time TIMESTAMP(6),
63
+ archived_time TIMESTAMP(6)
64
+ )
65
+ NO PRIMARY INDEX ;
66
+ """
79
67
 
80
- EFS_ENTITY_STAGING_SPEC = {
81
- "table_name": "{}_staging".format(EFS_ENTITY_SPEC["table_name"]),
82
- "columns": {
83
- "name": VARCHAR(200),
84
- "description": VARCHAR(200),
85
- "creation_time": TIMESTAMP,
86
- "modified_time": TIMESTAMP,
87
- "archived_time": TIMESTAMP
88
- },
89
- "primary_index": None
90
- }
68
+ EFS_FEATURES_TRG="""
69
+ CREATE TRIGGER {0}.{1}
70
+ AFTER DELETE ON {0}.{2}
71
+ REFERENCING OLD AS DeletedRow
72
+ FOR EACH ROW
73
+ INSERT INTO {3}
74
+ VALUES (DeletedRow.id, DeletedRow.name, DeletedRow.data_domain, DeletedRow.column_name, DeletedRow.description, DeletedRow.tags, DeletedRow.data_type, DeletedRow.feature_type, DeletedRow.status, DeletedRow.creation_time, DeletedRow.modified_time,
75
+ current_timestamp(6)
76
+ );
77
+ """
91
78
 
92
- EFS_ENTITY_TRG = _EFS_TRIGGER_TEMPLATE.format(
93
- table=EFS_ENTITY_SPEC["table_name"],
94
- columns=", ".join(("DeletedRow.{}".format(col) for col in EFS_ENTITY_SPEC["columns"]))
95
- )
96
-
97
- EFS_ENTITY_XREF_SPEC = {
98
- "table_name": "_efs_entity_xref",
99
- "columns": {
100
- "entity_name": VARCHAR(200),
101
- "entity_column": VARCHAR(200)
102
- },
103
- "primary_index": ["entity_name", "entity_column"],
104
- "foreign_keys": [
79
+ EFS_GROUP_FEATURES = """
80
+ CREATE MULTISET TABLE {0}.{1}
105
81
  (
106
- ["entity_name"],
107
- ["{}.name".format(EFS_ENTITY_SPEC["table_name"])],
108
- "entity_xref_fk"
82
+ feature_name VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
83
+ feature_data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
84
+ group_name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
85
+ group_data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
86
+ creation_time TIMESTAMP(6),
87
+ modified_time TIMESTAMP(6),
88
+ CONSTRAINT feature_name_fk FOREIGN KEY (feature_name, feature_data_domain) REFERENCES {0}._efs_features (name, data_domain),
89
+ CONSTRAINT group_name_fk FOREIGN KEY (group_name, group_data_domain) REFERENCES {0}._efs_feature_group (name, data_domain),
90
+ CONSTRAINT data_domain_fk1 FOREIGN KEY (feature_data_domain) REFERENCES {0}._efs_data_domains (name),
91
+ CONSTRAINT data_domain_fk2 FOREIGN KEY (group_data_domain) REFERENCES {0}._efs_data_domains (name)
109
92
  )
110
- ]
111
- }
93
+ UNIQUE PRIMARY INDEX (feature_name, feature_data_domain, group_name, group_data_domain);
94
+ """
112
95
 
113
- EFS_ENTITY_XREF_STAGING_SPEC = {
114
- "table_name": "{}_staging".format(EFS_ENTITY_XREF_SPEC["table_name"]),
115
- "columns": {
116
- "entity_name": VARCHAR(200),
117
- "entity_column": VARCHAR(200),
118
- "archived_time": TIMESTAMP
119
- },
120
- "primary_index": None
121
- }
96
+ EFS_GROUP_FEATURES_STAGING = """
97
+ CREATE MULTISET TABLE {0}.{1}
98
+ (
99
+ feature_name VARCHAR(255),
100
+ feature_data_domain VARCHAR(255),
101
+ group_name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
102
+ group_data_domain VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
103
+ creation_time TIMESTAMP(6),
104
+ modified_time TIMESTAMP(6),
105
+ archived_time TIMESTAMP(6)
106
+ )
107
+ NO PRIMARY INDEX ;
108
+ """
122
109
 
123
- EFS_ENTITY_XREF_TRG = _EFS_TRIGGER_TEMPLATE.format(
124
- table=EFS_ENTITY_XREF_SPEC["table_name"],
125
- columns=", ".join(("DeletedRow.{}".format(col) for col in EFS_ENTITY_XREF_SPEC["columns"]))
126
- )
127
-
128
- # Table for storing the Data sources. Column source stores
129
- # the corresponding Query.
130
- EFS_DATA_SOURCE_SPEC = {
131
- "table_name": "_efs_data_source",
132
- "columns": {
133
- "name": VARCHAR(200),
134
- "description": VARCHAR(1024),
135
- "timestamp_col_name": VARCHAR(50),
136
- "source": VARCHAR(5000),
137
- "creation_time": TIMESTAMP,
138
- "modified_time": TIMESTAMP
139
- },
140
- "primary_index": "name"
141
- }
110
+ EFS_GROUP_FEATURES_TRG = """
111
+ CREATE TRIGGER {0}.{1}
112
+ AFTER DELETE ON {0}.{2}
113
+ REFERENCING OLD AS DeletedRow
114
+ FOR EACH ROW
115
+ INSERT INTO {3}
116
+ VALUES (DeletedRow.feature_name, DeletedRow.feature_data_domain, DeletedRow.group_name, DeletedRow.group_data_domain, DeletedRow.creation_time, DeletedRow.modified_time,
117
+ current_timestamp(6)
118
+ );
119
+ """
120
+
121
+ # Tables for Entities.
122
+
123
+ EFS_ENTITY = """
124
+ CREATE MULTISET TABLE {0}.{1}
125
+ (
126
+ name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
127
+ data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
128
+ description VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
129
+ creation_time TIMESTAMP(6),
130
+ modified_time TIMESTAMP(6),
131
+ CONSTRAINT data_domain_fk FOREIGN KEY (data_domain) REFERENCES {0}._efs_data_domains (name)
132
+ )
133
+ UNIQUE PRIMARY INDEX (name, data_domain);
134
+ """
142
135
 
143
- EFS_DATA_SOURCE_STAGING_SPEC = {
144
- "table_name": "{}_staging".format(EFS_DATA_SOURCE_SPEC["table_name"]),
145
- "columns": {
146
- "name": VARCHAR(200),
147
- "description": VARCHAR(1024),
148
- "timestamp_col_name": VARCHAR(50),
149
- "source": VARCHAR(5000),
150
- "creation_time": TIMESTAMP,
151
- "modified_time": TIMESTAMP,
152
- "archived_time": TIMESTAMP
153
- },
154
- "primary_index": None
155
- }
136
+ EFS_ENTITY_STAGING= """
137
+ CREATE MULTISET TABLE {0}.{1}
138
+ (
139
+ name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
140
+ data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
141
+ description VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
142
+ creation_time TIMESTAMP(6),
143
+ modified_time TIMESTAMP(6),
144
+ archived_time TIMESTAMP(6))
145
+ NO PRIMARY INDEX ;
146
+ """
147
+
148
+ EFS_ENTITY_TRG = """
149
+ CREATE TRIGGER {0}.{1}
150
+ AFTER DELETE ON {0}.{2}
151
+ REFERENCING OLD AS DeletedRow
152
+ FOR EACH ROW
153
+ INSERT INTO {3}
154
+ VALUES (DeletedRow.name, DeletedRow.data_domain, DeletedRow.description, DeletedRow.creation_time, DeletedRow.modified_time,
155
+ current_timestamp(6)
156
+ );
157
+ """
156
158
 
157
- EFS_DATA_SOURCE_TRG = _EFS_TRIGGER_TEMPLATE.format(
158
- table=EFS_DATA_SOURCE_SPEC["table_name"],
159
- columns=", ".join(("DeletedRow.{}".format(col) for col in EFS_DATA_SOURCE_SPEC["columns"]))
160
- )
161
-
162
- # Table for storing the feature groups. This table holds all the required
163
- # parameters for creating DataFrame.
164
- EFS_FEATURE_GROUP_SPEC = {
165
- "table_name": "_efs_feature_group",
166
- "columns": {
167
- "name": VARCHAR(200),
168
- "description": VARCHAR(200),
169
- "data_source_name": VARCHAR(200),
170
- "entity_name": VARCHAR(200),
171
- "creation_time": TIMESTAMP,
172
- "modified_time": TIMESTAMP
173
- },
174
- "primary_index": "name",
175
- "foreign_keys": [
159
+ EFS_ENTITY_XREF= """
160
+ CREATE MULTISET TABLE {0}.{1}
176
161
  (
177
- ["data_source_name"],
178
- ["{}.name".format(EFS_DATA_SOURCE_SPEC["table_name"])],
179
- "data_source_name_fk"
180
- ),
162
+ entity_name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
163
+ data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
164
+ entity_column VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
165
+ CONSTRAINT entity_xref_fk FOREIGN KEY (entity_name, data_domain) REFERENCES {0}._efs_entity (name, data_domain),
166
+ CONSTRAINT data_domain_fk FOREIGN KEY (data_domain) REFERENCES {0}._efs_data_domains (name)
167
+ )
168
+ UNIQUE PRIMARY INDEX (entity_name, data_domain, entity_column);
169
+ """
170
+
171
+ EFS_ENTITY_XREF_STAGING = """
172
+ CREATE MULTISET TABLE {0}.{1}
181
173
  (
182
- ["entity_name"],
183
- ["{}.name".format(EFS_ENTITY_SPEC["table_name"])],
184
- "entity_fk"
185
- )
174
+ entity_name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
175
+ data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
176
+ entity_column VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
177
+ archived_time TIMESTAMP(6)
178
+ )
179
+ NO PRIMARY INDEX ;
180
+ """
186
181
 
187
- ]
188
- }
182
+ EFS_ENTITY_XREF_TRG = """
183
+ CREATE TRIGGER {0}.{1}
184
+ AFTER DELETE ON {0}.{2}
185
+ REFERENCING OLD AS DeletedRow
186
+ FOR EACH ROW
187
+ INSERT INTO {3}
188
+ VALUES (DeletedRow.entity_name, DeletedRow.data_domain, DeletedRow.entity_column,
189
+ current_timestamp(6)
190
+ );
191
+ """
189
192
 
190
- EFS_FEATURE_GROUP_STAGING_SPEC = {
191
- "table_name": "{}_staging".format(EFS_FEATURE_GROUP_SPEC["table_name"]),
192
- "columns": {
193
- "name": VARCHAR(200),
194
- "description": VARCHAR(200),
195
- "data_source_name": VARCHAR(200),
196
- "entity_name": VARCHAR(200),
197
- "creation_time": TIMESTAMP,
198
- "modified_time": TIMESTAMP,
199
- "archived_time": TIMESTAMP
200
- },
201
- "primary_index": None
202
- }
193
+ # Table for Data sources.
194
+
195
+ EFS_DATA_SOURCE = """
196
+ CREATE MULTISET TABLE {0}.{1}
197
+ (
198
+ name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
199
+ data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
200
+ description VARCHAR(1024) CHARACTER SET LATIN NOT CASESPECIFIC,
201
+ timestamp_column VARCHAR(50) CHARACTER SET LATIN NOT CASESPECIFIC,
202
+ source VARCHAR(5000) CHARACTER SET LATIN NOT CASESPECIFIC,
203
+ creation_time TIMESTAMP(6),
204
+ modified_time TIMESTAMP(6),
205
+ CONSTRAINT data_domain_fk FOREIGN KEY (data_domain) REFERENCES {0}._efs_data_domains (name)
206
+ )
207
+ UNIQUE PRIMARY INDEX (name, data_domain);
208
+ """
203
209
 
204
- EFS_FEATURE_GROUP_TRG = _EFS_TRIGGER_TEMPLATE.format(
205
- table=EFS_FEATURE_GROUP_SPEC["table_name"],
206
- columns=", ".join(("DeletedRow.{}".format(col) for col in EFS_FEATURE_GROUP_SPEC["columns"]))
207
- )
208
-
209
-
210
- # Table for storing the feature names and associated group names.
211
- EFS_GROUP_FEATURES_SPEC = {
212
- "table_name": "_efs_group_features",
213
- "columns": {
214
- "feature_name": VARCHAR(200),
215
- "group_name": VARCHAR(200),
216
- "creation_time": TIMESTAMP,
217
- "modified_time": TIMESTAMP
218
- },
219
- "primary_index": ["feature_name", "group_name"],
220
- "foreign_keys": [
210
+ EFS_DATA_SOURCE_STAGING = """
211
+ CREATE MULTISET TABLE {0}.{1}
221
212
  (
222
- ["feature_name"],
223
- ["{}.name".format(EFS_FEATURES_SPEC["table_name"])],
224
- "feature_name_fk"
225
- ),
213
+ name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
214
+ data_domain VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
215
+ description VARCHAR(1024) CHARACTER SET LATIN NOT CASESPECIFIC,
216
+ timestamp_column VARCHAR(50) CHARACTER SET LATIN NOT CASESPECIFIC,
217
+ source VARCHAR(5000) CHARACTER SET LATIN NOT CASESPECIFIC,
218
+ creation_time TIMESTAMP(6),
219
+ modified_time TIMESTAMP(6),
220
+ archived_time TIMESTAMP(6))
221
+ NO PRIMARY INDEX;
222
+ """
223
+
224
+ EFS_DATA_SOURCE_TRG = """
225
+ CREATE TRIGGER {0}.{1}
226
+ AFTER DELETE ON {0}.{2}
227
+ REFERENCING OLD AS DeletedRow
228
+ FOR EACH ROW
229
+ INSERT INTO {3}
230
+ VALUES (DeletedRow.name, DeletedRow.data_domain, DeletedRow.description, DeletedRow.timestamp_column, DeletedRow.source, DeletedRow.creation_time, DeletedRow.modified_time,
231
+ current_timestamp(6)
232
+ );
233
+ """
234
+
235
+ # Table for Feature groups.
236
+
237
+ EFS_FEATURE_GROUP = """
238
+ CREATE MULTISET TABLE {0}.{1}
226
239
  (
227
- ["group_name"],
228
- ["{}.name".format(EFS_FEATURE_GROUP_SPEC["table_name"])],
229
- "group_name_fk"
230
- )
240
+ name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
241
+ data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
242
+ description VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
243
+ data_source_name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
244
+ entity_name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
245
+ creation_time TIMESTAMP(6),
246
+ modified_time TIMESTAMP(6),
247
+ CONSTRAINT data_source_name_fk FOREIGN KEY (data_source_name, data_domain) REFERENCES {0}._efs_data_source (name, data_domain),
248
+ CONSTRAINT entity_fk FOREIGN KEY (entity_name, data_domain) REFERENCES {0}._efs_entity (name, data_domain),
249
+ CONSTRAINT data_domain_fk FOREIGN KEY (data_domain) REFERENCES {0}._efs_data_domains (name)
250
+ )
251
+ UNIQUE PRIMARY INDEX (name, data_domain);
252
+ """
231
253
 
232
- ]
233
- }
254
+ EFS_FEATURE_GROUP_STAGING = """
255
+ CREATE MULTISET TABLE {0}.{1}
256
+ (
257
+ name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
258
+ data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
259
+ description VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
260
+ data_source_name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
261
+ entity_name VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
262
+ creation_time TIMESTAMP(6),
263
+ modified_time TIMESTAMP(6),
264
+ archived_time TIMESTAMP(6))
265
+ NO PRIMARY INDEX ;
266
+ """
267
+
268
+ EFS_FEATURE_GROUP_TRG = """
269
+ CREATE TRIGGER {0}.{1}
270
+ AFTER DELETE ON {0}.{2}
271
+ REFERENCING OLD AS DeletedRow
272
+ FOR EACH ROW
273
+ INSERT INTO {3}
274
+ VALUES (DeletedRow.name, DeletedRow.data_domain, DeletedRow.description, DeletedRow.data_source_name, DeletedRow.entity_name, DeletedRow.creation_time, DeletedRow.modified_time,
275
+ current_timestamp(6)
276
+ );
277
+ """
278
+
279
+ # Table for feature process.
280
+ EFS_FEATURE_PROCESS = """
281
+ CREATE MULTISET TABLE {0}.{1}
282
+ (
283
+ process_id VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
284
+ description VARCHAR(2000) CHARACTER SET LATIN CASESPECIFIC,
285
+ data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
286
+ process_type VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
287
+ data_source VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
288
+ entity_id VARCHAR(255) CHARACTER SET LATIN CASESPECIFIC,
289
+ feature_names VARCHAR(2000) CHARACTER SET LATIN CASESPECIFIC,
290
+ feature_ids VARCHAR(2000) CHARACTER SET LATIN CASESPECIFIC,
291
+ valid_start TIMESTAMP(6) WITH TIME ZONE NOT NULL,
292
+ valid_end TIMESTAMP(6) WITH TIME ZONE NOT NULL,
293
+ PERIOD FOR ValidPeriod (valid_start, valid_end) AS VALIDTIME)
294
+ PRIMARY INDEX (process_id);
295
+ """
234
296
 
235
- EFS_GROUP_FEATURES_STAGING_SPEC = {
236
- "table_name": "{}_staging".format(EFS_GROUP_FEATURES_SPEC["table_name"]),
237
- "columns": {
238
- "feature_name": VARCHAR(200),
239
- "group_name": VARCHAR(200),
240
- "creation_time": TIMESTAMP,
241
- "modified_time": TIMESTAMP,
242
- "archived_time": TIMESTAMP
243
- },
244
- "primary_index": None
245
- }
246
297
 
247
- EFS_GROUP_FEATURES_TRG = _EFS_TRIGGER_TEMPLATE.format(
248
- table=EFS_GROUP_FEATURES_SPEC["table_name"],
249
- columns=", ".join(("DeletedRow.{}".format(col) for col in EFS_GROUP_FEATURES_SPEC["columns"]))
250
- )
298
+ EFS_FEATURE_RUNS = """
299
+ CREATE MULTISET TABLE {0}.{1}
300
+ (
301
+ run_id BIGINT GENERATED ALWAYS AS IDENTITY (START WITH 1 INCREMENT BY 1 MINVALUE 1 NO MAXVALUE NO CYCLE) NOT NULL,
302
+ process_id VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
303
+ data_domain VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
304
+ start_time TIMESTAMP(6),
305
+ end_time TIMESTAMP(6),
306
+ status VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC,
307
+ filter VARCHAR(2000) CHARACTER SET LATIN NOT CASESPECIFIC,
308
+ as_of_start TIMESTAMP(6) WITH TIME ZONE,
309
+ as_of_end TIMESTAMP(6) WITH TIME ZONE,
310
+ failure_reason VARCHAR(2000) CHARACTER SET LATIN CASESPECIFIC)
311
+ UNIQUE PRIMARY INDEX (run_id);
312
+ """
313
+
314
+ # Table for storing the features metadata.
315
+ EFS_FEATURES_METADATA = """
316
+ CREATE MULTISET TABLE {0}.{1}
317
+ (
318
+ entity_name VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
319
+ data_domain VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
320
+ feature_id BIGINT NOT NULL,
321
+ table_name VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
322
+ valid_start TIMESTAMP(6) WITH TIME ZONE NOT NULL,
323
+ valid_end TIMESTAMP(6) WITH TIME ZONE NOT NULL,
324
+ PERIOD FOR ValidPeriod (valid_start, valid_end) AS VALIDTIME)
325
+ PRIMARY INDEX (entity_name);
326
+ """
327
+
328
+ EFS_DATASET_CATALOG = """
329
+ CREATE MULTISET TABLE {0}.{1}
330
+ (
331
+ id VARCHAR(36) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
332
+ data_domain VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
333
+ name VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
334
+ entity_name VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
335
+ database_name VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
336
+ description VARCHAR(2000) CHARACTER SET LATIN NOT CASESPECIFIC,
337
+ valid_start TIMESTAMP(6) WITH TIME ZONE NOT NULL,
338
+ valid_end TIMESTAMP(6) WITH TIME ZONE NOT NULL,
339
+ PERIOD FOR ValidPeriod (valid_start, valid_end) AS VALIDTIME)
340
+ PRIMARY INDEX (id);
341
+ """
342
+
343
+ EFS_DATASET_FEATURES = """
344
+ CREATE MULTISET TABLE {0}.{1}
345
+ (
346
+ dataset_id VARCHAR(36) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
347
+ data_domain VARCHAR(200) CHARACTER SET LATIN NOT CASESPECIFIC,
348
+ feature_id BIGINT,
349
+ feature_name VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
350
+ feature_version VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
351
+ feature_repo VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
352
+ feature_view VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
353
+ valid_start TIMESTAMP(6) WITH TIME ZONE NOT NULL,
354
+ valid_end TIMESTAMP(6) WITH TIME ZONE NOT NULL,
355
+ PERIOD FOR ValidPeriod (valid_start, valid_end) AS VALIDTIME)
356
+ PRIMARY INDEX (dataset_id);
357
+ """
358
+
359
+ EFS_FEATURE_VERSION = """
360
+ CREATE VIEW {}.{} AS
361
+ LOCK ROW FOR ACCESS
362
+ SELECT
363
+ data_domain,
364
+ entity_id,
365
+ trim(NGRAM) AS feature_name,
366
+ PROCESS_ID as feature_version
367
+ FROM NGramSplitter (
368
+ ON (
369
+ SELECT * FROM {}.{}
370
+ ) as paragraphs_input
371
+ USING
372
+ TextColumn ('FEATURE_NAMES')
373
+ ConvertToLowerCase ('false')
374
+ Grams ('1')
375
+ Delimiter(',')
376
+ ) AS dt;
377
+ """
378
+
379
+ # Select the archived records.
380
+ EFS_ARCHIVED_RECORDS = """
381
+ SELECT {},
382
+ CASE WHEN valid_end < current_timestamp then 1 else 0 end as is_archived
383
+ FROM {}
384
+ WHERE {}"""
251
385
 
252
386
  # Table to store the version of feature store. This is very important.
253
387
  # When teradataml incrementally adds functionality for feature store, this
254
388
  # version will be deciding factor whether teradataml should automatically
255
389
  # update metadata or not.
256
- EFS_VERSION_SPEC = {
257
- "table_name": "_efs_version",
258
- "columns": {
259
- "version": VARCHAR(20),
260
- "creation_time": TIMESTAMP
261
- }
262
- }
263
390
 
264
- EFS_VERSION = "1.0.0"
391
+ EFS_VERSION = """
392
+ CREATE MULTISET TABLE {0}.{1} (
393
+ version VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC,
394
+ creation_time TIMESTAMP(6)
395
+ );
396
+ """
397
+
398
+ EFS_VERSION_ = "2.0.0"
399
+
400
+ EFS_DB_COMPONENTS = {
401
+ "data_domain": "_efs_data_domains",
402
+ "feature": "_efs_features",
403
+ "feature_staging": "_efs_features_staging",
404
+ "feature_trg": "_efs_features_trg",
405
+ "group_features": "_efs_group_features",
406
+ "group_features_staging": "_efs_group_features_staging",
407
+ "group_features_trg": "_efs_group_features_trg",
408
+ "entity": "_efs_entity",
409
+ "entity_staging": "_efs_entity_staging",
410
+ "entity_trg": "_efs_entity_trg",
411
+ "entity_xref": "_efs_entity_xref",
412
+ "entity_staging_xref": "_efs_entity_xref_staging",
413
+ "entity_xref_trg": "_efs_entity_xref_trg",
414
+ "data_source": "_efs_data_source",
415
+ "data_source_staging": "_efs_data_source_staging",
416
+ "data_source_trg": "_efs_data_source_trg",
417
+ "feature_group": "_efs_feature_group",
418
+ "feature_group_staging": "_efs_feature_group_staging",
419
+ "feature_group_trg": "_efs_feature_group_trg",
420
+ "feature_process": "_efs_feature_process",
421
+ "feature_runs": "_efs_feature_runs",
422
+ "feature_metadata": "_efs_features_metadata",
423
+ "dataset_catalog": "_efs_dataset_catalog",
424
+ "dataset_features": "_efs_dataset_features",
425
+ "feature_version": "_efs_feature_version",
426
+ "version": "_efs_version"
427
+ }
265
428
 
266
429
 
267
430
  EFS_TABLES = {
268
- "feature": EFS_FEATURES_SPEC["table_name"],
269
- "feature_staging": EFS_FEATURES_STAGING_SPEC["table_name"],
270
- "feature_group": EFS_FEATURE_GROUP_SPEC["table_name"],
271
- "feature_group_staging": EFS_FEATURE_GROUP_STAGING_SPEC["table_name"],
272
- "entity": EFS_ENTITY_SPEC["table_name"],
273
- "entity_staging": EFS_ENTITY_STAGING_SPEC["table_name"],
274
- "entity_xref": EFS_ENTITY_XREF_SPEC["table_name"],
275
- "entity_staging_xref": EFS_ENTITY_XREF_STAGING_SPEC["table_name"],
276
- "data_source": EFS_DATA_SOURCE_SPEC["table_name"],
277
- "data_source_staging": EFS_DATA_SOURCE_STAGING_SPEC["table_name"],
278
- "group_features": EFS_GROUP_FEATURES_SPEC["table_name"],
279
- "group_features_staging": EFS_GROUP_FEATURES_STAGING_SPEC["table_name"],
280
- "version": EFS_VERSION_SPEC["table_name"]
431
+ EFS_DATA_DOMAINS: "_efs_data_domains",
432
+ EFS_FEATURES: "_efs_features",
433
+ EFS_FEATURES_STAGING: "_efs_features_staging",
434
+ EFS_GROUP_FEATURES: "_efs_group_features",
435
+ EFS_GROUP_FEATURES_STAGING: "_efs_group_features_staging",
436
+ EFS_ENTITY: "_efs_entity",
437
+ EFS_ENTITY_STAGING: "_efs_entity_staging",
438
+ EFS_ENTITY_XREF: "_efs_entity_xref",
439
+ EFS_ENTITY_XREF_STAGING: "_efs_entity_xref_staging",
440
+ EFS_DATA_SOURCE: "_efs_data_source",
441
+ EFS_DATA_SOURCE_STAGING: "_efs_data_source_staging",
442
+ EFS_FEATURE_GROUP: "_efs_feature_group",
443
+ EFS_FEATURE_RUNS: "_efs_feature_runs",
444
+ EFS_FEATURE_GROUP_STAGING: "_efs_feature_group_staging",
445
+ EFS_FEATURE_PROCESS: "_efs_feature_process",
446
+ EFS_FEATURES_METADATA: "_efs_features_metadata",
447
+ EFS_DATASET_CATALOG: "_efs_dataset_catalog",
448
+ EFS_DATASET_FEATURES: "_efs_dataset_features",
449
+ EFS_VERSION: "_efs_version"
281
450
  }
282
451
 
452
+ EFS_TRIGGERS = {
453
+ EFS_FEATURES_TRG: "_efs_features_trg",
454
+ EFS_GROUP_FEATURES_TRG: "_efs_group_features_trg",
455
+ EFS_ENTITY_TRG: "_efs_entity_trg",
456
+ EFS_ENTITY_XREF_TRG: "_efs_entity_xref_trg",
457
+ EFS_DATA_SOURCE_TRG: "_efs_data_source_trg",
458
+ EFS_FEATURE_GROUP_TRG: "_efs_feature_group_trg"
459
+ }
283
460
 
284
461
  class FeatureStatus(Enum):
285
462
  ACTIVE = 1
@@ -289,3 +466,193 @@ class FeatureStatus(Enum):
289
466
  class FeatureType(Enum):
290
467
  CONTINUOUS = 1
291
468
  CATEGORICAL = 2
469
+ NUMERICAL = 3
470
+
471
+ class ProcessType(Enum):
472
+ DENORMALIZED_VIEW = 'denormalized view'
473
+ FEATURE_GROUP = 'feature group'
474
+ NEW = 'new'
475
+ EXISTING = 'existing'
476
+
477
+
478
+ class ProcessStatus(Enum):
479
+ NOT_STARTED = 'not started'
480
+ RUNNING = 'running'
481
+ COMPLETED = 'completed'
482
+ FAILED = 'failed'
483
+
484
+
485
+ class _FeatureStoreDFContainer:
486
+ """
487
+ Utility class for FeatureStore DataFrame operations.
488
+
489
+ This class provides static methods for creating and managing DataFrames
490
+ used across different FeatureStore components, eliminating code duplication
491
+ and providing a centralized, efficient approach to DataFrame handling.
492
+ """
493
+ __df_container = {}
494
+
495
+ @staticmethod
496
+ def get_df(obj_type, repo, data_domain):
497
+ """
498
+ DESCRIPTION:
499
+ Generic static method to create and manage DataFrames for different object types
500
+ in FeatureStore. Handles joins and special object type processing.
501
+
502
+ PARAMETERS:
503
+ obj_type:
504
+ Required Argument.
505
+ Specifies the type of DataFrame to return.
506
+ Supported types: 'feature', 'feature_staging', 'entity', 'entity_staging',
507
+ 'feature_wog', 'feature_info', 'feature_catalog', 'entity_info', and all
508
+ other types defined in EFS_DB_COMPONENTS.
509
+ Types: str
510
+
511
+ repo:
512
+ Required Argument.
513
+ Specifies the repository name.
514
+ Types: str
515
+
516
+ data_domain:
517
+ Required Argument.
518
+ Specifies the data domain for filtering operations.
519
+ Types: str
520
+
521
+ RETURNS:
522
+ teradataml DataFrame.
523
+
524
+ RAISES:
525
+ TeradataMlException
526
+
527
+ EXAMPLES:
528
+ >>> # Basic DataFrame retrieval
529
+ >>> df = _FeatureStoreDFContainer.get_df(
530
+ ... obj_type='feature',
531
+ ... repo='my_repo',
532
+ ... data_domain='analytics'
533
+ ... )
534
+
535
+ >>> # Complex join for feature info
536
+ >>> df = _FeatureStoreDFContainer.get_df(
537
+ ... obj_type='feature_info',
538
+ ... repo='my_repo',
539
+ ... data_domain='analytics'
540
+ ... )
541
+ """
542
+ from teradataml.dataframe.dataframe import DataFrame, in_schema
543
+ repo_obj = repo + '.' + data_domain + '.' + obj_type
544
+
545
+ if repo_obj not in _FeatureStoreDFContainer.__df_container:
546
+
547
+ # Handle complex FeatureStore-specific patterns with joins
548
+ if obj_type in ["feature", "feature_staging"]:
549
+ # Join features with group_features for group name
550
+ map_ = {"feature": "group_features", "feature_staging": "group_features_staging"}
551
+ features = DataFrame(in_schema(repo, EFS_DB_COMPONENTS[obj_type]))
552
+ features_xref = DataFrame(in_schema(repo, EFS_DB_COMPONENTS[map_[obj_type]]))
553
+ features = features[features.data_domain == data_domain]
554
+ features_xref = features_xref[features_xref.feature_data_domain == data_domain].select(["feature_name", "group_name"])
555
+ df = features.join(features_xref, on=["name==feature_name"], how='left')
556
+ _FeatureStoreDFContainer.__df_container[repo_obj] = df.select(features.columns + ["group_name"])
557
+
558
+ elif obj_type in ["entity", "entity_staging"]:
559
+ # Join entity with entity_xref for entity columns
560
+ ent_df = DataFrame(in_schema(repo, EFS_DB_COMPONENTS[obj_type]))
561
+ xref_df = DataFrame(in_schema(repo, EFS_DB_COMPONENTS["{}_xref".format(obj_type)]))
562
+ ent_df = ent_df[ent_df.data_domain == data_domain]
563
+ xref_df = xref_df[xref_df.data_domain == data_domain].select(['entity_name', 'entity_column'])
564
+ df = ent_df.join(xref_df, on=["name==entity_name"], how="inner")
565
+ _FeatureStoreDFContainer.__df_container[repo_obj] = df.select(ent_df.columns + ["entity_column"])
566
+
567
+ elif obj_type == "feature_wog":
568
+ # Feature without group - direct access to feature table
569
+ _FeatureStoreDFContainer.__df_container[repo_obj] = DataFrame(in_schema(repo, EFS_DB_COMPONENTS["feature"]))
570
+
571
+ elif obj_type == "feature_info":
572
+ # join: features + metadata
573
+ # Use feature_wog (without group)
574
+ feature = _FeatureStoreDFContainer.get_df('feature_wog', repo, data_domain)
575
+
576
+ # Get metadata DataFrame
577
+ feature_metadata = DataFrame(in_schema(repo, EFS_DB_COMPONENTS["feature_metadata"]))
578
+
579
+ # Drop ValidPeriod column if it exists
580
+ if 'ValidPeriod' in feature_metadata.columns:
581
+ feature_metadata = feature_metadata.drop(columns=["ValidPeriod"])
582
+
583
+ df = feature_metadata.join(feature,
584
+ how="inner",
585
+ on=[feature_metadata.feature_id == feature.id,
586
+ feature_metadata.data_domain == feature.data_domain,
587
+ feature_metadata.data_domain == data_domain],
588
+ lsuffix="_meta",
589
+ rsuffix="_feat")
590
+ _FeatureStoreDFContainer.__df_container[repo_obj] = df
591
+
592
+ elif obj_type == "feature_catalog":
593
+ # join: features + metadata + version
594
+ # Get the required DataFrames directly
595
+ fv = DataFrame(in_schema(repo, EFS_DB_COMPONENTS["feature_version"]))
596
+ f_ = _FeatureStoreDFContainer.get_df("feature", repo, data_domain)
597
+
598
+ # Feature can be mapped to more than one feature group. So, 'f_' can have duplicate rows
599
+ # which propagates these duplicates to final result.
600
+ f_ = f_.drop_duplicate(['id', 'data_domain', 'name'])
601
+ fm = DataFrame(in_schema(repo, EFS_DB_COMPONENTS["feature_metadata"]))
602
+
603
+ ndf = fm.select(['entity_name', 'data_domain', 'feature_id', 'table_name', 'valid_end'])
604
+ hdf = ndf.join(
605
+ f_, on=((f_.id == ndf.feature_id) & (ndf.data_domain == f_.data_domain)),
606
+ how='inner',
607
+ lprefix='l'
608
+ ).select(['entity_name', 'data_domain', 'id', 'name', 'table_name', 'valid_end'])
609
+
610
+ vdf = hdf.join(fv,
611
+ on=(
612
+ (hdf.data_domain == fv.data_domain) &
613
+ (hdf.entity_name == fv.entity_id) &
614
+ (fv.feature_name == hdf.name) &
615
+ (fv.data_domain == data_domain)
616
+ ),
617
+ how='inner',
618
+ lprefix='l'
619
+ )
620
+
621
+ _FeatureStoreDFContainer.__df_container[repo_obj] = vdf.select(
622
+ ['entity_id', 'data_domain', 'id', 'name', 'table_name', 'feature_version', 'valid_end']
623
+ )
624
+
625
+ elif obj_type == "entity_info":
626
+ # join: entity + entity_xref
627
+ entity_df = DataFrame(in_schema(repo, EFS_DB_COMPONENTS["entity"]))
628
+ entity_xref_df = DataFrame(in_schema(repo, EFS_DB_COMPONENTS["entity_xref"]))
629
+
630
+ # Build join conditions
631
+ join_conditions = [
632
+ entity_df.name == entity_xref_df.entity_name,
633
+ entity_df.data_domain == entity_xref_df.data_domain,
634
+ entity_df.data_domain == data_domain
635
+ ]
636
+
637
+ df = entity_df.join(
638
+ other=entity_xref_df,
639
+ on=join_conditions,
640
+ lsuffix="l"
641
+ )
642
+
643
+ _FeatureStoreDFContainer.__df_container[repo_obj] = df.select(
644
+ ['entity_name', 'data_domain', 'entity_column', 'description']
645
+ )
646
+
647
+ elif obj_type == 'data_domain':
648
+ _FeatureStoreDFContainer.__df_container[repo_obj] = DataFrame(in_schema(repo, EFS_DB_COMPONENTS["data_domain"]))
649
+
650
+ # Default case: simple DataFrame creation
651
+ else:
652
+ df = DataFrame(in_schema(repo, EFS_DB_COMPONENTS[obj_type]))
653
+ if 'data_domain' in df.columns:
654
+ df = df[df.data_domain == data_domain]
655
+
656
+ _FeatureStoreDFContainer.__df_container[repo_obj] = df
657
+
658
+ return _FeatureStoreDFContainer.__df_container[repo_obj]