tdfs4ds 0.2.4.26__py3-none-any.whl → 0.2.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +769 -571
- tdfs4ds/feature_store/feature_data_processing.py +370 -300
- tdfs4ds/feature_store/feature_query_retrieval.py +105 -52
- tdfs4ds/feature_store/feature_store_management.py +226 -231
- tdfs4ds/genai/__init__.py +27 -0
- tdfs4ds/genai/documentation.py +1878 -0
- tdfs4ds/process_store/process_followup.py +113 -2
- tdfs4ds/process_store/process_query_administration.py +1 -1
- tdfs4ds/process_store/process_registration_management.py +67 -55
- tdfs4ds/process_store/process_store_catalog_management.py +79 -26
- tdfs4ds/utils/filter_management.py +548 -138
- tdfs4ds/utils/query_management.py +18 -40
- tdfs4ds/utils/time_management.py +565 -98
- {tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.5.1.dist-info}/METADATA +1 -1
- tdfs4ds-0.2.5.1.dist-info/RECORD +32 -0
- tdfs/__init__.py +0 -1
- tdfs/data/curves.csv +0 -5086
- tdfs/datasets.py +0 -27
- tdfs/feature_store.py +0 -723
- tdfs4ds/feature_engineering.py +0 -152
- tdfs4ds/feature_store.py +0 -1529
- tdfs4ds/process_store.py +0 -387
- tdfs4ds/utils.py +0 -579
- tdfs4ds-0.2.4.26.dist-info/RECORD +0 -38
- {tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.5.1.dist-info}/WHEEL +0 -0
- {tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.5.1.dist-info}/top_level.txt +0 -0
tdfs/feature_store.py
DELETED
|
@@ -1,723 +0,0 @@
|
|
|
1
|
-
import teradataml as tdml
|
|
2
|
-
import pandas as pd
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
def feature_store_catalog_creation(schema,if_exists = 'replace',table_name='FS_FEATURE_CATALOG',comment='this table is a feature catalog'):
|
|
6
|
-
"""
|
|
7
|
-
This function creates a feature store catalog table in Teradata database.
|
|
8
|
-
The catalog table stores information about features such as their names, associated tables, databases, validity periods, etc.
|
|
9
|
-
|
|
10
|
-
Parameters:
|
|
11
|
-
- schema: The schema name in which the catalog table will be created.
|
|
12
|
-
- if_exists (optional): Specifies the behavior if the catalog table already exists. The default is 'replace', which means the existing table will be replaced.
|
|
13
|
-
- table_name (optional): The name of the catalog table. The default is 'FS_FEATURE_CATALOG'.
|
|
14
|
-
|
|
15
|
-
Returns:
|
|
16
|
-
The name of the created or replaced catalog table.
|
|
17
|
-
|
|
18
|
-
"""
|
|
19
|
-
|
|
20
|
-
# SQL query to create the catalog table
|
|
21
|
-
query = f"""
|
|
22
|
-
CREATE MULTISET TABLE {schema}.{table_name},
|
|
23
|
-
FALLBACK,
|
|
24
|
-
NO BEFORE JOURNAL,
|
|
25
|
-
NO AFTER JOURNAL,
|
|
26
|
-
CHECKSUM = DEFAULT,
|
|
27
|
-
DEFAULT MERGEBLOCKRATIO,
|
|
28
|
-
MAP = TD_MAP1
|
|
29
|
-
(
|
|
30
|
-
|
|
31
|
-
FEATURE_ID BIGINT,
|
|
32
|
-
FEATURE_NAME VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
33
|
-
FEATURE_TABLE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
34
|
-
FEATURE_DATABASE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
35
|
-
FEATURE_VIEW VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
36
|
-
ENTITY_NAME VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
37
|
-
ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
|
|
38
|
-
ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
|
|
39
|
-
PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME
|
|
40
|
-
)
|
|
41
|
-
PRIMARY INDEX (FEATURE_ID);
|
|
42
|
-
"""
|
|
43
|
-
|
|
44
|
-
# SQL query to create a secondary index on the feature name
|
|
45
|
-
query2 = f"CREATE INDEX (FEATURE_NAME) ON {schema}.{table_name};"
|
|
46
|
-
|
|
47
|
-
# SQL query to comment the table
|
|
48
|
-
query3 = f"COMMENT ON TABLE {schema}.{table_name} IS '{comment}'"
|
|
49
|
-
|
|
50
|
-
try:
|
|
51
|
-
# Attempt to execute the create table query
|
|
52
|
-
tdml.get_context().execute(query)
|
|
53
|
-
if tdml.display.print_sqlmr_query:
|
|
54
|
-
print(query)
|
|
55
|
-
print(f'TABLE {schema}.{table_name} has been created')
|
|
56
|
-
tdml.get_context().execute(query3)
|
|
57
|
-
except Exception as e:
|
|
58
|
-
# If the table already exists and if_exists is set to 'replace', drop the table and recreate it
|
|
59
|
-
print(str(e).split('\n')[0])
|
|
60
|
-
if str(e).split('\n')[0].endswith('already exists.') and (if_exists == 'replace'):
|
|
61
|
-
tdml.get_context().execute(f'DROP TABLE {schema}.{table_name}')
|
|
62
|
-
print(f'TABLE {schema}.{table_name} has been dropped')
|
|
63
|
-
try:
|
|
64
|
-
# Attempt to recreate the table after dropping it
|
|
65
|
-
tdml.get_context().execute(query)
|
|
66
|
-
print(f'TABLE {schema}.{table_name} has been re-created')
|
|
67
|
-
if tdml.display.print_sqlmr_query:
|
|
68
|
-
print(query)
|
|
69
|
-
tdml.get_context().execute(query3)
|
|
70
|
-
except Exception as e:
|
|
71
|
-
print(str(e).split('\n')[0])
|
|
72
|
-
|
|
73
|
-
try:
|
|
74
|
-
# Attempt to create the secondary index
|
|
75
|
-
tdml.get_context().execute(query2)
|
|
76
|
-
if tdml.display.print_sqlmr_query:
|
|
77
|
-
print(query)
|
|
78
|
-
print(f'SECONDARY INDEX ON TABLE {schema}.{table_name} has been created')
|
|
79
|
-
except Exception as e:
|
|
80
|
-
print(str(e).split('\n')[0])
|
|
81
|
-
|
|
82
|
-
return table_name
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
def get_feature_store_table_name(entity_id, feature_type):
|
|
86
|
-
"""
|
|
87
|
-
|
|
88
|
-
This function generates the table and view names for a feature store table based on the provided entity ID and feature type.
|
|
89
|
-
|
|
90
|
-
Parameters:
|
|
91
|
-
- entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to construct the table and view names.
|
|
92
|
-
- feature_type: The type of the feature.
|
|
93
|
-
|
|
94
|
-
Returns:
|
|
95
|
-
A tuple containing the generated table name and view name.
|
|
96
|
-
|
|
97
|
-
"""
|
|
98
|
-
|
|
99
|
-
# Construct the table name by concatenating the elements 'FS', 'T', the keys of entity_id, and feature_type
|
|
100
|
-
table_name = ['FS','T']+list(entity_id.keys())+[feature_type]
|
|
101
|
-
table_name = '_'.join(table_name)
|
|
102
|
-
|
|
103
|
-
# Construct the view name by concatenating the elements 'FS', 'V', the keys of entity_id, and feature_type
|
|
104
|
-
view_name = ['FS','V']+list(entity_id.keys())+[feature_type]
|
|
105
|
-
view_name = '_'.join(view_name)
|
|
106
|
-
|
|
107
|
-
return table_name, view_name
|
|
108
|
-
|
|
109
|
-
def feature_store_table_creation(entity_id, feature_type, schema, if_exists = 'replace',feature_catalog_name='FS_FEATURE_CATALOG'):
|
|
110
|
-
|
|
111
|
-
"""
|
|
112
|
-
This function creates a feature store table and a corresponding view in a Teradata database schema based on the provided entity ID, feature type, and feature catalog.
|
|
113
|
-
|
|
114
|
-
Parameters:
|
|
115
|
-
- entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to construct the table and view names.
|
|
116
|
-
- feature_type: The type of the feature.
|
|
117
|
-
- schema: The schema name in which the table and view will be created.
|
|
118
|
-
- if_exists (optional): Specifies the behavior if the table already exists. The default is 'replace', which means the existing table will be replaced.
|
|
119
|
-
- feature_catalog_name (optional): The name of the feature catalog table. The default is 'FS_FEATURE_CATALOG'.
|
|
120
|
-
|
|
121
|
-
Returns:
|
|
122
|
-
The name of the created or replaced feature store table.
|
|
123
|
-
|
|
124
|
-
"""
|
|
125
|
-
|
|
126
|
-
table_name, view_name = get_feature_store_table_name(entity_id, feature_type)
|
|
127
|
-
|
|
128
|
-
# Construct the column definitions for the table based on the entity ID
|
|
129
|
-
ENTITY_ID = ', \n'.join([k+' '+v for k,v in entity_id.items()])
|
|
130
|
-
ENTITY_ID_ = ', \n'.join(['B.'+k for k,v in entity_id.items()])
|
|
131
|
-
ENTITY_ID__ = ','.join([k for k,v in entity_id.items()])
|
|
132
|
-
|
|
133
|
-
# SQL query to create the feature store table
|
|
134
|
-
query = f"""
|
|
135
|
-
CREATE MULTISET TABLE {schema}.{table_name},
|
|
136
|
-
FALLBACK,
|
|
137
|
-
NO BEFORE JOURNAL,
|
|
138
|
-
NO AFTER JOURNAL,
|
|
139
|
-
CHECKSUM = DEFAULT,
|
|
140
|
-
DEFAULT MERGEBLOCKRATIO,
|
|
141
|
-
MAP = TD_MAP1
|
|
142
|
-
(
|
|
143
|
-
|
|
144
|
-
{ENTITY_ID},
|
|
145
|
-
FEATURE_ID BIGINT,
|
|
146
|
-
FEATURE_VALUE FLOAT,
|
|
147
|
-
FEATURE_VERSION VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
148
|
-
ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
|
|
149
|
-
ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
|
|
150
|
-
PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME
|
|
151
|
-
)
|
|
152
|
-
PRIMARY INDEX ({ENTITY_ID__},FEATURE_ID,FEATURE_VERSION);
|
|
153
|
-
"""
|
|
154
|
-
|
|
155
|
-
# SQL query to create a secondary index on the feature ID
|
|
156
|
-
query2 = f"CREATE INDEX (FEATURE_ID) ON {schema}.{table_name};"
|
|
157
|
-
|
|
158
|
-
# SQL query to create the view
|
|
159
|
-
query_view = f"""
|
|
160
|
-
REPLACE VIEW {schema}.{view_name} AS
|
|
161
|
-
CURRENT VALIDTIME
|
|
162
|
-
SELECT
|
|
163
|
-
A.FEATURE_NAME,
|
|
164
|
-
{ENTITY_ID_},
|
|
165
|
-
B.FEATURE_VALUE,
|
|
166
|
-
B.FEATURE_VERSION
|
|
167
|
-
FROM {schema}.{feature_catalog_name} A
|
|
168
|
-
, {schema}.{table_name} B
|
|
169
|
-
WHERE A.FEATURE_ID = B.FEATURE_ID
|
|
170
|
-
"""
|
|
171
|
-
|
|
172
|
-
try:
|
|
173
|
-
# Attempt to execute the create table query
|
|
174
|
-
tdml.get_context().execute(query)
|
|
175
|
-
if tdml.display.print_sqlmr_query:
|
|
176
|
-
print(query)
|
|
177
|
-
print(f'TABLE {schema}.{table_name} has been created')
|
|
178
|
-
tdml.get_context().execute(query2)
|
|
179
|
-
except Exception as e:
|
|
180
|
-
# If the table already exists and if_exists is set to 'replace', drop the table and recreate it
|
|
181
|
-
print(str(e).split('\n')[0])
|
|
182
|
-
if str(e).split('\n')[0].endswith('already exists.') and (if_exists == 'replace'):
|
|
183
|
-
tdml.get_context().execute(f'DROP TABLE {schema}.{table_name}')
|
|
184
|
-
print(f'TABLE {schema}.{table_name} has been dropped')
|
|
185
|
-
try:
|
|
186
|
-
# Attempt to recreate the table after dropping it
|
|
187
|
-
tdml.get_context().execute(query)
|
|
188
|
-
print(f'TABLE {schema}.{table_name} has been re-created')
|
|
189
|
-
if tdml.display.print_sqlmr_query:
|
|
190
|
-
print(query)
|
|
191
|
-
except Exception as e:
|
|
192
|
-
print(str(e).split('\n')[0])
|
|
193
|
-
|
|
194
|
-
try:
|
|
195
|
-
# Attempt to create the view
|
|
196
|
-
tdml.get_context().execute(query_view)
|
|
197
|
-
if tdml.display.print_sqlmr_query:
|
|
198
|
-
print(query)
|
|
199
|
-
print(f'VIEW {schema}.{view_name} has been created')
|
|
200
|
-
except Exception as e:
|
|
201
|
-
print(str(e).split('\n')[0])
|
|
202
|
-
|
|
203
|
-
return table_name
|
|
204
|
-
|
|
205
|
-
def register_features(entity_id, feature_names_types, schema, feature_catalog_name='FS_FEATURE_CATALOG'):
|
|
206
|
-
"""
|
|
207
|
-
|
|
208
|
-
This function registers features in the feature catalog table of a Teradata database. It creates or updates entries in the catalog based on the provided entity ID, feature names and types, and schema.
|
|
209
|
-
|
|
210
|
-
Parameters:
|
|
211
|
-
- entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to identify the entity.
|
|
212
|
-
- feature_names_types: A dictionary containing feature names and their corresponding types.
|
|
213
|
-
- schema: The schema name in which the feature catalog table resides.
|
|
214
|
-
- feature_catalog_name (optional): The name of the feature catalog table. The default is 'FS_FEATURE_CATALOG'.
|
|
215
|
-
|
|
216
|
-
Returns:
|
|
217
|
-
A DataFrame containing the registered features and their metadata.
|
|
218
|
-
|
|
219
|
-
"""
|
|
220
|
-
|
|
221
|
-
if len(list(feature_names_types.keys())) == 0:
|
|
222
|
-
print('no new feature to register')
|
|
223
|
-
return
|
|
224
|
-
|
|
225
|
-
# Create a comma-separated string of entity IDs
|
|
226
|
-
ENTITY_ID__ = ','.join([k for k,v in entity_id.items()])
|
|
227
|
-
|
|
228
|
-
# Create a DataFrame from the feature_names_types dictionary
|
|
229
|
-
if len(feature_names_types.keys())>1:
|
|
230
|
-
df = pd.DataFrame(feature_names_types).transpose().reset_index()
|
|
231
|
-
df.columns = ['FEATURE_NAME','TYPE','FEATURE_ID']
|
|
232
|
-
else:
|
|
233
|
-
df = pd.DataFrame(columns=['FEATURE_NAME','TYPE','FEATURE_ID'])
|
|
234
|
-
k = list(feature_names_types.keys())[0]
|
|
235
|
-
df['FEATURE_NAME'] = [k]
|
|
236
|
-
df['TYPE'] = [feature_names_types[k]['type']]
|
|
237
|
-
df['FEATURE_ID'] = [feature_names_types[k]['id']]
|
|
238
|
-
|
|
239
|
-
# Generate the feature table and view names based on the entity ID and feature type
|
|
240
|
-
df['FEATURE_TABLE'] = df.apply(lambda row:get_feature_store_table_name(entity_id, row[1])[0], axis=1)
|
|
241
|
-
df['FEATURE_VIEW'] = df.apply(lambda row:get_feature_store_table_name(entity_id, row[1])[1], axis=1)
|
|
242
|
-
|
|
243
|
-
# Add additional columns to the DataFrame
|
|
244
|
-
df['ENTITY_NAME'] = ENTITY_ID__
|
|
245
|
-
df['FEATURE_DATABASE'] = schema
|
|
246
|
-
|
|
247
|
-
# Copy the DataFrame to a temporary table in Teradata
|
|
248
|
-
tdml.copy_to_sql(df,table_name = 'temp', schema_name = schema, if_exists = 'replace', primary_index = 'FEATURE_ID', types={'FEATURE_ID':tdml.BIGINT})
|
|
249
|
-
|
|
250
|
-
# SQL query to update existing entries in the feature catalog
|
|
251
|
-
query_update = f"""
|
|
252
|
-
CURRENT VALIDTIME
|
|
253
|
-
UPDATE {schema}.{feature_catalog_name}
|
|
254
|
-
FROM (
|
|
255
|
-
CURRENT VALIDTIME
|
|
256
|
-
SELECT
|
|
257
|
-
NEW_FEATURES.FEATURE_ID
|
|
258
|
-
, NEW_FEATURES.FEATURE_NAME
|
|
259
|
-
, NEW_FEATURES.FEATURE_TABLE
|
|
260
|
-
, NEW_FEATURES.FEATURE_DATABASE
|
|
261
|
-
, NEW_FEATURES.FEATURE_VIEW
|
|
262
|
-
, NEW_FEATURES.ENTITY_NAME
|
|
263
|
-
FROM {schema}.temp NEW_FEATURES
|
|
264
|
-
LEFT JOIN {schema}.{feature_catalog_name} EXISTING_FEATURES
|
|
265
|
-
ON NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
|
|
266
|
-
WHERE EXISTING_FEATURES.FEATURE_NAME IS NOT NULL
|
|
267
|
-
) UPDATED_FEATURES
|
|
268
|
-
SET
|
|
269
|
-
FEATURE_NAME = UPDATED_FEATURES.FEATURE_NAME,
|
|
270
|
-
FEATURE_TABLE = UPDATED_FEATURES.FEATURE_TABLE,
|
|
271
|
-
FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE,
|
|
272
|
-
FEATURE_VIEW = UPDATED_FEATURES.FEATURE_VIEW,
|
|
273
|
-
ENTITY_NAME = UPDATED_FEATURES.ENTITY_NAME
|
|
274
|
-
WHERE {feature_catalog_name}.FEATURE_ID = UPDATED_FEATURES.FEATURE_ID;
|
|
275
|
-
"""
|
|
276
|
-
|
|
277
|
-
# SQL query to insert new entries into the feature catalog
|
|
278
|
-
query_insert = f"""
|
|
279
|
-
CURRENT VALIDTIME
|
|
280
|
-
INSERT INTO {schema}.{feature_catalog_name} (FEATURE_ID, FEATURE_NAME, FEATURE_TABLE, FEATURE_DATABASE, FEATURE_VIEW, ENTITY_NAME)
|
|
281
|
-
SELECT
|
|
282
|
-
NEW_FEATURES.FEATURE_ID
|
|
283
|
-
, NEW_FEATURES.FEATURE_NAME
|
|
284
|
-
, NEW_FEATURES.FEATURE_TABLE
|
|
285
|
-
, NEW_FEATURES.FEATURE_DATABASE
|
|
286
|
-
, NEW_FEATURES.FEATURE_VIEW
|
|
287
|
-
, NEW_FEATURES.ENTITY_NAME
|
|
288
|
-
FROM {schema}.temp NEW_FEATURES
|
|
289
|
-
LEFT JOIN {schema}.{feature_catalog_name} EXISTING_FEATURES
|
|
290
|
-
ON NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
|
|
291
|
-
WHERE EXISTING_FEATURES.FEATURE_NAME IS NULL;
|
|
292
|
-
"""
|
|
293
|
-
|
|
294
|
-
# Execute the update and insert queries
|
|
295
|
-
tdml.get_context().execute(query_insert)
|
|
296
|
-
tdml.get_context().execute(query_update)
|
|
297
|
-
|
|
298
|
-
return df
|
|
299
|
-
|
|
300
|
-
def prepare_feature_ingestion(df, entity_id, feature_names, feature_version_default = 'dev.0.0', feature_versions = None, **kwargs):
|
|
301
|
-
"""
|
|
302
|
-
|
|
303
|
-
This function prepares feature data for ingestion into the feature store. It transforms the input DataFrame by unpivoting the specified feature columns and adds additional columns for entity IDs, feature names, feature values, and feature versions.
|
|
304
|
-
|
|
305
|
-
Parameters:
|
|
306
|
-
- df: The input DataFrame containing the feature data.
|
|
307
|
-
- entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to identify the entity.
|
|
308
|
-
- feature_names: A list of feature names to unpivot from the DataFrame.
|
|
309
|
-
- feature_version_default (optional): The default feature version to assign if not specified in the feature_versions dictionary. Default is 'dev.0.0'.
|
|
310
|
-
- feature_versions (optional): A dictionary specifying feature versions for specific feature names. The keys are feature names, and the values are feature versions.
|
|
311
|
-
- **kwargs: Additional keyword arguments.
|
|
312
|
-
|
|
313
|
-
Returns:
|
|
314
|
-
A transformed tdml.DataFrame containing the prepared feature data.
|
|
315
|
-
|
|
316
|
-
"""
|
|
317
|
-
|
|
318
|
-
# Create the UNPIVOT clause for the specified feature columns
|
|
319
|
-
unpivot_columns = ", \n".join(["("+x+") as '"+x+"'" for x in feature_names])
|
|
320
|
-
|
|
321
|
-
# Create the output column list including entity IDs, feature names, and feature values
|
|
322
|
-
output_columns = ', \n'.join(list(entity_id.keys()) + ['FEATURE_NAME','FEATURE_VALUE'])
|
|
323
|
-
|
|
324
|
-
# Create a dictionary to store feature versions, using the default version if not specified
|
|
325
|
-
versions = {f:feature_version_default for f in feature_names}
|
|
326
|
-
if feature_versions is not None:
|
|
327
|
-
for k,v in feature_versions.items():
|
|
328
|
-
versions[k] = v
|
|
329
|
-
|
|
330
|
-
# Create the CASE statement to assign feature versions based on feature names
|
|
331
|
-
version_query = ["CASE"]+[f"WHEN FEATURE_NAME = '{k}' THEN '{v}' " for k,v in versions.items()]+["END AS FEATURE_VERSION"]
|
|
332
|
-
version_query = '\n'.join(version_query)
|
|
333
|
-
|
|
334
|
-
# Create the UNPIVOT query to transform the DataFrame
|
|
335
|
-
query_unpivot = f"""
|
|
336
|
-
SELECT
|
|
337
|
-
{output_columns},
|
|
338
|
-
{version_query}
|
|
339
|
-
FROM {df._table_name} UNPIVOT ((FEATURE_VALUE) FOR FEATURE_NAME
|
|
340
|
-
IN ({unpivot_columns})) Tmp;
|
|
341
|
-
"""
|
|
342
|
-
if tdml.display.print_sqlmr_query:
|
|
343
|
-
print(query_unpivot)
|
|
344
|
-
|
|
345
|
-
return tdml.DataFrame.from_query(query_unpivot)
|
|
346
|
-
|
|
347
|
-
def store_feature(entity_id, prepared_features, schema, feature_catalog_name='FS_FEATURE_CATALOG', date_in_the_past = None, **kwargs):
|
|
348
|
-
"""
|
|
349
|
-
|
|
350
|
-
This function stores feature data in the corresponding feature tables in a Teradata database. It updates existing feature values and inserts new feature values based on the entity ID and prepared features.
|
|
351
|
-
|
|
352
|
-
Parameters:
|
|
353
|
-
- entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to identify the entity.
|
|
354
|
-
- prepared_features: A tdml.DataFrame containing the prepared feature data.
|
|
355
|
-
- schema: The schema name in which the feature tables reside.
|
|
356
|
-
- feature_catalog_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
|
|
357
|
-
- **kwargs: Additional keyword arguments.
|
|
358
|
-
|
|
359
|
-
Returns:
|
|
360
|
-
None
|
|
361
|
-
|
|
362
|
-
"""
|
|
363
|
-
|
|
364
|
-
feature_catalog = tdml.DataFrame(tdml.in_schema(schema, feature_catalog_name))
|
|
365
|
-
|
|
366
|
-
if date_in_the_past == None:
|
|
367
|
-
validtime_statement = 'CURRENT VALIDTIME'
|
|
368
|
-
else:
|
|
369
|
-
validtime_statement = f"VALIDTIME AS OF DATE '{date_in_the_past}'"
|
|
370
|
-
|
|
371
|
-
# SQL query to select feature data and corresponding feature metadata from the prepared features and feature catalog
|
|
372
|
-
query = f"""
|
|
373
|
-
{validtime_statement}
|
|
374
|
-
SELECT
|
|
375
|
-
A.*
|
|
376
|
-
, B.FEATURE_ID
|
|
377
|
-
, B.FEATURE_TABLE
|
|
378
|
-
, B.FEATURE_DATABASE
|
|
379
|
-
FROM {prepared_features._table_name} A,
|
|
380
|
-
{schema}.{feature_catalog_name} B
|
|
381
|
-
WHERE A.FEATURE_NAME = B.FEATURE_NAME
|
|
382
|
-
"""
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
df = tdml.DataFrame.from_query(query)
|
|
387
|
-
|
|
388
|
-
# Group the target tables by feature table and feature database and count the number of occurrences
|
|
389
|
-
target_tables = df[['FEATURE_TABLE','FEATURE_DATABASE','FEATURE_ID']].groupby(['FEATURE_TABLE','FEATURE_DATABASE']).count().to_pandas()
|
|
390
|
-
print(target_tables)
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
ENTITY_ID = ', \n'.join([k for k,v in entity_id.items()])
|
|
394
|
-
ENTITY_ID_ON = ' AND '.join([f'NEW_FEATURES.{k} = EXISTING_FEATURES.{k}' for k,v in entity_id.items()])
|
|
395
|
-
ENTITY_ID_WHERE_INS = ' OR '.join([f'EXISTING_FEATURES.{k} IS NOT NULL' for k,v in entity_id.items()])
|
|
396
|
-
ENTITY_ID_WHERE_UP = ' OR '.join([f'EXISTING_FEATURES.{k} IS NULL' for k,v in entity_id.items()])
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
# Iterate over target tables and perform update and insert operations
|
|
400
|
-
for i,row in target_tables.iterrows():
|
|
401
|
-
|
|
402
|
-
# SQL query to update existing feature values
|
|
403
|
-
query_update = f"""
|
|
404
|
-
{validtime_statement}
|
|
405
|
-
UPDATE {row[1]}.{row[0]}
|
|
406
|
-
FROM (
|
|
407
|
-
CURRENT VALIDTIME
|
|
408
|
-
SELECT
|
|
409
|
-
NEW_FEATURES.{ENTITY_ID},
|
|
410
|
-
NEW_FEATURES.FEATURE_ID,
|
|
411
|
-
NEW_FEATURES.FEATURE_VALUE,
|
|
412
|
-
NEW_FEATURES.FEATURE_VERSION
|
|
413
|
-
FROM {df._table_name} NEW_FEATURES
|
|
414
|
-
LEFT JOIN {row[1]}.{row[0]} EXISTING_FEATURES
|
|
415
|
-
ON {ENTITY_ID_ON}
|
|
416
|
-
AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
|
|
417
|
-
AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
|
|
418
|
-
WHERE ({ENTITY_ID_WHERE_INS})
|
|
419
|
-
AND NEW_FEATURES.FEATURE_DATABASE = '{row[1]}'
|
|
420
|
-
AND NEW_FEATURES.FEATURE_TABLE = '{row[0]}'
|
|
421
|
-
) UPDATED_FEATURES
|
|
422
|
-
SET
|
|
423
|
-
FEATURE_VALUE = UPDATED_FEATURES.FEATURE_VALUE
|
|
424
|
-
WHERE {row[0]}.{ENTITY_ID} = UPDATED_FEATURES.{ENTITY_ID}
|
|
425
|
-
AND {row[0]}.FEATURE_ID = UPDATED_FEATURES.FEATURE_ID
|
|
426
|
-
AND {row[0]}.FEATURE_VERSION = UPDATED_FEATURES.FEATURE_VERSION;
|
|
427
|
-
"""
|
|
428
|
-
|
|
429
|
-
# SQL query to insert new feature values
|
|
430
|
-
query_insert = f"""
|
|
431
|
-
{validtime_statement}
|
|
432
|
-
INSERT INTO {row[1]}.{row[0]} ({ENTITY_ID}, FEATURE_ID, FEATURE_VALUE, FEATURE_VERSION)
|
|
433
|
-
SELECT
|
|
434
|
-
NEW_FEATURES.{ENTITY_ID},
|
|
435
|
-
NEW_FEATURES.FEATURE_ID,
|
|
436
|
-
NEW_FEATURES.FEATURE_VALUE,
|
|
437
|
-
NEW_FEATURES.FEATURE_VERSION
|
|
438
|
-
FROM {df._table_name} NEW_FEATURES
|
|
439
|
-
LEFT JOIN {row[1]}.{row[0]} EXISTING_FEATURES
|
|
440
|
-
ON {ENTITY_ID_ON}
|
|
441
|
-
AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
|
|
442
|
-
AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
|
|
443
|
-
WHERE ({ENTITY_ID_WHERE_UP})
|
|
444
|
-
AND NEW_FEATURES.FEATURE_DATABASE = '{row[1]}'
|
|
445
|
-
AND NEW_FEATURES.FEATURE_TABLE = '{row[0]}'
|
|
446
|
-
|
|
447
|
-
"""
|
|
448
|
-
|
|
449
|
-
print(f'insert feature values of new {ENTITY_ID} combinations in {row[1]}.{row[0]}')
|
|
450
|
-
if tdml.display.print_sqlmr_query:
|
|
451
|
-
print(query_insert)
|
|
452
|
-
tdml.get_context().execute(query_insert)
|
|
453
|
-
print(f'update feature values of existing {ENTITY_ID} combinations in {row[1]}.{row[0]}')
|
|
454
|
-
if tdml.display.print_sqlmr_query:
|
|
455
|
-
print(query_update)
|
|
456
|
-
tdml.get_context().execute(query_update)
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
return
|
|
460
|
-
|
|
461
|
-
def build_dataset(entity_id, selected_features, schema, view_name, feature_catalog_name='FS_FEATURE_CATALOG', comment = 'dataset', date_in_the_past = None, **kwargs):
|
|
462
|
-
"""
|
|
463
|
-
|
|
464
|
-
This function builds a dataset view in a Teradata database based on the selected features and entity ID. It retrieves the necessary feature data from the feature catalog and feature tables, and creates a view that pivots the data to the desired format.
|
|
465
|
-
|
|
466
|
-
Parameters:
|
|
467
|
-
- entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to identify the entity.
|
|
468
|
-
- selected_features: A dictionary specifying the selected features and their corresponding feature versions.
|
|
469
|
-
- schema: The schema name in which the dataset view will be created.
|
|
470
|
-
- view_name: The name of the dataset view.
|
|
471
|
-
- feature_catalog_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
|
|
472
|
-
- comment (optional): A comment to associate with the dataset view.
|
|
473
|
-
- **kwargs: Additional keyword arguments.
|
|
474
|
-
|
|
475
|
-
Returns:
|
|
476
|
-
A tdml.DataFrame representing the dataset view.
|
|
477
|
-
|
|
478
|
-
"""
|
|
479
|
-
|
|
480
|
-
feature_catalog = tdml.DataFrame.from_query(f'CURRENT VALIDTIME SELECT * FROM {schema}.{feature_catalog_name}')
|
|
481
|
-
|
|
482
|
-
if date_in_the_past == None:
|
|
483
|
-
validtime_statement = 'CURRENT VALIDTIME'
|
|
484
|
-
else:
|
|
485
|
-
validtime_statement = f"VALIDTIME AS OF DATE '{date_in_the_past}'"
|
|
486
|
-
|
|
487
|
-
# Compose the entity names and retrieve the corresponding feature locations
|
|
488
|
-
ENTITY_NAMES = ','.join([k for k in entity_id.keys()])
|
|
489
|
-
feature_location = feature_catalog[(feature_catalog.FEATURE_NAME.isin(list(selected_features.keys()))) & (feature_catalog.ENTITY_NAME == ENTITY_NAMES)].to_pandas()
|
|
490
|
-
feature_location['FEATURE_VERSION'] = feature_location['FEATURE_NAME'].map(selected_features)
|
|
491
|
-
|
|
492
|
-
# Build the query to retrieve the selected features from the feature tables
|
|
493
|
-
query = []
|
|
494
|
-
for g,df in feature_location.groupby(['FEATURE_DATABASE','FEATURE_TABLE']):
|
|
495
|
-
condition = ' \n OR '.join([f"(FEATURE_ID = {row['FEATURE_ID']} AND FEATURE_VERSION = '{row['FEATURE_VERSION']}')" for i,row in df.iterrows()])
|
|
496
|
-
query_ = f"""
|
|
497
|
-
SELECT * FROM {g[0]}.{g[1]}
|
|
498
|
-
WHERE {condition}
|
|
499
|
-
"""
|
|
500
|
-
query.append(query_)
|
|
501
|
-
query = 'UNION ALL '.join(query)
|
|
502
|
-
|
|
503
|
-
ENTITY_ID = ', \n'.join([k for k in entity_id.keys()])
|
|
504
|
-
ENTITY_ID_ = ', \n'.join(['B.'+k for k in entity_id.keys()])
|
|
505
|
-
|
|
506
|
-
# Build the query to construct the dataset view by joining the feature catalog and feature data
|
|
507
|
-
query_dataset = f"""
|
|
508
|
-
{validtime_statement}
|
|
509
|
-
SELECT
|
|
510
|
-
A.FEATURE_NAME,
|
|
511
|
-
{ENTITY_ID_},
|
|
512
|
-
B.FEATURE_VALUE
|
|
513
|
-
FROM {schema}.{feature_catalog_name} A
|
|
514
|
-
, ({query}) B
|
|
515
|
-
WHERE A.FEATURE_ID = B.FEATURE_ID
|
|
516
|
-
"""
|
|
517
|
-
|
|
518
|
-
# Define the output column names for the pivoted view
|
|
519
|
-
output_name = ',\n'.join([f"'{k}' as {k}" for k in selected_features.keys()])
|
|
520
|
-
output_name_ = ',\n'.join([f'CASE WHEN {k}_cnt=1 THEN {k} END AS {k}' for k in selected_features.keys()])
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
# Build the query to create the dataset view by pivoting the feature data
|
|
524
|
-
query_create_view = f'REPLACE VIEW {schema}.{view_name} AS'
|
|
525
|
-
query_pivot = f"""
|
|
526
|
-
SELECT
|
|
527
|
-
{ENTITY_ID}
|
|
528
|
-
, {output_name_}
|
|
529
|
-
FROM ({query_dataset}) AA PIVOT (
|
|
530
|
-
AVG(FEATURE_VALUE),
|
|
531
|
-
COUNT(FEATURE_VALUE) as cnt
|
|
532
|
-
FOR FEATURE_NAME IN (
|
|
533
|
-
{output_name}
|
|
534
|
-
)
|
|
535
|
-
)Tmp;
|
|
536
|
-
"""
|
|
537
|
-
if tdml.display.print_sqlmr_query:
|
|
538
|
-
print(query_create_view+'\n'+query_pivot)
|
|
539
|
-
|
|
540
|
-
if view_name != None:
|
|
541
|
-
tdml.get_context().execute(query_create_view+'\n'+query_pivot)
|
|
542
|
-
tdml.get_context().execute(f"COMMENT ON VIEW {schema}.{view_name} IS '{comment}'")
|
|
543
|
-
print(f'the dataset view {schema}.{view_name} has been created')
|
|
544
|
-
|
|
545
|
-
return tdml.DataFrame(tdml.in_schema(schema, view_name))
|
|
546
|
-
else:
|
|
547
|
-
return tdml.DataFrame.from_query(query_pivot)
|
|
548
|
-
def GetTheLargestFeatureID(schema,table_name='FS_FEATURE_CATALOG'):
|
|
549
|
-
"""
|
|
550
|
-
This function retrieves the maximum feature ID from the feature catalog table in the Teradata database.
|
|
551
|
-
|
|
552
|
-
Parameters:
|
|
553
|
-
- schema: The schema name in which the feature catalog table resides.
|
|
554
|
-
- table_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
|
|
555
|
-
|
|
556
|
-
Returns:
|
|
557
|
-
The maximum feature ID. If no feature IDs are found (i.e., the table is empty), the function returns 0.
|
|
558
|
-
|
|
559
|
-
"""
|
|
560
|
-
# Execute a SQL query to get the maximum feature ID from the feature catalog table.
|
|
561
|
-
feature_id = tdml.get_context().execute(f'SEL MAX(FEATURE_ID) AS MAX_FEATURE_ID FROM {schema}.{table_name}').fetchall()[0][0]
|
|
562
|
-
|
|
563
|
-
# If the result of the query is None (which means the table is empty), return 0.
|
|
564
|
-
if feature_id == None:
|
|
565
|
-
return 0
|
|
566
|
-
# If the result of the query is not None, return the maximum feature ID.
|
|
567
|
-
else:
|
|
568
|
-
return feature_id
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
def GetAlreadyExistingFeatureNames(feature_name, schema, table_name='FS_FEATURE_CATALOG'):
|
|
572
|
-
"""
|
|
573
|
-
This function retrieves the list of already existing features in the feature catalog table in the Teradata database.
|
|
574
|
-
|
|
575
|
-
Parameters:
|
|
576
|
-
- feature_name: The name of the feature to check.
|
|
577
|
-
- schema: The schema name in which the feature catalog table resides.
|
|
578
|
-
- table_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
|
|
579
|
-
|
|
580
|
-
Returns:
|
|
581
|
-
A list of existing features.
|
|
582
|
-
|
|
583
|
-
"""
|
|
584
|
-
# Create a temporary DataFrame with the feature name.
|
|
585
|
-
df = pd.DataFrame({'FEATURE_NAME': feature_name})
|
|
586
|
-
|
|
587
|
-
# Define a temporary table name.
|
|
588
|
-
tmp_name = 'tdfs__fgjnojnsmdoignmosnig'
|
|
589
|
-
|
|
590
|
-
# Copy the temporary DataFrame to a temporary table in the Teradata database.
|
|
591
|
-
tdml.copy_to_sql(df, schema_name=schema, table_name=tmp_name, if_exists='replace',
|
|
592
|
-
types={'FEATURE_NAME': tdml.VARCHAR(length=255, charset='LATIN')})
|
|
593
|
-
|
|
594
|
-
# Execute a SQL query to get the feature names that exist in both the temporary table and the feature catalog table.
|
|
595
|
-
existing_features = list(tdml.DataFrame.from_query(f"""
|
|
596
|
-
SEL A.FEATURE_NAME
|
|
597
|
-
FROM {schema}.{tmp_name} A
|
|
598
|
-
INNER JOIN {schema}.{table_name} B
|
|
599
|
-
ON A.FEATURE_NAME = B.FEATURE_NAME
|
|
600
|
-
""").to_pandas().FEATURE_NAME.values)
|
|
601
|
-
|
|
602
|
-
# Return the list of existing features.
|
|
603
|
-
return existing_features
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
def Gettdtypes(tddf, features_columns, schema, table_name='FS_FEATURE_CATALOG'):
|
|
607
|
-
"""
|
|
608
|
-
This function retrieves the data types of the columns in the provided DataFrame (tddf) and checks their existence in the feature catalog table.
|
|
609
|
-
It also assigns new feature IDs for those that do not already exist in the table.
|
|
610
|
-
|
|
611
|
-
Parameters:
|
|
612
|
-
- tddf: The input DataFrame.
|
|
613
|
-
- features_columns: A list of feature column names.
|
|
614
|
-
- schema: The schema name in which the feature catalog table resides.
|
|
615
|
-
- table_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
|
|
616
|
-
|
|
617
|
-
Returns:
|
|
618
|
-
A dictionary where keys are column names and values are dictionaries containing type and id of the feature.
|
|
619
|
-
|
|
620
|
-
"""
|
|
621
|
-
# Get the data types of the columns in the DataFrame.
|
|
622
|
-
types = dict(tddf.to_pandas(num_rows=10).dtypes)
|
|
623
|
-
|
|
624
|
-
# Get the names of the features that already exist in the feature catalog table.
|
|
625
|
-
existing_features = GetAlreadyExistingFeatureNames(tddf.columns, schema, table_name=table_name)
|
|
626
|
-
|
|
627
|
-
# Get the maximum feature ID from the feature catalog table.
|
|
628
|
-
feature_id = GetTheLargestFeatureID(schema, table_name=table_name)
|
|
629
|
-
|
|
630
|
-
# Increment the maximum feature ID to create a new feature ID.
|
|
631
|
-
feature_id = feature_id + 1
|
|
632
|
-
|
|
633
|
-
# Initialize a dictionary to store the result.
|
|
634
|
-
res = {}
|
|
635
|
-
|
|
636
|
-
# Iterate over the data types of the columns in the DataFrame.
|
|
637
|
-
for k, v in types.items():
|
|
638
|
-
# If the column name does not exist in the feature catalog table and is in the list of feature column names...
|
|
639
|
-
if k not in existing_features and k in features_columns:
|
|
640
|
-
# If the data type of the column is integer...
|
|
641
|
-
if 'int' in str(v):
|
|
642
|
-
# Add an entry to the result dictionary for the column name with its data type and new feature ID.
|
|
643
|
-
res[k] = {'type': 'BIGINT', 'id': feature_id}
|
|
644
|
-
# If the data type of the column is float...
|
|
645
|
-
elif 'float' in str(v):
|
|
646
|
-
# Add an entry to the result dictionary for the column name with its data type and new feature ID.
|
|
647
|
-
res[k] = {'type': 'FLOAT', 'id': feature_id}
|
|
648
|
-
# If the data type of the column is neither integer nor float...
|
|
649
|
-
else:
|
|
650
|
-
# Print a message that the data type is not yet managed.
|
|
651
|
-
print(f'{k} has a type that is not yet managed')
|
|
652
|
-
|
|
653
|
-
# Increment the feature ID for the next iteration.
|
|
654
|
-
feature_id += 1
|
|
655
|
-
|
|
656
|
-
# Return the result dictionary.
|
|
657
|
-
return res
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
def upload_feature(df, entity_id, feature_names, schema_name, feature_catalog_name='FS_FEATURE_CATALOG',
|
|
661
|
-
feature_versions='dev.0.0'):
|
|
662
|
-
"""
|
|
663
|
-
This function uploads features from a Teradata DataFrame to the feature store.
|
|
664
|
-
|
|
665
|
-
Parameters:
|
|
666
|
-
- df: The input Teradata DataFrame.
|
|
667
|
-
- entity_id: The ID of the entity that the features belong to.
|
|
668
|
-
- feature_names: A list of feature names.
|
|
669
|
-
- schema_name: The name of the schema where the feature store resides.
|
|
670
|
-
- feature_catalog_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
|
|
671
|
-
- feature_versions (optional): The versions of the features. Can be a string or a list. If it's a string, it's used as the version for all features. If it's a list, it should have the same length as feature_names. Default is 'dev.0.0'.
|
|
672
|
-
|
|
673
|
-
Returns:
|
|
674
|
-
A DataFrame representing the dataset view created in the feature store.
|
|
675
|
-
"""
|
|
676
|
-
# If feature_versions is a list, create a dictionary mapping each feature name to its corresponding version.
|
|
677
|
-
# If feature_versions is a string, create a dictionary mapping each feature name to this string.
|
|
678
|
-
if type(feature_versions) == list:
|
|
679
|
-
selected_features = {k: v for k, v in zip(feature_names, feature_versions)}
|
|
680
|
-
else:
|
|
681
|
-
selected_features = {k: feature_versions for k in feature_names}
|
|
682
|
-
|
|
683
|
-
# Get the Teradata types of the features in df.
|
|
684
|
-
feature_names_types = feature_store.Gettdtypes(
|
|
685
|
-
df,
|
|
686
|
-
features_columns=feature_names,
|
|
687
|
-
schema=schema_name
|
|
688
|
-
)
|
|
689
|
-
|
|
690
|
-
# Register the features in the feature catalog.
|
|
691
|
-
feature_store.register_features(
|
|
692
|
-
entity_id,
|
|
693
|
-
feature_names_types,
|
|
694
|
-
schema=schema_name,
|
|
695
|
-
feature_catalog_name=feature_catalog_name
|
|
696
|
-
)
|
|
697
|
-
|
|
698
|
-
# Prepare the features for ingestion.
|
|
699
|
-
prepared_features = feature_store.prepare_feature_ingestion(
|
|
700
|
-
df,
|
|
701
|
-
entity_id,
|
|
702
|
-
feature_names,
|
|
703
|
-
feature_versions=selected_features
|
|
704
|
-
)
|
|
705
|
-
|
|
706
|
-
# Store the prepared features in the feature store.
|
|
707
|
-
feature_store.store_feature(
|
|
708
|
-
entity_id,
|
|
709
|
-
prepared_features,
|
|
710
|
-
schema=Param['database'],
|
|
711
|
-
feature_catalog_name=feature_catalog_name
|
|
712
|
-
)
|
|
713
|
-
|
|
714
|
-
# Build a dataset view in the feature store.
|
|
715
|
-
dataset = feature_store.build_dataset(
|
|
716
|
-
entity_id,
|
|
717
|
-
selected_features,
|
|
718
|
-
schema=schema_name,
|
|
719
|
-
view_name=None
|
|
720
|
-
)
|
|
721
|
-
|
|
722
|
-
# Return the dataset view.
|
|
723
|
-
return dataset
|