tdfs4ds 0.2.4.41__py3-none-any.whl → 0.2.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tdfs/feature_store.py DELETED
@@ -1,723 +0,0 @@
1
- import teradataml as tdml
2
- import pandas as pd
3
-
4
-
5
- def feature_store_catalog_creation(schema,if_exists = 'replace',table_name='FS_FEATURE_CATALOG',comment='this table is a feature catalog'):
6
- """
7
- This function creates a feature store catalog table in Teradata database.
8
- The catalog table stores information about features such as their names, associated tables, databases, validity periods, etc.
9
-
10
- Parameters:
11
- - schema: The schema name in which the catalog table will be created.
12
- - if_exists (optional): Specifies the behavior if the catalog table already exists. The default is 'replace', which means the existing table will be replaced.
13
- - table_name (optional): The name of the catalog table. The default is 'FS_FEATURE_CATALOG'.
14
-
15
- Returns:
16
- The name of the created or replaced catalog table.
17
-
18
- """
19
-
20
- # SQL query to create the catalog table
21
- query = f"""
22
- CREATE MULTISET TABLE {schema}.{table_name},
23
- FALLBACK,
24
- NO BEFORE JOURNAL,
25
- NO AFTER JOURNAL,
26
- CHECKSUM = DEFAULT,
27
- DEFAULT MERGEBLOCKRATIO,
28
- MAP = TD_MAP1
29
- (
30
-
31
- FEATURE_ID BIGINT,
32
- FEATURE_NAME VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
33
- FEATURE_TABLE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
34
- FEATURE_DATABASE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
35
- FEATURE_VIEW VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
36
- ENTITY_NAME VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
37
- ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
38
- ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
39
- PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME
40
- )
41
- PRIMARY INDEX (FEATURE_ID);
42
- """
43
-
44
- # SQL query to create a secondary index on the feature name
45
- query2 = f"CREATE INDEX (FEATURE_NAME) ON {schema}.{table_name};"
46
-
47
- # SQL query to comment the table
48
- query3 = f"COMMENT ON TABLE {schema}.{table_name} IS '{comment}'"
49
-
50
- try:
51
- # Attempt to execute the create table query
52
- tdml.get_context().execute(query)
53
- if tdml.display.print_sqlmr_query:
54
- print(query)
55
- print(f'TABLE {schema}.{table_name} has been created')
56
- tdml.get_context().execute(query3)
57
- except Exception as e:
58
- # If the table already exists and if_exists is set to 'replace', drop the table and recreate it
59
- print(str(e).split('\n')[0])
60
- if str(e).split('\n')[0].endswith('already exists.') and (if_exists == 'replace'):
61
- tdml.get_context().execute(f'DROP TABLE {schema}.{table_name}')
62
- print(f'TABLE {schema}.{table_name} has been dropped')
63
- try:
64
- # Attempt to recreate the table after dropping it
65
- tdml.get_context().execute(query)
66
- print(f'TABLE {schema}.{table_name} has been re-created')
67
- if tdml.display.print_sqlmr_query:
68
- print(query)
69
- tdml.get_context().execute(query3)
70
- except Exception as e:
71
- print(str(e).split('\n')[0])
72
-
73
- try:
74
- # Attempt to create the secondary index
75
- tdml.get_context().execute(query2)
76
- if tdml.display.print_sqlmr_query:
77
- print(query)
78
- print(f'SECONDARY INDEX ON TABLE {schema}.{table_name} has been created')
79
- except Exception as e:
80
- print(str(e).split('\n')[0])
81
-
82
- return table_name
83
-
84
-
85
- def get_feature_store_table_name(entity_id, feature_type):
86
- """
87
-
88
- This function generates the table and view names for a feature store table based on the provided entity ID and feature type.
89
-
90
- Parameters:
91
- - entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to construct the table and view names.
92
- - feature_type: The type of the feature.
93
-
94
- Returns:
95
- A tuple containing the generated table name and view name.
96
-
97
- """
98
-
99
- # Construct the table name by concatenating the elements 'FS', 'T', the keys of entity_id, and feature_type
100
- table_name = ['FS','T']+list(entity_id.keys())+[feature_type]
101
- table_name = '_'.join(table_name)
102
-
103
- # Construct the view name by concatenating the elements 'FS', 'V', the keys of entity_id, and feature_type
104
- view_name = ['FS','V']+list(entity_id.keys())+[feature_type]
105
- view_name = '_'.join(view_name)
106
-
107
- return table_name, view_name
108
-
109
- def feature_store_table_creation(entity_id, feature_type, schema, if_exists = 'replace',feature_catalog_name='FS_FEATURE_CATALOG'):
110
-
111
- """
112
- This function creates a feature store table and a corresponding view in a Teradata database schema based on the provided entity ID, feature type, and feature catalog.
113
-
114
- Parameters:
115
- - entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to construct the table and view names.
116
- - feature_type: The type of the feature.
117
- - schema: The schema name in which the table and view will be created.
118
- - if_exists (optional): Specifies the behavior if the table already exists. The default is 'replace', which means the existing table will be replaced.
119
- - feature_catalog_name (optional): The name of the feature catalog table. The default is 'FS_FEATURE_CATALOG'.
120
-
121
- Returns:
122
- The name of the created or replaced feature store table.
123
-
124
- """
125
-
126
- table_name, view_name = get_feature_store_table_name(entity_id, feature_type)
127
-
128
- # Construct the column definitions for the table based on the entity ID
129
- ENTITY_ID = ', \n'.join([k+' '+v for k,v in entity_id.items()])
130
- ENTITY_ID_ = ', \n'.join(['B.'+k for k,v in entity_id.items()])
131
- ENTITY_ID__ = ','.join([k for k,v in entity_id.items()])
132
-
133
- # SQL query to create the feature store table
134
- query = f"""
135
- CREATE MULTISET TABLE {schema}.{table_name},
136
- FALLBACK,
137
- NO BEFORE JOURNAL,
138
- NO AFTER JOURNAL,
139
- CHECKSUM = DEFAULT,
140
- DEFAULT MERGEBLOCKRATIO,
141
- MAP = TD_MAP1
142
- (
143
-
144
- {ENTITY_ID},
145
- FEATURE_ID BIGINT,
146
- FEATURE_VALUE FLOAT,
147
- FEATURE_VERSION VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
148
- ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
149
- ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
150
- PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME
151
- )
152
- PRIMARY INDEX ({ENTITY_ID__},FEATURE_ID,FEATURE_VERSION);
153
- """
154
-
155
- # SQL query to create a secondary index on the feature ID
156
- query2 = f"CREATE INDEX (FEATURE_ID) ON {schema}.{table_name};"
157
-
158
- # SQL query to create the view
159
- query_view = f"""
160
- REPLACE VIEW {schema}.{view_name} AS
161
- CURRENT VALIDTIME
162
- SELECT
163
- A.FEATURE_NAME,
164
- {ENTITY_ID_},
165
- B.FEATURE_VALUE,
166
- B.FEATURE_VERSION
167
- FROM {schema}.{feature_catalog_name} A
168
- , {schema}.{table_name} B
169
- WHERE A.FEATURE_ID = B.FEATURE_ID
170
- """
171
-
172
- try:
173
- # Attempt to execute the create table query
174
- tdml.get_context().execute(query)
175
- if tdml.display.print_sqlmr_query:
176
- print(query)
177
- print(f'TABLE {schema}.{table_name} has been created')
178
- tdml.get_context().execute(query2)
179
- except Exception as e:
180
- # If the table already exists and if_exists is set to 'replace', drop the table and recreate it
181
- print(str(e).split('\n')[0])
182
- if str(e).split('\n')[0].endswith('already exists.') and (if_exists == 'replace'):
183
- tdml.get_context().execute(f'DROP TABLE {schema}.{table_name}')
184
- print(f'TABLE {schema}.{table_name} has been dropped')
185
- try:
186
- # Attempt to recreate the table after dropping it
187
- tdml.get_context().execute(query)
188
- print(f'TABLE {schema}.{table_name} has been re-created')
189
- if tdml.display.print_sqlmr_query:
190
- print(query)
191
- except Exception as e:
192
- print(str(e).split('\n')[0])
193
-
194
- try:
195
- # Attempt to create the view
196
- tdml.get_context().execute(query_view)
197
- if tdml.display.print_sqlmr_query:
198
- print(query)
199
- print(f'VIEW {schema}.{view_name} has been created')
200
- except Exception as e:
201
- print(str(e).split('\n')[0])
202
-
203
- return table_name
204
-
205
- def register_features(entity_id, feature_names_types, schema, feature_catalog_name='FS_FEATURE_CATALOG'):
206
- """
207
-
208
- This function registers features in the feature catalog table of a Teradata database. It creates or updates entries in the catalog based on the provided entity ID, feature names and types, and schema.
209
-
210
- Parameters:
211
- - entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to identify the entity.
212
- - feature_names_types: A dictionary containing feature names and their corresponding types.
213
- - schema: The schema name in which the feature catalog table resides.
214
- - feature_catalog_name (optional): The name of the feature catalog table. The default is 'FS_FEATURE_CATALOG'.
215
-
216
- Returns:
217
- A DataFrame containing the registered features and their metadata.
218
-
219
- """
220
-
221
- if len(list(feature_names_types.keys())) == 0:
222
- print('no new feature to register')
223
- return
224
-
225
- # Create a comma-separated string of entity IDs
226
- ENTITY_ID__ = ','.join([k for k,v in entity_id.items()])
227
-
228
- # Create a DataFrame from the feature_names_types dictionary
229
- if len(feature_names_types.keys())>1:
230
- df = pd.DataFrame(feature_names_types).transpose().reset_index()
231
- df.columns = ['FEATURE_NAME','TYPE','FEATURE_ID']
232
- else:
233
- df = pd.DataFrame(columns=['FEATURE_NAME','TYPE','FEATURE_ID'])
234
- k = list(feature_names_types.keys())[0]
235
- df['FEATURE_NAME'] = [k]
236
- df['TYPE'] = [feature_names_types[k]['type']]
237
- df['FEATURE_ID'] = [feature_names_types[k]['id']]
238
-
239
- # Generate the feature table and view names based on the entity ID and feature type
240
- df['FEATURE_TABLE'] = df.apply(lambda row:get_feature_store_table_name(entity_id, row[1])[0], axis=1)
241
- df['FEATURE_VIEW'] = df.apply(lambda row:get_feature_store_table_name(entity_id, row[1])[1], axis=1)
242
-
243
- # Add additional columns to the DataFrame
244
- df['ENTITY_NAME'] = ENTITY_ID__
245
- df['FEATURE_DATABASE'] = schema
246
-
247
- # Copy the DataFrame to a temporary table in Teradata
248
- tdml.copy_to_sql(df,table_name = 'temp', schema_name = schema, if_exists = 'replace', primary_index = 'FEATURE_ID', types={'FEATURE_ID':tdml.BIGINT})
249
-
250
- # SQL query to update existing entries in the feature catalog
251
- query_update = f"""
252
- CURRENT VALIDTIME
253
- UPDATE {schema}.{feature_catalog_name}
254
- FROM (
255
- CURRENT VALIDTIME
256
- SELECT
257
- NEW_FEATURES.FEATURE_ID
258
- , NEW_FEATURES.FEATURE_NAME
259
- , NEW_FEATURES.FEATURE_TABLE
260
- , NEW_FEATURES.FEATURE_DATABASE
261
- , NEW_FEATURES.FEATURE_VIEW
262
- , NEW_FEATURES.ENTITY_NAME
263
- FROM {schema}.temp NEW_FEATURES
264
- LEFT JOIN {schema}.{feature_catalog_name} EXISTING_FEATURES
265
- ON NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
266
- WHERE EXISTING_FEATURES.FEATURE_NAME IS NOT NULL
267
- ) UPDATED_FEATURES
268
- SET
269
- FEATURE_NAME = UPDATED_FEATURES.FEATURE_NAME,
270
- FEATURE_TABLE = UPDATED_FEATURES.FEATURE_TABLE,
271
- FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE,
272
- FEATURE_VIEW = UPDATED_FEATURES.FEATURE_VIEW,
273
- ENTITY_NAME = UPDATED_FEATURES.ENTITY_NAME
274
- WHERE {feature_catalog_name}.FEATURE_ID = UPDATED_FEATURES.FEATURE_ID;
275
- """
276
-
277
- # SQL query to insert new entries into the feature catalog
278
- query_insert = f"""
279
- CURRENT VALIDTIME
280
- INSERT INTO {schema}.{feature_catalog_name} (FEATURE_ID, FEATURE_NAME, FEATURE_TABLE, FEATURE_DATABASE, FEATURE_VIEW, ENTITY_NAME)
281
- SELECT
282
- NEW_FEATURES.FEATURE_ID
283
- , NEW_FEATURES.FEATURE_NAME
284
- , NEW_FEATURES.FEATURE_TABLE
285
- , NEW_FEATURES.FEATURE_DATABASE
286
- , NEW_FEATURES.FEATURE_VIEW
287
- , NEW_FEATURES.ENTITY_NAME
288
- FROM {schema}.temp NEW_FEATURES
289
- LEFT JOIN {schema}.{feature_catalog_name} EXISTING_FEATURES
290
- ON NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
291
- WHERE EXISTING_FEATURES.FEATURE_NAME IS NULL;
292
- """
293
-
294
- # Execute the update and insert queries
295
- tdml.get_context().execute(query_insert)
296
- tdml.get_context().execute(query_update)
297
-
298
- return df
299
-
300
- def prepare_feature_ingestion(df, entity_id, feature_names, feature_version_default = 'dev.0.0', feature_versions = None, **kwargs):
301
- """
302
-
303
- This function prepares feature data for ingestion into the feature store. It transforms the input DataFrame by unpivoting the specified feature columns and adds additional columns for entity IDs, feature names, feature values, and feature versions.
304
-
305
- Parameters:
306
- - df: The input DataFrame containing the feature data.
307
- - entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to identify the entity.
308
- - feature_names: A list of feature names to unpivot from the DataFrame.
309
- - feature_version_default (optional): The default feature version to assign if not specified in the feature_versions dictionary. Default is 'dev.0.0'.
310
- - feature_versions (optional): A dictionary specifying feature versions for specific feature names. The keys are feature names, and the values are feature versions.
311
- - **kwargs: Additional keyword arguments.
312
-
313
- Returns:
314
- A transformed tdml.DataFrame containing the prepared feature data.
315
-
316
- """
317
-
318
- # Create the UNPIVOT clause for the specified feature columns
319
- unpivot_columns = ", \n".join(["("+x+") as '"+x+"'" for x in feature_names])
320
-
321
- # Create the output column list including entity IDs, feature names, and feature values
322
- output_columns = ', \n'.join(list(entity_id.keys()) + ['FEATURE_NAME','FEATURE_VALUE'])
323
-
324
- # Create a dictionary to store feature versions, using the default version if not specified
325
- versions = {f:feature_version_default for f in feature_names}
326
- if feature_versions is not None:
327
- for k,v in feature_versions.items():
328
- versions[k] = v
329
-
330
- # Create the CASE statement to assign feature versions based on feature names
331
- version_query = ["CASE"]+[f"WHEN FEATURE_NAME = '{k}' THEN '{v}' " for k,v in versions.items()]+["END AS FEATURE_VERSION"]
332
- version_query = '\n'.join(version_query)
333
-
334
- # Create the UNPIVOT query to transform the DataFrame
335
- query_unpivot = f"""
336
- SELECT
337
- {output_columns},
338
- {version_query}
339
- FROM {df._table_name} UNPIVOT ((FEATURE_VALUE) FOR FEATURE_NAME
340
- IN ({unpivot_columns})) Tmp;
341
- """
342
- if tdml.display.print_sqlmr_query:
343
- print(query_unpivot)
344
-
345
- return tdml.DataFrame.from_query(query_unpivot)
346
-
347
- def store_feature(entity_id, prepared_features, schema, feature_catalog_name='FS_FEATURE_CATALOG', date_in_the_past = None, **kwargs):
348
- """
349
-
350
- This function stores feature data in the corresponding feature tables in a Teradata database. It updates existing feature values and inserts new feature values based on the entity ID and prepared features.
351
-
352
- Parameters:
353
- - entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to identify the entity.
354
- - prepared_features: A tdml.DataFrame containing the prepared feature data.
355
- - schema: The schema name in which the feature tables reside.
356
- - feature_catalog_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
357
- - **kwargs: Additional keyword arguments.
358
-
359
- Returns:
360
- None
361
-
362
- """
363
-
364
- feature_catalog = tdml.DataFrame(tdml.in_schema(schema, feature_catalog_name))
365
-
366
- if date_in_the_past == None:
367
- validtime_statement = 'CURRENT VALIDTIME'
368
- else:
369
- validtime_statement = f"VALIDTIME AS OF DATE '{date_in_the_past}'"
370
-
371
- # SQL query to select feature data and corresponding feature metadata from the prepared features and feature catalog
372
- query = f"""
373
- {validtime_statement}
374
- SELECT
375
- A.*
376
- , B.FEATURE_ID
377
- , B.FEATURE_TABLE
378
- , B.FEATURE_DATABASE
379
- FROM {prepared_features._table_name} A,
380
- {schema}.{feature_catalog_name} B
381
- WHERE A.FEATURE_NAME = B.FEATURE_NAME
382
- """
383
-
384
-
385
-
386
- df = tdml.DataFrame.from_query(query)
387
-
388
- # Group the target tables by feature table and feature database and count the number of occurrences
389
- target_tables = df[['FEATURE_TABLE','FEATURE_DATABASE','FEATURE_ID']].groupby(['FEATURE_TABLE','FEATURE_DATABASE']).count().to_pandas()
390
- print(target_tables)
391
-
392
-
393
- ENTITY_ID = ', \n'.join([k for k,v in entity_id.items()])
394
- ENTITY_ID_ON = ' AND '.join([f'NEW_FEATURES.{k} = EXISTING_FEATURES.{k}' for k,v in entity_id.items()])
395
- ENTITY_ID_WHERE_INS = ' OR '.join([f'EXISTING_FEATURES.{k} IS NOT NULL' for k,v in entity_id.items()])
396
- ENTITY_ID_WHERE_UP = ' OR '.join([f'EXISTING_FEATURES.{k} IS NULL' for k,v in entity_id.items()])
397
-
398
-
399
- # Iterate over target tables and perform update and insert operations
400
- for i,row in target_tables.iterrows():
401
-
402
- # SQL query to update existing feature values
403
- query_update = f"""
404
- {validtime_statement}
405
- UPDATE {row[1]}.{row[0]}
406
- FROM (
407
- CURRENT VALIDTIME
408
- SELECT
409
- NEW_FEATURES.{ENTITY_ID},
410
- NEW_FEATURES.FEATURE_ID,
411
- NEW_FEATURES.FEATURE_VALUE,
412
- NEW_FEATURES.FEATURE_VERSION
413
- FROM {df._table_name} NEW_FEATURES
414
- LEFT JOIN {row[1]}.{row[0]} EXISTING_FEATURES
415
- ON {ENTITY_ID_ON}
416
- AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
417
- AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
418
- WHERE ({ENTITY_ID_WHERE_INS})
419
- AND NEW_FEATURES.FEATURE_DATABASE = '{row[1]}'
420
- AND NEW_FEATURES.FEATURE_TABLE = '{row[0]}'
421
- ) UPDATED_FEATURES
422
- SET
423
- FEATURE_VALUE = UPDATED_FEATURES.FEATURE_VALUE
424
- WHERE {row[0]}.{ENTITY_ID} = UPDATED_FEATURES.{ENTITY_ID}
425
- AND {row[0]}.FEATURE_ID = UPDATED_FEATURES.FEATURE_ID
426
- AND {row[0]}.FEATURE_VERSION = UPDATED_FEATURES.FEATURE_VERSION;
427
- """
428
-
429
- # SQL query to insert new feature values
430
- query_insert = f"""
431
- {validtime_statement}
432
- INSERT INTO {row[1]}.{row[0]} ({ENTITY_ID}, FEATURE_ID, FEATURE_VALUE, FEATURE_VERSION)
433
- SELECT
434
- NEW_FEATURES.{ENTITY_ID},
435
- NEW_FEATURES.FEATURE_ID,
436
- NEW_FEATURES.FEATURE_VALUE,
437
- NEW_FEATURES.FEATURE_VERSION
438
- FROM {df._table_name} NEW_FEATURES
439
- LEFT JOIN {row[1]}.{row[0]} EXISTING_FEATURES
440
- ON {ENTITY_ID_ON}
441
- AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
442
- AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
443
- WHERE ({ENTITY_ID_WHERE_UP})
444
- AND NEW_FEATURES.FEATURE_DATABASE = '{row[1]}'
445
- AND NEW_FEATURES.FEATURE_TABLE = '{row[0]}'
446
-
447
- """
448
-
449
- print(f'insert feature values of new {ENTITY_ID} combinations in {row[1]}.{row[0]}')
450
- if tdml.display.print_sqlmr_query:
451
- print(query_insert)
452
- tdml.get_context().execute(query_insert)
453
- print(f'update feature values of existing {ENTITY_ID} combinations in {row[1]}.{row[0]}')
454
- if tdml.display.print_sqlmr_query:
455
- print(query_update)
456
- tdml.get_context().execute(query_update)
457
-
458
-
459
- return
460
-
461
- def build_dataset(entity_id, selected_features, schema, view_name, feature_catalog_name='FS_FEATURE_CATALOG', comment = 'dataset', date_in_the_past = None, **kwargs):
462
- """
463
-
464
- This function builds a dataset view in a Teradata database based on the selected features and entity ID. It retrieves the necessary feature data from the feature catalog and feature tables, and creates a view that pivots the data to the desired format.
465
-
466
- Parameters:
467
- - entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to identify the entity.
468
- - selected_features: A dictionary specifying the selected features and their corresponding feature versions.
469
- - schema: The schema name in which the dataset view will be created.
470
- - view_name: The name of the dataset view.
471
- - feature_catalog_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
472
- - comment (optional): A comment to associate with the dataset view.
473
- - **kwargs: Additional keyword arguments.
474
-
475
- Returns:
476
- A tdml.DataFrame representing the dataset view.
477
-
478
- """
479
-
480
- feature_catalog = tdml.DataFrame.from_query(f'CURRENT VALIDTIME SELECT * FROM {schema}.{feature_catalog_name}')
481
-
482
- if date_in_the_past == None:
483
- validtime_statement = 'CURRENT VALIDTIME'
484
- else:
485
- validtime_statement = f"VALIDTIME AS OF DATE '{date_in_the_past}'"
486
-
487
- # Compose the entity names and retrieve the corresponding feature locations
488
- ENTITY_NAMES = ','.join([k for k in entity_id.keys()])
489
- feature_location = feature_catalog[(feature_catalog.FEATURE_NAME.isin(list(selected_features.keys()))) & (feature_catalog.ENTITY_NAME == ENTITY_NAMES)].to_pandas()
490
- feature_location['FEATURE_VERSION'] = feature_location['FEATURE_NAME'].map(selected_features)
491
-
492
- # Build the query to retrieve the selected features from the feature tables
493
- query = []
494
- for g,df in feature_location.groupby(['FEATURE_DATABASE','FEATURE_TABLE']):
495
- condition = ' \n OR '.join([f"(FEATURE_ID = {row['FEATURE_ID']} AND FEATURE_VERSION = '{row['FEATURE_VERSION']}')" for i,row in df.iterrows()])
496
- query_ = f"""
497
- SELECT * FROM {g[0]}.{g[1]}
498
- WHERE {condition}
499
- """
500
- query.append(query_)
501
- query = 'UNION ALL '.join(query)
502
-
503
- ENTITY_ID = ', \n'.join([k for k in entity_id.keys()])
504
- ENTITY_ID_ = ', \n'.join(['B.'+k for k in entity_id.keys()])
505
-
506
- # Build the query to construct the dataset view by joining the feature catalog and feature data
507
- query_dataset = f"""
508
- {validtime_statement}
509
- SELECT
510
- A.FEATURE_NAME,
511
- {ENTITY_ID_},
512
- B.FEATURE_VALUE
513
- FROM {schema}.{feature_catalog_name} A
514
- , ({query}) B
515
- WHERE A.FEATURE_ID = B.FEATURE_ID
516
- """
517
-
518
- # Define the output column names for the pivoted view
519
- output_name = ',\n'.join([f"'{k}' as {k}" for k in selected_features.keys()])
520
- output_name_ = ',\n'.join([f'CASE WHEN {k}_cnt=1 THEN {k} END AS {k}' for k in selected_features.keys()])
521
-
522
-
523
- # Build the query to create the dataset view by pivoting the feature data
524
- query_create_view = f'REPLACE VIEW {schema}.{view_name} AS'
525
- query_pivot = f"""
526
- SELECT
527
- {ENTITY_ID}
528
- , {output_name_}
529
- FROM ({query_dataset}) AA PIVOT (
530
- AVG(FEATURE_VALUE),
531
- COUNT(FEATURE_VALUE) as cnt
532
- FOR FEATURE_NAME IN (
533
- {output_name}
534
- )
535
- )Tmp;
536
- """
537
- if tdml.display.print_sqlmr_query:
538
- print(query_create_view+'\n'+query_pivot)
539
-
540
- if view_name != None:
541
- tdml.get_context().execute(query_create_view+'\n'+query_pivot)
542
- tdml.get_context().execute(f"COMMENT ON VIEW {schema}.{view_name} IS '{comment}'")
543
- print(f'the dataset view {schema}.{view_name} has been created')
544
-
545
- return tdml.DataFrame(tdml.in_schema(schema, view_name))
546
- else:
547
- return tdml.DataFrame.from_query(query_pivot)
548
- def GetTheLargestFeatureID(schema,table_name='FS_FEATURE_CATALOG'):
549
- """
550
- This function retrieves the maximum feature ID from the feature catalog table in the Teradata database.
551
-
552
- Parameters:
553
- - schema: The schema name in which the feature catalog table resides.
554
- - table_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
555
-
556
- Returns:
557
- The maximum feature ID. If no feature IDs are found (i.e., the table is empty), the function returns 0.
558
-
559
- """
560
- # Execute a SQL query to get the maximum feature ID from the feature catalog table.
561
- feature_id = tdml.get_context().execute(f'SEL MAX(FEATURE_ID) AS MAX_FEATURE_ID FROM {schema}.{table_name}').fetchall()[0][0]
562
-
563
- # If the result of the query is None (which means the table is empty), return 0.
564
- if feature_id == None:
565
- return 0
566
- # If the result of the query is not None, return the maximum feature ID.
567
- else:
568
- return feature_id
569
-
570
-
571
- def GetAlreadyExistingFeatureNames(feature_name, schema, table_name='FS_FEATURE_CATALOG'):
572
- """
573
- This function retrieves the list of already existing features in the feature catalog table in the Teradata database.
574
-
575
- Parameters:
576
- - feature_name: The name of the feature to check.
577
- - schema: The schema name in which the feature catalog table resides.
578
- - table_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
579
-
580
- Returns:
581
- A list of existing features.
582
-
583
- """
584
- # Create a temporary DataFrame with the feature name.
585
- df = pd.DataFrame({'FEATURE_NAME': feature_name})
586
-
587
- # Define a temporary table name.
588
- tmp_name = 'tdfs__fgjnojnsmdoignmosnig'
589
-
590
- # Copy the temporary DataFrame to a temporary table in the Teradata database.
591
- tdml.copy_to_sql(df, schema_name=schema, table_name=tmp_name, if_exists='replace',
592
- types={'FEATURE_NAME': tdml.VARCHAR(length=255, charset='LATIN')})
593
-
594
- # Execute a SQL query to get the feature names that exist in both the temporary table and the feature catalog table.
595
- existing_features = list(tdml.DataFrame.from_query(f"""
596
- SEL A.FEATURE_NAME
597
- FROM {schema}.{tmp_name} A
598
- INNER JOIN {schema}.{table_name} B
599
- ON A.FEATURE_NAME = B.FEATURE_NAME
600
- """).to_pandas().FEATURE_NAME.values)
601
-
602
- # Return the list of existing features.
603
- return existing_features
604
-
605
-
606
- def Gettdtypes(tddf, features_columns, schema, table_name='FS_FEATURE_CATALOG'):
607
- """
608
- This function retrieves the data types of the columns in the provided DataFrame (tddf) and checks their existence in the feature catalog table.
609
- It also assigns new feature IDs for those that do not already exist in the table.
610
-
611
- Parameters:
612
- - tddf: The input DataFrame.
613
- - features_columns: A list of feature column names.
614
- - schema: The schema name in which the feature catalog table resides.
615
- - table_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
616
-
617
- Returns:
618
- A dictionary where keys are column names and values are dictionaries containing type and id of the feature.
619
-
620
- """
621
- # Get the data types of the columns in the DataFrame.
622
- types = dict(tddf.to_pandas(num_rows=10).dtypes)
623
-
624
- # Get the names of the features that already exist in the feature catalog table.
625
- existing_features = GetAlreadyExistingFeatureNames(tddf.columns, schema, table_name=table_name)
626
-
627
- # Get the maximum feature ID from the feature catalog table.
628
- feature_id = GetTheLargestFeatureID(schema, table_name=table_name)
629
-
630
- # Increment the maximum feature ID to create a new feature ID.
631
- feature_id = feature_id + 1
632
-
633
- # Initialize a dictionary to store the result.
634
- res = {}
635
-
636
- # Iterate over the data types of the columns in the DataFrame.
637
- for k, v in types.items():
638
- # If the column name does not exist in the feature catalog table and is in the list of feature column names...
639
- if k not in existing_features and k in features_columns:
640
- # If the data type of the column is integer...
641
- if 'int' in str(v):
642
- # Add an entry to the result dictionary for the column name with its data type and new feature ID.
643
- res[k] = {'type': 'BIGINT', 'id': feature_id}
644
- # If the data type of the column is float...
645
- elif 'float' in str(v):
646
- # Add an entry to the result dictionary for the column name with its data type and new feature ID.
647
- res[k] = {'type': 'FLOAT', 'id': feature_id}
648
- # If the data type of the column is neither integer nor float...
649
- else:
650
- # Print a message that the data type is not yet managed.
651
- print(f'{k} has a type that is not yet managed')
652
-
653
- # Increment the feature ID for the next iteration.
654
- feature_id += 1
655
-
656
- # Return the result dictionary.
657
- return res
658
-
659
-
660
- def upload_feature(df, entity_id, feature_names, schema_name, feature_catalog_name='FS_FEATURE_CATALOG',
661
- feature_versions='dev.0.0'):
662
- """
663
- This function uploads features from a Teradata DataFrame to the feature store.
664
-
665
- Parameters:
666
- - df: The input Teradata DataFrame.
667
- - entity_id: The ID of the entity that the features belong to.
668
- - feature_names: A list of feature names.
669
- - schema_name: The name of the schema where the feature store resides.
670
- - feature_catalog_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
671
- - feature_versions (optional): The versions of the features. Can be a string or a list. If it's a string, it's used as the version for all features. If it's a list, it should have the same length as feature_names. Default is 'dev.0.0'.
672
-
673
- Returns:
674
- A DataFrame representing the dataset view created in the feature store.
675
- """
676
- # If feature_versions is a list, create a dictionary mapping each feature name to its corresponding version.
677
- # If feature_versions is a string, create a dictionary mapping each feature name to this string.
678
- if type(feature_versions) == list:
679
- selected_features = {k: v for k, v in zip(feature_names, feature_versions)}
680
- else:
681
- selected_features = {k: feature_versions for k in feature_names}
682
-
683
- # Get the Teradata types of the features in df.
684
- feature_names_types = feature_store.Gettdtypes(
685
- df,
686
- features_columns=feature_names,
687
- schema=schema_name
688
- )
689
-
690
- # Register the features in the feature catalog.
691
- feature_store.register_features(
692
- entity_id,
693
- feature_names_types,
694
- schema=schema_name,
695
- feature_catalog_name=feature_catalog_name
696
- )
697
-
698
- # Prepare the features for ingestion.
699
- prepared_features = feature_store.prepare_feature_ingestion(
700
- df,
701
- entity_id,
702
- feature_names,
703
- feature_versions=selected_features
704
- )
705
-
706
- # Store the prepared features in the feature store.
707
- feature_store.store_feature(
708
- entity_id,
709
- prepared_features,
710
- schema=Param['database'],
711
- feature_catalog_name=feature_catalog_name
712
- )
713
-
714
- # Build a dataset view in the feature store.
715
- dataset = feature_store.build_dataset(
716
- entity_id,
717
- selected_features,
718
- schema=schema_name,
719
- view_name=None
720
- )
721
-
722
- # Return the dataset view.
723
- return dataset