tdfs4ds 0.2.4.41__py3-none-any.whl → 0.2.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tdfs4ds/feature_store.py DELETED
@@ -1,1529 +0,0 @@
1
- import teradataml as tdml
2
- import pandas as pd
3
- from tdfs4ds.utils import execute_query, display_table, get_column_types, get_column_types_simple
4
- from teradataml.context.context import _get_database_username
5
- import inspect
6
- import warnings
7
- from tdfs4ds.process_store import register_process_view, run
8
-
9
- warnings.filterwarnings("ignore")
10
-
11
- data_domain = None
12
- schema = None
13
- feature_catalog_name = 'FS_FEATURE_CATALOG'
14
- end_period = 'UNTIL_CHANGED' #'9999-01-01 00:00:00'
15
- date_in_the_past = None
16
- feature_version_default = 'dev.0.0'
17
- display_logs = True
18
-
19
-
20
- def feature_store_catalog_creation(if_exists = 'replace',comment='this table is a feature catalog'):
21
- """
22
- This function creates a feature store catalog table in Teradata database.
23
- The catalog table stores information about features such as their names, associated tables, databases, validity periods, etc.
24
-
25
- Parameters:
26
- - schema: The schema name in which the catalog table will be created.
27
- - if_exists (optional): Specifies the behavior if the catalog table already exists. The default is 'replace', which means the existing table will be replaced.
28
- - table_name (optional): The name of the catalog table. The default is 'FS_FEATURE_CATALOG'.
29
-
30
- Returns:
31
- The name of the created or replaced catalog table.
32
-
33
- """
34
-
35
- # SQL query to create the catalog table
36
- query = f"""
37
- CREATE MULTISET TABLE {schema}.{feature_catalog_name},
38
- FALLBACK,
39
- NO BEFORE JOURNAL,
40
- NO AFTER JOURNAL,
41
- CHECKSUM = DEFAULT,
42
- DEFAULT MERGEBLOCKRATIO,
43
- MAP = TD_MAP1
44
- (
45
-
46
- FEATURE_ID BIGINT,
47
- FEATURE_NAME VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
48
- FEATURE_TABLE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
49
- FEATURE_DATABASE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
50
- FEATURE_VIEW VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
51
- ENTITY_NAME VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
52
- DATA_DOMAIN VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
53
- ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
54
- ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
55
- PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME
56
- )
57
- PRIMARY INDEX (FEATURE_ID);
58
- """
59
-
60
- # SQL query to create a secondary index on the feature name
61
- query2 = f"CREATE INDEX (FEATURE_NAME) ON {schema}.{feature_catalog_name};"
62
-
63
- # SQL query to comment the table
64
- query3 = f"COMMENT ON TABLE {schema}.{feature_catalog_name} IS '{comment}'"
65
-
66
- try:
67
- # Attempt to execute the create table query
68
- execute_query(query)
69
- if tdml.display.print_sqlmr_query:
70
- print(query)
71
- if display_logs: print(f'TABLE {schema}.{feature_catalog_name} has been created')
72
- execute_query(query3)
73
- except Exception as e:
74
- # If the table already exists and if_exists is set to 'replace', drop the table and recreate it
75
- if display_logs: print(str(e).split('\n')[0])
76
- if str(e).split('\n')[0].endswith('already exists.') and (if_exists == 'replace'):
77
- execute_query(f'DROP TABLE {schema}.{feature_catalog_name}')
78
- print(f'TABLE {schema}.{feature_catalog_name} has been dropped')
79
- try:
80
- # Attempt to recreate the table after dropping it
81
- execute_query(query)
82
- if display_logs: print(f'TABLE {schema}.{feature_catalog_name} has been re-created')
83
- if tdml.display.print_sqlmr_query:
84
- print(query)
85
- execute_query(query3)
86
- except Exception as e:
87
- print(str(e).split('\n')[0])
88
-
89
- try:
90
- # Attempt to create the secondary index
91
- execute_query(query2)
92
- if tdml.display.print_sqlmr_query:
93
- print(query)
94
- if display_logs: print(f'SECONDARY INDEX ON TABLE {schema}.{feature_catalog_name} has been created')
95
- except Exception as e:
96
- print(str(e).split('\n')[0])
97
-
98
- return feature_catalog_name
99
-
100
- def list_features():
101
- query = f"CURRENT VALIDTIME SEL * FROM {schema}.{feature_catalog_name}"
102
-
103
- return tdml.DataFrame.from_query(query)
104
-
105
- def get_feature_store_table_name(entity_id, feature_type):
106
- """
107
-
108
- This function generates the table and view names for a feature store table based on the provided entity ID and feature type.
109
-
110
- Parameters:
111
- - entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to construct the table and view names.
112
- - feature_type: The type of the feature.
113
-
114
- Returns:
115
- A tuple containing the generated table name and view name.
116
-
117
- """
118
-
119
- if type(entity_id) == list:
120
- list_entity_id = entity_id
121
- elif type(entity_id) == dict:
122
- list_entity_id = list(entity_id.keys())
123
- else:
124
- list_entity_id = [entity_id]
125
-
126
- # Construct the table name by concatenating the elements 'FS', 'T', the keys of entity_id, and feature_type
127
- table_name = ['FS','T']+[data_domain]+list_entity_id+[feature_type]
128
- table_name = '_'.join(table_name)
129
-
130
- # Construct the view name by concatenating the elements 'FS', 'V', the keys of entity_id, and feature_type
131
- view_name = ['FS','V']+[data_domain]+list_entity_id+[feature_type]
132
- view_name = '_'.join(view_name)
133
-
134
- return table_name, view_name
135
-
136
- def feature_store_table_creation(entity_id, feature_type, if_exists = 'fail'):
137
-
138
- """
139
- This function creates a feature store table and a corresponding view in a Teradata database schema based on the provided entity ID, feature type, and feature catalog.
140
-
141
- Parameters:
142
- - entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to construct the table and view names.
143
- - feature_type: The type of the feature.
144
- - schema: The schema name in which the table and view will be created.
145
- - if_exists (optional): Specifies the behavior if the table already exists. The default is 'replace', which means the existing table will be replaced.
146
- - feature_catalog_name (optional): The name of the feature catalog table. The default is 'FS_FEATURE_CATALOG'.
147
-
148
- Returns:
149
- The name of the created or replaced feature store table.
150
-
151
- """
152
-
153
- table_name, view_name = get_feature_store_table_name(entity_id, feature_type)
154
- if tdml.db_list_tables(schema_name=schema, object_name=table_name+'%').shape[0] > 0:
155
- print(f'table {table_name} in the {schema} database already exists. No need to create it.')
156
- return
157
- else:
158
- print(f'table {table_name} in the {schema} database does not exists. Need to create it.')
159
-
160
- query_feature_value = {
161
- 'FLOAT' : 'FEATURE_VALUE FLOAT',
162
- 'BIGINT' : 'FEATURE_VALUE BIGINT',
163
- 'VARCHAR' : 'FEATURE_VALUE VARCHAR(2048) CHARACTER SET LATIN'
164
- }
165
-
166
- # Construct the column definitions for the table based on the entity ID
167
- ENTITY_ID = ', \n'.join([k+' '+v for k,v in entity_id.items()])
168
- ENTITY_ID_ = ', \n'.join(['B.'+k for k,v in entity_id.items()])
169
- ENTITY_ID__ = ','.join([k for k,v in entity_id.items()])
170
-
171
- # SQL query to create the feature store table
172
- query = f"""
173
- CREATE MULTISET TABLE {schema}.{table_name},
174
- FALLBACK,
175
- NO BEFORE JOURNAL,
176
- NO AFTER JOURNAL,
177
- CHECKSUM = DEFAULT,
178
- DEFAULT MERGEBLOCKRATIO,
179
- MAP = TD_MAP1
180
- (
181
-
182
- {ENTITY_ID},
183
- FEATURE_ID BIGINT,
184
- {query_feature_value[feature_type]},
185
- FEATURE_VERSION VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
186
- ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
187
- ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
188
- PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME
189
- )
190
- PRIMARY INDEX ({ENTITY_ID__},FEATURE_ID,FEATURE_VERSION);
191
- """
192
-
193
- # SQL query to create a secondary index on the feature ID
194
- query2 = f"CREATE INDEX (FEATURE_ID) ON {schema}.{table_name};"
195
-
196
- # SQL query to create the view
197
- query_view = f"""
198
- REPLACE VIEW {schema}.{view_name} AS
199
- CURRENT VALIDTIME
200
- SELECT
201
- A.FEATURE_NAME,
202
- {ENTITY_ID_},
203
- B.FEATURE_VALUE,
204
- B.FEATURE_VERSION
205
- FROM {schema}.{feature_catalog_name} A
206
- , {schema}.{table_name} B
207
- WHERE A.FEATURE_ID = B.FEATURE_ID
208
- """
209
-
210
- try:
211
- # Attempt to execute the create table query
212
- execute_query(query)
213
- if tdml.display.print_sqlmr_query:
214
- print(query)
215
- if display_logs: print(f'TABLE {schema}.{table_name} has been created')
216
- execute_query(query2)
217
- except Exception as e:
218
- # If the table already exists and if_exists is set to 'replace', drop the table and recreate it
219
- print(str(e).split('\n')[0])
220
- if str(e).split('\n')[0].endswith('already exists.') and (if_exists == 'replace'):
221
- execute_query(f'DROP TABLE {schema}.{table_name}')
222
- if display_logs: print(f'TABLE {schema}.{table_name} has been dropped')
223
- try:
224
- # Attempt to recreate the table after dropping it
225
- execute_query(query)
226
- if display_logs: print(f'TABLE {schema}.{table_name} has been re-created')
227
- if tdml.display.print_sqlmr_query:
228
- print(query)
229
- except Exception as e:
230
- print(str(e).split('\n')[0])
231
-
232
- try:
233
- # Attempt to create the view
234
- execute_query(query_view)
235
- if tdml.display.print_sqlmr_query:
236
- print(query)
237
- if display_logs: print(f'VIEW {schema}.{view_name} has been created')
238
- except Exception as e:
239
- print(str(e).split('\n')[0])
240
-
241
- return table_name
242
-
243
- def register_features(entity_id, feature_names_types):
244
- """
245
-
246
- This function registers features in the feature catalog table of a Teradata database. It creates or updates entries in the catalog based on the provided entity ID, feature names and types, and schema.
247
-
248
- Parameters:
249
- - entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to identify the entity.
250
- - feature_names_types: A dictionary containing feature names and their corresponding types.
251
- - schema: The schema name in which the feature catalog table resides.
252
- - feature_catalog_name (optional): The name of the feature catalog table. The default is 'FS_FEATURE_CATALOG'.
253
-
254
- Returns:
255
- A DataFrame containing the registered features and their metadata.
256
-
257
- """
258
-
259
- if date_in_the_past == None:
260
- validtime_statement = 'CURRENT VALIDTIME'
261
- else:
262
- validtime_statement = f"VALIDTIME PERIOD '({date_in_the_past},{end_period})'"
263
-
264
-
265
- if len(list(feature_names_types.keys())) == 0:
266
- if display_logs: print('no new feature to register')
267
- return
268
-
269
- # Create a comma-separated string of entity IDs
270
- ENTITY_ID__ = ','.join([k for k,v in entity_id.items()])
271
-
272
- # Create a DataFrame from the feature_names_types dictionary
273
- if len(feature_names_types.keys())>1:
274
- df = pd.DataFrame(feature_names_types).transpose().reset_index()
275
- df.columns = ['FEATURE_NAME','TYPE','FEATURE_ID']
276
- else:
277
- df = pd.DataFrame(columns=['FEATURE_NAME','TYPE','FEATURE_ID'])
278
- k = list(feature_names_types.keys())[0]
279
- df['FEATURE_NAME'] = [k]
280
- df['TYPE'] = [feature_names_types[k]['type']]
281
- df['FEATURE_ID'] = [feature_names_types[k]['id']]
282
-
283
- # Generate the feature table and view names based on the entity ID and feature type
284
- df['FEATURE_TABLE'] = df.apply(lambda row:get_feature_store_table_name(entity_id, row.iloc[1])[0], axis=1)
285
- df['FEATURE_VIEW'] = df.apply(lambda row:get_feature_store_table_name(entity_id, row.iloc[1])[1], axis=1)
286
-
287
- # Add additional columns to the DataFrame
288
- df['ENTITY_NAME'] = ENTITY_ID__
289
- df['FEATURE_DATABASE'] = schema
290
- df['DATA_DOMAIN'] = data_domain
291
-
292
- # Copy the DataFrame to a temporary table in Teradata
293
- tdml.copy_to_sql(df,table_name = 'temp', schema_name = schema, if_exists = 'replace', primary_index = 'FEATURE_ID', types={'FEATURE_ID':tdml.BIGINT})
294
-
295
- # SQL query to update existing entries in the feature catalog
296
- query_update = f"""
297
- {validtime_statement}
298
- UPDATE {schema}.{feature_catalog_name}
299
- FROM (
300
- CURRENT VALIDTIME
301
- SELECT
302
- NEW_FEATURES.FEATURE_ID
303
- , NEW_FEATURES.FEATURE_NAME
304
- , NEW_FEATURES.FEATURE_TABLE
305
- , NEW_FEATURES.FEATURE_DATABASE
306
- , NEW_FEATURES.FEATURE_VIEW
307
- , NEW_FEATURES.ENTITY_NAME
308
- , NEW_FEATURES.DATA_DOMAIN
309
- FROM {schema}.temp NEW_FEATURES
310
- LEFT JOIN {schema}.{feature_catalog_name} EXISTING_FEATURES
311
- ON NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
312
- AND NEW_FEATURES.DATA_DOMAIN = EXISTING_FEATURES.DATA_DOMAIN
313
- WHERE EXISTING_FEATURES.FEATURE_NAME IS NOT NULL
314
- ) UPDATED_FEATURES
315
- SET
316
- FEATURE_NAME = UPDATED_FEATURES.FEATURE_NAME,
317
- FEATURE_TABLE = UPDATED_FEATURES.FEATURE_TABLE,
318
- FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE,
319
- FEATURE_VIEW = UPDATED_FEATURES.FEATURE_VIEW,
320
- ENTITY_NAME = UPDATED_FEATURES.ENTITY_NAME
321
- WHERE {feature_catalog_name}.FEATURE_ID = UPDATED_FEATURES.FEATURE_ID
322
- AND {feature_catalog_name}.DATA_DOMAIN = UPDATED_FEATURES.DATA_DOMAIN;
323
- """
324
-
325
- # SQL query to insert new entries into the feature catalog
326
- if validtime_statement == 'CURRENT VALIDTIME':
327
- query_insert = f"""
328
- {validtime_statement}
329
- INSERT INTO {schema}.{feature_catalog_name} (FEATURE_ID, FEATURE_NAME, FEATURE_TABLE, FEATURE_DATABASE, FEATURE_VIEW, ENTITY_NAME,DATA_DOMAIN)
330
- SELECT
331
- NEW_FEATURES.FEATURE_ID
332
- , NEW_FEATURES.FEATURE_NAME
333
- , NEW_FEATURES.FEATURE_TABLE
334
- , NEW_FEATURES.FEATURE_DATABASE
335
- , NEW_FEATURES.FEATURE_VIEW
336
- , NEW_FEATURES.ENTITY_NAME
337
- , NEW_FEATURES.DATA_DOMAIN
338
- FROM {schema}.temp NEW_FEATURES
339
- LEFT JOIN {schema}.{feature_catalog_name} EXISTING_FEATURES
340
- ON NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
341
- AND NEW_FEATURES.DATA_DOMAIN = EXISTING_FEATURES.DATA_DOMAIN
342
- WHERE EXISTING_FEATURES.FEATURE_NAME IS NULL;
343
- """
344
- elif date_in_the_past is not None:
345
- if end_period == 'UNTIL_CHANGED':
346
- end_period_ = '9999-01-01 00:00:00'
347
- else:
348
- end_period_ = end_period
349
- query_insert = f"""
350
- INSERT INTO {schema}.{feature_catalog_name} (FEATURE_ID, FEATURE_NAME, FEATURE_TABLE, FEATURE_DATABASE, FEATURE_VIEW, ENTITY_NAME,DATA_DOMAIN,ValidStart,ValidEnd)
351
- SELECT
352
- NEW_FEATURES.FEATURE_ID
353
- , NEW_FEATURES.FEATURE_NAME
354
- , NEW_FEATURES.FEATURE_TABLE
355
- , NEW_FEATURES.FEATURE_DATABASE
356
- , NEW_FEATURES.FEATURE_VIEW
357
- , NEW_FEATURES.ENTITY_NAME
358
- , NEW_FEATURES.DATA_DOMAIN
359
- , TIMESTAMP '{date_in_the_past}'
360
- , TIMESTAMP '{end_period_}'
361
- FROM {schema}.temp NEW_FEATURES
362
- LEFT JOIN {schema}.{feature_catalog_name} EXISTING_FEATURES
363
- ON NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
364
- AND NEW_FEATURES.DATA_DOMAIN = EXISTING_FEATURES.DATA_DOMAIN
365
- WHERE EXISTING_FEATURES.FEATURE_NAME IS NULL;
366
- """
367
-
368
- # Execute the update and insert queries
369
- execute_query(query_insert)
370
- execute_query(query_update)
371
-
372
- return df
373
-
374
- def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions = None, **kwargs):
375
- """
376
-
377
- This function prepares feature data for ingestion into the feature store. It transforms the input DataFrame by unpivoting the specified feature columns and adds additional columns for entity IDs, feature names, feature values, and feature versions.
378
-
379
- Parameters:
380
- - df: The input DataFrame containing the feature data.
381
- - entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to identify the entity.
382
- - feature_names: A list of feature names to unpivot from the DataFrame.
383
- - feature_version_default (optional): The default feature version to assign if not specified in the feature_versions dictionary. Default is 'dev.0.0'.
384
- - feature_versions (optional): A dictionary specifying feature versions for specific feature names. The keys are feature names, and the values are feature versions.
385
- - **kwargs: Additional keyword arguments.
386
-
387
- Returns:
388
- A transformed tdml.DataFrame containing the prepared feature data.
389
-
390
- """
391
-
392
- # Create the UNPIVOT clause for the specified feature columns
393
- unpivot_columns = ", \n".join(["("+x+") as '"+x+"'" for x in feature_names])
394
-
395
- if type(entity_id) == list:
396
- list_entity_id = entity_id
397
- elif type(entity_id) == dict:
398
- list_entity_id = list(entity_id.keys())
399
- else:
400
- list_entity_id = [entity_id]
401
-
402
- # Create the output column list including entity IDs, feature names, and feature values
403
- output_columns = ', \n'.join(list_entity_id+ ['FEATURE_NAME','FEATURE_VALUE'])
404
- primary_index = ','.join(list_entity_id)
405
-
406
- # Create a dictionary to store feature versions, using the default version if not specified
407
- versions = {f:feature_version_default for f in feature_names}
408
- if feature_versions is not None:
409
- for k,v in feature_versions.items():
410
- versions[k] = v
411
-
412
- # Create the CASE statement to assign feature versions based on feature names
413
- version_query = ["CASE"]+[f"WHEN FEATURE_NAME = '{k}' THEN '{v}' " for k,v in versions.items()]+["END AS FEATURE_VERSION"]
414
- version_query = '\n'.join(version_query)
415
-
416
-
417
-
418
- # Create a volatile table name based on the original table's name, ensuring it is unique.
419
- volatile_table_name = df._table_name.split('.')[1].replace('"', '')
420
- volatile_table_name = f'temp_{volatile_table_name}'
421
-
422
- if type(entity_id) == list:
423
- list_entity_id = entity_id
424
- elif type(entity_id) == dict:
425
- list_entity_id = list(entity_id.keys())
426
- else:
427
- list_entity_id = [entity_id]
428
-
429
- # query casting in varchar everything
430
- nested_query = f"""
431
- CREATE VOLATILE TABLE {volatile_table_name} AS
432
- (
433
- SELECT
434
- {','.join(list_entity_id)},
435
- {','.join([f'CAST({x} AS VARCHAR(2048)) AS {x}' for x in feature_names])}
436
- FROM {df._table_name}
437
- ) WITH DATA
438
- PRIMARY INDEX ({primary_index})
439
- ON COMMIT PRESERVE ROWS
440
- """
441
-
442
-
443
- # Execute the SQL query to create the volatile table.
444
- tdml.execute_sql(nested_query)
445
-
446
-
447
-
448
-
449
- # Construct the SQL query to create the volatile table with the transformed data.
450
- query = f"""
451
- SELECT
452
- {output_columns},
453
- {version_query}
454
- FROM {tdml.in_schema(_get_database_username(), volatile_table_name)}
455
- UNPIVOT ((FEATURE_VALUE ) FOR FEATURE_NAME
456
- IN ({unpivot_columns})) Tmp
457
- """
458
-
459
-
460
-
461
- # Optionally print the query if the display flag is set.
462
- if tdml.display.print_sqlmr_query:
463
- print(query)
464
-
465
-
466
- # Return the DataFrame representation of the volatile table and its name.
467
- return tdml.DataFrame.from_query(query), volatile_table_name
468
-
469
- def store_feature(entity_id, prepared_features, **kwargs):
470
- """
471
-
472
- This function stores feature data in the corresponding feature tables in a Teradata database. It updates existing feature values and inserts new feature values based on the entity ID and prepared features.
473
-
474
- Parameters:
475
- - entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to identify the entity.
476
- - prepared_features: A tdml.DataFrame containing the prepared feature data.
477
- - schema: The schema name in which the feature tables reside.
478
- - feature_catalog_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
479
- - **kwargs: Additional keyword arguments.
480
-
481
- Returns:
482
- None
483
-
484
- """
485
-
486
- feature_catalog = tdml.DataFrame(tdml.in_schema(schema, feature_catalog_name))
487
-
488
- if date_in_the_past == None:
489
- validtime_statement = 'CURRENT VALIDTIME'
490
- validtime_statement2 = validtime_statement
491
- else:
492
- validtime_statement = f"VALIDTIME PERIOD '({date_in_the_past},{end_period})'"
493
- validtime_statement2 = f"VALIDTIME AS OF TIMESTAMP '{date_in_the_past}'"
494
-
495
- # SQL query to select feature data and corresponding feature metadata from the prepared features and feature catalog
496
- query = f"""
497
- {validtime_statement2}
498
- SELECT
499
- A.*
500
- , B.FEATURE_ID
501
- , B.FEATURE_TABLE
502
- , B.FEATURE_DATABASE
503
- FROM {prepared_features._table_name} A,
504
- {schema}.{feature_catalog_name} B
505
- WHERE A.FEATURE_NAME = B.FEATURE_NAME
506
- AND B.DATA_DOMAIN = '{data_domain}'
507
- """
508
-
509
-
510
-
511
- df = tdml.DataFrame.from_query(query)
512
-
513
- # Group the target tables by feature table and feature database and count the number of occurrences
514
- target_tables = df[['FEATURE_TABLE','FEATURE_DATABASE','FEATURE_ID']].groupby(['FEATURE_TABLE','FEATURE_DATABASE']).count().to_pandas()
515
- if display_logs:
516
- display_table(target_tables[['FEATURE_DATABASE','FEATURE_TABLE','count_FEATURE_ID']])
517
-
518
-
519
- ENTITY_ID = ', \n'.join([k for k,v in entity_id.items()])
520
- ENTITY_ID_ON = ' AND '.join([f'NEW_FEATURES.{k} = EXISTING_FEATURES.{k}' for k,v in entity_id.items()])
521
- ENTITY_ID_WHERE_INS = ' OR '.join([f'EXISTING_FEATURES.{k} IS NOT NULL' for k,v in entity_id.items()])
522
- ENTITY_ID_WHERE_UP = ' OR '.join([f'EXISTING_FEATURES.{k} IS NULL' for k,v in entity_id.items()])
523
-
524
- ENTITY_ID_SELECT = ', \n'.join(['NEW_FEATURES.'+k for k, v in entity_id.items()])
525
- # Iterate over target tables and perform update and insert operations
526
- for i,row in target_tables.iterrows():
527
-
528
- ENTITY_ID_WHERE_ = ' AND '.join([f'{row.iloc[0]}.{k} = UPDATED_FEATURES.{k}' for k,v in entity_id.items()])
529
- # SQL query to update existing feature values
530
- query_update = f"""
531
- {validtime_statement}
532
- UPDATE {row.iloc[1]}.{row.iloc[0]}
533
- FROM (
534
- {validtime_statement2}
535
- SELECT
536
- {ENTITY_ID_SELECT},
537
- NEW_FEATURES.FEATURE_ID,
538
- NEW_FEATURES.FEATURE_VALUE,
539
- NEW_FEATURES.FEATURE_VERSION
540
- FROM {df._table_name} NEW_FEATURES
541
- LEFT JOIN {row.iloc[1]}.{row.iloc[0]} EXISTING_FEATURES
542
- ON {ENTITY_ID_ON}
543
- AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
544
- AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
545
- WHERE ({ENTITY_ID_WHERE_INS})
546
- AND NEW_FEATURES.FEATURE_DATABASE = '{row.iloc[1]}'
547
- AND NEW_FEATURES.FEATURE_TABLE = '{row.iloc[0]}'
548
- ) UPDATED_FEATURES
549
- SET
550
- FEATURE_VALUE = UPDATED_FEATURES.FEATURE_VALUE
551
- WHERE {ENTITY_ID_WHERE_}
552
- AND {row.iloc[0]}.FEATURE_ID = UPDATED_FEATURES.FEATURE_ID
553
- AND {row.iloc[0]}.FEATURE_VERSION = UPDATED_FEATURES.FEATURE_VERSION;
554
- """
555
-
556
- # SQL query to insert new feature values
557
- if validtime_statement == 'CURRENT VALIDTIME':
558
- query_insert = f"""
559
- {validtime_statement}
560
- INSERT INTO {row.iloc[1]}.{row.iloc[0]} ({ENTITY_ID}, FEATURE_ID, FEATURE_VALUE, FEATURE_VERSION)
561
- SELECT
562
- {ENTITY_ID_SELECT},
563
- NEW_FEATURES.FEATURE_ID,
564
- NEW_FEATURES.FEATURE_VALUE,
565
- NEW_FEATURES.FEATURE_VERSION
566
- FROM {df._table_name} NEW_FEATURES
567
- LEFT JOIN {row.iloc[1]}.{row.iloc[0]} EXISTING_FEATURES
568
- ON {ENTITY_ID_ON}
569
- AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
570
- AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
571
- WHERE ({ENTITY_ID_WHERE_UP})
572
- AND NEW_FEATURES.FEATURE_DATABASE = '{row.iloc[1]}'
573
- AND NEW_FEATURES.FEATURE_TABLE = '{row.iloc[0]}'
574
- """
575
- elif date_in_the_past is not None:
576
- if end_period == 'UNTIL_CHANGED':
577
- end_period_ = '9999-01-01 00:00:00'
578
- else:
579
- end_period_ = end_period
580
- query_insert = f"""
581
- INSERT INTO {row.iloc[1]}.{row.iloc[0]} ({ENTITY_ID}, FEATURE_ID, FEATURE_VALUE, FEATURE_VERSION, ValidStart, ValidEnd)
582
- SELECT
583
- {ENTITY_ID_SELECT},
584
- NEW_FEATURES.FEATURE_ID,
585
- NEW_FEATURES.FEATURE_VALUE,
586
- NEW_FEATURES.FEATURE_VERSION,
587
- TIMESTAMP '{date_in_the_past}',
588
- TIMESTAMP '{end_period_}'
589
- FROM {df._table_name} NEW_FEATURES
590
- LEFT JOIN {row.iloc[1]}.{row.iloc[0]} EXISTING_FEATURES
591
- ON {ENTITY_ID_ON}
592
- AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
593
- AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
594
- WHERE ({ENTITY_ID_WHERE_UP})
595
- AND NEW_FEATURES.FEATURE_DATABASE = '{row.iloc[1]}'
596
- AND NEW_FEATURES.FEATURE_TABLE = '{row.iloc[0]}'
597
- """
598
- entity_id_str = ', \n'.join([k for k, v in entity_id.items()])
599
- if display_logs: print(f'insert feature values of new {entity_id_str} combinations in {row.iloc[1]}.{row.iloc[0]}')
600
- if tdml.display.print_sqlmr_query:
601
- print(query_insert)
602
- execute_query(query_insert)
603
- if display_logs: print(f'update feature values of existing {entity_id_str} combinations in {row.iloc[1]}.{row.iloc[0]}')
604
- if tdml.display.print_sqlmr_query:
605
- print(query_update)
606
- execute_query(query_update)
607
-
608
-
609
- return
610
-
611
-
612
- def build_dataset(entity_id, selected_features, view_name,
613
- comment='dataset', no_temporal=False, time_manager=None, query_only=False):
614
- """
615
- This function builds a dataset view in a Teradata database. It is designed to pivot and format data from the feature catalog and feature tables based on the specified parameters.
616
-
617
- Parameters:
618
- - entity_id (dict or list or other): A dictionary, list, or other format representing the entity ID. The keys of the dictionary are used to identify the entity. Lists and other formats are converted to a list of keys.
619
- - selected_features (dict): A dictionary specifying the selected features and their corresponding feature versions.
620
- - view_name (str): The name of the dataset view to be created.
621
- - comment (str, optional): A comment to associate with the dataset view. Defaults to 'dataset'.
622
- - no_temporal (bool, optional): Flag to determine if temporal aspects should be ignored. Defaults to False.
623
- - time_manager (object, optional): An object to manage time aspects. Defaults to None.
624
- - query_only (bool, optional): Flag to determine if we want only the generated query without the execution
625
-
626
- Returns:
627
- tdml.DataFrame: A DataFrame representing the dataset view.
628
- """
629
-
630
- # Retrieve feature data from the feature catalog table
631
- feature_catalog = tdml.DataFrame.from_query(f'CURRENT VALIDTIME SELECT * FROM {schema}.{feature_catalog_name}')
632
-
633
- # Determine the valid time statement based on the presence of a specific date in the past
634
- if date_in_the_past is None:
635
- validtime_statement = 'CURRENT VALIDTIME'
636
- else:
637
- validtime_statement = f"VALIDTIME AS OF TIMESTAMP '{date_in_the_past}'"
638
-
639
- # Adjust valid time statement based on the presence of time_manager and no_temporal flag
640
- if no_temporal:
641
- validtime_statement = ''
642
-
643
- # Convert entity_id to a list format for processing
644
- if isinstance(entity_id, list):
645
- list_entity_id = entity_id
646
- elif isinstance(entity_id, dict):
647
- list_entity_id = list(entity_id.keys())
648
- else:
649
- list_entity_id = [entity_id]
650
-
651
- # Compose the entity names and retrieve the corresponding feature locations
652
- ENTITY_NAMES = ','.join([k for k in list_entity_id])
653
- ENTITY_ID = ', \n'.join([k for k in list_entity_id])
654
- if len(selected_features) > 1:
655
- ENTITY_ID_ = ','.join([','.join(['COALESCE('+','.join(['AA'+str(i+1)+'.'+k for i,c in enumerate(selected_features)])+') as '+k]) for k in list_entity_id])
656
- else:
657
- ENTITY_ID_ = ','.join([','.join(['' + ','.join(['AA' + str(i + 1) + '.' + k for i, c in enumerate(selected_features)]) + ' as ' + k]) for k in list_entity_id])
658
-
659
-
660
- feature_location = feature_catalog[(feature_catalog.FEATURE_NAME.isin(list(selected_features.keys()))) & \
661
- (feature_catalog.ENTITY_NAME == ENTITY_NAMES) & \
662
- (feature_catalog.DATA_DOMAIN == data_domain) \
663
- ].to_pandas()
664
-
665
- # manage the case sensitivity
666
- feature_location['FEATURE_NAME_UPPER'] = [x.upper() for x in feature_location['FEATURE_NAME']]
667
- feature_location['FEATURE_VERSION'] = feature_location['FEATURE_NAME_UPPER'].map({k.upper():v for k,v in selected_features.items()})
668
-
669
-
670
- # Build the query to retrieve the selected features from the feature tables
671
- query = []
672
- counter = 1
673
- feature_names = []
674
- for g,df in feature_location.groupby(['FEATURE_DATABASE','FEATURE_TABLE']):
675
- for i,row in df.iterrows():
676
- condition = ' \n '+f"(FEATURE_ID = {row['FEATURE_ID']} AND FEATURE_VERSION = '{row['FEATURE_VERSION']}')"
677
- if time_manager is not None:
678
- if 'date' in time_manager.data_type.lower():
679
- print(f'Time Manager {time_manager.schema_name}.{time_manager.table_name} has a {time_manager.data_type} data type')
680
- query_ = f"""
681
- SELECT A{counter}.* FROM (
682
- SELECT * FROM {g[0]}.{g[1]}
683
- WHERE {condition} AND PERIOD(CAST(ValidStart AS DATE), CAST(ValidEnd AS DATE)) CONTAINS (SEL BUSINESS_DATE FROM {time_manager.schema_name}.{time_manager.table_name})
684
- ) A{counter}
685
- """
686
- else:
687
- print(
688
- f'Time Manager {time_manager.schema_name}.{time_manager.table_name} has a {time_manager.data_type} data type')
689
- query_ = f"""
690
- SELECT A{counter}.* FROM (
691
- SELECT * FROM {g[0]}.{g[1]}
692
- WHERE {condition} AND PERIOD(ValidStart, ValidEnd) CONTAINS (SEL BUSINESS_DATE FROM {time_manager.schema_name}.{time_manager.table_name})
693
- ) A{counter}
694
- """
695
- else:
696
- print(
697
- f'no time manager used.')
698
- query_ = f"""
699
- SELECT A{counter}.* FROM (
700
- {validtime_statement} SELECT * FROM {g[0]}.{g[1]}
701
- WHERE {condition}
702
- ) A{counter}
703
- """
704
- query.append(query_)
705
- feature_names.append(row['FEATURE_NAME'])
706
- counter+=1
707
-
708
-
709
-
710
- query_select = [f"SELECT {ENTITY_ID_}"]
711
- query_select = query_select + ['AA'+str(i+1)+'.FEATURE_VALUE AS '+c for i,c in enumerate(feature_names)]
712
- if no_temporal:
713
- query_select = query_select + ['AA'+str(i+1)+'.ValidStart AS ValidStart_'+ c + ',AA'+str(i+1)+'.ValidEnd AS ValidEnd_'+ c for i,c in enumerate(feature_names)]
714
- query_select = ', \n'.join(query_select)
715
-
716
- query_from = [' FROM ('+query[0]+') AA1 ']
717
- query_from = query_from + [' FULL OUTER JOIN ('+q+') AA'+str(i+1)+' \n ON '+' \n AND '.join([f'AA1.{c}=AA{i+1}.{c}' for c in list_entity_id]) for i,q in enumerate(query) if i>0]
718
- query_from = '\n'.join(query_from)
719
-
720
- query_dataset = query_select + '\n' + query_from
721
-
722
- # Build the query to create the dataset view by pivoting the feature data
723
- query_create_view = f'REPLACE VIEW {schema}.{view_name} AS'
724
- query_pivot = f"""
725
- {query_dataset}
726
- """
727
-
728
- if tdml.display.print_sqlmr_query:
729
- print(query_create_view+'\n'+query_pivot)
730
- if query_only:
731
- return query_pivot
732
- else:
733
- if view_name != None:
734
- execute_query(query_create_view+'\n'+query_pivot)
735
- execute_query(f"COMMENT ON VIEW {schema}.{view_name} IS '{comment}'")
736
- if display_logs: print(f'the dataset view {schema}.{view_name} has been created')
737
-
738
- return tdml.DataFrame(tdml.in_schema(schema, view_name))
739
- else:
740
- return tdml.DataFrame.from_query(query_pivot)
741
- def GetTheLargestFeatureID():
742
- """
743
- This function retrieves the maximum feature ID from the feature catalog table in the Teradata database.
744
-
745
- Parameters:
746
- - schema: The schema name in which the feature catalog table resides.
747
- - table_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
748
-
749
- Returns:
750
- The maximum feature ID. If no feature IDs are found (i.e., the table is empty), the function returns 0.
751
-
752
- """
753
- # Execute a SQL query to get the maximum feature ID from the feature catalog table.
754
- feature_id = execute_query(f'SEL MAX(FEATURE_ID) AS MAX_FEATURE_ID FROM {schema}.{feature_catalog_name}').fetchall()[0][0]
755
-
756
- # If the result of the query is None (which means the table is empty), return 0.
757
- if feature_id == None:
758
- return 0
759
- # If the result of the query is not None, return the maximum feature ID.
760
- else:
761
- return feature_id
762
-
763
-
764
- def GetAlreadyExistingFeatureNames(feature_name, entity_id):
765
- """
766
- This function retrieves the list of already existing features in the feature catalog table in the Teradata database.
767
-
768
- Parameters:
769
- - feature_name: The name of the feature to check.
770
- - schema: The schema name in which the feature catalog table resides.
771
- - table_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
772
-
773
- Returns:
774
- A list of existing features.
775
-
776
- """
777
- # Create a temporary DataFrame with the feature name.
778
- df = pd.DataFrame({'FEATURE_NAME': feature_name, 'DATA_DOMAIN': data_domain, 'ENTITY_NAME': ','.join([k for k,v in entity_id.items()])})
779
-
780
- # Define a temporary table name.
781
- tmp_name = 'tdfs__fgjnojnsmdoignmosnig'
782
-
783
- # Copy the temporary DataFrame to a temporary table in the Teradata database.
784
- tdml.copy_to_sql(df, schema_name=schema, table_name=tmp_name, if_exists='replace',
785
- types={'FEATURE_NAME': tdml.VARCHAR(length=255, charset='LATIN')})
786
-
787
- # Execute a SQL query to get the feature names that exist in both the temporary table and the feature catalog table.
788
- existing_features = list(tdml.DataFrame.from_query(f"""
789
- SEL A.FEATURE_NAME
790
- FROM {schema}.{tmp_name} A
791
- INNER JOIN {schema}.{feature_catalog_name} B
792
- ON A.FEATURE_NAME = B.FEATURE_NAME
793
- AND A.ENTITY_NAME = B.ENTITY_NAME
794
- AND A.DATA_DOMAIN = B.DATA_DOMAIN
795
- """).to_pandas().FEATURE_NAME.values)
796
-
797
- # Return the list of existing features.
798
- return existing_features
799
-
800
-
801
- def Gettdtypes(tddf, features_columns, entity_id):
802
- """
803
- This function retrieves the data types of the columns in the provided DataFrame (tddf) and checks their existence in the feature catalog table.
804
- It also assigns new feature IDs for those that do not already exist in the table.
805
-
806
- Parameters:
807
- - tddf: The input DataFrame.
808
- - features_columns: A list of feature column names.
809
- - schema: The schema name in which the feature catalog table resides.
810
- - table_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
811
-
812
- Returns:
813
- A dictionary where keys are column names and values are dictionaries containing type and id of the feature.
814
-
815
- """
816
- # Get the data types of the columns in the DataFrame.
817
- types = get_column_types_simple(tddf, tddf.columns) #dict(tddf.to_pandas(num_rows=10).dtypes)
818
-
819
- # Get the names of the features that already exist in the feature catalog table.
820
- existing_features = GetAlreadyExistingFeatureNames(tddf.columns, entity_id)
821
-
822
- # Get the maximum feature ID from the feature catalog table.
823
- feature_id = GetTheLargestFeatureID()
824
-
825
- # Increment the maximum feature ID to create a new feature ID.
826
- feature_id = feature_id + 1
827
-
828
- # Initialize a dictionary to store the result.
829
- res = {}
830
-
831
- # Iterate over the data types of the columns in the DataFrame.
832
- for k, v in types.items():
833
- # If the column name does not exist in the feature catalog table and is in the list of feature column names...
834
- if k.upper() not in [n.upper() for n in existing_features] and k.upper() in [n.upper() for n in features_columns]:
835
- # If the data type of the column is integer...
836
- if 'int' in str(v):
837
- # Add an entry to the result dictionary for the column name with its data type and new feature ID.
838
- res[k] = {'type': 'BIGINT', 'id': feature_id}
839
- # If the data type of the column is float...
840
- elif 'float' in str(v):
841
- # Add an entry to the result dictionary for the column name with its data type and new feature ID.
842
- res[k] = {'type': 'FLOAT', 'id': feature_id}
843
- # If the data type of the column is neither integer nor float...
844
- else:
845
- res[k] = {'type': 'VARCHAR', 'id': feature_id}
846
- # Print a message that the data type is not yet managed.
847
- #if display_logs: print(f'{k} has a type that is not yet managed')
848
-
849
- # Increment the feature ID for the next iteration.
850
- feature_id += 1
851
-
852
- # Return the result dictionary.
853
- return res
854
-
855
-
856
- def _upload_features(df, entity_id, feature_names,
857
- feature_versions=feature_version_default):
858
- """
859
- This function uploads features from a Teradata DataFrame to the feature store.
860
-
861
- Parameters:
862
- - df: The input Teradata DataFrame.
863
- - entity_id: The ID of the entity that the features belong to.
864
- - feature_names: A list of feature names.
865
- - schema_name: The name of the schema where the feature store resides.
866
- - feature_catalog_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
867
- - feature_versions (optional): The versions of the features. Can be a string or a list. If it's a string, it's used as the version for all features. If it's a list, it should have the same length as feature_names. Default is 'dev.0.0'.
868
-
869
- Returns:
870
- A DataFrame representing the dataset view created in the feature store.
871
- """
872
-
873
-
874
- register_entity(entity_id)
875
-
876
- # If feature_versions is a list, create a dictionary mapping each feature name to its corresponding version.
877
- # If feature_versions is a string, create a dictionary mapping each feature name to this string.
878
- if type(feature_versions) == list:
879
- selected_features = {k: v for k, v in zip(feature_names, feature_versions)}
880
- else:
881
- selected_features = {k: feature_versions for k in feature_names}
882
-
883
- # Get the Teradata types of the features in df.
884
- feature_names_types = Gettdtypes(
885
- df,
886
- features_columns=feature_names,
887
- entity_id=entity_id
888
- )
889
-
890
- # Register the features in the feature catalog.
891
- register_features(
892
- entity_id,
893
- feature_names_types
894
- )
895
-
896
- # Prepare the features for ingestion.
897
- prepared_features, volatile_table_name = prepare_feature_ingestion(
898
- df,
899
- entity_id,
900
- feature_names,
901
- feature_versions=selected_features
902
- )
903
-
904
- # Store the prepared features in the feature store.
905
- store_feature(
906
- entity_id,
907
- prepared_features
908
- )
909
-
910
- # Clean up by dropping the temporary volatile table.
911
- tdml.execute_sql(f'DROP TABLE {volatile_table_name}')
912
-
913
- # Build a dataset view in the feature store.
914
- dataset = build_dataset(
915
- entity_id,
916
- selected_features,
917
- view_name=None
918
- )
919
-
920
- # Return the dataset view.
921
- return dataset
922
-
923
- def register_entity(entity_id):
924
- feature_store_table_name_float = feature_store_table_creation(entity_id, feature_type='FLOAT')
925
- feature_store_table_name_integer = feature_store_table_creation(entity_id, feature_type='BIGINT')
926
- feature_store_table_name_varchar = feature_store_table_creation(entity_id, feature_type='VARCHAR')
927
-
928
- return feature_store_table_name_float,feature_store_table_name_integer,feature_store_table_name_varchar
929
-
930
-
931
- def get_available_features(entity_id, display_details=False):
932
- if date_in_the_past == None:
933
- validtime_statement = 'CURRENT VALIDTIME'
934
- else:
935
- validtime_statement = f"VALIDTIME AS OF '{date_in_the_past}'"
936
-
937
- if type(entity_id) == dict:
938
- ENTITY_ID__ = ','.join([k.lower() for k, v in entity_id.items()])
939
- elif type(entity_id) == list:
940
- ENTITY_ID__ = ','.join([k.lower() for k in entity_id])
941
- else:
942
- ENTITY_ID__ = entity_id.lower()
943
-
944
- query = f"""
945
- {validtime_statement}
946
- SELECT
947
- FEATURE_NAME
948
- FROM {schema}.{feature_catalog_name}
949
- WHERE LOWER(ENTITY_NAME) = '{ENTITY_ID__}'
950
- AND DATA_DOMAIN = '{data_domain}'
951
- """
952
-
953
- if display_details:
954
- print(tdml.DataFrame.from_query(f'{validtime_statement} SELECT * FROM {schema}.{feature_catalog_name}'))
955
-
956
- return list(tdml.DataFrame.from_query(query).to_pandas().FEATURE_NAME.values)
957
-
958
-
959
- def tdstone2_entity_id(existing_model):
960
- """
961
- Generate a dictionary mapping entity IDs to their respective data types in a given model.
962
-
963
- This function iterates over the 'id_row' attribute of the 'mapper_scoring' object in the provided model.
964
- It then creates a dictionary where each key is an entity ID and its corresponding value is the data type of that entity ID,
965
- as defined in the 'types' attribute of the 'mapper_scoring' object.
966
-
967
- Args:
968
- existing_model (object): The model object that contains the 'mapper_scoring' attribute with necessary information.
969
- It is expected to have 'id_row' and 'types' attributes.
970
-
971
- Returns:
972
- dict: A dictionary where keys are entity IDs and values are their respective data types.
973
-
974
- Raises:
975
- TypeError: If the 'id_row' attribute in the model is not a list or a single value.
976
-
977
- Note:
978
- - If 'id_row' is a single value (not a list), it is converted into a list with that single value.
979
- - The function assumes 'mapper_scoring' and its attributes ('id_row' and 'types') are properly defined in the model.
980
-
981
- Example:
982
- entity_id = tdstone2_entity_id(model)
983
- # entity_id might look like {'ID': 'BIGINT'}
984
- """
985
-
986
- # Initialize an empty dictionary to store entity IDs and their data types.
987
- entity_id = {}
988
-
989
- # Retrieve the list of IDs from the 'id_row' attribute of 'mapper_scoring' in the model.
990
- if 'score' in [x[0] for x in inspect.getmembers(type(existing_model))]:
991
- ids = existing_model.mapper_scoring.id_row
992
- model_type = 'model scoring'
993
- elif existing_model.feature_engineering_type == 'feature engineering reducer':
994
- ids = existing_model.mapper.id_partition
995
- model_type = 'feature engineering'
996
- else:
997
- ids = existing_model.mapper.id_row
998
- model_type = 'feature engineering'
999
-
1000
- # Ensure 'ids' is a list. If not, convert it into a list.
1001
- if type(ids) != list:
1002
- ids = [ids]
1003
-
1004
- # Iterate over each ID in 'ids' and map it to its corresponding data type in the dictionary.
1005
- if model_type == 'model scoring':
1006
- for k in ids:
1007
- entity_id[k] = existing_model.mapper_scoring.types[k]
1008
- else:
1009
- for k in ids:
1010
- entity_id[k] = existing_model.mapper.types[k]
1011
-
1012
- # Return the dictionary containing mappings of entity IDs to data types.
1013
- return entity_id
1014
-
1015
-
1016
- def tdstone2_Gettdtypes(existing_model, entity_id, display_logs=False):
1017
- """
1018
- Generate a dictionary mapping feature names to their data types and unique feature IDs for a given model.
1019
-
1020
- This function processes a model to create a dictionary where each key is a feature name and its value
1021
- is a dictionary containing the feature's data type and a unique ID. The function filters out features
1022
- that already exist in a feature catalog and only includes new features with 'BIGINT' or 'FLOAT' data types.
1023
-
1024
- Args:
1025
- existing_model (object): The model object containing necessary schema and scoring information.
1026
- display_logs (bool): Flag to indicate whether to display logs. Defaults to False.
1027
-
1028
- Returns:
1029
- dict: A dictionary with feature names as keys, and each value is a dictionary containing 'type' and 'id'.
1030
-
1031
- Raises:
1032
- ValueError: If the data types encountered are neither integer nor float.
1033
-
1034
- Note:
1035
- - The function assumes that 'tdstone.schema_name' and 'mapper_scoring.scores_repository' are properly defined.
1036
- - The function auto-generates unique IDs for new features.
1037
-
1038
- Example:
1039
- result = tdstone2_Gettdtypes(model)
1040
- # result might look like {'count_AMOUNT': {'type': 'BIGINT', 'id': 1}, 'mean_AMOUNT': {'type': 'FLOAT', 'id': 3}, ...}
1041
- """
1042
-
1043
- # Initialize an empty dictionary to store feature names and their types.
1044
- types = {}
1045
-
1046
- # Create a DataFrame based on the model's schema and scores repository.
1047
- if 'score' in [x[0] for x in inspect.getmembers(type(existing_model))]:
1048
- df = existing_model.get_model_predictions()
1049
- else:
1050
- #if existing_model.feature_engineering_type == 'feature engineering reducer':
1051
- df = existing_model.get_computed_features()
1052
-
1053
- # Group and count the DataFrame by feature name and type, converting it to a pandas DataFrame.
1054
- df_ = df[['FEATURE_NAME', 'FEATURE_TYPE', 'FEATURE_VALUE']].groupby(['FEATURE_NAME', 'FEATURE_TYPE']).count()[
1055
- ['FEATURE_NAME', 'FEATURE_TYPE']].to_pandas()
1056
-
1057
- # Iterate through the DataFrame to filter and assign types.
1058
- for i, row in df_.iterrows():
1059
- if 'float' in row['FEATURE_TYPE'] or 'int' in row['FEATURE_TYPE']:
1060
- types[row['FEATURE_NAME']] = row['FEATURE_TYPE']
1061
-
1062
- # Retrieve existing feature names to filter out already cataloged features.
1063
- existing_features = GetAlreadyExistingFeatureNames(types.keys(),entity_id)
1064
-
1065
- # Get the current maximum feature ID to ensure uniqueness for new features.
1066
- feature_id = GetTheLargestFeatureID() + 1
1067
-
1068
- # Initialize a dictionary to store the result.
1069
- res = {}
1070
-
1071
- # Process each feature type and assign a corresponding data type and unique ID.
1072
- for k, v in types.items():
1073
- if k not in existing_features and k in types.keys():
1074
- if 'int' in str(v):
1075
- res[k] = {'type': 'BIGINT', 'id': feature_id}
1076
- elif 'float' in str(v):
1077
- res[k] = {'type': 'FLOAT', 'id': feature_id}
1078
- else:
1079
- if display_logs:
1080
- print(f'{k} has a type that is not yet managed')
1081
- continue # Skip this iteration for unmanaged types.
1082
- feature_id += 1
1083
-
1084
- # Return the dictionary containing feature names, types, and IDs.
1085
- return res
1086
-
1087
-
1088
- def prepare_feature_ingestion_tdstone2(df, entity_id):
1089
- """
1090
- Prepare feature data for ingestion into the feature store by transforming a DataFrame.
1091
- This function unpivots specified feature columns in the input DataFrame and adds additional columns
1092
- for entity IDs, feature names, feature values, and feature versions. It creates a volatile table
1093
- in the database to store the transformed data.
1094
-
1095
- Parameters:
1096
- - df (tdml.DataFrame): The input DataFrame containing the feature data. This DataFrame should have a structure
1097
- compatible with the requirements of the tdstone2 feature store.
1098
- - entity_id (dict): A dictionary mapping column names to their respective entity ID types, used for identifying entities.
1099
-
1100
- Returns:
1101
- - tdml.DataFrame: A transformed DataFrame containing the prepared feature data in a suitable format for feature store ingestion.
1102
- - str: The name of the volatile table created for storing the transformed data.
1103
-
1104
- Note:
1105
- - The function assumes the input DataFrame 'df' has a valid table name and is compatible with tdml operations.
1106
- - The function automatically handles the creation and management of a volatile table for the transformed data.
1107
- - 'ID_PROCESS' is used as the feature version identifier.
1108
-
1109
- Example usage:
1110
- transformed_df, table_name = prepare_feature_ingestion_tdstone2(input_df, entity_id_dict)
1111
- """
1112
-
1113
- # Ensure the internal table name of the DataFrame is set, necessary for further processing.
1114
- df._DataFrame__execute_node_and_set_table_name(df._nodeid, df._metaexpr)
1115
-
1116
- if type(entity_id) == list:
1117
- list_entity_id = entity_id
1118
- elif type(entity_id) == dict:
1119
- list_entity_id = list(entity_id.keys())
1120
- else:
1121
- list_entity_id = [entity_id]
1122
-
1123
- # Combine entity ID columns with feature name and value columns to form the output column list.
1124
- output_columns = ', \n'.join(list_entity_id + ['FEATURE_NAME', 'FEATURE_VALUE'])
1125
- primary_index = ','.join(list_entity_id)
1126
-
1127
- # Define a query segment to assign feature versions.
1128
- version_query = "ID_PROCESS AS FEATURE_VERSION"
1129
-
1130
- # Create a volatile table name based on the original table's name, ensuring it is unique.
1131
- volatile_table_name = df._table_name.split('.')[1].replace('"', '')
1132
- volatile_table_name = f'temp_{volatile_table_name}'
1133
-
1134
- # Construct the SQL query to create the volatile table with the transformed data.
1135
- query = f"""
1136
- CREATE VOLATILE TABLE {volatile_table_name} AS
1137
- (
1138
- SELECT
1139
- {output_columns},
1140
- {version_query}
1141
- FROM {df._table_name}
1142
- ) WITH DATA
1143
- PRIMARY INDEX ({primary_index})
1144
- ON COMMIT PRESERVE ROWS
1145
- """
1146
- # Execute the SQL query to create the volatile table.
1147
- tdml.execute_sql(query)
1148
-
1149
- # Optionally print the query if the display flag is set.
1150
- if tdml.display.print_sqlmr_query:
1151
- print(query)
1152
-
1153
- # Return the DataFrame representation of the volatile table and its name.
1154
- return tdml.DataFrame(tdml.in_schema(_get_database_username(), volatile_table_name)), volatile_table_name
1155
-
1156
-
1157
- def upload_tdstone2_scores(model):
1158
- """
1159
- Uploads features from a model's predictions to the Teradata feature store. This function handles the entire
1160
- workflow from extracting feature names and types, registering them in the feature catalog, preparing features for ingestion,
1161
- storing them in the feature store, and finally creating a dataset view in the feature store.
1162
-
1163
- Parameters:
1164
- - model: The model object whose predictions contain features to be uploaded. This model should have methods
1165
- to extract predictions and feature information.
1166
-
1167
- Returns:
1168
- - DataFrame: A DataFrame representing the dataset view created in the feature store, which includes
1169
- features from the model's predictions.
1170
-
1171
- Note:
1172
- - The function assumes that the model provides a method `get_model_predictions` which returns a Teradata DataFrame.
1173
- - Entity ID for the model is extracted and registered in the data domain.
1174
- - The function cleans up by dropping the volatile table created during the process.
1175
- - The feature names and their types are extracted from the model's predictions and are registered in the feature catalog.
1176
- """
1177
-
1178
- # Extract the entity ID from the existing model.
1179
- entity_id = tdstone2_entity_id(model)
1180
-
1181
- # Register the entity ID in the data domain.
1182
- register_entity(entity_id)
1183
-
1184
- # Get the Teradata types of the features from the model's predictions.
1185
- feature_names_types = tdstone2_Gettdtypes(model,entity_id)
1186
-
1187
- # Register these features in the feature catalog.
1188
- register_features(entity_id, feature_names_types)
1189
-
1190
- # Prepare the features for ingestion into the feature store.
1191
- if 'score' in [x[0] for x in inspect.getmembers(type(model))]:
1192
- prepared_features, volatile_table_name = prepare_feature_ingestion_tdstone2(
1193
- model.get_model_predictions(),
1194
- entity_id
1195
- )
1196
- else:
1197
- prepared_features, volatile_table_name = prepare_feature_ingestion_tdstone2(
1198
- model.get_computed_features(),
1199
- entity_id
1200
- )
1201
-
1202
- # Store the prepared features in the feature store.
1203
- store_feature(entity_id, prepared_features)
1204
-
1205
- # Clean up by dropping the temporary volatile table.
1206
- tdml.execute_sql(f'DROP TABLE {volatile_table_name}')
1207
-
1208
- # Get the list of selected features for building the dataset view.
1209
- if 'score' in [x[0] for x in inspect.getmembers(type(model))]:
1210
- selected_features = model.get_model_predictions().groupby(['FEATURE_NAME', 'ID_PROCESS']).count().to_pandas()[
1211
- ['FEATURE_NAME', 'ID_PROCESS']].set_index('FEATURE_NAME').to_dict()['ID_PROCESS']
1212
- else:
1213
- selected_features = model.get_computed_features().groupby(['FEATURE_NAME', 'ID_PROCESS']).count().to_pandas()[
1214
- ['FEATURE_NAME', 'ID_PROCESS']].set_index('FEATURE_NAME').to_dict()['ID_PROCESS']
1215
-
1216
- # Build and return the dataset view in the feature store.
1217
- dataset = build_dataset(entity_id, selected_features, view_name=None)
1218
- return dataset
1219
-
1220
-
1221
- def get_list_entity(domain=None):
1222
- """
1223
- Retrieve a list of unique entity names from a specified data domain.
1224
-
1225
- This function executes a database query to extract distinct entity names from
1226
- a feature catalog, filtered by the provided data domain. If no domain is
1227
- specified, it defaults to a predefined data domain.
1228
-
1229
- Parameters:
1230
- domain (str, optional): The data domain to filter the entity names.
1231
- Defaults to None, in which case a predefined domain is used.
1232
-
1233
- Returns:
1234
- DataFrame: A pandas-like DataFrame containing the unique entity names.
1235
- """
1236
-
1237
- # Use the default data domain if none is specified
1238
- if domain is None:
1239
- domain = data_domain
1240
-
1241
- # Constructing the SQL query to fetch distinct entity names from the specified domain
1242
- query = f"CURRENT VALIDTIME SEL DISTINCT ENTITY_NAME FROM {schema}.{feature_catalog_name} where DATA_DOMAIN = '{domain}'"
1243
-
1244
- # Executing the query and returning the result as a DataFrame
1245
- return tdml.DataFrame.from_query(query)
1246
-
1247
-
1248
- def get_list_features(entity_name, domain=None):
1249
- """
1250
- Retrieve a list of feature names associated with a specific entity or entities
1251
- from a given data domain.
1252
-
1253
- This function constructs and executes a database query to extract feature names
1254
- for the specified entity or entities from a feature catalog, filtered by the
1255
- provided data domain. If no domain is specified, it defaults to a predefined
1256
- data domain.
1257
-
1258
- Parameters:
1259
- entity_name (str or list): The name of the entity or a list of entity names
1260
- to fetch features for.
1261
- domain (str, optional): The data domain to filter the feature names.
1262
- Defaults to None, where a predefined domain is used.
1263
-
1264
- Returns:
1265
- DataFrame: A pandas-like DataFrame containing the feature names associated with the given entity or entities.
1266
- """
1267
-
1268
- # Default to a predefined data domain if none is provided
1269
- if domain is None:
1270
- domain = data_domain
1271
-
1272
- # Convert the entity_name to a string if it is a list
1273
- if type(entity_name) == list:
1274
- entity_name = ','.join(entity_name)
1275
-
1276
- # Constructing the SQL query to fetch feature names for the specified entity or entities
1277
- query = f"CURRENT VALIDTIME SEL FEATURE_NAME FROM {schema}.{feature_catalog_name} where entity_name = '{entity_name}' AND DATA_DOMAIN = '{domain}'"
1278
-
1279
- # Executing the query and returning the result as a DataFrame
1280
- return tdml.DataFrame.from_query(query)
1281
-
1282
-
1283
- def get_feature_versions(entity_name, features, domain=None, latest_version_only=True, version_lag=0):
1284
- """
1285
- Retrieve feature versions for specified features associated with certain entities
1286
- from a given data domain. This function allows fetching either all versions or
1287
- just the latest versions of the features.
1288
-
1289
- Parameters:
1290
- entity_name (str or list): The name of the entity or a list of entity names
1291
- for which feature versions are to be fetched.
1292
- features (list): A list of features for which versions are required.
1293
- domain (str, optional): The data domain to filter the feature versions.
1294
- Defaults to None, where a predefined domain is used.
1295
- latest_version_only (bool, optional): Flag to fetch only the latest version
1296
- of each feature. Defaults to True.
1297
- version_lag (int, optional): The number of versions to lag behind the latest.
1298
- Only effective if latest_version_only is True. Defaults to 0.
1299
-
1300
- Returns:
1301
- dict: A dictionary with feature names as keys and their corresponding versions as values.
1302
- """
1303
-
1304
- # Default to a predefined data domain if none is provided
1305
- if domain is None:
1306
- domain = data_domain
1307
-
1308
- # Convert the entity_name to a string if it is a list
1309
- if type(entity_name) == list:
1310
- entity_name = ','.join(entity_name)
1311
-
1312
- # Preparing the feature names for inclusion in the SQL query
1313
- features = ["'" + f + "'" for f in features]
1314
-
1315
- # Constructing the SQL query to fetch basic feature data for the specified entities and features
1316
- query = f"""CURRENT VALIDTIME
1317
- SEL FEATURE_ID, FEATURE_NAME, FEATURE_TABLE, FEATURE_DATABASE
1318
- FROM {schema}.{feature_catalog_name} where entity_name = '{entity_name}' AND DATA_DOMAIN = '{domain}'
1319
- AND FEATURE_NAME in ({','.join(features)})"""
1320
-
1321
- # Executing the first query and converting the results to a pandas DataFrame
1322
- df = tdml.DataFrame.from_query(query).to_pandas()
1323
-
1324
- # Building the second query to fetch feature versions
1325
- query = []
1326
- for i, row in df.iterrows():
1327
- query_ = f"""
1328
- SEL DISTINCT A{i}.FEATURE_NAME, A{i}.FEATURE_VERSION
1329
- FROM (
1330
- CURRENT VALIDTIME
1331
- SELECT CAST('{row['FEATURE_NAME']}' AS VARCHAR(255)) AS FEATURE_NAME, FEATURE_VERSION FROM {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}
1332
- WHERE FEATURE_ID = {row['FEATURE_ID']})
1333
- A{i}
1334
- """
1335
- query.append(query_)
1336
-
1337
- # Combining the individual queries with UNION ALL
1338
- query = '\n UNION ALL \n'.join(query)
1339
-
1340
- # Modifying the query to fetch only the latest versions, if specified
1341
- if latest_version_only:
1342
- query = 'SELECT * FROM (' + query + ') A \n' + f'QUALIFY ROW_NUMBER() OVER(PARTITION BY FEATURE_NAME ORDER BY FEATURE_VERSION DESC) = 1+{version_lag}'
1343
-
1344
- # Executing the final query and converting the results to a pandas DataFrame
1345
- df = tdml.DataFrame.from_query(query).to_pandas()
1346
-
1347
- # Returning the results as a dictionary with feature names as keys and their versions as values
1348
- return {row['FEATURE_NAME']:row['FEATURE_VERSION'] for i,row in df.iterrows()}
1349
-
1350
-
1351
- def upload_features(df, entity_id, feature_names, metadata={}):
1352
- """
1353
- Uploads features from a dataframe to a specified entity, registering the process and returning the resulting dataset.
1354
-
1355
- Args:
1356
- df (DataFrame): The dataframe containing the features to be uploaded.
1357
- entity_id (dict or compatible type): The entity identifier. If not a dictionary, it will be converted using `get_column_types`.
1358
- feature_names (list): The list of feature names to be uploaded.
1359
- metadata (dict, optional): Additional metadata to associate with the upload. Defaults to an empty dictionary.
1360
-
1361
- Returns:
1362
- DataFrame: The dataset resulting from the upload process.
1363
- """
1364
-
1365
- # Convert entity_id to a dictionary if it's not already one
1366
- if type(entity_id) != dict:
1367
- entity_id = get_column_types(df, entity_id)
1368
- print('entity_id has been converted to a proper dictionary : ', entity_id)
1369
-
1370
- # Register the process and retrieve the SQL query to insert the features, and the process ID
1371
- query_insert, process_id = register_process_view.__wrapped__(
1372
- view_name=df,
1373
- entity_id=entity_id,
1374
- feature_names=feature_names,
1375
- metadata=metadata,
1376
- with_process_id=True
1377
- )
1378
-
1379
- # Execute the SQL query to insert the features into the database
1380
- execute_query(query_insert)
1381
-
1382
- # Run the registered process and return the resulting dataset
1383
- dataset = run(process_id=process_id, return_dataset=True)
1384
-
1385
- return dataset
1386
-
1387
-
1388
-
1389
- def _build_time_series(entity_id, selected_feature, query_only=False):
1390
- """
1391
- Constructs a time series dataset for a given entity and feature.
1392
- Optionally returns only the query used for dataset construction.
1393
-
1394
- This is a wrapper around the `build_dataset` function, tailored specifically for time series data by setting temporal parameters to null.
1395
-
1396
- Args:
1397
- entity_id (dict): The identifier for the entity for which the dataset is being built.
1398
- selected_feature (str or list): The feature(s) to be included in the dataset.
1399
- query_only (bool, optional): If True, returns only the SQL query used for building the dataset, not the dataset itself. Defaults to False.
1400
-
1401
- Returns:
1402
- DataFrame or str: The constructed time series dataset as a DataFrame, or the SQL query as a string if query_only is True.
1403
- """
1404
-
1405
- # Call the build_dataset function with specific parameters set for time series dataset construction
1406
- return build_dataset(
1407
- entity_id=entity_id, # The identifier for the entity
1408
- selected_features=selected_feature, # The feature(s) to be included in the dataset
1409
- no_temporal=True, # Indicates that the dataset should not have a temporal component
1410
- query_only=query_only, # Determines whether to return just the query or the constructed dataset
1411
- time_manager=None, # No time management for the dataset construction
1412
- view_name=None # No specific view name provided
1413
- )
1414
-
1415
-
1416
- def build_dataset_time_series(df, time_column, entity_id, selected_features, query_only=False, time_manager=None):
1417
- """
1418
- Constructs a time series dataset based on the specified features and entity_id from the provided dataframe.
1419
-
1420
- Args:
1421
- df (DataFrame): The source dataframe.
1422
- time_column (str): The name of the column in df that represents time.
1423
- entity_id (dict): A dictionary representing the entity identifier.
1424
- selected_features (dict): A dictionary with keys as feature names and values as conditions or specifications for those features.
1425
- query_only (bool, optional): If True, only the SQL query for the dataset is returned. Defaults to False.
1426
- time_manager (TimeManager, optional): An instance of TimeManager to manage time-related operations. Defaults to None.
1427
-
1428
- Returns:
1429
- DataFrame or str: The constructed time series dataset as a DataFrame, or the SQL query as a string if query_only is True.
1430
- """
1431
-
1432
- # Convert column names to lowercase for case-insensitive matching
1433
- col = [c.lower() for c in df.columns]
1434
-
1435
- # Check if the entity_id keys are present in the dataframe columns
1436
- for e in entity_id:
1437
- if e.lower() not in col:
1438
- print(f' {e} is not present in your dataframe')
1439
- print('Here are the columns of your dataframe:')
1440
- print(str(col))
1441
- return # Exit if any entity_id key is not found
1442
-
1443
- # Check if the time_column is present in the dataframe columns
1444
- if time_column.lower() not in col:
1445
- print(f' {time_column} is not present in your dataframe')
1446
- print('Here are the columns of your dataframe:')
1447
- print(str(col))
1448
- return # Exit if the time_column is not found
1449
-
1450
- # Extract and check the data type of the time_column
1451
- d_ = {x[0]: x[1] for x in df._td_column_names_and_types}
1452
- time_column_data_type = d_[time_column]
1453
- print('time column data type :', time_column_data_type)
1454
- if 'date' not in time_column_data_type.lower() and 'time' not in time_column_data_type.lower():
1455
- print('the time column of your data frame is neither a date nor a timestamp')
1456
- return # Exit if the time_column data type is not date or timestamp
1457
-
1458
- # Initialize the select query
1459
- select_query = 'SELECT \n' + ', \n'.join(['A.' + c for c in col]) + '\n'
1460
-
1461
- # If a time_manager is provided, extract its details
1462
- if time_manager is not None:
1463
- tm_datatype = time_manager.data_type.lower()
1464
- tm_schema = time_manager.schema_name
1465
- tm_table = time_manager.table_name
1466
-
1467
- sub_queries_list = []
1468
- # For each selected feature, build its part of the query
1469
- for i, (k, v) in enumerate(selected_features.items()):
1470
- select_query += ', BB' + str(i + 1) + '.' + k + '\n'
1471
-
1472
- nested_query = _build_time_series(entity_id, {k: v}, query_only=True)
1473
-
1474
- sub_queries = 'SELECT \n' + '\n ,'.join(entity_id) + '\n ,' + k + '\n'
1475
-
1476
- # Build the sub_queries based on the presence of a time_manager and the data types of time_column and time_manager
1477
- if time_manager is None:
1478
- # there is a time manager
1479
- if 'date' in tm_datatype:
1480
- # the data type of the time column is DATE
1481
- sub_queries += f', CAST(ValidStart_{k} AS DATE) AS ValidStart \n'
1482
- sub_queries += f', CAST(ValidEnd_{k} AS DATE) AS ValidEnd \n'
1483
- else:
1484
- # the data type of the time column is timestamp
1485
- sub_queries += f', CAST(ValidStart_{k} AS TIMESTAMP(0)) AS ValidStart \n'
1486
- sub_queries += f', CAST(ValidEnd_{k} AS TIMESTAMP(0)) AS ValidEnd \n'
1487
- else:
1488
- # there is a time manager
1489
- if 'date' in time_column_data_type.lower():
1490
- # the data type of the time column is DATE
1491
- if 'date' in tm_datatype:
1492
- # the data type of the time manager is DATE
1493
- sub_queries += f', CAST(ValidStart_{k} AS DATE) AS ValidStart \n'
1494
- sub_queries += f', CASE WHEN CAST(ValidEnd_{k} AS DATE) > BUS_DATE.BUSINESS_DATE THEN BUS_DATE.BUSINESS_DATE ELSE CAST(ValidEnd_{k} AS DATE) END AS ValidEnd \n'
1495
- else:
1496
- # the data type of the time manager is timestamp
1497
- sub_queries += f', CAST(ValidStart_{k} AS DATE) AS ValidStart \n'
1498
- sub_queries += f', CASE WHEN CAST(ValidEnd_{k} AS DATE) > BUS_DATE.BUSINESS_DATE THEN BUS_DATE.BUSINESS_DATE ELSE CAST(ValidEnd_{k} AS DATE) END AS ValidEnd \n'
1499
- else:
1500
- # the data type of the time column is TIMESTAMP
1501
- if 'date' in tm_datatype:
1502
- sub_queries += f', CAST(ValidStart_{k} AS TIMESTAMP(0)) AS ValidStart \n'
1503
- sub_queries += f', CASE WHEN CAST(ValidEnd_{k} AS TIMESTAMP(0)) > CAST(BUS_DATE.BUSINESS_DATE AS TIMESTAMP(0)) THEN BUS_DATE.BUSINESS_DATE ELSE CAST(ValidEnd_{k} AS TIMESTAMP(0)) END AS ValidEnd \n'
1504
- else:
1505
- sub_queries += f', CAST(ValidStart_{k} AS TIMESTAMP(0)) AS ValidStart \n'
1506
- sub_queries += f', CASE WHEN CAST(ValidEnd_{k} AS TIMESTAMP(0)) > CAST(BUS_DATE.BUSINESS_DATE AS TIMESTAMP(0)) THEN BUS_DATE.BUSINESS_DATE ELSE CAST(ValidEnd_{k} AS TIMESTAMP(0)) END AS ValidEnd \n'
1507
-
1508
- sub_queries += f'FROM ({nested_query}) tmp{i + 1} \n'
1509
- if time_manager is not None:
1510
- sub_queries += f',{tm_schema}.{tm_table} BUS_DATE \n'
1511
-
1512
- sub_queries += 'WHERE ValidStart < ValidEnd \n'
1513
-
1514
- sub_queries = 'LEFT JOIN ( \n' + sub_queries + ') BB' + str(i + 1) + '\n ON '
1515
-
1516
- sub_queries += '\n AND '.join(['A.' + c + '=BB' + str(i + 1) + '.' + c for c in entity_id])
1517
-
1518
- sub_queries += f'\n AND PERIOD(BB{i + 1}.ValidStart, BB{i + 1}.ValidEnd) CONTAINS A.{time_column} \n'
1519
-
1520
- sub_queries_list.append(sub_queries)
1521
-
1522
- # Combine all parts of the query
1523
- query = select_query + f'FROM ({df.show_query()}) A \n' + '\n --------------- \n'.join(sub_queries_list)
1524
-
1525
- # If only the query is requested, return it; otherwise, execute the query and return the resulting DataFrame
1526
- if query_only:
1527
- return query
1528
- else:
1529
- return tdml.DataFrame.from_query(query)