tdfs4ds 0.2.4.41__py3-none-any.whl → 0.2.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +214 -38
- tdfs4ds/feature_store/feature_data_processing.py +7 -5
- tdfs4ds/genai/__init__.py +27 -0
- tdfs4ds/genai/documentation.py +1878 -0
- tdfs4ds/process_store/process_store_catalog_management.py +77 -24
- tdfs4ds/utils/filter_management.py +40 -13
- tdfs4ds/utils/time_management.py +28 -11
- {tdfs4ds-0.2.4.41.dist-info → tdfs4ds-0.2.5.1.dist-info}/METADATA +1 -1
- {tdfs4ds-0.2.4.41.dist-info → tdfs4ds-0.2.5.1.dist-info}/RECORD +11 -17
- tdfs/__init__.py +0 -1
- tdfs/data/curves.csv +0 -5086
- tdfs/datasets.py +0 -27
- tdfs/feature_store.py +0 -723
- tdfs4ds/feature_engineering.py +0 -152
- tdfs4ds/feature_store.py +0 -1529
- tdfs4ds/process_store.py +0 -387
- tdfs4ds/utils.py +0 -579
- {tdfs4ds-0.2.4.41.dist-info → tdfs4ds-0.2.5.1.dist-info}/WHEEL +0 -0
- {tdfs4ds-0.2.4.41.dist-info → tdfs4ds-0.2.5.1.dist-info}/top_level.txt +0 -0
tdfs4ds/feature_store.py
DELETED
|
@@ -1,1529 +0,0 @@
|
|
|
1
|
-
import teradataml as tdml
|
|
2
|
-
import pandas as pd
|
|
3
|
-
from tdfs4ds.utils import execute_query, display_table, get_column_types, get_column_types_simple
|
|
4
|
-
from teradataml.context.context import _get_database_username
|
|
5
|
-
import inspect
|
|
6
|
-
import warnings
|
|
7
|
-
from tdfs4ds.process_store import register_process_view, run
|
|
8
|
-
|
|
9
|
-
warnings.filterwarnings("ignore")
|
|
10
|
-
|
|
11
|
-
data_domain = None
|
|
12
|
-
schema = None
|
|
13
|
-
feature_catalog_name = 'FS_FEATURE_CATALOG'
|
|
14
|
-
end_period = 'UNTIL_CHANGED' #'9999-01-01 00:00:00'
|
|
15
|
-
date_in_the_past = None
|
|
16
|
-
feature_version_default = 'dev.0.0'
|
|
17
|
-
display_logs = True
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def feature_store_catalog_creation(if_exists = 'replace',comment='this table is a feature catalog'):
|
|
21
|
-
"""
|
|
22
|
-
This function creates a feature store catalog table in Teradata database.
|
|
23
|
-
The catalog table stores information about features such as their names, associated tables, databases, validity periods, etc.
|
|
24
|
-
|
|
25
|
-
Parameters:
|
|
26
|
-
- schema: The schema name in which the catalog table will be created.
|
|
27
|
-
- if_exists (optional): Specifies the behavior if the catalog table already exists. The default is 'replace', which means the existing table will be replaced.
|
|
28
|
-
- table_name (optional): The name of the catalog table. The default is 'FS_FEATURE_CATALOG'.
|
|
29
|
-
|
|
30
|
-
Returns:
|
|
31
|
-
The name of the created or replaced catalog table.
|
|
32
|
-
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
# SQL query to create the catalog table
|
|
36
|
-
query = f"""
|
|
37
|
-
CREATE MULTISET TABLE {schema}.{feature_catalog_name},
|
|
38
|
-
FALLBACK,
|
|
39
|
-
NO BEFORE JOURNAL,
|
|
40
|
-
NO AFTER JOURNAL,
|
|
41
|
-
CHECKSUM = DEFAULT,
|
|
42
|
-
DEFAULT MERGEBLOCKRATIO,
|
|
43
|
-
MAP = TD_MAP1
|
|
44
|
-
(
|
|
45
|
-
|
|
46
|
-
FEATURE_ID BIGINT,
|
|
47
|
-
FEATURE_NAME VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
48
|
-
FEATURE_TABLE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
49
|
-
FEATURE_DATABASE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
50
|
-
FEATURE_VIEW VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
51
|
-
ENTITY_NAME VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
52
|
-
DATA_DOMAIN VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
53
|
-
ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
|
|
54
|
-
ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
|
|
55
|
-
PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME
|
|
56
|
-
)
|
|
57
|
-
PRIMARY INDEX (FEATURE_ID);
|
|
58
|
-
"""
|
|
59
|
-
|
|
60
|
-
# SQL query to create a secondary index on the feature name
|
|
61
|
-
query2 = f"CREATE INDEX (FEATURE_NAME) ON {schema}.{feature_catalog_name};"
|
|
62
|
-
|
|
63
|
-
# SQL query to comment the table
|
|
64
|
-
query3 = f"COMMENT ON TABLE {schema}.{feature_catalog_name} IS '{comment}'"
|
|
65
|
-
|
|
66
|
-
try:
|
|
67
|
-
# Attempt to execute the create table query
|
|
68
|
-
execute_query(query)
|
|
69
|
-
if tdml.display.print_sqlmr_query:
|
|
70
|
-
print(query)
|
|
71
|
-
if display_logs: print(f'TABLE {schema}.{feature_catalog_name} has been created')
|
|
72
|
-
execute_query(query3)
|
|
73
|
-
except Exception as e:
|
|
74
|
-
# If the table already exists and if_exists is set to 'replace', drop the table and recreate it
|
|
75
|
-
if display_logs: print(str(e).split('\n')[0])
|
|
76
|
-
if str(e).split('\n')[0].endswith('already exists.') and (if_exists == 'replace'):
|
|
77
|
-
execute_query(f'DROP TABLE {schema}.{feature_catalog_name}')
|
|
78
|
-
print(f'TABLE {schema}.{feature_catalog_name} has been dropped')
|
|
79
|
-
try:
|
|
80
|
-
# Attempt to recreate the table after dropping it
|
|
81
|
-
execute_query(query)
|
|
82
|
-
if display_logs: print(f'TABLE {schema}.{feature_catalog_name} has been re-created')
|
|
83
|
-
if tdml.display.print_sqlmr_query:
|
|
84
|
-
print(query)
|
|
85
|
-
execute_query(query3)
|
|
86
|
-
except Exception as e:
|
|
87
|
-
print(str(e).split('\n')[0])
|
|
88
|
-
|
|
89
|
-
try:
|
|
90
|
-
# Attempt to create the secondary index
|
|
91
|
-
execute_query(query2)
|
|
92
|
-
if tdml.display.print_sqlmr_query:
|
|
93
|
-
print(query)
|
|
94
|
-
if display_logs: print(f'SECONDARY INDEX ON TABLE {schema}.{feature_catalog_name} has been created')
|
|
95
|
-
except Exception as e:
|
|
96
|
-
print(str(e).split('\n')[0])
|
|
97
|
-
|
|
98
|
-
return feature_catalog_name
|
|
99
|
-
|
|
100
|
-
def list_features():
|
|
101
|
-
query = f"CURRENT VALIDTIME SEL * FROM {schema}.{feature_catalog_name}"
|
|
102
|
-
|
|
103
|
-
return tdml.DataFrame.from_query(query)
|
|
104
|
-
|
|
105
|
-
def get_feature_store_table_name(entity_id, feature_type):
|
|
106
|
-
"""
|
|
107
|
-
|
|
108
|
-
This function generates the table and view names for a feature store table based on the provided entity ID and feature type.
|
|
109
|
-
|
|
110
|
-
Parameters:
|
|
111
|
-
- entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to construct the table and view names.
|
|
112
|
-
- feature_type: The type of the feature.
|
|
113
|
-
|
|
114
|
-
Returns:
|
|
115
|
-
A tuple containing the generated table name and view name.
|
|
116
|
-
|
|
117
|
-
"""
|
|
118
|
-
|
|
119
|
-
if type(entity_id) == list:
|
|
120
|
-
list_entity_id = entity_id
|
|
121
|
-
elif type(entity_id) == dict:
|
|
122
|
-
list_entity_id = list(entity_id.keys())
|
|
123
|
-
else:
|
|
124
|
-
list_entity_id = [entity_id]
|
|
125
|
-
|
|
126
|
-
# Construct the table name by concatenating the elements 'FS', 'T', the keys of entity_id, and feature_type
|
|
127
|
-
table_name = ['FS','T']+[data_domain]+list_entity_id+[feature_type]
|
|
128
|
-
table_name = '_'.join(table_name)
|
|
129
|
-
|
|
130
|
-
# Construct the view name by concatenating the elements 'FS', 'V', the keys of entity_id, and feature_type
|
|
131
|
-
view_name = ['FS','V']+[data_domain]+list_entity_id+[feature_type]
|
|
132
|
-
view_name = '_'.join(view_name)
|
|
133
|
-
|
|
134
|
-
return table_name, view_name
|
|
135
|
-
|
|
136
|
-
def feature_store_table_creation(entity_id, feature_type, if_exists = 'fail'):
|
|
137
|
-
|
|
138
|
-
"""
|
|
139
|
-
This function creates a feature store table and a corresponding view in a Teradata database schema based on the provided entity ID, feature type, and feature catalog.
|
|
140
|
-
|
|
141
|
-
Parameters:
|
|
142
|
-
- entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to construct the table and view names.
|
|
143
|
-
- feature_type: The type of the feature.
|
|
144
|
-
- schema: The schema name in which the table and view will be created.
|
|
145
|
-
- if_exists (optional): Specifies the behavior if the table already exists. The default is 'replace', which means the existing table will be replaced.
|
|
146
|
-
- feature_catalog_name (optional): The name of the feature catalog table. The default is 'FS_FEATURE_CATALOG'.
|
|
147
|
-
|
|
148
|
-
Returns:
|
|
149
|
-
The name of the created or replaced feature store table.
|
|
150
|
-
|
|
151
|
-
"""
|
|
152
|
-
|
|
153
|
-
table_name, view_name = get_feature_store_table_name(entity_id, feature_type)
|
|
154
|
-
if tdml.db_list_tables(schema_name=schema, object_name=table_name+'%').shape[0] > 0:
|
|
155
|
-
print(f'table {table_name} in the {schema} database already exists. No need to create it.')
|
|
156
|
-
return
|
|
157
|
-
else:
|
|
158
|
-
print(f'table {table_name} in the {schema} database does not exists. Need to create it.')
|
|
159
|
-
|
|
160
|
-
query_feature_value = {
|
|
161
|
-
'FLOAT' : 'FEATURE_VALUE FLOAT',
|
|
162
|
-
'BIGINT' : 'FEATURE_VALUE BIGINT',
|
|
163
|
-
'VARCHAR' : 'FEATURE_VALUE VARCHAR(2048) CHARACTER SET LATIN'
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
# Construct the column definitions for the table based on the entity ID
|
|
167
|
-
ENTITY_ID = ', \n'.join([k+' '+v for k,v in entity_id.items()])
|
|
168
|
-
ENTITY_ID_ = ', \n'.join(['B.'+k for k,v in entity_id.items()])
|
|
169
|
-
ENTITY_ID__ = ','.join([k for k,v in entity_id.items()])
|
|
170
|
-
|
|
171
|
-
# SQL query to create the feature store table
|
|
172
|
-
query = f"""
|
|
173
|
-
CREATE MULTISET TABLE {schema}.{table_name},
|
|
174
|
-
FALLBACK,
|
|
175
|
-
NO BEFORE JOURNAL,
|
|
176
|
-
NO AFTER JOURNAL,
|
|
177
|
-
CHECKSUM = DEFAULT,
|
|
178
|
-
DEFAULT MERGEBLOCKRATIO,
|
|
179
|
-
MAP = TD_MAP1
|
|
180
|
-
(
|
|
181
|
-
|
|
182
|
-
{ENTITY_ID},
|
|
183
|
-
FEATURE_ID BIGINT,
|
|
184
|
-
{query_feature_value[feature_type]},
|
|
185
|
-
FEATURE_VERSION VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
186
|
-
ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
|
|
187
|
-
ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
|
|
188
|
-
PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME
|
|
189
|
-
)
|
|
190
|
-
PRIMARY INDEX ({ENTITY_ID__},FEATURE_ID,FEATURE_VERSION);
|
|
191
|
-
"""
|
|
192
|
-
|
|
193
|
-
# SQL query to create a secondary index on the feature ID
|
|
194
|
-
query2 = f"CREATE INDEX (FEATURE_ID) ON {schema}.{table_name};"
|
|
195
|
-
|
|
196
|
-
# SQL query to create the view
|
|
197
|
-
query_view = f"""
|
|
198
|
-
REPLACE VIEW {schema}.{view_name} AS
|
|
199
|
-
CURRENT VALIDTIME
|
|
200
|
-
SELECT
|
|
201
|
-
A.FEATURE_NAME,
|
|
202
|
-
{ENTITY_ID_},
|
|
203
|
-
B.FEATURE_VALUE,
|
|
204
|
-
B.FEATURE_VERSION
|
|
205
|
-
FROM {schema}.{feature_catalog_name} A
|
|
206
|
-
, {schema}.{table_name} B
|
|
207
|
-
WHERE A.FEATURE_ID = B.FEATURE_ID
|
|
208
|
-
"""
|
|
209
|
-
|
|
210
|
-
try:
|
|
211
|
-
# Attempt to execute the create table query
|
|
212
|
-
execute_query(query)
|
|
213
|
-
if tdml.display.print_sqlmr_query:
|
|
214
|
-
print(query)
|
|
215
|
-
if display_logs: print(f'TABLE {schema}.{table_name} has been created')
|
|
216
|
-
execute_query(query2)
|
|
217
|
-
except Exception as e:
|
|
218
|
-
# If the table already exists and if_exists is set to 'replace', drop the table and recreate it
|
|
219
|
-
print(str(e).split('\n')[0])
|
|
220
|
-
if str(e).split('\n')[0].endswith('already exists.') and (if_exists == 'replace'):
|
|
221
|
-
execute_query(f'DROP TABLE {schema}.{table_name}')
|
|
222
|
-
if display_logs: print(f'TABLE {schema}.{table_name} has been dropped')
|
|
223
|
-
try:
|
|
224
|
-
# Attempt to recreate the table after dropping it
|
|
225
|
-
execute_query(query)
|
|
226
|
-
if display_logs: print(f'TABLE {schema}.{table_name} has been re-created')
|
|
227
|
-
if tdml.display.print_sqlmr_query:
|
|
228
|
-
print(query)
|
|
229
|
-
except Exception as e:
|
|
230
|
-
print(str(e).split('\n')[0])
|
|
231
|
-
|
|
232
|
-
try:
|
|
233
|
-
# Attempt to create the view
|
|
234
|
-
execute_query(query_view)
|
|
235
|
-
if tdml.display.print_sqlmr_query:
|
|
236
|
-
print(query)
|
|
237
|
-
if display_logs: print(f'VIEW {schema}.{view_name} has been created')
|
|
238
|
-
except Exception as e:
|
|
239
|
-
print(str(e).split('\n')[0])
|
|
240
|
-
|
|
241
|
-
return table_name
|
|
242
|
-
|
|
243
|
-
def register_features(entity_id, feature_names_types):
|
|
244
|
-
"""
|
|
245
|
-
|
|
246
|
-
This function registers features in the feature catalog table of a Teradata database. It creates or updates entries in the catalog based on the provided entity ID, feature names and types, and schema.
|
|
247
|
-
|
|
248
|
-
Parameters:
|
|
249
|
-
- entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to identify the entity.
|
|
250
|
-
- feature_names_types: A dictionary containing feature names and their corresponding types.
|
|
251
|
-
- schema: The schema name in which the feature catalog table resides.
|
|
252
|
-
- feature_catalog_name (optional): The name of the feature catalog table. The default is 'FS_FEATURE_CATALOG'.
|
|
253
|
-
|
|
254
|
-
Returns:
|
|
255
|
-
A DataFrame containing the registered features and their metadata.
|
|
256
|
-
|
|
257
|
-
"""
|
|
258
|
-
|
|
259
|
-
if date_in_the_past == None:
|
|
260
|
-
validtime_statement = 'CURRENT VALIDTIME'
|
|
261
|
-
else:
|
|
262
|
-
validtime_statement = f"VALIDTIME PERIOD '({date_in_the_past},{end_period})'"
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
if len(list(feature_names_types.keys())) == 0:
|
|
266
|
-
if display_logs: print('no new feature to register')
|
|
267
|
-
return
|
|
268
|
-
|
|
269
|
-
# Create a comma-separated string of entity IDs
|
|
270
|
-
ENTITY_ID__ = ','.join([k for k,v in entity_id.items()])
|
|
271
|
-
|
|
272
|
-
# Create a DataFrame from the feature_names_types dictionary
|
|
273
|
-
if len(feature_names_types.keys())>1:
|
|
274
|
-
df = pd.DataFrame(feature_names_types).transpose().reset_index()
|
|
275
|
-
df.columns = ['FEATURE_NAME','TYPE','FEATURE_ID']
|
|
276
|
-
else:
|
|
277
|
-
df = pd.DataFrame(columns=['FEATURE_NAME','TYPE','FEATURE_ID'])
|
|
278
|
-
k = list(feature_names_types.keys())[0]
|
|
279
|
-
df['FEATURE_NAME'] = [k]
|
|
280
|
-
df['TYPE'] = [feature_names_types[k]['type']]
|
|
281
|
-
df['FEATURE_ID'] = [feature_names_types[k]['id']]
|
|
282
|
-
|
|
283
|
-
# Generate the feature table and view names based on the entity ID and feature type
|
|
284
|
-
df['FEATURE_TABLE'] = df.apply(lambda row:get_feature_store_table_name(entity_id, row.iloc[1])[0], axis=1)
|
|
285
|
-
df['FEATURE_VIEW'] = df.apply(lambda row:get_feature_store_table_name(entity_id, row.iloc[1])[1], axis=1)
|
|
286
|
-
|
|
287
|
-
# Add additional columns to the DataFrame
|
|
288
|
-
df['ENTITY_NAME'] = ENTITY_ID__
|
|
289
|
-
df['FEATURE_DATABASE'] = schema
|
|
290
|
-
df['DATA_DOMAIN'] = data_domain
|
|
291
|
-
|
|
292
|
-
# Copy the DataFrame to a temporary table in Teradata
|
|
293
|
-
tdml.copy_to_sql(df,table_name = 'temp', schema_name = schema, if_exists = 'replace', primary_index = 'FEATURE_ID', types={'FEATURE_ID':tdml.BIGINT})
|
|
294
|
-
|
|
295
|
-
# SQL query to update existing entries in the feature catalog
|
|
296
|
-
query_update = f"""
|
|
297
|
-
{validtime_statement}
|
|
298
|
-
UPDATE {schema}.{feature_catalog_name}
|
|
299
|
-
FROM (
|
|
300
|
-
CURRENT VALIDTIME
|
|
301
|
-
SELECT
|
|
302
|
-
NEW_FEATURES.FEATURE_ID
|
|
303
|
-
, NEW_FEATURES.FEATURE_NAME
|
|
304
|
-
, NEW_FEATURES.FEATURE_TABLE
|
|
305
|
-
, NEW_FEATURES.FEATURE_DATABASE
|
|
306
|
-
, NEW_FEATURES.FEATURE_VIEW
|
|
307
|
-
, NEW_FEATURES.ENTITY_NAME
|
|
308
|
-
, NEW_FEATURES.DATA_DOMAIN
|
|
309
|
-
FROM {schema}.temp NEW_FEATURES
|
|
310
|
-
LEFT JOIN {schema}.{feature_catalog_name} EXISTING_FEATURES
|
|
311
|
-
ON NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
|
|
312
|
-
AND NEW_FEATURES.DATA_DOMAIN = EXISTING_FEATURES.DATA_DOMAIN
|
|
313
|
-
WHERE EXISTING_FEATURES.FEATURE_NAME IS NOT NULL
|
|
314
|
-
) UPDATED_FEATURES
|
|
315
|
-
SET
|
|
316
|
-
FEATURE_NAME = UPDATED_FEATURES.FEATURE_NAME,
|
|
317
|
-
FEATURE_TABLE = UPDATED_FEATURES.FEATURE_TABLE,
|
|
318
|
-
FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE,
|
|
319
|
-
FEATURE_VIEW = UPDATED_FEATURES.FEATURE_VIEW,
|
|
320
|
-
ENTITY_NAME = UPDATED_FEATURES.ENTITY_NAME
|
|
321
|
-
WHERE {feature_catalog_name}.FEATURE_ID = UPDATED_FEATURES.FEATURE_ID
|
|
322
|
-
AND {feature_catalog_name}.DATA_DOMAIN = UPDATED_FEATURES.DATA_DOMAIN;
|
|
323
|
-
"""
|
|
324
|
-
|
|
325
|
-
# SQL query to insert new entries into the feature catalog
|
|
326
|
-
if validtime_statement == 'CURRENT VALIDTIME':
|
|
327
|
-
query_insert = f"""
|
|
328
|
-
{validtime_statement}
|
|
329
|
-
INSERT INTO {schema}.{feature_catalog_name} (FEATURE_ID, FEATURE_NAME, FEATURE_TABLE, FEATURE_DATABASE, FEATURE_VIEW, ENTITY_NAME,DATA_DOMAIN)
|
|
330
|
-
SELECT
|
|
331
|
-
NEW_FEATURES.FEATURE_ID
|
|
332
|
-
, NEW_FEATURES.FEATURE_NAME
|
|
333
|
-
, NEW_FEATURES.FEATURE_TABLE
|
|
334
|
-
, NEW_FEATURES.FEATURE_DATABASE
|
|
335
|
-
, NEW_FEATURES.FEATURE_VIEW
|
|
336
|
-
, NEW_FEATURES.ENTITY_NAME
|
|
337
|
-
, NEW_FEATURES.DATA_DOMAIN
|
|
338
|
-
FROM {schema}.temp NEW_FEATURES
|
|
339
|
-
LEFT JOIN {schema}.{feature_catalog_name} EXISTING_FEATURES
|
|
340
|
-
ON NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
|
|
341
|
-
AND NEW_FEATURES.DATA_DOMAIN = EXISTING_FEATURES.DATA_DOMAIN
|
|
342
|
-
WHERE EXISTING_FEATURES.FEATURE_NAME IS NULL;
|
|
343
|
-
"""
|
|
344
|
-
elif date_in_the_past is not None:
|
|
345
|
-
if end_period == 'UNTIL_CHANGED':
|
|
346
|
-
end_period_ = '9999-01-01 00:00:00'
|
|
347
|
-
else:
|
|
348
|
-
end_period_ = end_period
|
|
349
|
-
query_insert = f"""
|
|
350
|
-
INSERT INTO {schema}.{feature_catalog_name} (FEATURE_ID, FEATURE_NAME, FEATURE_TABLE, FEATURE_DATABASE, FEATURE_VIEW, ENTITY_NAME,DATA_DOMAIN,ValidStart,ValidEnd)
|
|
351
|
-
SELECT
|
|
352
|
-
NEW_FEATURES.FEATURE_ID
|
|
353
|
-
, NEW_FEATURES.FEATURE_NAME
|
|
354
|
-
, NEW_FEATURES.FEATURE_TABLE
|
|
355
|
-
, NEW_FEATURES.FEATURE_DATABASE
|
|
356
|
-
, NEW_FEATURES.FEATURE_VIEW
|
|
357
|
-
, NEW_FEATURES.ENTITY_NAME
|
|
358
|
-
, NEW_FEATURES.DATA_DOMAIN
|
|
359
|
-
, TIMESTAMP '{date_in_the_past}'
|
|
360
|
-
, TIMESTAMP '{end_period_}'
|
|
361
|
-
FROM {schema}.temp NEW_FEATURES
|
|
362
|
-
LEFT JOIN {schema}.{feature_catalog_name} EXISTING_FEATURES
|
|
363
|
-
ON NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
|
|
364
|
-
AND NEW_FEATURES.DATA_DOMAIN = EXISTING_FEATURES.DATA_DOMAIN
|
|
365
|
-
WHERE EXISTING_FEATURES.FEATURE_NAME IS NULL;
|
|
366
|
-
"""
|
|
367
|
-
|
|
368
|
-
# Execute the update and insert queries
|
|
369
|
-
execute_query(query_insert)
|
|
370
|
-
execute_query(query_update)
|
|
371
|
-
|
|
372
|
-
return df
|
|
373
|
-
|
|
374
|
-
def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions = None, **kwargs):
|
|
375
|
-
"""
|
|
376
|
-
|
|
377
|
-
This function prepares feature data for ingestion into the feature store. It transforms the input DataFrame by unpivoting the specified feature columns and adds additional columns for entity IDs, feature names, feature values, and feature versions.
|
|
378
|
-
|
|
379
|
-
Parameters:
|
|
380
|
-
- df: The input DataFrame containing the feature data.
|
|
381
|
-
- entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to identify the entity.
|
|
382
|
-
- feature_names: A list of feature names to unpivot from the DataFrame.
|
|
383
|
-
- feature_version_default (optional): The default feature version to assign if not specified in the feature_versions dictionary. Default is 'dev.0.0'.
|
|
384
|
-
- feature_versions (optional): A dictionary specifying feature versions for specific feature names. The keys are feature names, and the values are feature versions.
|
|
385
|
-
- **kwargs: Additional keyword arguments.
|
|
386
|
-
|
|
387
|
-
Returns:
|
|
388
|
-
A transformed tdml.DataFrame containing the prepared feature data.
|
|
389
|
-
|
|
390
|
-
"""
|
|
391
|
-
|
|
392
|
-
# Create the UNPIVOT clause for the specified feature columns
|
|
393
|
-
unpivot_columns = ", \n".join(["("+x+") as '"+x+"'" for x in feature_names])
|
|
394
|
-
|
|
395
|
-
if type(entity_id) == list:
|
|
396
|
-
list_entity_id = entity_id
|
|
397
|
-
elif type(entity_id) == dict:
|
|
398
|
-
list_entity_id = list(entity_id.keys())
|
|
399
|
-
else:
|
|
400
|
-
list_entity_id = [entity_id]
|
|
401
|
-
|
|
402
|
-
# Create the output column list including entity IDs, feature names, and feature values
|
|
403
|
-
output_columns = ', \n'.join(list_entity_id+ ['FEATURE_NAME','FEATURE_VALUE'])
|
|
404
|
-
primary_index = ','.join(list_entity_id)
|
|
405
|
-
|
|
406
|
-
# Create a dictionary to store feature versions, using the default version if not specified
|
|
407
|
-
versions = {f:feature_version_default for f in feature_names}
|
|
408
|
-
if feature_versions is not None:
|
|
409
|
-
for k,v in feature_versions.items():
|
|
410
|
-
versions[k] = v
|
|
411
|
-
|
|
412
|
-
# Create the CASE statement to assign feature versions based on feature names
|
|
413
|
-
version_query = ["CASE"]+[f"WHEN FEATURE_NAME = '{k}' THEN '{v}' " for k,v in versions.items()]+["END AS FEATURE_VERSION"]
|
|
414
|
-
version_query = '\n'.join(version_query)
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
# Create a volatile table name based on the original table's name, ensuring it is unique.
|
|
419
|
-
volatile_table_name = df._table_name.split('.')[1].replace('"', '')
|
|
420
|
-
volatile_table_name = f'temp_{volatile_table_name}'
|
|
421
|
-
|
|
422
|
-
if type(entity_id) == list:
|
|
423
|
-
list_entity_id = entity_id
|
|
424
|
-
elif type(entity_id) == dict:
|
|
425
|
-
list_entity_id = list(entity_id.keys())
|
|
426
|
-
else:
|
|
427
|
-
list_entity_id = [entity_id]
|
|
428
|
-
|
|
429
|
-
# query casting in varchar everything
|
|
430
|
-
nested_query = f"""
|
|
431
|
-
CREATE VOLATILE TABLE {volatile_table_name} AS
|
|
432
|
-
(
|
|
433
|
-
SELECT
|
|
434
|
-
{','.join(list_entity_id)},
|
|
435
|
-
{','.join([f'CAST({x} AS VARCHAR(2048)) AS {x}' for x in feature_names])}
|
|
436
|
-
FROM {df._table_name}
|
|
437
|
-
) WITH DATA
|
|
438
|
-
PRIMARY INDEX ({primary_index})
|
|
439
|
-
ON COMMIT PRESERVE ROWS
|
|
440
|
-
"""
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
# Execute the SQL query to create the volatile table.
|
|
444
|
-
tdml.execute_sql(nested_query)
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
# Construct the SQL query to create the volatile table with the transformed data.
|
|
450
|
-
query = f"""
|
|
451
|
-
SELECT
|
|
452
|
-
{output_columns},
|
|
453
|
-
{version_query}
|
|
454
|
-
FROM {tdml.in_schema(_get_database_username(), volatile_table_name)}
|
|
455
|
-
UNPIVOT ((FEATURE_VALUE ) FOR FEATURE_NAME
|
|
456
|
-
IN ({unpivot_columns})) Tmp
|
|
457
|
-
"""
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
# Optionally print the query if the display flag is set.
|
|
462
|
-
if tdml.display.print_sqlmr_query:
|
|
463
|
-
print(query)
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
# Return the DataFrame representation of the volatile table and its name.
|
|
467
|
-
return tdml.DataFrame.from_query(query), volatile_table_name
|
|
468
|
-
|
|
469
|
-
def store_feature(entity_id, prepared_features, **kwargs):
|
|
470
|
-
"""
|
|
471
|
-
|
|
472
|
-
This function stores feature data in the corresponding feature tables in a Teradata database. It updates existing feature values and inserts new feature values based on the entity ID and prepared features.
|
|
473
|
-
|
|
474
|
-
Parameters:
|
|
475
|
-
- entity_id: A dictionary representing the entity ID. The keys of the dictionary are used to identify the entity.
|
|
476
|
-
- prepared_features: A tdml.DataFrame containing the prepared feature data.
|
|
477
|
-
- schema: The schema name in which the feature tables reside.
|
|
478
|
-
- feature_catalog_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
|
|
479
|
-
- **kwargs: Additional keyword arguments.
|
|
480
|
-
|
|
481
|
-
Returns:
|
|
482
|
-
None
|
|
483
|
-
|
|
484
|
-
"""
|
|
485
|
-
|
|
486
|
-
feature_catalog = tdml.DataFrame(tdml.in_schema(schema, feature_catalog_name))
|
|
487
|
-
|
|
488
|
-
if date_in_the_past == None:
|
|
489
|
-
validtime_statement = 'CURRENT VALIDTIME'
|
|
490
|
-
validtime_statement2 = validtime_statement
|
|
491
|
-
else:
|
|
492
|
-
validtime_statement = f"VALIDTIME PERIOD '({date_in_the_past},{end_period})'"
|
|
493
|
-
validtime_statement2 = f"VALIDTIME AS OF TIMESTAMP '{date_in_the_past}'"
|
|
494
|
-
|
|
495
|
-
# SQL query to select feature data and corresponding feature metadata from the prepared features and feature catalog
|
|
496
|
-
query = f"""
|
|
497
|
-
{validtime_statement2}
|
|
498
|
-
SELECT
|
|
499
|
-
A.*
|
|
500
|
-
, B.FEATURE_ID
|
|
501
|
-
, B.FEATURE_TABLE
|
|
502
|
-
, B.FEATURE_DATABASE
|
|
503
|
-
FROM {prepared_features._table_name} A,
|
|
504
|
-
{schema}.{feature_catalog_name} B
|
|
505
|
-
WHERE A.FEATURE_NAME = B.FEATURE_NAME
|
|
506
|
-
AND B.DATA_DOMAIN = '{data_domain}'
|
|
507
|
-
"""
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
df = tdml.DataFrame.from_query(query)
|
|
512
|
-
|
|
513
|
-
# Group the target tables by feature table and feature database and count the number of occurrences
|
|
514
|
-
target_tables = df[['FEATURE_TABLE','FEATURE_DATABASE','FEATURE_ID']].groupby(['FEATURE_TABLE','FEATURE_DATABASE']).count().to_pandas()
|
|
515
|
-
if display_logs:
|
|
516
|
-
display_table(target_tables[['FEATURE_DATABASE','FEATURE_TABLE','count_FEATURE_ID']])
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
ENTITY_ID = ', \n'.join([k for k,v in entity_id.items()])
|
|
520
|
-
ENTITY_ID_ON = ' AND '.join([f'NEW_FEATURES.{k} = EXISTING_FEATURES.{k}' for k,v in entity_id.items()])
|
|
521
|
-
ENTITY_ID_WHERE_INS = ' OR '.join([f'EXISTING_FEATURES.{k} IS NOT NULL' for k,v in entity_id.items()])
|
|
522
|
-
ENTITY_ID_WHERE_UP = ' OR '.join([f'EXISTING_FEATURES.{k} IS NULL' for k,v in entity_id.items()])
|
|
523
|
-
|
|
524
|
-
ENTITY_ID_SELECT = ', \n'.join(['NEW_FEATURES.'+k for k, v in entity_id.items()])
|
|
525
|
-
# Iterate over target tables and perform update and insert operations
|
|
526
|
-
for i,row in target_tables.iterrows():
|
|
527
|
-
|
|
528
|
-
ENTITY_ID_WHERE_ = ' AND '.join([f'{row.iloc[0]}.{k} = UPDATED_FEATURES.{k}' for k,v in entity_id.items()])
|
|
529
|
-
# SQL query to update existing feature values
|
|
530
|
-
query_update = f"""
|
|
531
|
-
{validtime_statement}
|
|
532
|
-
UPDATE {row.iloc[1]}.{row.iloc[0]}
|
|
533
|
-
FROM (
|
|
534
|
-
{validtime_statement2}
|
|
535
|
-
SELECT
|
|
536
|
-
{ENTITY_ID_SELECT},
|
|
537
|
-
NEW_FEATURES.FEATURE_ID,
|
|
538
|
-
NEW_FEATURES.FEATURE_VALUE,
|
|
539
|
-
NEW_FEATURES.FEATURE_VERSION
|
|
540
|
-
FROM {df._table_name} NEW_FEATURES
|
|
541
|
-
LEFT JOIN {row.iloc[1]}.{row.iloc[0]} EXISTING_FEATURES
|
|
542
|
-
ON {ENTITY_ID_ON}
|
|
543
|
-
AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
|
|
544
|
-
AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
|
|
545
|
-
WHERE ({ENTITY_ID_WHERE_INS})
|
|
546
|
-
AND NEW_FEATURES.FEATURE_DATABASE = '{row.iloc[1]}'
|
|
547
|
-
AND NEW_FEATURES.FEATURE_TABLE = '{row.iloc[0]}'
|
|
548
|
-
) UPDATED_FEATURES
|
|
549
|
-
SET
|
|
550
|
-
FEATURE_VALUE = UPDATED_FEATURES.FEATURE_VALUE
|
|
551
|
-
WHERE {ENTITY_ID_WHERE_}
|
|
552
|
-
AND {row.iloc[0]}.FEATURE_ID = UPDATED_FEATURES.FEATURE_ID
|
|
553
|
-
AND {row.iloc[0]}.FEATURE_VERSION = UPDATED_FEATURES.FEATURE_VERSION;
|
|
554
|
-
"""
|
|
555
|
-
|
|
556
|
-
# SQL query to insert new feature values
|
|
557
|
-
if validtime_statement == 'CURRENT VALIDTIME':
|
|
558
|
-
query_insert = f"""
|
|
559
|
-
{validtime_statement}
|
|
560
|
-
INSERT INTO {row.iloc[1]}.{row.iloc[0]} ({ENTITY_ID}, FEATURE_ID, FEATURE_VALUE, FEATURE_VERSION)
|
|
561
|
-
SELECT
|
|
562
|
-
{ENTITY_ID_SELECT},
|
|
563
|
-
NEW_FEATURES.FEATURE_ID,
|
|
564
|
-
NEW_FEATURES.FEATURE_VALUE,
|
|
565
|
-
NEW_FEATURES.FEATURE_VERSION
|
|
566
|
-
FROM {df._table_name} NEW_FEATURES
|
|
567
|
-
LEFT JOIN {row.iloc[1]}.{row.iloc[0]} EXISTING_FEATURES
|
|
568
|
-
ON {ENTITY_ID_ON}
|
|
569
|
-
AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
|
|
570
|
-
AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
|
|
571
|
-
WHERE ({ENTITY_ID_WHERE_UP})
|
|
572
|
-
AND NEW_FEATURES.FEATURE_DATABASE = '{row.iloc[1]}'
|
|
573
|
-
AND NEW_FEATURES.FEATURE_TABLE = '{row.iloc[0]}'
|
|
574
|
-
"""
|
|
575
|
-
elif date_in_the_past is not None:
|
|
576
|
-
if end_period == 'UNTIL_CHANGED':
|
|
577
|
-
end_period_ = '9999-01-01 00:00:00'
|
|
578
|
-
else:
|
|
579
|
-
end_period_ = end_period
|
|
580
|
-
query_insert = f"""
|
|
581
|
-
INSERT INTO {row.iloc[1]}.{row.iloc[0]} ({ENTITY_ID}, FEATURE_ID, FEATURE_VALUE, FEATURE_VERSION, ValidStart, ValidEnd)
|
|
582
|
-
SELECT
|
|
583
|
-
{ENTITY_ID_SELECT},
|
|
584
|
-
NEW_FEATURES.FEATURE_ID,
|
|
585
|
-
NEW_FEATURES.FEATURE_VALUE,
|
|
586
|
-
NEW_FEATURES.FEATURE_VERSION,
|
|
587
|
-
TIMESTAMP '{date_in_the_past}',
|
|
588
|
-
TIMESTAMP '{end_period_}'
|
|
589
|
-
FROM {df._table_name} NEW_FEATURES
|
|
590
|
-
LEFT JOIN {row.iloc[1]}.{row.iloc[0]} EXISTING_FEATURES
|
|
591
|
-
ON {ENTITY_ID_ON}
|
|
592
|
-
AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
|
|
593
|
-
AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
|
|
594
|
-
WHERE ({ENTITY_ID_WHERE_UP})
|
|
595
|
-
AND NEW_FEATURES.FEATURE_DATABASE = '{row.iloc[1]}'
|
|
596
|
-
AND NEW_FEATURES.FEATURE_TABLE = '{row.iloc[0]}'
|
|
597
|
-
"""
|
|
598
|
-
entity_id_str = ', \n'.join([k for k, v in entity_id.items()])
|
|
599
|
-
if display_logs: print(f'insert feature values of new {entity_id_str} combinations in {row.iloc[1]}.{row.iloc[0]}')
|
|
600
|
-
if tdml.display.print_sqlmr_query:
|
|
601
|
-
print(query_insert)
|
|
602
|
-
execute_query(query_insert)
|
|
603
|
-
if display_logs: print(f'update feature values of existing {entity_id_str} combinations in {row.iloc[1]}.{row.iloc[0]}')
|
|
604
|
-
if tdml.display.print_sqlmr_query:
|
|
605
|
-
print(query_update)
|
|
606
|
-
execute_query(query_update)
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
return
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
def build_dataset(entity_id, selected_features, view_name,
|
|
613
|
-
comment='dataset', no_temporal=False, time_manager=None, query_only=False):
|
|
614
|
-
"""
|
|
615
|
-
This function builds a dataset view in a Teradata database. It is designed to pivot and format data from the feature catalog and feature tables based on the specified parameters.
|
|
616
|
-
|
|
617
|
-
Parameters:
|
|
618
|
-
- entity_id (dict or list or other): A dictionary, list, or other format representing the entity ID. The keys of the dictionary are used to identify the entity. Lists and other formats are converted to a list of keys.
|
|
619
|
-
- selected_features (dict): A dictionary specifying the selected features and their corresponding feature versions.
|
|
620
|
-
- view_name (str): The name of the dataset view to be created.
|
|
621
|
-
- comment (str, optional): A comment to associate with the dataset view. Defaults to 'dataset'.
|
|
622
|
-
- no_temporal (bool, optional): Flag to determine if temporal aspects should be ignored. Defaults to False.
|
|
623
|
-
- time_manager (object, optional): An object to manage time aspects. Defaults to None.
|
|
624
|
-
- query_only (bool, optional): Flag to determine if we want only the generated query without the execution
|
|
625
|
-
|
|
626
|
-
Returns:
|
|
627
|
-
tdml.DataFrame: A DataFrame representing the dataset view.
|
|
628
|
-
"""
|
|
629
|
-
|
|
630
|
-
# Retrieve feature data from the feature catalog table
|
|
631
|
-
feature_catalog = tdml.DataFrame.from_query(f'CURRENT VALIDTIME SELECT * FROM {schema}.{feature_catalog_name}')
|
|
632
|
-
|
|
633
|
-
# Determine the valid time statement based on the presence of a specific date in the past
|
|
634
|
-
if date_in_the_past is None:
|
|
635
|
-
validtime_statement = 'CURRENT VALIDTIME'
|
|
636
|
-
else:
|
|
637
|
-
validtime_statement = f"VALIDTIME AS OF TIMESTAMP '{date_in_the_past}'"
|
|
638
|
-
|
|
639
|
-
# Adjust valid time statement based on the presence of time_manager and no_temporal flag
|
|
640
|
-
if no_temporal:
|
|
641
|
-
validtime_statement = ''
|
|
642
|
-
|
|
643
|
-
# Convert entity_id to a list format for processing
|
|
644
|
-
if isinstance(entity_id, list):
|
|
645
|
-
list_entity_id = entity_id
|
|
646
|
-
elif isinstance(entity_id, dict):
|
|
647
|
-
list_entity_id = list(entity_id.keys())
|
|
648
|
-
else:
|
|
649
|
-
list_entity_id = [entity_id]
|
|
650
|
-
|
|
651
|
-
# Compose the entity names and retrieve the corresponding feature locations
|
|
652
|
-
ENTITY_NAMES = ','.join([k for k in list_entity_id])
|
|
653
|
-
ENTITY_ID = ', \n'.join([k for k in list_entity_id])
|
|
654
|
-
if len(selected_features) > 1:
|
|
655
|
-
ENTITY_ID_ = ','.join([','.join(['COALESCE('+','.join(['AA'+str(i+1)+'.'+k for i,c in enumerate(selected_features)])+') as '+k]) for k in list_entity_id])
|
|
656
|
-
else:
|
|
657
|
-
ENTITY_ID_ = ','.join([','.join(['' + ','.join(['AA' + str(i + 1) + '.' + k for i, c in enumerate(selected_features)]) + ' as ' + k]) for k in list_entity_id])
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
feature_location = feature_catalog[(feature_catalog.FEATURE_NAME.isin(list(selected_features.keys()))) & \
|
|
661
|
-
(feature_catalog.ENTITY_NAME == ENTITY_NAMES) & \
|
|
662
|
-
(feature_catalog.DATA_DOMAIN == data_domain) \
|
|
663
|
-
].to_pandas()
|
|
664
|
-
|
|
665
|
-
# manage the case sensitivity
|
|
666
|
-
feature_location['FEATURE_NAME_UPPER'] = [x.upper() for x in feature_location['FEATURE_NAME']]
|
|
667
|
-
feature_location['FEATURE_VERSION'] = feature_location['FEATURE_NAME_UPPER'].map({k.upper():v for k,v in selected_features.items()})
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
# Build the query to retrieve the selected features from the feature tables
|
|
671
|
-
query = []
|
|
672
|
-
counter = 1
|
|
673
|
-
feature_names = []
|
|
674
|
-
for g,df in feature_location.groupby(['FEATURE_DATABASE','FEATURE_TABLE']):
|
|
675
|
-
for i,row in df.iterrows():
|
|
676
|
-
condition = ' \n '+f"(FEATURE_ID = {row['FEATURE_ID']} AND FEATURE_VERSION = '{row['FEATURE_VERSION']}')"
|
|
677
|
-
if time_manager is not None:
|
|
678
|
-
if 'date' in time_manager.data_type.lower():
|
|
679
|
-
print(f'Time Manager {time_manager.schema_name}.{time_manager.table_name} has a {time_manager.data_type} data type')
|
|
680
|
-
query_ = f"""
|
|
681
|
-
SELECT A{counter}.* FROM (
|
|
682
|
-
SELECT * FROM {g[0]}.{g[1]}
|
|
683
|
-
WHERE {condition} AND PERIOD(CAST(ValidStart AS DATE), CAST(ValidEnd AS DATE)) CONTAINS (SEL BUSINESS_DATE FROM {time_manager.schema_name}.{time_manager.table_name})
|
|
684
|
-
) A{counter}
|
|
685
|
-
"""
|
|
686
|
-
else:
|
|
687
|
-
print(
|
|
688
|
-
f'Time Manager {time_manager.schema_name}.{time_manager.table_name} has a {time_manager.data_type} data type')
|
|
689
|
-
query_ = f"""
|
|
690
|
-
SELECT A{counter}.* FROM (
|
|
691
|
-
SELECT * FROM {g[0]}.{g[1]}
|
|
692
|
-
WHERE {condition} AND PERIOD(ValidStart, ValidEnd) CONTAINS (SEL BUSINESS_DATE FROM {time_manager.schema_name}.{time_manager.table_name})
|
|
693
|
-
) A{counter}
|
|
694
|
-
"""
|
|
695
|
-
else:
|
|
696
|
-
print(
|
|
697
|
-
f'no time manager used.')
|
|
698
|
-
query_ = f"""
|
|
699
|
-
SELECT A{counter}.* FROM (
|
|
700
|
-
{validtime_statement} SELECT * FROM {g[0]}.{g[1]}
|
|
701
|
-
WHERE {condition}
|
|
702
|
-
) A{counter}
|
|
703
|
-
"""
|
|
704
|
-
query.append(query_)
|
|
705
|
-
feature_names.append(row['FEATURE_NAME'])
|
|
706
|
-
counter+=1
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
query_select = [f"SELECT {ENTITY_ID_}"]
|
|
711
|
-
query_select = query_select + ['AA'+str(i+1)+'.FEATURE_VALUE AS '+c for i,c in enumerate(feature_names)]
|
|
712
|
-
if no_temporal:
|
|
713
|
-
query_select = query_select + ['AA'+str(i+1)+'.ValidStart AS ValidStart_'+ c + ',AA'+str(i+1)+'.ValidEnd AS ValidEnd_'+ c for i,c in enumerate(feature_names)]
|
|
714
|
-
query_select = ', \n'.join(query_select)
|
|
715
|
-
|
|
716
|
-
query_from = [' FROM ('+query[0]+') AA1 ']
|
|
717
|
-
query_from = query_from + [' FULL OUTER JOIN ('+q+') AA'+str(i+1)+' \n ON '+' \n AND '.join([f'AA1.{c}=AA{i+1}.{c}' for c in list_entity_id]) for i,q in enumerate(query) if i>0]
|
|
718
|
-
query_from = '\n'.join(query_from)
|
|
719
|
-
|
|
720
|
-
query_dataset = query_select + '\n' + query_from
|
|
721
|
-
|
|
722
|
-
# Build the query to create the dataset view by pivoting the feature data
|
|
723
|
-
query_create_view = f'REPLACE VIEW {schema}.{view_name} AS'
|
|
724
|
-
query_pivot = f"""
|
|
725
|
-
{query_dataset}
|
|
726
|
-
"""
|
|
727
|
-
|
|
728
|
-
if tdml.display.print_sqlmr_query:
|
|
729
|
-
print(query_create_view+'\n'+query_pivot)
|
|
730
|
-
if query_only:
|
|
731
|
-
return query_pivot
|
|
732
|
-
else:
|
|
733
|
-
if view_name != None:
|
|
734
|
-
execute_query(query_create_view+'\n'+query_pivot)
|
|
735
|
-
execute_query(f"COMMENT ON VIEW {schema}.{view_name} IS '{comment}'")
|
|
736
|
-
if display_logs: print(f'the dataset view {schema}.{view_name} has been created')
|
|
737
|
-
|
|
738
|
-
return tdml.DataFrame(tdml.in_schema(schema, view_name))
|
|
739
|
-
else:
|
|
740
|
-
return tdml.DataFrame.from_query(query_pivot)
|
|
741
|
-
def GetTheLargestFeatureID():
|
|
742
|
-
"""
|
|
743
|
-
This function retrieves the maximum feature ID from the feature catalog table in the Teradata database.
|
|
744
|
-
|
|
745
|
-
Parameters:
|
|
746
|
-
- schema: The schema name in which the feature catalog table resides.
|
|
747
|
-
- table_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
|
|
748
|
-
|
|
749
|
-
Returns:
|
|
750
|
-
The maximum feature ID. If no feature IDs are found (i.e., the table is empty), the function returns 0.
|
|
751
|
-
|
|
752
|
-
"""
|
|
753
|
-
# Execute a SQL query to get the maximum feature ID from the feature catalog table.
|
|
754
|
-
feature_id = execute_query(f'SEL MAX(FEATURE_ID) AS MAX_FEATURE_ID FROM {schema}.{feature_catalog_name}').fetchall()[0][0]
|
|
755
|
-
|
|
756
|
-
# If the result of the query is None (which means the table is empty), return 0.
|
|
757
|
-
if feature_id == None:
|
|
758
|
-
return 0
|
|
759
|
-
# If the result of the query is not None, return the maximum feature ID.
|
|
760
|
-
else:
|
|
761
|
-
return feature_id
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
def GetAlreadyExistingFeatureNames(feature_name, entity_id):
|
|
765
|
-
"""
|
|
766
|
-
This function retrieves the list of already existing features in the feature catalog table in the Teradata database.
|
|
767
|
-
|
|
768
|
-
Parameters:
|
|
769
|
-
- feature_name: The name of the feature to check.
|
|
770
|
-
- schema: The schema name in which the feature catalog table resides.
|
|
771
|
-
- table_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
|
|
772
|
-
|
|
773
|
-
Returns:
|
|
774
|
-
A list of existing features.
|
|
775
|
-
|
|
776
|
-
"""
|
|
777
|
-
# Create a temporary DataFrame with the feature name.
|
|
778
|
-
df = pd.DataFrame({'FEATURE_NAME': feature_name, 'DATA_DOMAIN': data_domain, 'ENTITY_NAME': ','.join([k for k,v in entity_id.items()])})
|
|
779
|
-
|
|
780
|
-
# Define a temporary table name.
|
|
781
|
-
tmp_name = 'tdfs__fgjnojnsmdoignmosnig'
|
|
782
|
-
|
|
783
|
-
# Copy the temporary DataFrame to a temporary table in the Teradata database.
|
|
784
|
-
tdml.copy_to_sql(df, schema_name=schema, table_name=tmp_name, if_exists='replace',
|
|
785
|
-
types={'FEATURE_NAME': tdml.VARCHAR(length=255, charset='LATIN')})
|
|
786
|
-
|
|
787
|
-
# Execute a SQL query to get the feature names that exist in both the temporary table and the feature catalog table.
|
|
788
|
-
existing_features = list(tdml.DataFrame.from_query(f"""
|
|
789
|
-
SEL A.FEATURE_NAME
|
|
790
|
-
FROM {schema}.{tmp_name} A
|
|
791
|
-
INNER JOIN {schema}.{feature_catalog_name} B
|
|
792
|
-
ON A.FEATURE_NAME = B.FEATURE_NAME
|
|
793
|
-
AND A.ENTITY_NAME = B.ENTITY_NAME
|
|
794
|
-
AND A.DATA_DOMAIN = B.DATA_DOMAIN
|
|
795
|
-
""").to_pandas().FEATURE_NAME.values)
|
|
796
|
-
|
|
797
|
-
# Return the list of existing features.
|
|
798
|
-
return existing_features
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
def Gettdtypes(tddf, features_columns, entity_id):
|
|
802
|
-
"""
|
|
803
|
-
This function retrieves the data types of the columns in the provided DataFrame (tddf) and checks their existence in the feature catalog table.
|
|
804
|
-
It also assigns new feature IDs for those that do not already exist in the table.
|
|
805
|
-
|
|
806
|
-
Parameters:
|
|
807
|
-
- tddf: The input DataFrame.
|
|
808
|
-
- features_columns: A list of feature column names.
|
|
809
|
-
- schema: The schema name in which the feature catalog table resides.
|
|
810
|
-
- table_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
|
|
811
|
-
|
|
812
|
-
Returns:
|
|
813
|
-
A dictionary where keys are column names and values are dictionaries containing type and id of the feature.
|
|
814
|
-
|
|
815
|
-
"""
|
|
816
|
-
# Get the data types of the columns in the DataFrame.
|
|
817
|
-
types = get_column_types_simple(tddf, tddf.columns) #dict(tddf.to_pandas(num_rows=10).dtypes)
|
|
818
|
-
|
|
819
|
-
# Get the names of the features that already exist in the feature catalog table.
|
|
820
|
-
existing_features = GetAlreadyExistingFeatureNames(tddf.columns, entity_id)
|
|
821
|
-
|
|
822
|
-
# Get the maximum feature ID from the feature catalog table.
|
|
823
|
-
feature_id = GetTheLargestFeatureID()
|
|
824
|
-
|
|
825
|
-
# Increment the maximum feature ID to create a new feature ID.
|
|
826
|
-
feature_id = feature_id + 1
|
|
827
|
-
|
|
828
|
-
# Initialize a dictionary to store the result.
|
|
829
|
-
res = {}
|
|
830
|
-
|
|
831
|
-
# Iterate over the data types of the columns in the DataFrame.
|
|
832
|
-
for k, v in types.items():
|
|
833
|
-
# If the column name does not exist in the feature catalog table and is in the list of feature column names...
|
|
834
|
-
if k.upper() not in [n.upper() for n in existing_features] and k.upper() in [n.upper() for n in features_columns]:
|
|
835
|
-
# If the data type of the column is integer...
|
|
836
|
-
if 'int' in str(v):
|
|
837
|
-
# Add an entry to the result dictionary for the column name with its data type and new feature ID.
|
|
838
|
-
res[k] = {'type': 'BIGINT', 'id': feature_id}
|
|
839
|
-
# If the data type of the column is float...
|
|
840
|
-
elif 'float' in str(v):
|
|
841
|
-
# Add an entry to the result dictionary for the column name with its data type and new feature ID.
|
|
842
|
-
res[k] = {'type': 'FLOAT', 'id': feature_id}
|
|
843
|
-
# If the data type of the column is neither integer nor float...
|
|
844
|
-
else:
|
|
845
|
-
res[k] = {'type': 'VARCHAR', 'id': feature_id}
|
|
846
|
-
# Print a message that the data type is not yet managed.
|
|
847
|
-
#if display_logs: print(f'{k} has a type that is not yet managed')
|
|
848
|
-
|
|
849
|
-
# Increment the feature ID for the next iteration.
|
|
850
|
-
feature_id += 1
|
|
851
|
-
|
|
852
|
-
# Return the result dictionary.
|
|
853
|
-
return res
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
def _upload_features(df, entity_id, feature_names,
|
|
857
|
-
feature_versions=feature_version_default):
|
|
858
|
-
"""
|
|
859
|
-
This function uploads features from a Teradata DataFrame to the feature store.
|
|
860
|
-
|
|
861
|
-
Parameters:
|
|
862
|
-
- df: The input Teradata DataFrame.
|
|
863
|
-
- entity_id: The ID of the entity that the features belong to.
|
|
864
|
-
- feature_names: A list of feature names.
|
|
865
|
-
- schema_name: The name of the schema where the feature store resides.
|
|
866
|
-
- feature_catalog_name (optional): The name of the feature catalog table. Default is 'FS_FEATURE_CATALOG'.
|
|
867
|
-
- feature_versions (optional): The versions of the features. Can be a string or a list. If it's a string, it's used as the version for all features. If it's a list, it should have the same length as feature_names. Default is 'dev.0.0'.
|
|
868
|
-
|
|
869
|
-
Returns:
|
|
870
|
-
A DataFrame representing the dataset view created in the feature store.
|
|
871
|
-
"""
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
register_entity(entity_id)
|
|
875
|
-
|
|
876
|
-
# If feature_versions is a list, create a dictionary mapping each feature name to its corresponding version.
|
|
877
|
-
# If feature_versions is a string, create a dictionary mapping each feature name to this string.
|
|
878
|
-
if type(feature_versions) == list:
|
|
879
|
-
selected_features = {k: v for k, v in zip(feature_names, feature_versions)}
|
|
880
|
-
else:
|
|
881
|
-
selected_features = {k: feature_versions for k in feature_names}
|
|
882
|
-
|
|
883
|
-
# Get the Teradata types of the features in df.
|
|
884
|
-
feature_names_types = Gettdtypes(
|
|
885
|
-
df,
|
|
886
|
-
features_columns=feature_names,
|
|
887
|
-
entity_id=entity_id
|
|
888
|
-
)
|
|
889
|
-
|
|
890
|
-
# Register the features in the feature catalog.
|
|
891
|
-
register_features(
|
|
892
|
-
entity_id,
|
|
893
|
-
feature_names_types
|
|
894
|
-
)
|
|
895
|
-
|
|
896
|
-
# Prepare the features for ingestion.
|
|
897
|
-
prepared_features, volatile_table_name = prepare_feature_ingestion(
|
|
898
|
-
df,
|
|
899
|
-
entity_id,
|
|
900
|
-
feature_names,
|
|
901
|
-
feature_versions=selected_features
|
|
902
|
-
)
|
|
903
|
-
|
|
904
|
-
# Store the prepared features in the feature store.
|
|
905
|
-
store_feature(
|
|
906
|
-
entity_id,
|
|
907
|
-
prepared_features
|
|
908
|
-
)
|
|
909
|
-
|
|
910
|
-
# Clean up by dropping the temporary volatile table.
|
|
911
|
-
tdml.execute_sql(f'DROP TABLE {volatile_table_name}')
|
|
912
|
-
|
|
913
|
-
# Build a dataset view in the feature store.
|
|
914
|
-
dataset = build_dataset(
|
|
915
|
-
entity_id,
|
|
916
|
-
selected_features,
|
|
917
|
-
view_name=None
|
|
918
|
-
)
|
|
919
|
-
|
|
920
|
-
# Return the dataset view.
|
|
921
|
-
return dataset
|
|
922
|
-
|
|
923
|
-
def register_entity(entity_id):
|
|
924
|
-
feature_store_table_name_float = feature_store_table_creation(entity_id, feature_type='FLOAT')
|
|
925
|
-
feature_store_table_name_integer = feature_store_table_creation(entity_id, feature_type='BIGINT')
|
|
926
|
-
feature_store_table_name_varchar = feature_store_table_creation(entity_id, feature_type='VARCHAR')
|
|
927
|
-
|
|
928
|
-
return feature_store_table_name_float,feature_store_table_name_integer,feature_store_table_name_varchar
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
def get_available_features(entity_id, display_details=False):
|
|
932
|
-
if date_in_the_past == None:
|
|
933
|
-
validtime_statement = 'CURRENT VALIDTIME'
|
|
934
|
-
else:
|
|
935
|
-
validtime_statement = f"VALIDTIME AS OF '{date_in_the_past}'"
|
|
936
|
-
|
|
937
|
-
if type(entity_id) == dict:
|
|
938
|
-
ENTITY_ID__ = ','.join([k.lower() for k, v in entity_id.items()])
|
|
939
|
-
elif type(entity_id) == list:
|
|
940
|
-
ENTITY_ID__ = ','.join([k.lower() for k in entity_id])
|
|
941
|
-
else:
|
|
942
|
-
ENTITY_ID__ = entity_id.lower()
|
|
943
|
-
|
|
944
|
-
query = f"""
|
|
945
|
-
{validtime_statement}
|
|
946
|
-
SELECT
|
|
947
|
-
FEATURE_NAME
|
|
948
|
-
FROM {schema}.{feature_catalog_name}
|
|
949
|
-
WHERE LOWER(ENTITY_NAME) = '{ENTITY_ID__}'
|
|
950
|
-
AND DATA_DOMAIN = '{data_domain}'
|
|
951
|
-
"""
|
|
952
|
-
|
|
953
|
-
if display_details:
|
|
954
|
-
print(tdml.DataFrame.from_query(f'{validtime_statement} SELECT * FROM {schema}.{feature_catalog_name}'))
|
|
955
|
-
|
|
956
|
-
return list(tdml.DataFrame.from_query(query).to_pandas().FEATURE_NAME.values)
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
def tdstone2_entity_id(existing_model):
|
|
960
|
-
"""
|
|
961
|
-
Generate a dictionary mapping entity IDs to their respective data types in a given model.
|
|
962
|
-
|
|
963
|
-
This function iterates over the 'id_row' attribute of the 'mapper_scoring' object in the provided model.
|
|
964
|
-
It then creates a dictionary where each key is an entity ID and its corresponding value is the data type of that entity ID,
|
|
965
|
-
as defined in the 'types' attribute of the 'mapper_scoring' object.
|
|
966
|
-
|
|
967
|
-
Args:
|
|
968
|
-
existing_model (object): The model object that contains the 'mapper_scoring' attribute with necessary information.
|
|
969
|
-
It is expected to have 'id_row' and 'types' attributes.
|
|
970
|
-
|
|
971
|
-
Returns:
|
|
972
|
-
dict: A dictionary where keys are entity IDs and values are their respective data types.
|
|
973
|
-
|
|
974
|
-
Raises:
|
|
975
|
-
TypeError: If the 'id_row' attribute in the model is not a list or a single value.
|
|
976
|
-
|
|
977
|
-
Note:
|
|
978
|
-
- If 'id_row' is a single value (not a list), it is converted into a list with that single value.
|
|
979
|
-
- The function assumes 'mapper_scoring' and its attributes ('id_row' and 'types') are properly defined in the model.
|
|
980
|
-
|
|
981
|
-
Example:
|
|
982
|
-
entity_id = tdstone2_entity_id(model)
|
|
983
|
-
# entity_id might look like {'ID': 'BIGINT'}
|
|
984
|
-
"""
|
|
985
|
-
|
|
986
|
-
# Initialize an empty dictionary to store entity IDs and their data types.
|
|
987
|
-
entity_id = {}
|
|
988
|
-
|
|
989
|
-
# Retrieve the list of IDs from the 'id_row' attribute of 'mapper_scoring' in the model.
|
|
990
|
-
if 'score' in [x[0] for x in inspect.getmembers(type(existing_model))]:
|
|
991
|
-
ids = existing_model.mapper_scoring.id_row
|
|
992
|
-
model_type = 'model scoring'
|
|
993
|
-
elif existing_model.feature_engineering_type == 'feature engineering reducer':
|
|
994
|
-
ids = existing_model.mapper.id_partition
|
|
995
|
-
model_type = 'feature engineering'
|
|
996
|
-
else:
|
|
997
|
-
ids = existing_model.mapper.id_row
|
|
998
|
-
model_type = 'feature engineering'
|
|
999
|
-
|
|
1000
|
-
# Ensure 'ids' is a list. If not, convert it into a list.
|
|
1001
|
-
if type(ids) != list:
|
|
1002
|
-
ids = [ids]
|
|
1003
|
-
|
|
1004
|
-
# Iterate over each ID in 'ids' and map it to its corresponding data type in the dictionary.
|
|
1005
|
-
if model_type == 'model scoring':
|
|
1006
|
-
for k in ids:
|
|
1007
|
-
entity_id[k] = existing_model.mapper_scoring.types[k]
|
|
1008
|
-
else:
|
|
1009
|
-
for k in ids:
|
|
1010
|
-
entity_id[k] = existing_model.mapper.types[k]
|
|
1011
|
-
|
|
1012
|
-
# Return the dictionary containing mappings of entity IDs to data types.
|
|
1013
|
-
return entity_id
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
def tdstone2_Gettdtypes(existing_model, entity_id, display_logs=False):
|
|
1017
|
-
"""
|
|
1018
|
-
Generate a dictionary mapping feature names to their data types and unique feature IDs for a given model.
|
|
1019
|
-
|
|
1020
|
-
This function processes a model to create a dictionary where each key is a feature name and its value
|
|
1021
|
-
is a dictionary containing the feature's data type and a unique ID. The function filters out features
|
|
1022
|
-
that already exist in a feature catalog and only includes new features with 'BIGINT' or 'FLOAT' data types.
|
|
1023
|
-
|
|
1024
|
-
Args:
|
|
1025
|
-
existing_model (object): The model object containing necessary schema and scoring information.
|
|
1026
|
-
display_logs (bool): Flag to indicate whether to display logs. Defaults to False.
|
|
1027
|
-
|
|
1028
|
-
Returns:
|
|
1029
|
-
dict: A dictionary with feature names as keys, and each value is a dictionary containing 'type' and 'id'.
|
|
1030
|
-
|
|
1031
|
-
Raises:
|
|
1032
|
-
ValueError: If the data types encountered are neither integer nor float.
|
|
1033
|
-
|
|
1034
|
-
Note:
|
|
1035
|
-
- The function assumes that 'tdstone.schema_name' and 'mapper_scoring.scores_repository' are properly defined.
|
|
1036
|
-
- The function auto-generates unique IDs for new features.
|
|
1037
|
-
|
|
1038
|
-
Example:
|
|
1039
|
-
result = tdstone2_Gettdtypes(model)
|
|
1040
|
-
# result might look like {'count_AMOUNT': {'type': 'BIGINT', 'id': 1}, 'mean_AMOUNT': {'type': 'FLOAT', 'id': 3}, ...}
|
|
1041
|
-
"""
|
|
1042
|
-
|
|
1043
|
-
# Initialize an empty dictionary to store feature names and their types.
|
|
1044
|
-
types = {}
|
|
1045
|
-
|
|
1046
|
-
# Create a DataFrame based on the model's schema and scores repository.
|
|
1047
|
-
if 'score' in [x[0] for x in inspect.getmembers(type(existing_model))]:
|
|
1048
|
-
df = existing_model.get_model_predictions()
|
|
1049
|
-
else:
|
|
1050
|
-
#if existing_model.feature_engineering_type == 'feature engineering reducer':
|
|
1051
|
-
df = existing_model.get_computed_features()
|
|
1052
|
-
|
|
1053
|
-
# Group and count the DataFrame by feature name and type, converting it to a pandas DataFrame.
|
|
1054
|
-
df_ = df[['FEATURE_NAME', 'FEATURE_TYPE', 'FEATURE_VALUE']].groupby(['FEATURE_NAME', 'FEATURE_TYPE']).count()[
|
|
1055
|
-
['FEATURE_NAME', 'FEATURE_TYPE']].to_pandas()
|
|
1056
|
-
|
|
1057
|
-
# Iterate through the DataFrame to filter and assign types.
|
|
1058
|
-
for i, row in df_.iterrows():
|
|
1059
|
-
if 'float' in row['FEATURE_TYPE'] or 'int' in row['FEATURE_TYPE']:
|
|
1060
|
-
types[row['FEATURE_NAME']] = row['FEATURE_TYPE']
|
|
1061
|
-
|
|
1062
|
-
# Retrieve existing feature names to filter out already cataloged features.
|
|
1063
|
-
existing_features = GetAlreadyExistingFeatureNames(types.keys(),entity_id)
|
|
1064
|
-
|
|
1065
|
-
# Get the current maximum feature ID to ensure uniqueness for new features.
|
|
1066
|
-
feature_id = GetTheLargestFeatureID() + 1
|
|
1067
|
-
|
|
1068
|
-
# Initialize a dictionary to store the result.
|
|
1069
|
-
res = {}
|
|
1070
|
-
|
|
1071
|
-
# Process each feature type and assign a corresponding data type and unique ID.
|
|
1072
|
-
for k, v in types.items():
|
|
1073
|
-
if k not in existing_features and k in types.keys():
|
|
1074
|
-
if 'int' in str(v):
|
|
1075
|
-
res[k] = {'type': 'BIGINT', 'id': feature_id}
|
|
1076
|
-
elif 'float' in str(v):
|
|
1077
|
-
res[k] = {'type': 'FLOAT', 'id': feature_id}
|
|
1078
|
-
else:
|
|
1079
|
-
if display_logs:
|
|
1080
|
-
print(f'{k} has a type that is not yet managed')
|
|
1081
|
-
continue # Skip this iteration for unmanaged types.
|
|
1082
|
-
feature_id += 1
|
|
1083
|
-
|
|
1084
|
-
# Return the dictionary containing feature names, types, and IDs.
|
|
1085
|
-
return res
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
def prepare_feature_ingestion_tdstone2(df, entity_id):
|
|
1089
|
-
"""
|
|
1090
|
-
Prepare feature data for ingestion into the feature store by transforming a DataFrame.
|
|
1091
|
-
This function unpivots specified feature columns in the input DataFrame and adds additional columns
|
|
1092
|
-
for entity IDs, feature names, feature values, and feature versions. It creates a volatile table
|
|
1093
|
-
in the database to store the transformed data.
|
|
1094
|
-
|
|
1095
|
-
Parameters:
|
|
1096
|
-
- df (tdml.DataFrame): The input DataFrame containing the feature data. This DataFrame should have a structure
|
|
1097
|
-
compatible with the requirements of the tdstone2 feature store.
|
|
1098
|
-
- entity_id (dict): A dictionary mapping column names to their respective entity ID types, used for identifying entities.
|
|
1099
|
-
|
|
1100
|
-
Returns:
|
|
1101
|
-
- tdml.DataFrame: A transformed DataFrame containing the prepared feature data in a suitable format for feature store ingestion.
|
|
1102
|
-
- str: The name of the volatile table created for storing the transformed data.
|
|
1103
|
-
|
|
1104
|
-
Note:
|
|
1105
|
-
- The function assumes the input DataFrame 'df' has a valid table name and is compatible with tdml operations.
|
|
1106
|
-
- The function automatically handles the creation and management of a volatile table for the transformed data.
|
|
1107
|
-
- 'ID_PROCESS' is used as the feature version identifier.
|
|
1108
|
-
|
|
1109
|
-
Example usage:
|
|
1110
|
-
transformed_df, table_name = prepare_feature_ingestion_tdstone2(input_df, entity_id_dict)
|
|
1111
|
-
"""
|
|
1112
|
-
|
|
1113
|
-
# Ensure the internal table name of the DataFrame is set, necessary for further processing.
|
|
1114
|
-
df._DataFrame__execute_node_and_set_table_name(df._nodeid, df._metaexpr)
|
|
1115
|
-
|
|
1116
|
-
if type(entity_id) == list:
|
|
1117
|
-
list_entity_id = entity_id
|
|
1118
|
-
elif type(entity_id) == dict:
|
|
1119
|
-
list_entity_id = list(entity_id.keys())
|
|
1120
|
-
else:
|
|
1121
|
-
list_entity_id = [entity_id]
|
|
1122
|
-
|
|
1123
|
-
# Combine entity ID columns with feature name and value columns to form the output column list.
|
|
1124
|
-
output_columns = ', \n'.join(list_entity_id + ['FEATURE_NAME', 'FEATURE_VALUE'])
|
|
1125
|
-
primary_index = ','.join(list_entity_id)
|
|
1126
|
-
|
|
1127
|
-
# Define a query segment to assign feature versions.
|
|
1128
|
-
version_query = "ID_PROCESS AS FEATURE_VERSION"
|
|
1129
|
-
|
|
1130
|
-
# Create a volatile table name based on the original table's name, ensuring it is unique.
|
|
1131
|
-
volatile_table_name = df._table_name.split('.')[1].replace('"', '')
|
|
1132
|
-
volatile_table_name = f'temp_{volatile_table_name}'
|
|
1133
|
-
|
|
1134
|
-
# Construct the SQL query to create the volatile table with the transformed data.
|
|
1135
|
-
query = f"""
|
|
1136
|
-
CREATE VOLATILE TABLE {volatile_table_name} AS
|
|
1137
|
-
(
|
|
1138
|
-
SELECT
|
|
1139
|
-
{output_columns},
|
|
1140
|
-
{version_query}
|
|
1141
|
-
FROM {df._table_name}
|
|
1142
|
-
) WITH DATA
|
|
1143
|
-
PRIMARY INDEX ({primary_index})
|
|
1144
|
-
ON COMMIT PRESERVE ROWS
|
|
1145
|
-
"""
|
|
1146
|
-
# Execute the SQL query to create the volatile table.
|
|
1147
|
-
tdml.execute_sql(query)
|
|
1148
|
-
|
|
1149
|
-
# Optionally print the query if the display flag is set.
|
|
1150
|
-
if tdml.display.print_sqlmr_query:
|
|
1151
|
-
print(query)
|
|
1152
|
-
|
|
1153
|
-
# Return the DataFrame representation of the volatile table and its name.
|
|
1154
|
-
return tdml.DataFrame(tdml.in_schema(_get_database_username(), volatile_table_name)), volatile_table_name
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
def upload_tdstone2_scores(model):
|
|
1158
|
-
"""
|
|
1159
|
-
Uploads features from a model's predictions to the Teradata feature store. This function handles the entire
|
|
1160
|
-
workflow from extracting feature names and types, registering them in the feature catalog, preparing features for ingestion,
|
|
1161
|
-
storing them in the feature store, and finally creating a dataset view in the feature store.
|
|
1162
|
-
|
|
1163
|
-
Parameters:
|
|
1164
|
-
- model: The model object whose predictions contain features to be uploaded. This model should have methods
|
|
1165
|
-
to extract predictions and feature information.
|
|
1166
|
-
|
|
1167
|
-
Returns:
|
|
1168
|
-
- DataFrame: A DataFrame representing the dataset view created in the feature store, which includes
|
|
1169
|
-
features from the model's predictions.
|
|
1170
|
-
|
|
1171
|
-
Note:
|
|
1172
|
-
- The function assumes that the model provides a method `get_model_predictions` which returns a Teradata DataFrame.
|
|
1173
|
-
- Entity ID for the model is extracted and registered in the data domain.
|
|
1174
|
-
- The function cleans up by dropping the volatile table created during the process.
|
|
1175
|
-
- The feature names and their types are extracted from the model's predictions and are registered in the feature catalog.
|
|
1176
|
-
"""
|
|
1177
|
-
|
|
1178
|
-
# Extract the entity ID from the existing model.
|
|
1179
|
-
entity_id = tdstone2_entity_id(model)
|
|
1180
|
-
|
|
1181
|
-
# Register the entity ID in the data domain.
|
|
1182
|
-
register_entity(entity_id)
|
|
1183
|
-
|
|
1184
|
-
# Get the Teradata types of the features from the model's predictions.
|
|
1185
|
-
feature_names_types = tdstone2_Gettdtypes(model,entity_id)
|
|
1186
|
-
|
|
1187
|
-
# Register these features in the feature catalog.
|
|
1188
|
-
register_features(entity_id, feature_names_types)
|
|
1189
|
-
|
|
1190
|
-
# Prepare the features for ingestion into the feature store.
|
|
1191
|
-
if 'score' in [x[0] for x in inspect.getmembers(type(model))]:
|
|
1192
|
-
prepared_features, volatile_table_name = prepare_feature_ingestion_tdstone2(
|
|
1193
|
-
model.get_model_predictions(),
|
|
1194
|
-
entity_id
|
|
1195
|
-
)
|
|
1196
|
-
else:
|
|
1197
|
-
prepared_features, volatile_table_name = prepare_feature_ingestion_tdstone2(
|
|
1198
|
-
model.get_computed_features(),
|
|
1199
|
-
entity_id
|
|
1200
|
-
)
|
|
1201
|
-
|
|
1202
|
-
# Store the prepared features in the feature store.
|
|
1203
|
-
store_feature(entity_id, prepared_features)
|
|
1204
|
-
|
|
1205
|
-
# Clean up by dropping the temporary volatile table.
|
|
1206
|
-
tdml.execute_sql(f'DROP TABLE {volatile_table_name}')
|
|
1207
|
-
|
|
1208
|
-
# Get the list of selected features for building the dataset view.
|
|
1209
|
-
if 'score' in [x[0] for x in inspect.getmembers(type(model))]:
|
|
1210
|
-
selected_features = model.get_model_predictions().groupby(['FEATURE_NAME', 'ID_PROCESS']).count().to_pandas()[
|
|
1211
|
-
['FEATURE_NAME', 'ID_PROCESS']].set_index('FEATURE_NAME').to_dict()['ID_PROCESS']
|
|
1212
|
-
else:
|
|
1213
|
-
selected_features = model.get_computed_features().groupby(['FEATURE_NAME', 'ID_PROCESS']).count().to_pandas()[
|
|
1214
|
-
['FEATURE_NAME', 'ID_PROCESS']].set_index('FEATURE_NAME').to_dict()['ID_PROCESS']
|
|
1215
|
-
|
|
1216
|
-
# Build and return the dataset view in the feature store.
|
|
1217
|
-
dataset = build_dataset(entity_id, selected_features, view_name=None)
|
|
1218
|
-
return dataset
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
def get_list_entity(domain=None):
|
|
1222
|
-
"""
|
|
1223
|
-
Retrieve a list of unique entity names from a specified data domain.
|
|
1224
|
-
|
|
1225
|
-
This function executes a database query to extract distinct entity names from
|
|
1226
|
-
a feature catalog, filtered by the provided data domain. If no domain is
|
|
1227
|
-
specified, it defaults to a predefined data domain.
|
|
1228
|
-
|
|
1229
|
-
Parameters:
|
|
1230
|
-
domain (str, optional): The data domain to filter the entity names.
|
|
1231
|
-
Defaults to None, in which case a predefined domain is used.
|
|
1232
|
-
|
|
1233
|
-
Returns:
|
|
1234
|
-
DataFrame: A pandas-like DataFrame containing the unique entity names.
|
|
1235
|
-
"""
|
|
1236
|
-
|
|
1237
|
-
# Use the default data domain if none is specified
|
|
1238
|
-
if domain is None:
|
|
1239
|
-
domain = data_domain
|
|
1240
|
-
|
|
1241
|
-
# Constructing the SQL query to fetch distinct entity names from the specified domain
|
|
1242
|
-
query = f"CURRENT VALIDTIME SEL DISTINCT ENTITY_NAME FROM {schema}.{feature_catalog_name} where DATA_DOMAIN = '{domain}'"
|
|
1243
|
-
|
|
1244
|
-
# Executing the query and returning the result as a DataFrame
|
|
1245
|
-
return tdml.DataFrame.from_query(query)
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
def get_list_features(entity_name, domain=None):
|
|
1249
|
-
"""
|
|
1250
|
-
Retrieve a list of feature names associated with a specific entity or entities
|
|
1251
|
-
from a given data domain.
|
|
1252
|
-
|
|
1253
|
-
This function constructs and executes a database query to extract feature names
|
|
1254
|
-
for the specified entity or entities from a feature catalog, filtered by the
|
|
1255
|
-
provided data domain. If no domain is specified, it defaults to a predefined
|
|
1256
|
-
data domain.
|
|
1257
|
-
|
|
1258
|
-
Parameters:
|
|
1259
|
-
entity_name (str or list): The name of the entity or a list of entity names
|
|
1260
|
-
to fetch features for.
|
|
1261
|
-
domain (str, optional): The data domain to filter the feature names.
|
|
1262
|
-
Defaults to None, where a predefined domain is used.
|
|
1263
|
-
|
|
1264
|
-
Returns:
|
|
1265
|
-
DataFrame: A pandas-like DataFrame containing the feature names associated with the given entity or entities.
|
|
1266
|
-
"""
|
|
1267
|
-
|
|
1268
|
-
# Default to a predefined data domain if none is provided
|
|
1269
|
-
if domain is None:
|
|
1270
|
-
domain = data_domain
|
|
1271
|
-
|
|
1272
|
-
# Convert the entity_name to a string if it is a list
|
|
1273
|
-
if type(entity_name) == list:
|
|
1274
|
-
entity_name = ','.join(entity_name)
|
|
1275
|
-
|
|
1276
|
-
# Constructing the SQL query to fetch feature names for the specified entity or entities
|
|
1277
|
-
query = f"CURRENT VALIDTIME SEL FEATURE_NAME FROM {schema}.{feature_catalog_name} where entity_name = '{entity_name}' AND DATA_DOMAIN = '{domain}'"
|
|
1278
|
-
|
|
1279
|
-
# Executing the query and returning the result as a DataFrame
|
|
1280
|
-
return tdml.DataFrame.from_query(query)
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
def get_feature_versions(entity_name, features, domain=None, latest_version_only=True, version_lag=0):
|
|
1284
|
-
"""
|
|
1285
|
-
Retrieve feature versions for specified features associated with certain entities
|
|
1286
|
-
from a given data domain. This function allows fetching either all versions or
|
|
1287
|
-
just the latest versions of the features.
|
|
1288
|
-
|
|
1289
|
-
Parameters:
|
|
1290
|
-
entity_name (str or list): The name of the entity or a list of entity names
|
|
1291
|
-
for which feature versions are to be fetched.
|
|
1292
|
-
features (list): A list of features for which versions are required.
|
|
1293
|
-
domain (str, optional): The data domain to filter the feature versions.
|
|
1294
|
-
Defaults to None, where a predefined domain is used.
|
|
1295
|
-
latest_version_only (bool, optional): Flag to fetch only the latest version
|
|
1296
|
-
of each feature. Defaults to True.
|
|
1297
|
-
version_lag (int, optional): The number of versions to lag behind the latest.
|
|
1298
|
-
Only effective if latest_version_only is True. Defaults to 0.
|
|
1299
|
-
|
|
1300
|
-
Returns:
|
|
1301
|
-
dict: A dictionary with feature names as keys and their corresponding versions as values.
|
|
1302
|
-
"""
|
|
1303
|
-
|
|
1304
|
-
# Default to a predefined data domain if none is provided
|
|
1305
|
-
if domain is None:
|
|
1306
|
-
domain = data_domain
|
|
1307
|
-
|
|
1308
|
-
# Convert the entity_name to a string if it is a list
|
|
1309
|
-
if type(entity_name) == list:
|
|
1310
|
-
entity_name = ','.join(entity_name)
|
|
1311
|
-
|
|
1312
|
-
# Preparing the feature names for inclusion in the SQL query
|
|
1313
|
-
features = ["'" + f + "'" for f in features]
|
|
1314
|
-
|
|
1315
|
-
# Constructing the SQL query to fetch basic feature data for the specified entities and features
|
|
1316
|
-
query = f"""CURRENT VALIDTIME
|
|
1317
|
-
SEL FEATURE_ID, FEATURE_NAME, FEATURE_TABLE, FEATURE_DATABASE
|
|
1318
|
-
FROM {schema}.{feature_catalog_name} where entity_name = '{entity_name}' AND DATA_DOMAIN = '{domain}'
|
|
1319
|
-
AND FEATURE_NAME in ({','.join(features)})"""
|
|
1320
|
-
|
|
1321
|
-
# Executing the first query and converting the results to a pandas DataFrame
|
|
1322
|
-
df = tdml.DataFrame.from_query(query).to_pandas()
|
|
1323
|
-
|
|
1324
|
-
# Building the second query to fetch feature versions
|
|
1325
|
-
query = []
|
|
1326
|
-
for i, row in df.iterrows():
|
|
1327
|
-
query_ = f"""
|
|
1328
|
-
SEL DISTINCT A{i}.FEATURE_NAME, A{i}.FEATURE_VERSION
|
|
1329
|
-
FROM (
|
|
1330
|
-
CURRENT VALIDTIME
|
|
1331
|
-
SELECT CAST('{row['FEATURE_NAME']}' AS VARCHAR(255)) AS FEATURE_NAME, FEATURE_VERSION FROM {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}
|
|
1332
|
-
WHERE FEATURE_ID = {row['FEATURE_ID']})
|
|
1333
|
-
A{i}
|
|
1334
|
-
"""
|
|
1335
|
-
query.append(query_)
|
|
1336
|
-
|
|
1337
|
-
# Combining the individual queries with UNION ALL
|
|
1338
|
-
query = '\n UNION ALL \n'.join(query)
|
|
1339
|
-
|
|
1340
|
-
# Modifying the query to fetch only the latest versions, if specified
|
|
1341
|
-
if latest_version_only:
|
|
1342
|
-
query = 'SELECT * FROM (' + query + ') A \n' + f'QUALIFY ROW_NUMBER() OVER(PARTITION BY FEATURE_NAME ORDER BY FEATURE_VERSION DESC) = 1+{version_lag}'
|
|
1343
|
-
|
|
1344
|
-
# Executing the final query and converting the results to a pandas DataFrame
|
|
1345
|
-
df = tdml.DataFrame.from_query(query).to_pandas()
|
|
1346
|
-
|
|
1347
|
-
# Returning the results as a dictionary with feature names as keys and their versions as values
|
|
1348
|
-
return {row['FEATURE_NAME']:row['FEATURE_VERSION'] for i,row in df.iterrows()}
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
def upload_features(df, entity_id, feature_names, metadata={}):
|
|
1352
|
-
"""
|
|
1353
|
-
Uploads features from a dataframe to a specified entity, registering the process and returning the resulting dataset.
|
|
1354
|
-
|
|
1355
|
-
Args:
|
|
1356
|
-
df (DataFrame): The dataframe containing the features to be uploaded.
|
|
1357
|
-
entity_id (dict or compatible type): The entity identifier. If not a dictionary, it will be converted using `get_column_types`.
|
|
1358
|
-
feature_names (list): The list of feature names to be uploaded.
|
|
1359
|
-
metadata (dict, optional): Additional metadata to associate with the upload. Defaults to an empty dictionary.
|
|
1360
|
-
|
|
1361
|
-
Returns:
|
|
1362
|
-
DataFrame: The dataset resulting from the upload process.
|
|
1363
|
-
"""
|
|
1364
|
-
|
|
1365
|
-
# Convert entity_id to a dictionary if it's not already one
|
|
1366
|
-
if type(entity_id) != dict:
|
|
1367
|
-
entity_id = get_column_types(df, entity_id)
|
|
1368
|
-
print('entity_id has been converted to a proper dictionary : ', entity_id)
|
|
1369
|
-
|
|
1370
|
-
# Register the process and retrieve the SQL query to insert the features, and the process ID
|
|
1371
|
-
query_insert, process_id = register_process_view.__wrapped__(
|
|
1372
|
-
view_name=df,
|
|
1373
|
-
entity_id=entity_id,
|
|
1374
|
-
feature_names=feature_names,
|
|
1375
|
-
metadata=metadata,
|
|
1376
|
-
with_process_id=True
|
|
1377
|
-
)
|
|
1378
|
-
|
|
1379
|
-
# Execute the SQL query to insert the features into the database
|
|
1380
|
-
execute_query(query_insert)
|
|
1381
|
-
|
|
1382
|
-
# Run the registered process and return the resulting dataset
|
|
1383
|
-
dataset = run(process_id=process_id, return_dataset=True)
|
|
1384
|
-
|
|
1385
|
-
return dataset
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
def _build_time_series(entity_id, selected_feature, query_only=False):
|
|
1390
|
-
"""
|
|
1391
|
-
Constructs a time series dataset for a given entity and feature.
|
|
1392
|
-
Optionally returns only the query used for dataset construction.
|
|
1393
|
-
|
|
1394
|
-
This is a wrapper around the `build_dataset` function, tailored specifically for time series data by setting temporal parameters to null.
|
|
1395
|
-
|
|
1396
|
-
Args:
|
|
1397
|
-
entity_id (dict): The identifier for the entity for which the dataset is being built.
|
|
1398
|
-
selected_feature (str or list): The feature(s) to be included in the dataset.
|
|
1399
|
-
query_only (bool, optional): If True, returns only the SQL query used for building the dataset, not the dataset itself. Defaults to False.
|
|
1400
|
-
|
|
1401
|
-
Returns:
|
|
1402
|
-
DataFrame or str: The constructed time series dataset as a DataFrame, or the SQL query as a string if query_only is True.
|
|
1403
|
-
"""
|
|
1404
|
-
|
|
1405
|
-
# Call the build_dataset function with specific parameters set for time series dataset construction
|
|
1406
|
-
return build_dataset(
|
|
1407
|
-
entity_id=entity_id, # The identifier for the entity
|
|
1408
|
-
selected_features=selected_feature, # The feature(s) to be included in the dataset
|
|
1409
|
-
no_temporal=True, # Indicates that the dataset should not have a temporal component
|
|
1410
|
-
query_only=query_only, # Determines whether to return just the query or the constructed dataset
|
|
1411
|
-
time_manager=None, # No time management for the dataset construction
|
|
1412
|
-
view_name=None # No specific view name provided
|
|
1413
|
-
)
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
def build_dataset_time_series(df, time_column, entity_id, selected_features, query_only=False, time_manager=None):
|
|
1417
|
-
"""
|
|
1418
|
-
Constructs a time series dataset based on the specified features and entity_id from the provided dataframe.
|
|
1419
|
-
|
|
1420
|
-
Args:
|
|
1421
|
-
df (DataFrame): The source dataframe.
|
|
1422
|
-
time_column (str): The name of the column in df that represents time.
|
|
1423
|
-
entity_id (dict): A dictionary representing the entity identifier.
|
|
1424
|
-
selected_features (dict): A dictionary with keys as feature names and values as conditions or specifications for those features.
|
|
1425
|
-
query_only (bool, optional): If True, only the SQL query for the dataset is returned. Defaults to False.
|
|
1426
|
-
time_manager (TimeManager, optional): An instance of TimeManager to manage time-related operations. Defaults to None.
|
|
1427
|
-
|
|
1428
|
-
Returns:
|
|
1429
|
-
DataFrame or str: The constructed time series dataset as a DataFrame, or the SQL query as a string if query_only is True.
|
|
1430
|
-
"""
|
|
1431
|
-
|
|
1432
|
-
# Convert column names to lowercase for case-insensitive matching
|
|
1433
|
-
col = [c.lower() for c in df.columns]
|
|
1434
|
-
|
|
1435
|
-
# Check if the entity_id keys are present in the dataframe columns
|
|
1436
|
-
for e in entity_id:
|
|
1437
|
-
if e.lower() not in col:
|
|
1438
|
-
print(f' {e} is not present in your dataframe')
|
|
1439
|
-
print('Here are the columns of your dataframe:')
|
|
1440
|
-
print(str(col))
|
|
1441
|
-
return # Exit if any entity_id key is not found
|
|
1442
|
-
|
|
1443
|
-
# Check if the time_column is present in the dataframe columns
|
|
1444
|
-
if time_column.lower() not in col:
|
|
1445
|
-
print(f' {time_column} is not present in your dataframe')
|
|
1446
|
-
print('Here are the columns of your dataframe:')
|
|
1447
|
-
print(str(col))
|
|
1448
|
-
return # Exit if the time_column is not found
|
|
1449
|
-
|
|
1450
|
-
# Extract and check the data type of the time_column
|
|
1451
|
-
d_ = {x[0]: x[1] for x in df._td_column_names_and_types}
|
|
1452
|
-
time_column_data_type = d_[time_column]
|
|
1453
|
-
print('time column data type :', time_column_data_type)
|
|
1454
|
-
if 'date' not in time_column_data_type.lower() and 'time' not in time_column_data_type.lower():
|
|
1455
|
-
print('the time column of your data frame is neither a date nor a timestamp')
|
|
1456
|
-
return # Exit if the time_column data type is not date or timestamp
|
|
1457
|
-
|
|
1458
|
-
# Initialize the select query
|
|
1459
|
-
select_query = 'SELECT \n' + ', \n'.join(['A.' + c for c in col]) + '\n'
|
|
1460
|
-
|
|
1461
|
-
# If a time_manager is provided, extract its details
|
|
1462
|
-
if time_manager is not None:
|
|
1463
|
-
tm_datatype = time_manager.data_type.lower()
|
|
1464
|
-
tm_schema = time_manager.schema_name
|
|
1465
|
-
tm_table = time_manager.table_name
|
|
1466
|
-
|
|
1467
|
-
sub_queries_list = []
|
|
1468
|
-
# For each selected feature, build its part of the query
|
|
1469
|
-
for i, (k, v) in enumerate(selected_features.items()):
|
|
1470
|
-
select_query += ', BB' + str(i + 1) + '.' + k + '\n'
|
|
1471
|
-
|
|
1472
|
-
nested_query = _build_time_series(entity_id, {k: v}, query_only=True)
|
|
1473
|
-
|
|
1474
|
-
sub_queries = 'SELECT \n' + '\n ,'.join(entity_id) + '\n ,' + k + '\n'
|
|
1475
|
-
|
|
1476
|
-
# Build the sub_queries based on the presence of a time_manager and the data types of time_column and time_manager
|
|
1477
|
-
if time_manager is None:
|
|
1478
|
-
# there is a time manager
|
|
1479
|
-
if 'date' in tm_datatype:
|
|
1480
|
-
# the data type of the time column is DATE
|
|
1481
|
-
sub_queries += f', CAST(ValidStart_{k} AS DATE) AS ValidStart \n'
|
|
1482
|
-
sub_queries += f', CAST(ValidEnd_{k} AS DATE) AS ValidEnd \n'
|
|
1483
|
-
else:
|
|
1484
|
-
# the data type of the time column is timestamp
|
|
1485
|
-
sub_queries += f', CAST(ValidStart_{k} AS TIMESTAMP(0)) AS ValidStart \n'
|
|
1486
|
-
sub_queries += f', CAST(ValidEnd_{k} AS TIMESTAMP(0)) AS ValidEnd \n'
|
|
1487
|
-
else:
|
|
1488
|
-
# there is a time manager
|
|
1489
|
-
if 'date' in time_column_data_type.lower():
|
|
1490
|
-
# the data type of the time column is DATE
|
|
1491
|
-
if 'date' in tm_datatype:
|
|
1492
|
-
# the data type of the time manager is DATE
|
|
1493
|
-
sub_queries += f', CAST(ValidStart_{k} AS DATE) AS ValidStart \n'
|
|
1494
|
-
sub_queries += f', CASE WHEN CAST(ValidEnd_{k} AS DATE) > BUS_DATE.BUSINESS_DATE THEN BUS_DATE.BUSINESS_DATE ELSE CAST(ValidEnd_{k} AS DATE) END AS ValidEnd \n'
|
|
1495
|
-
else:
|
|
1496
|
-
# the data type of the time manager is timestamp
|
|
1497
|
-
sub_queries += f', CAST(ValidStart_{k} AS DATE) AS ValidStart \n'
|
|
1498
|
-
sub_queries += f', CASE WHEN CAST(ValidEnd_{k} AS DATE) > BUS_DATE.BUSINESS_DATE THEN BUS_DATE.BUSINESS_DATE ELSE CAST(ValidEnd_{k} AS DATE) END AS ValidEnd \n'
|
|
1499
|
-
else:
|
|
1500
|
-
# the data type of the time column is TIMESTAMP
|
|
1501
|
-
if 'date' in tm_datatype:
|
|
1502
|
-
sub_queries += f', CAST(ValidStart_{k} AS TIMESTAMP(0)) AS ValidStart \n'
|
|
1503
|
-
sub_queries += f', CASE WHEN CAST(ValidEnd_{k} AS TIMESTAMP(0)) > CAST(BUS_DATE.BUSINESS_DATE AS TIMESTAMP(0)) THEN BUS_DATE.BUSINESS_DATE ELSE CAST(ValidEnd_{k} AS TIMESTAMP(0)) END AS ValidEnd \n'
|
|
1504
|
-
else:
|
|
1505
|
-
sub_queries += f', CAST(ValidStart_{k} AS TIMESTAMP(0)) AS ValidStart \n'
|
|
1506
|
-
sub_queries += f', CASE WHEN CAST(ValidEnd_{k} AS TIMESTAMP(0)) > CAST(BUS_DATE.BUSINESS_DATE AS TIMESTAMP(0)) THEN BUS_DATE.BUSINESS_DATE ELSE CAST(ValidEnd_{k} AS TIMESTAMP(0)) END AS ValidEnd \n'
|
|
1507
|
-
|
|
1508
|
-
sub_queries += f'FROM ({nested_query}) tmp{i + 1} \n'
|
|
1509
|
-
if time_manager is not None:
|
|
1510
|
-
sub_queries += f',{tm_schema}.{tm_table} BUS_DATE \n'
|
|
1511
|
-
|
|
1512
|
-
sub_queries += 'WHERE ValidStart < ValidEnd \n'
|
|
1513
|
-
|
|
1514
|
-
sub_queries = 'LEFT JOIN ( \n' + sub_queries + ') BB' + str(i + 1) + '\n ON '
|
|
1515
|
-
|
|
1516
|
-
sub_queries += '\n AND '.join(['A.' + c + '=BB' + str(i + 1) + '.' + c for c in entity_id])
|
|
1517
|
-
|
|
1518
|
-
sub_queries += f'\n AND PERIOD(BB{i + 1}.ValidStart, BB{i + 1}.ValidEnd) CONTAINS A.{time_column} \n'
|
|
1519
|
-
|
|
1520
|
-
sub_queries_list.append(sub_queries)
|
|
1521
|
-
|
|
1522
|
-
# Combine all parts of the query
|
|
1523
|
-
query = select_query + f'FROM ({df.show_query()}) A \n' + '\n --------------- \n'.join(sub_queries_list)
|
|
1524
|
-
|
|
1525
|
-
# If only the query is requested, return it; otherwise, execute the query and return the resulting DataFrame
|
|
1526
|
-
if query_only:
|
|
1527
|
-
return query
|
|
1528
|
-
else:
|
|
1529
|
-
return tdml.DataFrame.from_query(query)
|