tdfs4ds 0.2.4.31__py3-none-any.whl → 0.2.4.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +341 -519
- tdfs4ds/feature_store/feature_data_processing.py +236 -268
- tdfs4ds/process_store/process_query_administration.py +1 -1
- tdfs4ds/process_store/process_registration_management.py +67 -55
- tdfs4ds/utils/filter_management.py +456 -137
- tdfs4ds/utils/time_management.py +547 -97
- {tdfs4ds-0.2.4.31.dist-info → tdfs4ds-0.2.4.33.dist-info}/METADATA +1 -1
- {tdfs4ds-0.2.4.31.dist-info → tdfs4ds-0.2.4.33.dist-info}/RECORD +10 -10
- {tdfs4ds-0.2.4.31.dist-info → tdfs4ds-0.2.4.33.dist-info}/WHEEL +0 -0
- {tdfs4ds-0.2.4.31.dist-info → tdfs4ds-0.2.4.33.dist-info}/top_level.txt +0 -0
|
@@ -7,6 +7,8 @@ from tdfs4ds.utils.info import seconds_to_dhms
|
|
|
7
7
|
import time
|
|
8
8
|
import re
|
|
9
9
|
import pandas as pd
|
|
10
|
+
from tdfs4ds import logger_safe, logger
|
|
11
|
+
|
|
10
12
|
|
|
11
13
|
def generate_on_clause(entity_id, entity_null_substitute, left_name, right_name):
|
|
12
14
|
res = []
|
|
@@ -164,11 +166,10 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
|
|
|
164
166
|
# Record the start time
|
|
165
167
|
start_time = time.time()
|
|
166
168
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
if type(entity_id) == list:
|
|
169
|
+
# Normalize entity_id into a list of keys
|
|
170
|
+
if isinstance(entity_id, list):
|
|
170
171
|
list_entity_id = entity_id
|
|
171
|
-
elif
|
|
172
|
+
elif isinstance(entity_id, dict):
|
|
172
173
|
list_entity_id = list(entity_id.keys())
|
|
173
174
|
else:
|
|
174
175
|
list_entity_id = [entity_id]
|
|
@@ -176,199 +177,170 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
|
|
|
176
177
|
|
|
177
178
|
feature_id_names, conversion_name2id = get_feature_id_and_conversion(list_entity_id, feature_names)
|
|
178
179
|
|
|
179
|
-
features_infos = pd.DataFrame(feature_id_names, columns
|
|
180
|
+
features_infos = pd.DataFrame(feature_id_names, columns=['FEATURE_ID', 'FEATURE_NAME', 'FEATURE_TABLE', 'FEATURE_DATABASE'])
|
|
180
181
|
features_infos['FEATURE_VERSION'] = [feature_versions[k] for k in features_infos.FEATURE_NAME.values]
|
|
181
|
-
if tdfs4ds.DEBUG_MODE:
|
|
182
|
-
print('--- prepare_feature_ingestion ---')
|
|
183
|
-
print('conversion_name2id : ', conversion_name2id)
|
|
184
|
-
print('feature_names : ', feature_names)
|
|
185
182
|
|
|
186
|
-
|
|
187
|
-
|
|
183
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
184
|
+
logger_safe("debug", "--- prepare_feature_ingestion ---")
|
|
185
|
+
logger_safe("debug", "conversion_name2id=%s", conversion_name2id)
|
|
186
|
+
logger_safe("debug", "feature_names=%s", feature_names)
|
|
188
187
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
188
|
+
# UNPIVOT mapping
|
|
189
|
+
unpivot_columns = ", \n".join([f"({x}) as '{conversion_name2id[x]}'" for x in feature_names])
|
|
190
|
+
|
|
191
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
192
|
+
logger_safe("debug", "unpivot_columns=%s", unpivot_columns)
|
|
193
193
|
|
|
194
|
+
# Output columns for volatile table
|
|
194
195
|
output_columns = ', \n'.join(list_entity_id + ['CAST(FEATURE_ID AS BIGINT) AS FEATURE_ID', 'FEATURE_VALUE'])
|
|
195
196
|
|
|
197
|
+
# Primary index
|
|
196
198
|
if primary_index is None:
|
|
197
199
|
primary_index = ','.join(list_entity_id)
|
|
198
200
|
else:
|
|
199
|
-
if
|
|
200
|
-
primary_index = primary_index
|
|
201
|
-
else:
|
|
201
|
+
if not isinstance(primary_index, list):
|
|
202
202
|
primary_index = [primary_index]
|
|
203
203
|
primary_index = ','.join(primary_index)
|
|
204
204
|
|
|
205
|
-
#
|
|
205
|
+
# Feature versions (defaults)
|
|
206
206
|
versions = {f: tdfs4ds.FEATURE_VERSION_DEFAULT for f in feature_names}
|
|
207
207
|
if feature_versions is not None:
|
|
208
208
|
for k, v in feature_versions.items():
|
|
209
209
|
versions[k] = v
|
|
210
210
|
|
|
211
|
-
if tdfs4ds
|
|
212
|
-
|
|
213
|
-
print('versions : ', versions)
|
|
211
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
212
|
+
logger_safe("debug", "versions=%s", versions)
|
|
214
213
|
|
|
215
|
-
#
|
|
216
|
-
version_query = ["CASE"] + [f"WHEN FEATURE_ID = '{conversion_name2id[k]}' THEN '{v}' " for k, v in versions.items()] + [
|
|
217
|
-
"END AS FEATURE_VERSION"]
|
|
214
|
+
# CASE statement for versions
|
|
215
|
+
version_query = ["CASE"] + [f"WHEN FEATURE_ID = '{conversion_name2id[k]}' THEN '{v}' " for k, v in versions.items()] + ["END AS FEATURE_VERSION"]
|
|
218
216
|
version_query = '\n'.join(version_query)
|
|
219
217
|
|
|
220
|
-
if tdfs4ds
|
|
221
|
-
|
|
222
|
-
print('version_query : ', version_query)
|
|
218
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
219
|
+
logger_safe("debug", "version_query=%s", version_query)
|
|
223
220
|
|
|
224
|
-
#
|
|
221
|
+
# Volatile table name
|
|
225
222
|
volatile_table_name = df._table_name.split('.')[1].replace('"', '')
|
|
226
|
-
volatile_table_name = f
|
|
223
|
+
volatile_table_name = f"temp_{volatile_table_name}"
|
|
227
224
|
|
|
228
|
-
|
|
225
|
+
# Normalize entity_id again for var casting
|
|
226
|
+
if isinstance(entity_id, list):
|
|
229
227
|
list_entity_id = entity_id
|
|
230
|
-
elif
|
|
228
|
+
elif isinstance(entity_id, dict):
|
|
231
229
|
list_entity_id = list(entity_id.keys())
|
|
232
230
|
else:
|
|
233
231
|
list_entity_id = [entity_id]
|
|
234
232
|
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
res = {x.split()[0]:''.join(x.split()[1::]) for x in str(df[feature_names].tdtypes).split('\n')}
|
|
233
|
+
# Character set handling / pass-through
|
|
234
|
+
res = {x.split()[0]: ''.join(x.split()[1::]) for x in str(df[feature_names].tdtypes).split('\n')}
|
|
238
235
|
var_temp2 = []
|
|
239
|
-
for k,v in res.items():
|
|
236
|
+
for k, v in res.items():
|
|
240
237
|
if 'UNICODE' in v:
|
|
241
|
-
#var_temp2.append(f'TRANSLATE({k} USING UNICODE_TO_LATIN) AS {k}')
|
|
242
238
|
var_temp2.append(f'{k}')
|
|
243
239
|
elif 'LATIN' in v:
|
|
244
|
-
#var_temp2.append(f'{k}')
|
|
245
240
|
var_temp2.append(f'TRANSLATE({k} USING LATIN_TO_UNICODE) AS {k}')
|
|
246
241
|
else:
|
|
247
242
|
var_temp2.append(f'CAST({k} AS VARCHAR(2048) CHARACTER SET UNICODE) AS {k}')
|
|
248
243
|
var_temp2 = ', \n'.join(var_temp2)
|
|
244
|
+
|
|
245
|
+
# NOTE: the original code overrides var_temp2 with just the raw column names.
|
|
246
|
+
# Preserve that behavior to avoid functional change.
|
|
249
247
|
var_temp2 = ', \n'.join(list(res.keys()))
|
|
250
248
|
|
|
249
|
+
# Null substitution on entity keys
|
|
251
250
|
var_temp3 = []
|
|
252
251
|
for e in list_entity_id:
|
|
253
252
|
if e in entity_null_substitute.keys():
|
|
254
|
-
if
|
|
253
|
+
if isinstance(entity_null_substitute[e], str):
|
|
255
254
|
var_temp3.append(f"coalesce({e},'{entity_null_substitute[e]}') AS {e}")
|
|
256
255
|
else:
|
|
257
256
|
var_temp3.append(f"coalesce({e},{entity_null_substitute[e]}) AS {e}")
|
|
258
257
|
else:
|
|
259
258
|
var_temp3.append(e)
|
|
260
|
-
|
|
261
259
|
var_temp3 = ', \n'.join(var_temp3)
|
|
262
260
|
|
|
263
|
-
|
|
264
|
-
nested_query = f"""
|
|
265
|
-
CREATE MULTISET VOLATILE TABLE {volatile_table_name} AS
|
|
266
|
-
(
|
|
267
|
-
SELECT
|
|
268
|
-
{output_columns},
|
|
269
|
-
{version_query}
|
|
270
|
-
FROM
|
|
271
|
-
(SELECT
|
|
272
|
-
{var_temp3},
|
|
273
|
-
{var_temp2}
|
|
274
|
-
FROM {df._table_name}
|
|
275
|
-
) A
|
|
276
|
-
UNPIVOT INCLUDE NULLS ((FEATURE_VALUE ) FOR FEATURE_ID
|
|
277
|
-
IN ({unpivot_columns})) Tmp
|
|
278
|
-
) WITH DATA
|
|
279
|
-
PRIMARY INDEX ({primary_index})
|
|
280
|
-
PARTITION BY RANGE_N(FEATURE_ID BETWEEN 0 AND 2000 EACH 1 )
|
|
281
|
-
ON COMMIT PRESERVE ROWS
|
|
282
|
-
"""
|
|
283
|
-
|
|
284
|
-
nested_query = f"""
|
|
285
|
-
CREATE MULTISET VOLATILE TABLE {volatile_table_name} AS
|
|
286
|
-
(
|
|
287
|
-
SELECT
|
|
288
|
-
{var_temp3},
|
|
289
|
-
{var_temp2}
|
|
290
|
-
FROM {df._table_name}
|
|
291
|
-
) WITH DATA
|
|
292
|
-
PRIMARY INDEX ({primary_index})
|
|
293
|
-
ON COMMIT PRESERVE ROWS
|
|
294
|
-
"""
|
|
295
|
-
|
|
261
|
+
# Final nested query used (the function reassigns to plain SELECT; preserve as-is)
|
|
296
262
|
nested_query = f"""
|
|
297
263
|
SELECT
|
|
298
264
|
{var_temp3},
|
|
299
265
|
{var_temp2}
|
|
300
266
|
FROM {df._table_name}
|
|
301
|
-
|
|
302
267
|
"""
|
|
303
268
|
|
|
304
|
-
#
|
|
269
|
+
# Duplicate check query
|
|
305
270
|
output_columns_unicity = ', \n'.join(list_entity_id)
|
|
306
271
|
query_test_unicity = f"""
|
|
307
272
|
SELECT sum(CASE WHEN n>1 THEN 1 ELSE 0 END) AS nb_duplicates
|
|
308
273
|
FROM (
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
274
|
+
SELECT
|
|
275
|
+
{output_columns_unicity},
|
|
276
|
+
count(*) as n
|
|
277
|
+
FROM {_get_database_username()}.{volatile_table_name}
|
|
278
|
+
GROUP BY {output_columns_unicity}
|
|
314
279
|
) A
|
|
315
280
|
"""
|
|
316
281
|
|
|
317
|
-
if tdfs4ds
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
print('nested_query :', nested_query)
|
|
322
|
-
|
|
323
|
-
|
|
282
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
283
|
+
logger_safe("debug", "var_temp2=%s", var_temp2)
|
|
284
|
+
logger_safe("debug", "var_temp3=%s", var_temp3)
|
|
285
|
+
logger_safe("debug", "nested_query=%s", nested_query)
|
|
324
286
|
|
|
325
|
-
# Execute
|
|
287
|
+
# Execute: create volatile table and test unicity
|
|
326
288
|
try:
|
|
327
|
-
|
|
328
|
-
|
|
289
|
+
tdml.DataFrame.from_query(nested_query).to_sql(
|
|
290
|
+
table_name=volatile_table_name,
|
|
291
|
+
temporary=True,
|
|
292
|
+
primary_index=primary_index.split(','),
|
|
293
|
+
if_exists='replace'
|
|
294
|
+
)
|
|
329
295
|
nb_duplicates = tdml.execute_sql(query_test_unicity).fetchall()[0][0]
|
|
330
296
|
if nb_duplicates is not None and nb_duplicates > 0:
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
297
|
+
logger_safe("error", "The process generates %s duplicates", nb_duplicates)
|
|
298
|
+
# Show sample duplicates in debug for troubleshooting
|
|
299
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
300
|
+
sample_dups_query = f"""
|
|
301
|
+
SELECT TOP 3
|
|
302
|
+
{output_columns_unicity},
|
|
303
|
+
count(*) as n
|
|
304
|
+
FROM {_get_database_username()}.{volatile_table_name}
|
|
305
|
+
GROUP BY {output_columns_unicity}
|
|
306
|
+
HAVING n > 1
|
|
307
|
+
"""
|
|
308
|
+
logger_safe("debug", "Sample duplicates query:\n%s", sample_dups_query)
|
|
340
309
|
raise ValueError("Invalid process: the process generates duplicates.")
|
|
341
|
-
#
|
|
310
|
+
# else: no duplicates
|
|
311
|
+
# logger_safe("info", "No duplicate found.") # optional
|
|
342
312
|
except Exception as e:
|
|
343
|
-
|
|
344
|
-
print(str(e).split('\n')[0])
|
|
313
|
+
logger_safe("error", "prepare_feature_ingestion failed: %s", str(e).split('\n')[0])
|
|
345
314
|
raise
|
|
346
315
|
|
|
316
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
317
|
+
logger_safe(
|
|
318
|
+
"debug",
|
|
319
|
+
"Result volatile table dtypes:\n%s",
|
|
320
|
+
tdml.DataFrame(tdml.in_schema(_get_database_username(), volatile_table_name)).tdtypes
|
|
321
|
+
)
|
|
347
322
|
|
|
348
|
-
|
|
349
|
-
print('--- prepare_feature_ingestion ---')
|
|
350
|
-
print(tdml.DataFrame(tdml.in_schema(_get_database_username(), volatile_table_name)).tdtypes)
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
# Record the end time
|
|
323
|
+
# Timing
|
|
354
324
|
end_time = time.time()
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
# Calculate the elapsed time in seconds
|
|
358
325
|
elapsed_time = end_time - start_time
|
|
359
326
|
formatted_elapsed_time = seconds_to_dhms(elapsed_time)
|
|
360
|
-
|
|
361
|
-
|
|
327
|
+
logger_safe("info", "Feature preparation for ingestion: %s (%.3fs)", formatted_elapsed_time, elapsed_time)
|
|
328
|
+
|
|
329
|
+
# Return DataFrame and metadata
|
|
362
330
|
try:
|
|
363
331
|
df_out = tdml.DataFrame(tdml.in_schema(_get_database_username(), volatile_table_name))
|
|
364
332
|
return df_out, volatile_table_name, features_infos
|
|
365
333
|
except Exception as e:
|
|
366
|
-
|
|
367
|
-
|
|
334
|
+
logger_safe("error", "Failed to materialize volatile DataFrame: %s", str(e).split()[0])
|
|
335
|
+
logger_safe("error", "Feature dtypes snapshot: %s", df[feature_names].tdtypes)
|
|
368
336
|
if 'TD_Unpivot contract function' in str(e).split()[0]:
|
|
369
|
-
raise(
|
|
337
|
+
raise RuntimeError(
|
|
338
|
+
"Error: you may have strings with UNICODE encoding as features; please convert them to LATIN first."
|
|
339
|
+
)
|
|
340
|
+
raise
|
|
370
341
|
|
|
371
|
-
|
|
342
|
+
# Fallback (should not reach)
|
|
343
|
+
# return None, None, None
|
|
372
344
|
|
|
373
345
|
|
|
374
346
|
def apply_collect_stats(entity_id, primary_index, partitioning, feature_infos):
|
|
@@ -392,44 +364,46 @@ def apply_collect_stats(entity_id, primary_index, partitioning, feature_infos):
|
|
|
392
364
|
Returns:
|
|
393
365
|
None
|
|
394
366
|
"""
|
|
395
|
-
# Sort
|
|
367
|
+
# Sort entity IDs for consistent ordering
|
|
396
368
|
sorted_entity_id = list(entity_id.keys())
|
|
397
369
|
sorted_entity_id.sort()
|
|
398
370
|
|
|
399
|
-
# Group
|
|
371
|
+
# Group target tables
|
|
400
372
|
target_tables = feature_infos[['FEATURE_TABLE', 'FEATURE_DATABASE', 'FEATURE_ID']].groupby(
|
|
401
373
|
['FEATURE_TABLE', 'FEATURE_DATABASE']
|
|
402
374
|
).count().reset_index()
|
|
403
375
|
|
|
404
|
-
|
|
376
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
377
|
+
logger_safe("debug", "Target tables for COLLECT STATs: %s", target_tables[['FEATURE_DATABASE','FEATURE_TABLE']].to_dict(orient='records'))
|
|
378
|
+
|
|
379
|
+
# Generate COLLECT STATs queries
|
|
405
380
|
query_collect_stats, query_collect_stats_extension = generate_collect_stats(
|
|
406
381
|
sorted_entity_id,
|
|
407
382
|
primary_index=primary_index,
|
|
408
383
|
partitioning=partitioning
|
|
409
384
|
)
|
|
410
385
|
|
|
411
|
-
# Record the start time for measuring query execution duration.
|
|
412
386
|
start_time = time.time()
|
|
413
387
|
|
|
414
|
-
#
|
|
415
|
-
for
|
|
416
|
-
|
|
417
|
-
|
|
388
|
+
# Execute COLLECT STATs
|
|
389
|
+
for _, row in target_tables.iterrows():
|
|
390
|
+
table_fqn = f"{row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}"
|
|
391
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
392
|
+
logger_safe("debug", "Running COLLECT STATs on %s", table_fqn)
|
|
418
393
|
|
|
419
|
-
|
|
420
|
-
if query_collect_stats_extension is not None:
|
|
421
|
-
execute_query(query_collect_stats_extension + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
|
|
394
|
+
execute_query(query_collect_stats + f" ON {table_fqn}")
|
|
422
395
|
|
|
423
|
-
|
|
424
|
-
|
|
396
|
+
if query_collect_stats_extension is not None:
|
|
397
|
+
execute_query(query_collect_stats_extension + f" ON {table_fqn}")
|
|
425
398
|
|
|
426
|
-
|
|
427
|
-
elapsed_time = end_time - start_time
|
|
399
|
+
elapsed_time = time.time() - start_time
|
|
428
400
|
formatted_elapsed_time = seconds_to_dhms(elapsed_time)
|
|
401
|
+
logger_safe(
|
|
402
|
+
"info",
|
|
403
|
+
"Storage of the prepared features - collect stats only: %s (%.3fs)",
|
|
404
|
+
formatted_elapsed_time, elapsed_time
|
|
405
|
+
)
|
|
429
406
|
|
|
430
|
-
# Log the execution time if logging is enabled.
|
|
431
|
-
if tdfs4ds.DISPLAY_LOGS:
|
|
432
|
-
print(f'Storage of the prepared features - collect stats only : {formatted_elapsed_time} ({elapsed_time}s)')
|
|
433
407
|
|
|
434
408
|
|
|
435
409
|
def _store_feature_update_insert(entity_id, volatile_table_name, entity_null_substitute={},primary_index=None,
|
|
@@ -627,9 +601,8 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
|
|
|
627
601
|
>>> store_feature(entity_id_dict, prepared_features)
|
|
628
602
|
"""
|
|
629
603
|
|
|
630
|
-
#
|
|
631
|
-
|
|
632
|
-
if tdfs4ds.FEATURE_STORE_TIME == None:
|
|
604
|
+
# VALIDTIME handling
|
|
605
|
+
if tdfs4ds.FEATURE_STORE_TIME is None:
|
|
633
606
|
validtime_statement = 'CURRENT VALIDTIME'
|
|
634
607
|
validtime_statement2 = validtime_statement
|
|
635
608
|
validtime_start = 'CAST(CURRENT_TIME AS TIMESTAMP(0) WITH TIME ZONE)'
|
|
@@ -638,180 +611,155 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
|
|
|
638
611
|
validtime_statement2 = f"VALIDTIME AS OF TIMESTAMP '{tdfs4ds.FEATURE_STORE_TIME}'"
|
|
639
612
|
validtime_start = f"CAST('{tdfs4ds.FEATURE_STORE_TIME}' AS TIMESTAMP(0) WITH TIME ZONE)"
|
|
640
613
|
|
|
641
|
-
if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED'
|
|
642
|
-
end_period_ = '9999-01-01 00:00:00'
|
|
643
|
-
else:
|
|
644
|
-
end_period_ = tdfs4ds.END_PERIOD
|
|
645
|
-
|
|
646
|
-
if tdfs4ds.DEBUG_MODE:
|
|
647
|
-
print('tdfs4ds.FEATURE_STORE_TIME :' , tdfs4ds.FEATURE_STORE_TIME)
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
if tdfs4ds.DEBUG_MODE:
|
|
651
|
-
print('entity_id :' , entity_id)
|
|
614
|
+
end_period_ = '9999-01-01 00:00:00' if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED' else tdfs4ds.END_PERIOD
|
|
652
615
|
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
616
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
617
|
+
logger_safe("debug", "FEATURE_STORE_TIME=%s | END_PERIOD=%s", tdfs4ds.FEATURE_STORE_TIME, tdfs4ds.END_PERIOD)
|
|
618
|
+
logger_safe("debug", "entity_id=%s", entity_id)
|
|
656
619
|
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
{volatile_table_name}
|
|
661
|
-
""").fetchall(), columns = ['NB_ROWS'])
|
|
620
|
+
# Entity id helpers
|
|
621
|
+
sorted_entity_id = sorted(list(entity_id.keys()))
|
|
622
|
+
ENTITY_ID = ','.join(sorted_entity_id)
|
|
662
623
|
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
624
|
+
# Count rows in volatile table
|
|
625
|
+
count_features = pd.DataFrame(
|
|
626
|
+
tdml.execute_sql(
|
|
627
|
+
f"""
|
|
628
|
+
SEL count(*) as NB_ROWS
|
|
629
|
+
FROM {_get_database_username()}.{volatile_table_name}
|
|
630
|
+
"""
|
|
631
|
+
).fetchall(),
|
|
632
|
+
columns=['NB_ROWS']
|
|
633
|
+
)
|
|
666
634
|
|
|
635
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
636
|
+
logger_safe("debug", "count_features=%s", count_features)
|
|
637
|
+
logger_safe("debug", "features_infos initial=%s", features_infos)
|
|
667
638
|
|
|
668
639
|
if count_features.shape[0] > 0:
|
|
669
640
|
features_infos['NB_ROWS'] = count_features['NB_ROWS'].values[0]
|
|
670
641
|
else:
|
|
671
642
|
features_infos['NB_ROWS'] = 0
|
|
672
643
|
|
|
673
|
-
if tdfs4ds
|
|
674
|
-
|
|
675
|
-
# Group the target tables by feature table and feature database and count the number of occurrences
|
|
676
|
-
target_tables = features_infos[['FEATURE_TABLE', 'FEATURE_DATABASE', 'NB_ROWS']].groupby(
|
|
677
|
-
['FEATURE_TABLE', 'FEATURE_DATABASE']).sum().reset_index()
|
|
678
|
-
|
|
679
|
-
if tdfs4ds.DEBUG_MODE:
|
|
680
|
-
print('target_tables :' , target_tables)
|
|
681
|
-
if tdfs4ds.DISPLAY_LOGS:
|
|
682
|
-
display_table(target_tables[['FEATURE_DATABASE', 'FEATURE_TABLE', 'NB_ROWS']])
|
|
644
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
645
|
+
logger_safe("debug", "features_infos updated=%s", features_infos)
|
|
683
646
|
|
|
647
|
+
# Compute target tables
|
|
648
|
+
target_tables = features_infos[['FEATURE_TABLE', 'FEATURE_DATABASE', 'NB_ROWS']].groupby(
|
|
649
|
+
['FEATURE_TABLE', 'FEATURE_DATABASE']
|
|
650
|
+
).sum().reset_index()
|
|
684
651
|
|
|
652
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
653
|
+
logger_safe("debug", "target_tables=%s", target_tables)
|
|
685
654
|
|
|
686
|
-
|
|
687
|
-
|
|
655
|
+
# Optional display (keep existing UX semantics)
|
|
656
|
+
if getattr(tdfs4ds, "DISPLAY_LOGS", False):
|
|
657
|
+
try:
|
|
658
|
+
display_table(target_tables[['FEATURE_DATABASE', 'FEATURE_TABLE', 'NB_ROWS']])
|
|
659
|
+
except Exception as e:
|
|
660
|
+
logger_safe("warning", "display_table failed: %s", str(e).split('\n')[0])
|
|
688
661
|
|
|
689
662
|
ENTITY_ID_ON = ' AND '.join([f'NEW_FEATURES.{k} = EXISTING_FEATURES.{k}' for k in sorted_entity_id])
|
|
690
|
-
|
|
691
663
|
ENTITY_ID_SELECT = ', \n'.join(['NEW_FEATURES.' + k for k in sorted_entity_id])
|
|
692
|
-
# Iterate over target tables and perform update and insert operations
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
#query_collect_stats, query_collect_stats_extension = generate_collect_stats(sorted_entity_id,primary_index=primary_index, partitioning=partitioning)
|
|
696
|
-
|
|
697
664
|
|
|
698
665
|
queries = []
|
|
699
|
-
for
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
666
|
+
for _, row in features_infos.iterrows():
|
|
667
|
+
features_infos_ = features_infos[
|
|
668
|
+
(features_infos.FEATURE_TABLE == row['FEATURE_TABLE']) &
|
|
669
|
+
(features_infos.FEATURE_DATABASE == row['FEATURE_DATABASE'])
|
|
670
|
+
]
|
|
671
|
+
feature_id_list = ','.join([str(x) for x in sorted(set(features_infos_.FEATURE_ID.values))])
|
|
672
|
+
feature_version_list = ','.join(["'" + x + "'" for x in sorted(set(features_infos_.FEATURE_VERSION.values))])
|
|
673
|
+
|
|
674
|
+
# Build nested query
|
|
707
675
|
nested_query = f"""
|
|
708
676
|
SEL
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
FROM {_get_database_username()}.{volatile_table_name}
|
|
677
|
+
{ENTITY_ID}
|
|
678
|
+
, {row['FEATURE_ID']} AS FEATURE_ID
|
|
679
|
+
, {row['FEATURE_NAME']} AS FEATURE_VALUE
|
|
680
|
+
, '{row['FEATURE_VERSION']}' AS FEATURE_VERSION
|
|
681
|
+
FROM {_get_database_username()}.{volatile_table_name}
|
|
714
682
|
"""
|
|
715
683
|
|
|
716
|
-
if tdfs4ds.FEATURE_STORE_TIME
|
|
684
|
+
if tdfs4ds.FEATURE_STORE_TIME is None:
|
|
717
685
|
query_merge = f"""
|
|
718
686
|
{validtime_statement}
|
|
719
|
-
MERGE INTO
|
|
720
|
-
|
|
687
|
+
MERGE INTO {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']} EXISTING_FEATURES
|
|
721
688
|
USING ( {nested_query} ) NEW_FEATURES
|
|
722
|
-
ON {ENTITY_ID_ON}
|
|
689
|
+
ON {ENTITY_ID_ON}
|
|
723
690
|
AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
|
|
724
691
|
AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
|
|
725
692
|
AND NEW_FEATURES.FEATURE_ID IN ({feature_id_list})
|
|
726
693
|
AND EXISTING_FEATURES.FEATURE_ID IN ({feature_id_list})
|
|
727
694
|
AND EXISTING_FEATURES.FEATURE_VERSION IN ({feature_version_list})
|
|
728
695
|
WHEN MATCHED THEN
|
|
729
|
-
UPDATE
|
|
730
|
-
SET
|
|
731
|
-
FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
|
|
696
|
+
UPDATE SET FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
|
|
732
697
|
WHEN NOT MATCHED THEN
|
|
733
698
|
INSERT
|
|
734
|
-
(
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
--'{end_period_}')
|
|
699
|
+
(
|
|
700
|
+
{ENTITY_ID_SELECT},
|
|
701
|
+
NEW_FEATURES.FEATURE_ID,
|
|
702
|
+
NEW_FEATURES.FEATURE_VALUE,
|
|
703
|
+
NEW_FEATURES.FEATURE_VERSION
|
|
704
|
+
)
|
|
741
705
|
"""
|
|
742
706
|
else:
|
|
743
707
|
query_merge = f"""
|
|
744
708
|
{validtime_statement}
|
|
745
|
-
MERGE INTO
|
|
709
|
+
MERGE INTO {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']} EXISTING_FEATURES
|
|
746
710
|
USING ( {nested_query} ) NEW_FEATURES
|
|
747
|
-
ON {ENTITY_ID_ON}
|
|
711
|
+
ON {ENTITY_ID_ON}
|
|
748
712
|
AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
|
|
749
|
-
AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
|
|
750
|
-
AND EXISTING_FEATURES.FEATURE_ID IN ({feature_id_list})
|
|
751
|
-
AND NEW_FEATURES.FEATURE_ID IN ({feature_id_list})
|
|
713
|
+
AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
|
|
714
|
+
AND EXISTING_FEATURES.FEATURE_ID IN ({feature_id_list})
|
|
715
|
+
AND NEW_FEATURES.FEATURE_ID IN ({feature_id_list})
|
|
752
716
|
AND EXISTING_FEATURES.FEATURE_VERSION IN ({feature_version_list})
|
|
753
717
|
WHEN MATCHED THEN
|
|
754
|
-
UPDATE
|
|
755
|
-
SET
|
|
756
|
-
FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
|
|
718
|
+
UPDATE SET FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
|
|
757
719
|
WHEN NOT MATCHED THEN
|
|
758
720
|
INSERT
|
|
759
|
-
(
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
721
|
+
(
|
|
722
|
+
{ENTITY_ID_SELECT},
|
|
723
|
+
NEW_FEATURES.FEATURE_ID,
|
|
724
|
+
NEW_FEATURES.FEATURE_VALUE,
|
|
725
|
+
NEW_FEATURES.FEATURE_VERSION,
|
|
726
|
+
{validtime_start},
|
|
727
|
+
'{end_period_}'
|
|
728
|
+
)
|
|
765
729
|
"""
|
|
766
730
|
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
731
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
732
|
+
entity_id_str = ', '.join(sorted_entity_id)
|
|
733
|
+
logger_safe(
|
|
734
|
+
"debug",
|
|
735
|
+
"Merging feature values for entity keys (%s) into %s.%s",
|
|
736
|
+
entity_id_str, row['FEATURE_DATABASE'], row['FEATURE_TABLE']
|
|
737
|
+
)
|
|
738
|
+
logger_safe("debug", "Query (truncated): %s", "\n".join(query_merge.splitlines()[:12]) + "\n...")
|
|
772
739
|
|
|
773
740
|
queries.append(query_merge)
|
|
774
741
|
|
|
775
|
-
query_merge = '; \n'.join(queries)
|
|
776
742
|
try:
|
|
777
|
-
# Record the end time
|
|
778
743
|
start_time = time.time()
|
|
779
744
|
|
|
780
745
|
for q in queries:
|
|
781
|
-
if tdfs4ds
|
|
782
|
-
|
|
783
|
-
# Execute the SQL query to create the volatile table.
|
|
746
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
747
|
+
logger_safe("debug", "Executing merge (head): %s", "\n".join(q.split('\n')[0:3]))
|
|
784
748
|
execute_query(q)
|
|
785
|
-
#execute_query(query_merge)
|
|
786
|
-
# Record the end time
|
|
787
|
-
end_time = time.time()
|
|
788
749
|
|
|
789
|
-
|
|
790
|
-
elapsed_time = end_time - start_time
|
|
750
|
+
elapsed_time = time.time() - start_time
|
|
791
751
|
formatted_elapsed_time = seconds_to_dhms(elapsed_time)
|
|
792
|
-
|
|
793
|
-
|
|
752
|
+
logger_safe(
|
|
753
|
+
"info",
|
|
754
|
+
"Storage of prepared features (merge-only) completed in %s (%.3fs)",
|
|
755
|
+
formatted_elapsed_time, elapsed_time
|
|
756
|
+
)
|
|
794
757
|
except Exception as e:
|
|
795
|
-
|
|
758
|
+
logger_safe("exception", "Feature storage (merge) failed: %s", str(e).split('\n')[0])
|
|
796
759
|
raise
|
|
797
760
|
|
|
798
|
-
# # Record the end time
|
|
799
|
-
# start_time = time.time()
|
|
800
|
-
# for i, row in features_infos.iterrows():
|
|
801
|
-
# execute_query(query_collect_stats + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
|
|
802
|
-
# #print(query_collect_stats + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
|
|
803
|
-
# if query_collect_stats_extension is not None:
|
|
804
|
-
# execute_query(query_collect_stats_extension + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
|
|
805
|
-
# #print(query_collect_stats_extension + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
|
|
806
|
-
# # Record the end time
|
|
807
|
-
# end_time = time.time()
|
|
808
|
-
#
|
|
809
|
-
# # Calculate the elapsed time in seconds
|
|
810
|
-
# elapsed_time = end_time - start_time
|
|
811
|
-
# formatted_elapsed_time = seconds_to_dhms(elapsed_time)
|
|
812
|
-
# if tdfs4ds.DISPLAY_LOGS:
|
|
813
|
-
# print(f'Storage of the prepared features - collect stats only : {formatted_elapsed_time} ({elapsed_time}s)')
|
|
814
761
|
return
|
|
762
|
+
|
|
815
763
|
def store_feature(entity_id, volatile_table_name, entity_null_substitute = {},primary_index=None,
|
|
816
764
|
partitioning='', features_infos = None, **kwargs):
|
|
817
765
|
"""
|
|
@@ -832,27 +780,47 @@ def store_feature(entity_id, volatile_table_name, entity_null_substitute = {},pr
|
|
|
832
780
|
>>> store_feature(entity_id_dict, prepared_features)
|
|
833
781
|
"""
|
|
834
782
|
|
|
835
|
-
# Record the start time
|
|
836
783
|
start_time = time.time()
|
|
837
784
|
|
|
785
|
+
# Choose storage strategy
|
|
838
786
|
if tdfs4ds.STORE_FEATURE == 'UPDATE_INSERT':
|
|
839
|
-
|
|
840
|
-
|
|
787
|
+
logger_safe("info", "Storing features using UPDATE/INSERT strategy.")
|
|
788
|
+
_store_feature_update_insert(
|
|
789
|
+
entity_id,
|
|
790
|
+
volatile_table_name,
|
|
791
|
+
entity_null_substitute=entity_null_substitute,
|
|
792
|
+
primary_index=primary_index,
|
|
793
|
+
partitioning=partitioning,
|
|
794
|
+
features_infos=features_infos,
|
|
795
|
+
**kwargs
|
|
796
|
+
)
|
|
841
797
|
elif tdfs4ds.STORE_FEATURE == 'MERGE':
|
|
842
|
-
|
|
843
|
-
|
|
798
|
+
logger_safe("info", "Storing features using MERGE strategy.")
|
|
799
|
+
_store_feature_merge(
|
|
800
|
+
entity_id,
|
|
801
|
+
volatile_table_name,
|
|
802
|
+
entity_null_substitute=entity_null_substitute,
|
|
803
|
+
primary_index=primary_index,
|
|
804
|
+
partitioning=partitioning,
|
|
805
|
+
features_infos=features_infos,
|
|
806
|
+
**kwargs
|
|
807
|
+
)
|
|
844
808
|
else:
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
#
|
|
852
|
-
elapsed_time =
|
|
809
|
+
logger_safe(
|
|
810
|
+
"warning",
|
|
811
|
+
"Unknown STORE_FEATURE strategy '%s'. No storage operation was performed.",
|
|
812
|
+
tdfs4ds.STORE_FEATURE
|
|
813
|
+
)
|
|
814
|
+
|
|
815
|
+
# Log duration
|
|
816
|
+
elapsed_time = time.time() - start_time
|
|
853
817
|
formatted_elapsed_time = seconds_to_dhms(elapsed_time)
|
|
854
|
-
|
|
855
|
-
|
|
818
|
+
logger_safe(
|
|
819
|
+
"info",
|
|
820
|
+
"Storage of prepared features completed in %s (%.3fs)",
|
|
821
|
+
formatted_elapsed_time,
|
|
822
|
+
elapsed_time
|
|
823
|
+
)
|
|
856
824
|
|
|
857
825
|
def prepare_feature_ingestion_tdstone2(df, entity_id):
|
|
858
826
|
"""
|