tdfs4ds 0.2.4.31__py3-none-any.whl → 0.2.4.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,8 @@ from tdfs4ds.utils.info import seconds_to_dhms
7
7
  import time
8
8
  import re
9
9
  import pandas as pd
10
+ from tdfs4ds import logger_safe, logger
11
+
10
12
 
11
13
  def generate_on_clause(entity_id, entity_null_substitute, left_name, right_name):
12
14
  res = []
@@ -164,11 +166,10 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
164
166
  # Record the start time
165
167
  start_time = time.time()
166
168
 
167
-
168
-
169
- if type(entity_id) == list:
169
+ # Normalize entity_id into a list of keys
170
+ if isinstance(entity_id, list):
170
171
  list_entity_id = entity_id
171
- elif type(entity_id) == dict:
172
+ elif isinstance(entity_id, dict):
172
173
  list_entity_id = list(entity_id.keys())
173
174
  else:
174
175
  list_entity_id = [entity_id]
@@ -176,199 +177,170 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
176
177
 
177
178
  feature_id_names, conversion_name2id = get_feature_id_and_conversion(list_entity_id, feature_names)
178
179
 
179
- features_infos = pd.DataFrame(feature_id_names, columns = ['FEATURE_ID','FEATURE_NAME','FEATURE_TABLE','FEATURE_DATABASE'])
180
+ features_infos = pd.DataFrame(feature_id_names, columns=['FEATURE_ID', 'FEATURE_NAME', 'FEATURE_TABLE', 'FEATURE_DATABASE'])
180
181
  features_infos['FEATURE_VERSION'] = [feature_versions[k] for k in features_infos.FEATURE_NAME.values]
181
- if tdfs4ds.DEBUG_MODE:
182
- print('--- prepare_feature_ingestion ---')
183
- print('conversion_name2id : ', conversion_name2id)
184
- print('feature_names : ', feature_names)
185
182
 
186
- # Create the UNPIVOT clause for the specified feature columns
187
- unpivot_columns = ", \n".join(["(" + x + ") as '" + str(conversion_name2id[x]) + "'" for x in feature_names])
183
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
184
+ logger_safe("debug", "--- prepare_feature_ingestion ---")
185
+ logger_safe("debug", "conversion_name2id=%s", conversion_name2id)
186
+ logger_safe("debug", "feature_names=%s", feature_names)
188
187
 
189
- if tdfs4ds.DEBUG_MODE:
190
- print('--- prepare_feature_ingestion ---')
191
- print('unpivot_columns : ', unpivot_columns)
192
- # Create the output column list including entity IDs, feature names, and feature values
188
+ # UNPIVOT mapping
189
+ unpivot_columns = ", \n".join([f"({x}) as '{conversion_name2id[x]}'" for x in feature_names])
190
+
191
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
192
+ logger_safe("debug", "unpivot_columns=%s", unpivot_columns)
193
193
 
194
+ # Output columns for volatile table
194
195
  output_columns = ', \n'.join(list_entity_id + ['CAST(FEATURE_ID AS BIGINT) AS FEATURE_ID', 'FEATURE_VALUE'])
195
196
 
197
+ # Primary index
196
198
  if primary_index is None:
197
199
  primary_index = ','.join(list_entity_id)
198
200
  else:
199
- if type(primary_index) == list:
200
- primary_index = primary_index
201
- else:
201
+ if not isinstance(primary_index, list):
202
202
  primary_index = [primary_index]
203
203
  primary_index = ','.join(primary_index)
204
204
 
205
- # Create a dictionary to store feature versions, using the default version if not specified
205
+ # Feature versions (defaults)
206
206
  versions = {f: tdfs4ds.FEATURE_VERSION_DEFAULT for f in feature_names}
207
207
  if feature_versions is not None:
208
208
  for k, v in feature_versions.items():
209
209
  versions[k] = v
210
210
 
211
- if tdfs4ds.DEBUG_MODE:
212
- print('--- prepare_feature_ingestion ---')
213
- print('versions : ', versions)
211
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
212
+ logger_safe("debug", "versions=%s", versions)
214
213
 
215
- # Create the CASE statement to assign feature versions based on feature names
216
- version_query = ["CASE"] + [f"WHEN FEATURE_ID = '{conversion_name2id[k]}' THEN '{v}' " for k, v in versions.items()] + [
217
- "END AS FEATURE_VERSION"]
214
+ # CASE statement for versions
215
+ version_query = ["CASE"] + [f"WHEN FEATURE_ID = '{conversion_name2id[k]}' THEN '{v}' " for k, v in versions.items()] + ["END AS FEATURE_VERSION"]
218
216
  version_query = '\n'.join(version_query)
219
217
 
220
- if tdfs4ds.DEBUG_MODE:
221
- print('--- prepare_feature_ingestion ---')
222
- print('version_query : ', version_query)
218
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
219
+ logger_safe("debug", "version_query=%s", version_query)
223
220
 
224
- # Create a volatile table name based on the original table's name, ensuring it is unique.
221
+ # Volatile table name
225
222
  volatile_table_name = df._table_name.split('.')[1].replace('"', '')
226
- volatile_table_name = f'temp_{volatile_table_name}'
223
+ volatile_table_name = f"temp_{volatile_table_name}"
227
224
 
228
- if type(entity_id) == list:
225
+ # Normalize entity_id again for var casting
226
+ if isinstance(entity_id, list):
229
227
  list_entity_id = entity_id
230
- elif type(entity_id) == dict:
228
+ elif isinstance(entity_id, dict):
231
229
  list_entity_id = list(entity_id.keys())
232
230
  else:
233
231
  list_entity_id = [entity_id]
234
232
 
235
-
236
- # get the character set of varchars
237
- res = {x.split()[0]:''.join(x.split()[1::]) for x in str(df[feature_names].tdtypes).split('\n')}
233
+ # Character set handling / pass-through
234
+ res = {x.split()[0]: ''.join(x.split()[1::]) for x in str(df[feature_names].tdtypes).split('\n')}
238
235
  var_temp2 = []
239
- for k,v in res.items():
236
+ for k, v in res.items():
240
237
  if 'UNICODE' in v:
241
- #var_temp2.append(f'TRANSLATE({k} USING UNICODE_TO_LATIN) AS {k}')
242
238
  var_temp2.append(f'{k}')
243
239
  elif 'LATIN' in v:
244
- #var_temp2.append(f'{k}')
245
240
  var_temp2.append(f'TRANSLATE({k} USING LATIN_TO_UNICODE) AS {k}')
246
241
  else:
247
242
  var_temp2.append(f'CAST({k} AS VARCHAR(2048) CHARACTER SET UNICODE) AS {k}')
248
243
  var_temp2 = ', \n'.join(var_temp2)
244
+
245
+ # NOTE: the original code overrides var_temp2 with just the raw column names.
246
+ # Preserve that behavior to avoid functional change.
249
247
  var_temp2 = ', \n'.join(list(res.keys()))
250
248
 
249
+ # Null substitution on entity keys
251
250
  var_temp3 = []
252
251
  for e in list_entity_id:
253
252
  if e in entity_null_substitute.keys():
254
- if type(entity_null_substitute[e]) == str:
253
+ if isinstance(entity_null_substitute[e], str):
255
254
  var_temp3.append(f"coalesce({e},'{entity_null_substitute[e]}') AS {e}")
256
255
  else:
257
256
  var_temp3.append(f"coalesce({e},{entity_null_substitute[e]}) AS {e}")
258
257
  else:
259
258
  var_temp3.append(e)
260
-
261
259
  var_temp3 = ', \n'.join(var_temp3)
262
260
 
263
-
264
- nested_query = f"""
265
- CREATE MULTISET VOLATILE TABLE {volatile_table_name} AS
266
- (
267
- SELECT
268
- {output_columns},
269
- {version_query}
270
- FROM
271
- (SELECT
272
- {var_temp3},
273
- {var_temp2}
274
- FROM {df._table_name}
275
- ) A
276
- UNPIVOT INCLUDE NULLS ((FEATURE_VALUE ) FOR FEATURE_ID
277
- IN ({unpivot_columns})) Tmp
278
- ) WITH DATA
279
- PRIMARY INDEX ({primary_index})
280
- PARTITION BY RANGE_N(FEATURE_ID BETWEEN 0 AND 2000 EACH 1 )
281
- ON COMMIT PRESERVE ROWS
282
- """
283
-
284
- nested_query = f"""
285
- CREATE MULTISET VOLATILE TABLE {volatile_table_name} AS
286
- (
287
- SELECT
288
- {var_temp3},
289
- {var_temp2}
290
- FROM {df._table_name}
291
- ) WITH DATA
292
- PRIMARY INDEX ({primary_index})
293
- ON COMMIT PRESERVE ROWS
294
- """
295
-
261
+ # Final nested query used (the function reassigns to plain SELECT; preserve as-is)
296
262
  nested_query = f"""
297
263
  SELECT
298
264
  {var_temp3},
299
265
  {var_temp2}
300
266
  FROM {df._table_name}
301
-
302
267
  """
303
268
 
304
- # Test unicity of the process
269
+ # Duplicate check query
305
270
  output_columns_unicity = ', \n'.join(list_entity_id)
306
271
  query_test_unicity = f"""
307
272
  SELECT sum(CASE WHEN n>1 THEN 1 ELSE 0 END) AS nb_duplicates
308
273
  FROM (
309
- SELECT
310
- {output_columns_unicity}
311
- , count(*) as n
312
- FROM {_get_database_username()}.{volatile_table_name}
313
- GROUP BY {output_columns_unicity}
274
+ SELECT
275
+ {output_columns_unicity},
276
+ count(*) as n
277
+ FROM {_get_database_username()}.{volatile_table_name}
278
+ GROUP BY {output_columns_unicity}
314
279
  ) A
315
280
  """
316
281
 
317
- if tdfs4ds.DEBUG_MODE:
318
- print('--- prepare_feature_ingestion ---')
319
- print('var_temp2 : ', var_temp2)
320
- print('var_temp3 : ', var_temp3)
321
- print('nested_query :', nested_query)
322
-
323
-
282
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
283
+ logger_safe("debug", "var_temp2=%s", var_temp2)
284
+ logger_safe("debug", "var_temp3=%s", var_temp3)
285
+ logger_safe("debug", "nested_query=%s", nested_query)
324
286
 
325
- # Execute the SQL query to create the volatile table.
287
+ # Execute: create volatile table and test unicity
326
288
  try:
327
- #tdml.execute_sql(nested_query)
328
- tdml.DataFrame.from_query(nested_query).to_sql(table_name = volatile_table_name, temporary = True, primary_index = primary_index.split(','), if_exists='replace')
289
+ tdml.DataFrame.from_query(nested_query).to_sql(
290
+ table_name=volatile_table_name,
291
+ temporary=True,
292
+ primary_index=primary_index.split(','),
293
+ if_exists='replace'
294
+ )
329
295
  nb_duplicates = tdml.execute_sql(query_test_unicity).fetchall()[0][0]
330
296
  if nb_duplicates is not None and nb_duplicates > 0:
331
- tdfs4ds.logger.error(f"The process generates {nb_duplicates} duplicates")
332
- query_test_unicity = f"""
333
- SELECT TOP 3
334
- {output_columns_unicity}
335
- , count(*) as n
336
- FROM {_get_database_username()}.{volatile_table_name}
337
- GROUP BY {output_columns_unicity}
338
- HAVING n > 1
339
- """
297
+ logger_safe("error", "The process generates %s duplicates", nb_duplicates)
298
+ # Show sample duplicates in debug for troubleshooting
299
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
300
+ sample_dups_query = f"""
301
+ SELECT TOP 3
302
+ {output_columns_unicity},
303
+ count(*) as n
304
+ FROM {_get_database_username()}.{volatile_table_name}
305
+ GROUP BY {output_columns_unicity}
306
+ HAVING n > 1
307
+ """
308
+ logger_safe("debug", "Sample duplicates query:\n%s", sample_dups_query)
340
309
  raise ValueError("Invalid process: the process generates duplicates.")
341
- #tdfs4ds.logger.info(f"No duplicate found.")
310
+ # else: no duplicates
311
+ # logger_safe("info", "No duplicate found.") # optional
342
312
  except Exception as e:
343
- if tdfs4ds.DISPLAY_LOGS:
344
- print(str(e).split('\n')[0])
313
+ logger_safe("error", "prepare_feature_ingestion failed: %s", str(e).split('\n')[0])
345
314
  raise
346
315
 
316
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
317
+ logger_safe(
318
+ "debug",
319
+ "Result volatile table dtypes:\n%s",
320
+ tdml.DataFrame(tdml.in_schema(_get_database_username(), volatile_table_name)).tdtypes
321
+ )
347
322
 
348
- if tdfs4ds.DEBUG_MODE:
349
- print('--- prepare_feature_ingestion ---')
350
- print(tdml.DataFrame(tdml.in_schema(_get_database_username(), volatile_table_name)).tdtypes)
351
-
352
-
353
- # Record the end time
323
+ # Timing
354
324
  end_time = time.time()
355
-
356
-
357
- # Calculate the elapsed time in seconds
358
325
  elapsed_time = end_time - start_time
359
326
  formatted_elapsed_time = seconds_to_dhms(elapsed_time)
360
- if tdfs4ds.DISPLAY_LOGS:
361
- print(f'Feature preparation for ingestion : {formatted_elapsed_time} ({elapsed_time}s)')
327
+ logger_safe("info", "Feature preparation for ingestion: %s (%.3fs)", formatted_elapsed_time, elapsed_time)
328
+
329
+ # Return DataFrame and metadata
362
330
  try:
363
331
  df_out = tdml.DataFrame(tdml.in_schema(_get_database_username(), volatile_table_name))
364
332
  return df_out, volatile_table_name, features_infos
365
333
  except Exception as e:
366
- print(str(e).split()[0])
367
- print(df[feature_names].tdtypes)
334
+ logger_safe("error", "Failed to materialize volatile DataFrame: %s", str(e).split()[0])
335
+ logger_safe("error", "Feature dtypes snapshot: %s", df[feature_names].tdtypes)
368
336
  if 'TD_Unpivot contract function' in str(e).split()[0]:
369
- raise('Error : you may have string with UNICODE encoding as feature, please convert them to latin first')
337
+ raise RuntimeError(
338
+ "Error: you may have strings with UNICODE encoding as features; please convert them to LATIN first."
339
+ )
340
+ raise
370
341
 
371
- return None, None, None
342
+ # Fallback (should not reach)
343
+ # return None, None, None
372
344
 
373
345
 
374
346
  def apply_collect_stats(entity_id, primary_index, partitioning, feature_infos):
@@ -392,44 +364,46 @@ def apply_collect_stats(entity_id, primary_index, partitioning, feature_infos):
392
364
  Returns:
393
365
  None
394
366
  """
395
- # Sort the entity IDs to ensure consistent ordering.
367
+ # Sort entity IDs for consistent ordering
396
368
  sorted_entity_id = list(entity_id.keys())
397
369
  sorted_entity_id.sort()
398
370
 
399
- # Group the target tables by 'FEATURE_TABLE' and 'FEATURE_DATABASE' and count occurrences.
371
+ # Group target tables
400
372
  target_tables = feature_infos[['FEATURE_TABLE', 'FEATURE_DATABASE', 'FEATURE_ID']].groupby(
401
373
  ['FEATURE_TABLE', 'FEATURE_DATABASE']
402
374
  ).count().reset_index()
403
375
 
404
- # Generate the collect statistics query and its optional extension.
376
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
377
+ logger_safe("debug", "Target tables for COLLECT STATs: %s", target_tables[['FEATURE_DATABASE','FEATURE_TABLE']].to_dict(orient='records'))
378
+
379
+ # Generate COLLECT STATs queries
405
380
  query_collect_stats, query_collect_stats_extension = generate_collect_stats(
406
381
  sorted_entity_id,
407
382
  primary_index=primary_index,
408
383
  partitioning=partitioning
409
384
  )
410
385
 
411
- # Record the start time for measuring query execution duration.
412
386
  start_time = time.time()
413
387
 
414
- # Loop through the grouped target tables and execute the queries.
415
- for i, row in target_tables.iterrows():
416
- # Execute the main collect statistics query.
417
- execute_query(query_collect_stats + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
388
+ # Execute COLLECT STATs
389
+ for _, row in target_tables.iterrows():
390
+ table_fqn = f"{row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}"
391
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
392
+ logger_safe("debug", "Running COLLECT STATs on %s", table_fqn)
418
393
 
419
- # If an extension query exists, execute it as well.
420
- if query_collect_stats_extension is not None:
421
- execute_query(query_collect_stats_extension + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
394
+ execute_query(query_collect_stats + f" ON {table_fqn}")
422
395
 
423
- # Record the end time after query execution.
424
- end_time = time.time()
396
+ if query_collect_stats_extension is not None:
397
+ execute_query(query_collect_stats_extension + f" ON {table_fqn}")
425
398
 
426
- # Calculate the elapsed time in seconds and format it into a human-readable format.
427
- elapsed_time = end_time - start_time
399
+ elapsed_time = time.time() - start_time
428
400
  formatted_elapsed_time = seconds_to_dhms(elapsed_time)
401
+ logger_safe(
402
+ "info",
403
+ "Storage of the prepared features - collect stats only: %s (%.3fs)",
404
+ formatted_elapsed_time, elapsed_time
405
+ )
429
406
 
430
- # Log the execution time if logging is enabled.
431
- if tdfs4ds.DISPLAY_LOGS:
432
- print(f'Storage of the prepared features - collect stats only : {formatted_elapsed_time} ({elapsed_time}s)')
433
407
 
434
408
 
435
409
  def _store_feature_update_insert(entity_id, volatile_table_name, entity_null_substitute={},primary_index=None,
@@ -627,9 +601,8 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
627
601
  >>> store_feature(entity_id_dict, prepared_features)
628
602
  """
629
603
 
630
- #feature_catalog = tdml.DataFrame(tdml.in_schema(tdfs4ds.SCHEMA, tdfs4ds.FEATURE_CATALOG_NAME))
631
-
632
- if tdfs4ds.FEATURE_STORE_TIME == None:
604
+ # VALIDTIME handling
605
+ if tdfs4ds.FEATURE_STORE_TIME is None:
633
606
  validtime_statement = 'CURRENT VALIDTIME'
634
607
  validtime_statement2 = validtime_statement
635
608
  validtime_start = 'CAST(CURRENT_TIME AS TIMESTAMP(0) WITH TIME ZONE)'
@@ -638,180 +611,155 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
638
611
  validtime_statement2 = f"VALIDTIME AS OF TIMESTAMP '{tdfs4ds.FEATURE_STORE_TIME}'"
639
612
  validtime_start = f"CAST('{tdfs4ds.FEATURE_STORE_TIME}' AS TIMESTAMP(0) WITH TIME ZONE)"
640
613
 
641
- if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED':
642
- end_period_ = '9999-01-01 00:00:00'
643
- else:
644
- end_period_ = tdfs4ds.END_PERIOD
645
-
646
- if tdfs4ds.DEBUG_MODE:
647
- print('tdfs4ds.FEATURE_STORE_TIME :' , tdfs4ds.FEATURE_STORE_TIME)
648
-
649
-
650
- if tdfs4ds.DEBUG_MODE:
651
- print('entity_id :' , entity_id)
614
+ end_period_ = '9999-01-01 00:00:00' if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED' else tdfs4ds.END_PERIOD
652
615
 
653
- sorted_entity_id = list(entity_id.keys())
654
- sorted_entity_id.sort()
655
- ENTITY_ID = ','.join([k for k in sorted_entity_id])
616
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
617
+ logger_safe("debug", "FEATURE_STORE_TIME=%s | END_PERIOD=%s", tdfs4ds.FEATURE_STORE_TIME, tdfs4ds.END_PERIOD)
618
+ logger_safe("debug", "entity_id=%s", entity_id)
656
619
 
657
- count_features = pd.DataFrame(tdml.execute_sql(f"""
658
- SEL count(*) as NB_ROWS FROM
659
- {_get_database_username()}.
660
- {volatile_table_name}
661
- """).fetchall(), columns = ['NB_ROWS'])
620
+ # Entity id helpers
621
+ sorted_entity_id = sorted(list(entity_id.keys()))
622
+ ENTITY_ID = ','.join(sorted_entity_id)
662
623
 
663
- if tdfs4ds.DEBUG_MODE:
664
- print('count_features :' , count_features)
665
- print('features_infos :', features_infos)
624
+ # Count rows in volatile table
625
+ count_features = pd.DataFrame(
626
+ tdml.execute_sql(
627
+ f"""
628
+ SEL count(*) as NB_ROWS
629
+ FROM {_get_database_username()}.{volatile_table_name}
630
+ """
631
+ ).fetchall(),
632
+ columns=['NB_ROWS']
633
+ )
666
634
 
635
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
636
+ logger_safe("debug", "count_features=%s", count_features)
637
+ logger_safe("debug", "features_infos initial=%s", features_infos)
667
638
 
668
639
  if count_features.shape[0] > 0:
669
640
  features_infos['NB_ROWS'] = count_features['NB_ROWS'].values[0]
670
641
  else:
671
642
  features_infos['NB_ROWS'] = 0
672
643
 
673
- if tdfs4ds.DEBUG_MODE:
674
- print('features_infos :' , features_infos)
675
- # Group the target tables by feature table and feature database and count the number of occurrences
676
- target_tables = features_infos[['FEATURE_TABLE', 'FEATURE_DATABASE', 'NB_ROWS']].groupby(
677
- ['FEATURE_TABLE', 'FEATURE_DATABASE']).sum().reset_index()
678
-
679
- if tdfs4ds.DEBUG_MODE:
680
- print('target_tables :' , target_tables)
681
- if tdfs4ds.DISPLAY_LOGS:
682
- display_table(target_tables[['FEATURE_DATABASE', 'FEATURE_TABLE', 'NB_ROWS']])
644
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
645
+ logger_safe("debug", "features_infos updated=%s", features_infos)
683
646
 
647
+ # Compute target tables
648
+ target_tables = features_infos[['FEATURE_TABLE', 'FEATURE_DATABASE', 'NB_ROWS']].groupby(
649
+ ['FEATURE_TABLE', 'FEATURE_DATABASE']
650
+ ).sum().reset_index()
684
651
 
652
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
653
+ logger_safe("debug", "target_tables=%s", target_tables)
685
654
 
686
- sorted_entity_id = list(entity_id.keys())
687
- sorted_entity_id.sort()
655
+ # Optional display (keep existing UX semantics)
656
+ if getattr(tdfs4ds, "DISPLAY_LOGS", False):
657
+ try:
658
+ display_table(target_tables[['FEATURE_DATABASE', 'FEATURE_TABLE', 'NB_ROWS']])
659
+ except Exception as e:
660
+ logger_safe("warning", "display_table failed: %s", str(e).split('\n')[0])
688
661
 
689
662
  ENTITY_ID_ON = ' AND '.join([f'NEW_FEATURES.{k} = EXISTING_FEATURES.{k}' for k in sorted_entity_id])
690
-
691
663
  ENTITY_ID_SELECT = ', \n'.join(['NEW_FEATURES.' + k for k in sorted_entity_id])
692
- # Iterate over target tables and perform update and insert operations
693
-
694
-
695
- #query_collect_stats, query_collect_stats_extension = generate_collect_stats(sorted_entity_id,primary_index=primary_index, partitioning=partitioning)
696
-
697
664
 
698
665
  queries = []
699
- for i, row in features_infos.iterrows():
700
-
701
- features_infos_ = features_infos[(features_infos.FEATURE_TABLE == row['FEATURE_TABLE']) & (features_infos.FEATURE_DATABASE == row['FEATURE_DATABASE'])]
702
- feature_id_list = ','.join([str(x) for x in list(set(features_infos_.FEATURE_ID.values))])
703
- feature_version_list = ','.join(["'"+x+"'" for x in list(set(features_infos_.FEATURE_VERSION.values))])
704
-
705
-
706
- nested_query = f"SEL * FROM {_get_database_username()}.{volatile_table_name} WHERE FEATURE_ID IN ({feature_id_list})"
666
+ for _, row in features_infos.iterrows():
667
+ features_infos_ = features_infos[
668
+ (features_infos.FEATURE_TABLE == row['FEATURE_TABLE']) &
669
+ (features_infos.FEATURE_DATABASE == row['FEATURE_DATABASE'])
670
+ ]
671
+ feature_id_list = ','.join([str(x) for x in sorted(set(features_infos_.FEATURE_ID.values))])
672
+ feature_version_list = ','.join(["'" + x + "'" for x in sorted(set(features_infos_.FEATURE_VERSION.values))])
673
+
674
+ # Build nested query
707
675
  nested_query = f"""
708
676
  SEL
709
- {ENTITY_ID}
710
- , {row['FEATURE_ID']} AS FEATURE_ID
711
- , {row['FEATURE_NAME']} AS FEATURE_VALUE
712
- , '{row['FEATURE_VERSION']}' AS FEATURE_VERSION
713
- FROM {_get_database_username()}.{volatile_table_name}
677
+ {ENTITY_ID}
678
+ , {row['FEATURE_ID']} AS FEATURE_ID
679
+ , {row['FEATURE_NAME']} AS FEATURE_VALUE
680
+ , '{row['FEATURE_VERSION']}' AS FEATURE_VERSION
681
+ FROM {_get_database_username()}.{volatile_table_name}
714
682
  """
715
683
 
716
- if tdfs4ds.FEATURE_STORE_TIME == None:
684
+ if tdfs4ds.FEATURE_STORE_TIME is None:
717
685
  query_merge = f"""
718
686
  {validtime_statement}
719
- MERGE INTO {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']} EXISTING_FEATURES
720
-
687
+ MERGE INTO {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']} EXISTING_FEATURES
721
688
  USING ( {nested_query} ) NEW_FEATURES
722
- ON {ENTITY_ID_ON}
689
+ ON {ENTITY_ID_ON}
723
690
  AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
724
691
  AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
725
692
  AND NEW_FEATURES.FEATURE_ID IN ({feature_id_list})
726
693
  AND EXISTING_FEATURES.FEATURE_ID IN ({feature_id_list})
727
694
  AND EXISTING_FEATURES.FEATURE_VERSION IN ({feature_version_list})
728
695
  WHEN MATCHED THEN
729
- UPDATE
730
- SET
731
- FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
696
+ UPDATE SET FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
732
697
  WHEN NOT MATCHED THEN
733
698
  INSERT
734
- ({ENTITY_ID_SELECT},
735
- NEW_FEATURES.FEATURE_ID,
736
- NEW_FEATURES.FEATURE_VALUE,
737
- NEW_FEATURES.FEATURE_VERSION)
738
- --,
739
- --{validtime_start},
740
- --'{end_period_}')
699
+ (
700
+ {ENTITY_ID_SELECT},
701
+ NEW_FEATURES.FEATURE_ID,
702
+ NEW_FEATURES.FEATURE_VALUE,
703
+ NEW_FEATURES.FEATURE_VERSION
704
+ )
741
705
  """
742
706
  else:
743
707
  query_merge = f"""
744
708
  {validtime_statement}
745
- MERGE INTO {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']} EXISTING_FEATURES
709
+ MERGE INTO {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']} EXISTING_FEATURES
746
710
  USING ( {nested_query} ) NEW_FEATURES
747
- ON {ENTITY_ID_ON}
711
+ ON {ENTITY_ID_ON}
748
712
  AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
749
- AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
750
- AND EXISTING_FEATURES.FEATURE_ID IN ({feature_id_list})
751
- AND NEW_FEATURES.FEATURE_ID IN ({feature_id_list})
713
+ AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
714
+ AND EXISTING_FEATURES.FEATURE_ID IN ({feature_id_list})
715
+ AND NEW_FEATURES.FEATURE_ID IN ({feature_id_list})
752
716
  AND EXISTING_FEATURES.FEATURE_VERSION IN ({feature_version_list})
753
717
  WHEN MATCHED THEN
754
- UPDATE
755
- SET
756
- FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
718
+ UPDATE SET FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
757
719
  WHEN NOT MATCHED THEN
758
720
  INSERT
759
- ({ENTITY_ID_SELECT},
760
- NEW_FEATURES.FEATURE_ID,
761
- NEW_FEATURES.FEATURE_VALUE,
762
- NEW_FEATURES.FEATURE_VERSION,
763
- {validtime_start},
764
- '{end_period_}')
721
+ (
722
+ {ENTITY_ID_SELECT},
723
+ NEW_FEATURES.FEATURE_ID,
724
+ NEW_FEATURES.FEATURE_VALUE,
725
+ NEW_FEATURES.FEATURE_VERSION,
726
+ {validtime_start},
727
+ '{end_period_}'
728
+ )
765
729
  """
766
730
 
767
- entity_id_str = ', \n'.join([k for k in sorted_entity_id])
768
- if tdfs4ds.DEBUG_MODE: print(
769
- f'merge feature values of new {entity_id_str} combinations in {row.iloc[1]}.{row.iloc[0]}')
770
- if tdfs4ds.DEBUG_MODE:
771
- print(query_merge)
731
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
732
+ entity_id_str = ', '.join(sorted_entity_id)
733
+ logger_safe(
734
+ "debug",
735
+ "Merging feature values for entity keys (%s) into %s.%s",
736
+ entity_id_str, row['FEATURE_DATABASE'], row['FEATURE_TABLE']
737
+ )
738
+ logger_safe("debug", "Query (truncated): %s", "\n".join(query_merge.splitlines()[:12]) + "\n...")
772
739
 
773
740
  queries.append(query_merge)
774
741
 
775
- query_merge = '; \n'.join(queries)
776
742
  try:
777
- # Record the end time
778
743
  start_time = time.time()
779
744
 
780
745
  for q in queries:
781
- if tdfs4ds.DEBUG_MODE:
782
- print(q.split('\n')[0:3])
783
- # Execute the SQL query to create the volatile table.
746
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
747
+ logger_safe("debug", "Executing merge (head): %s", "\n".join(q.split('\n')[0:3]))
784
748
  execute_query(q)
785
- #execute_query(query_merge)
786
- # Record the end time
787
- end_time = time.time()
788
749
 
789
- # Calculate the elapsed time in seconds
790
- elapsed_time = end_time - start_time
750
+ elapsed_time = time.time() - start_time
791
751
  formatted_elapsed_time = seconds_to_dhms(elapsed_time)
792
- if tdfs4ds.DISPLAY_LOGS:
793
- print(f'Storage of the prepared features - merge only : {formatted_elapsed_time} ({elapsed_time}s)')
752
+ logger_safe(
753
+ "info",
754
+ "Storage of prepared features (merge-only) completed in %s (%.3fs)",
755
+ formatted_elapsed_time, elapsed_time
756
+ )
794
757
  except Exception as e:
795
- print(str(e))
758
+ logger_safe("exception", "Feature storage (merge) failed: %s", str(e).split('\n')[0])
796
759
  raise
797
760
 
798
- # # Record the end time
799
- # start_time = time.time()
800
- # for i, row in features_infos.iterrows():
801
- # execute_query(query_collect_stats + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
802
- # #print(query_collect_stats + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
803
- # if query_collect_stats_extension is not None:
804
- # execute_query(query_collect_stats_extension + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
805
- # #print(query_collect_stats_extension + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
806
- # # Record the end time
807
- # end_time = time.time()
808
- #
809
- # # Calculate the elapsed time in seconds
810
- # elapsed_time = end_time - start_time
811
- # formatted_elapsed_time = seconds_to_dhms(elapsed_time)
812
- # if tdfs4ds.DISPLAY_LOGS:
813
- # print(f'Storage of the prepared features - collect stats only : {formatted_elapsed_time} ({elapsed_time}s)')
814
761
  return
762
+
815
763
  def store_feature(entity_id, volatile_table_name, entity_null_substitute = {},primary_index=None,
816
764
  partitioning='', features_infos = None, **kwargs):
817
765
  """
@@ -832,27 +780,47 @@ def store_feature(entity_id, volatile_table_name, entity_null_substitute = {},pr
832
780
  >>> store_feature(entity_id_dict, prepared_features)
833
781
  """
834
782
 
835
- # Record the start time
836
783
  start_time = time.time()
837
784
 
785
+ # Choose storage strategy
838
786
  if tdfs4ds.STORE_FEATURE == 'UPDATE_INSERT':
839
- _store_feature_update_insert(entity_id, volatile_table_name, entity_null_substitute=entity_null_substitute,primary_index=primary_index,
840
- partitioning=partitioning, features_infos=features_infos, **kwargs)
787
+ logger_safe("info", "Storing features using UPDATE/INSERT strategy.")
788
+ _store_feature_update_insert(
789
+ entity_id,
790
+ volatile_table_name,
791
+ entity_null_substitute=entity_null_substitute,
792
+ primary_index=primary_index,
793
+ partitioning=partitioning,
794
+ features_infos=features_infos,
795
+ **kwargs
796
+ )
841
797
  elif tdfs4ds.STORE_FEATURE == 'MERGE':
842
- _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=entity_null_substitute,primary_index=primary_index,
843
- partitioning=partitioning, features_infos=features_infos, **kwargs)
798
+ logger_safe("info", "Storing features using MERGE strategy.")
799
+ _store_feature_merge(
800
+ entity_id,
801
+ volatile_table_name,
802
+ entity_null_substitute=entity_null_substitute,
803
+ primary_index=primary_index,
804
+ partitioning=partitioning,
805
+ features_infos=features_infos,
806
+ **kwargs
807
+ )
844
808
  else:
845
- # Handle other conditions or operations as required
846
- pass
847
-
848
- # Record the end time
849
- end_time = time.time()
850
-
851
- # Calculate the elapsed time in seconds
852
- elapsed_time = end_time - start_time
809
+ logger_safe(
810
+ "warning",
811
+ "Unknown STORE_FEATURE strategy '%s'. No storage operation was performed.",
812
+ tdfs4ds.STORE_FEATURE
813
+ )
814
+
815
+ # Log duration
816
+ elapsed_time = time.time() - start_time
853
817
  formatted_elapsed_time = seconds_to_dhms(elapsed_time)
854
- if tdfs4ds.DISPLAY_LOGS:
855
- print(f'Storage of the prepared features : {formatted_elapsed_time} ({elapsed_time}s)')
818
+ logger_safe(
819
+ "info",
820
+ "Storage of prepared features completed in %s (%.3fs)",
821
+ formatted_elapsed_time,
822
+ elapsed_time
823
+ )
856
824
 
857
825
  def prepare_feature_ingestion_tdstone2(df, entity_id):
858
826
  """