tdfs4ds 0.2.4.32__py3-none-any.whl → 0.2.4.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,8 @@ from tdfs4ds.utils.info import seconds_to_dhms
7
7
  import time
8
8
  import re
9
9
  import pandas as pd
10
+ from tdfs4ds import logger_safe, logger
11
+
10
12
 
11
13
  def generate_on_clause(entity_id, entity_null_substitute, left_name, right_name):
12
14
  res = []
@@ -73,7 +75,7 @@ def generate_collect_stats(entity_id, primary_index='', partitioning=''):
73
75
 
74
76
  # Initialize the extended query with sampling and threshold settings for statistics collection
75
77
  query_extension_header = 'COLLECT STATISTICS USING SAMPLE 25 PERCENT AND THRESHOLD 15 PERCENT'
76
- query_extension = []
78
+ query_extension = []
77
79
 
78
80
  # Add primary index columns to the extended query
79
81
  if primary_index:
@@ -164,11 +166,10 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
164
166
  # Record the start time
165
167
  start_time = time.time()
166
168
 
167
-
168
-
169
- if type(entity_id) == list:
169
+ # Normalize entity_id into a list of keys
170
+ if isinstance(entity_id, list):
170
171
  list_entity_id = entity_id
171
- elif type(entity_id) == dict:
172
+ elif isinstance(entity_id, dict):
172
173
  list_entity_id = list(entity_id.keys())
173
174
  else:
174
175
  list_entity_id = [entity_id]
@@ -176,260 +177,333 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
176
177
 
177
178
  feature_id_names, conversion_name2id = get_feature_id_and_conversion(list_entity_id, feature_names)
178
179
 
179
- features_infos = pd.DataFrame(feature_id_names, columns = ['FEATURE_ID','FEATURE_NAME','FEATURE_TABLE','FEATURE_DATABASE'])
180
+ features_infos = pd.DataFrame(feature_id_names, columns=['FEATURE_ID', 'FEATURE_NAME', 'FEATURE_TABLE', 'FEATURE_DATABASE'])
180
181
  features_infos['FEATURE_VERSION'] = [feature_versions[k] for k in features_infos.FEATURE_NAME.values]
181
- if tdfs4ds.DEBUG_MODE:
182
- print('--- prepare_feature_ingestion ---')
183
- print('conversion_name2id : ', conversion_name2id)
184
- print('feature_names : ', feature_names)
185
182
 
186
- # Create the UNPIVOT clause for the specified feature columns
187
- unpivot_columns = ", \n".join(["(" + x + ") as '" + str(conversion_name2id[x]) + "'" for x in feature_names])
183
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
184
+ logger_safe("debug", "--- prepare_feature_ingestion ---")
185
+ logger_safe("debug", "conversion_name2id=%s", conversion_name2id)
186
+ logger_safe("debug", "feature_names=%s", feature_names)
188
187
 
189
- if tdfs4ds.DEBUG_MODE:
190
- print('--- prepare_feature_ingestion ---')
191
- print('unpivot_columns : ', unpivot_columns)
192
- # Create the output column list including entity IDs, feature names, and feature values
188
+ # UNPIVOT mapping
189
+ unpivot_columns = ", \n".join([f"({x}) as '{conversion_name2id[x]}'" for x in feature_names])
193
190
 
191
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
192
+ logger_safe("debug", "unpivot_columns=%s", unpivot_columns)
193
+
194
+ # Output columns for volatile table
194
195
  output_columns = ', \n'.join(list_entity_id + ['CAST(FEATURE_ID AS BIGINT) AS FEATURE_ID', 'FEATURE_VALUE'])
195
196
 
197
+ # Primary index
196
198
  if primary_index is None:
197
199
  primary_index = ','.join(list_entity_id)
198
200
  else:
199
- if type(primary_index) == list:
200
- primary_index = primary_index
201
- else:
201
+ if not isinstance(primary_index, list):
202
202
  primary_index = [primary_index]
203
203
  primary_index = ','.join(primary_index)
204
204
 
205
- # Create a dictionary to store feature versions, using the default version if not specified
205
+ # Feature versions (defaults)
206
206
  versions = {f: tdfs4ds.FEATURE_VERSION_DEFAULT for f in feature_names}
207
207
  if feature_versions is not None:
208
208
  for k, v in feature_versions.items():
209
209
  versions[k] = v
210
210
 
211
- if tdfs4ds.DEBUG_MODE:
212
- print('--- prepare_feature_ingestion ---')
213
- print('versions : ', versions)
211
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
212
+ logger_safe("debug", "versions=%s", versions)
214
213
 
215
- # Create the CASE statement to assign feature versions based on feature names
216
- version_query = ["CASE"] + [f"WHEN FEATURE_ID = '{conversion_name2id[k]}' THEN '{v}' " for k, v in versions.items()] + [
217
- "END AS FEATURE_VERSION"]
214
+ # CASE statement for versions
215
+ version_query = ["CASE"] + [f"WHEN FEATURE_ID = '{conversion_name2id[k]}' THEN '{v}' " for k, v in versions.items()] + ["END AS FEATURE_VERSION"]
218
216
  version_query = '\n'.join(version_query)
219
217
 
220
- if tdfs4ds.DEBUG_MODE:
221
- print('--- prepare_feature_ingestion ---')
222
- print('version_query : ', version_query)
218
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
219
+ logger_safe("debug", "version_query=%s", version_query)
223
220
 
224
- # Create a volatile table name based on the original table's name, ensuring it is unique.
221
+ # Volatile table name
225
222
  volatile_table_name = df._table_name.split('.')[1].replace('"', '')
226
- volatile_table_name = f'temp_{volatile_table_name}'
223
+ volatile_table_name = f"temp_{volatile_table_name}"
227
224
 
228
- if type(entity_id) == list:
225
+ # Normalize entity_id again for var casting
226
+ if isinstance(entity_id, list):
229
227
  list_entity_id = entity_id
230
- elif type(entity_id) == dict:
228
+ elif isinstance(entity_id, dict):
231
229
  list_entity_id = list(entity_id.keys())
232
230
  else:
233
231
  list_entity_id = [entity_id]
234
232
 
235
-
236
- # get the character set of varchars
237
- res = {x.split()[0]:''.join(x.split()[1::]) for x in str(df[feature_names].tdtypes).split('\n')}
233
+ # Character set handling / pass-through
234
+ res = {x.split()[0]: ''.join(x.split()[1::]) for x in str(df[feature_names].tdtypes).split('\n')}
238
235
  var_temp2 = []
239
- for k,v in res.items():
236
+ for k, v in res.items():
240
237
  if 'UNICODE' in v:
241
- #var_temp2.append(f'TRANSLATE({k} USING UNICODE_TO_LATIN) AS {k}')
242
238
  var_temp2.append(f'{k}')
243
239
  elif 'LATIN' in v:
244
- #var_temp2.append(f'{k}')
245
240
  var_temp2.append(f'TRANSLATE({k} USING LATIN_TO_UNICODE) AS {k}')
246
241
  else:
247
242
  var_temp2.append(f'CAST({k} AS VARCHAR(2048) CHARACTER SET UNICODE) AS {k}')
248
243
  var_temp2 = ', \n'.join(var_temp2)
244
+
245
+ # NOTE: the original code overrides var_temp2 with just the raw column names.
246
+ # Preserve that behavior to avoid functional change.
249
247
  var_temp2 = ', \n'.join(list(res.keys()))
250
248
 
249
+ # Null substitution on entity keys
251
250
  var_temp3 = []
252
251
  for e in list_entity_id:
253
252
  if e in entity_null_substitute.keys():
254
- if type(entity_null_substitute[e]) == str:
253
+ if isinstance(entity_null_substitute[e], str):
255
254
  var_temp3.append(f"coalesce({e},'{entity_null_substitute[e]}') AS {e}")
256
255
  else:
257
256
  var_temp3.append(f"coalesce({e},{entity_null_substitute[e]}) AS {e}")
258
257
  else:
259
258
  var_temp3.append(e)
260
-
261
259
  var_temp3 = ', \n'.join(var_temp3)
262
260
 
263
-
264
- nested_query = f"""
265
- CREATE MULTISET VOLATILE TABLE {volatile_table_name} AS
266
- (
267
- SELECT
268
- {output_columns},
269
- {version_query}
270
- FROM
271
- (SELECT
272
- {var_temp3},
273
- {var_temp2}
274
- FROM {df._table_name}
275
- ) A
276
- UNPIVOT INCLUDE NULLS ((FEATURE_VALUE ) FOR FEATURE_ID
277
- IN ({unpivot_columns})) Tmp
278
- ) WITH DATA
279
- PRIMARY INDEX ({primary_index})
280
- PARTITION BY RANGE_N(FEATURE_ID BETWEEN 0 AND 2000 EACH 1 )
281
- ON COMMIT PRESERVE ROWS
282
- """
283
-
284
- nested_query = f"""
285
- CREATE MULTISET VOLATILE TABLE {volatile_table_name} AS
286
- (
287
- SELECT
288
- {var_temp3},
289
- {var_temp2}
290
- FROM {df._table_name}
291
- ) WITH DATA
292
- PRIMARY INDEX ({primary_index})
293
- ON COMMIT PRESERVE ROWS
294
- """
295
-
261
+ # Final nested query used (the function reassigns to plain SELECT; preserve as-is)
296
262
  nested_query = f"""
297
263
  SELECT
298
264
  {var_temp3},
299
265
  {var_temp2}
300
266
  FROM {df._table_name}
301
-
302
267
  """
303
268
 
304
- # Test unicity of the process
269
+ # Duplicate check query
305
270
  output_columns_unicity = ', \n'.join(list_entity_id)
306
271
  query_test_unicity = f"""
307
272
  SELECT sum(CASE WHEN n>1 THEN 1 ELSE 0 END) AS nb_duplicates
308
273
  FROM (
309
- SELECT
310
- {output_columns_unicity}
311
- , count(*) as n
312
- FROM {_get_database_username()}.{volatile_table_name}
313
- GROUP BY {output_columns_unicity}
274
+ SELECT
275
+ {output_columns_unicity},
276
+ count(*) as n
277
+ FROM {_get_database_username()}.{volatile_table_name}
278
+ GROUP BY {output_columns_unicity}
314
279
  ) A
315
280
  """
316
281
 
317
- if tdfs4ds.DEBUG_MODE:
318
- print('--- prepare_feature_ingestion ---')
319
- print('var_temp2 : ', var_temp2)
320
- print('var_temp3 : ', var_temp3)
321
- print('nested_query :', nested_query)
322
-
323
-
282
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
283
+ logger_safe("debug", "var_temp2=%s", var_temp2)
284
+ logger_safe("debug", "var_temp3=%s", var_temp3)
285
+ logger_safe("debug", "nested_query=%s", nested_query)
324
286
 
325
- # Execute the SQL query to create the volatile table.
287
+ # Execute: create volatile table and test unicity
326
288
  try:
327
- #tdml.execute_sql(nested_query)
328
- tdml.DataFrame.from_query(nested_query).to_sql(table_name = volatile_table_name, temporary = True, primary_index = primary_index.split(','), if_exists='replace')
289
+ tdml.DataFrame.from_query(nested_query).to_sql(
290
+ table_name=volatile_table_name,
291
+ temporary=True,
292
+ primary_index=primary_index.split(','),
293
+ if_exists='replace'
294
+ )
329
295
  nb_duplicates = tdml.execute_sql(query_test_unicity).fetchall()[0][0]
330
296
  if nb_duplicates is not None and nb_duplicates > 0:
331
- tdfs4ds.logger.error(f"The process generates {nb_duplicates} duplicates")
332
- query_test_unicity = f"""
333
- SELECT TOP 3
334
- {output_columns_unicity}
335
- , count(*) as n
336
- FROM {_get_database_username()}.{volatile_table_name}
337
- GROUP BY {output_columns_unicity}
338
- HAVING n > 1
339
- """
297
+ logger_safe("error", "The process generates %s duplicates", nb_duplicates)
298
+ # Show sample duplicates in debug for troubleshooting
299
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
300
+ sample_dups_query = f"""
301
+ SELECT TOP 3
302
+ {output_columns_unicity},
303
+ count(*) as n
304
+ FROM {_get_database_username()}.{volatile_table_name}
305
+ GROUP BY {output_columns_unicity}
306
+ HAVING n > 1
307
+ """
308
+ logger_safe("debug", "Sample duplicates query:\n%s", sample_dups_query)
340
309
  raise ValueError("Invalid process: the process generates duplicates.")
341
- #tdfs4ds.logger.info(f"No duplicate found.")
310
+ # else: no duplicates
311
+ # logger_safe("info", "No duplicate found.") # optional
342
312
  except Exception as e:
343
- if tdfs4ds.DISPLAY_LOGS:
344
- print(str(e).split('\n')[0])
313
+ logger_safe("error", "prepare_feature_ingestion failed: %s", str(e).split('\n')[0])
345
314
  raise
346
315
 
316
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
317
+ logger_safe(
318
+ "debug",
319
+ "Result volatile table dtypes:\n%s",
320
+ tdml.DataFrame(tdml.in_schema(_get_database_username(), volatile_table_name)).tdtypes
321
+ )
347
322
 
348
- if tdfs4ds.DEBUG_MODE:
349
- print('--- prepare_feature_ingestion ---')
350
- print(tdml.DataFrame(tdml.in_schema(_get_database_username(), volatile_table_name)).tdtypes)
351
-
352
-
353
- # Record the end time
323
+ # Timing
354
324
  end_time = time.time()
355
-
356
-
357
- # Calculate the elapsed time in seconds
358
325
  elapsed_time = end_time - start_time
359
326
  formatted_elapsed_time = seconds_to_dhms(elapsed_time)
360
- if tdfs4ds.DISPLAY_LOGS:
361
- print(f'Feature preparation for ingestion : {formatted_elapsed_time} ({elapsed_time}s)')
327
+ logger_safe("info", "Feature preparation for ingestion: %s (%.3fs)", formatted_elapsed_time, elapsed_time)
328
+
329
+ # Return DataFrame and metadata
362
330
  try:
363
331
  df_out = tdml.DataFrame(tdml.in_schema(_get_database_username(), volatile_table_name))
364
332
  return df_out, volatile_table_name, features_infos
365
333
  except Exception as e:
366
- print(str(e).split()[0])
367
- print(df[feature_names].tdtypes)
334
+ logger_safe("error", "Failed to materialize volatile DataFrame: %s", str(e).split()[0])
335
+ logger_safe("error", "Feature dtypes snapshot: %s", df[feature_names].tdtypes)
368
336
  if 'TD_Unpivot contract function' in str(e).split()[0]:
369
- raise('Error : you may have string with UNICODE encoding as feature, please convert them to latin first')
337
+ raise RuntimeError(
338
+ "Error: you may have strings with UNICODE encoding as features; please convert them to LATIN first."
339
+ )
340
+ raise
370
341
 
371
- return None, None, None
342
+ # Fallback (should not reach)
343
+ # return None, None, None
372
344
 
373
345
 
374
- def apply_collect_stats(entity_id, primary_index, partitioning, feature_infos):
375
- """
376
- Applies a collect statistics operation on target tables grouped by feature table and database.
377
-
378
- This function performs the following steps:
379
- 1. Sorts the `entity_id`.
380
- 2. Groups the feature information by feature table and database to count occurrences.
381
- 3. Generates collect statistics queries.
382
- 4. Executes the queries on the target tables while recording the execution time.
383
- 5. Logs the elapsed time if logging is enabled.
384
-
385
- Args:
386
- entity_id (list): A list of entity IDs to process.
387
- primary_index (str): The primary index to use in the collect statistics query.
388
- partitioning (str): Partitioning information for the query.
389
- feature_infos (pd.DataFrame): A DataFrame containing feature information,
390
- including columns 'FEATURE_TABLE', 'FEATURE_DATABASE', and 'FEATURE_ID'.
346
+ import time
347
+ from typing import Any, Dict, Iterable, Mapping, Optional, Tuple
391
348
 
392
- Returns:
393
- None
349
+ import pandas as pd
350
+
351
+ def apply_collect_stats(
352
+ entity_id: Mapping[str, Any] | Iterable[str],
353
+ primary_index: Optional[str],
354
+ partitioning: Optional[str],
355
+ feature_infos: pd.DataFrame,
356
+ ) -> Dict[str, Any]:
394
357
  """
395
- # Sort the entity IDs to ensure consistent ordering.
396
- sorted_entity_id = list(entity_id.keys())
397
- sorted_entity_id.sort()
358
+ Run COLLECT STATS on all target feature tables, with fallbacks and timing.
359
+
360
+ Steps:
361
+ 1) Determine a stable ordering of entity IDs (for deterministic query gen).
362
+ 2) Group `feature_infos` by FEATURE_DATABASE + FEATURE_TABLE to get unique targets.
363
+ 3) Generate COLLECT STATS statements via `generate_collect_stats(...)` for fallback use.
364
+ 4) For each target table:
365
+ - Try a simple `COLLECT STATS ON <db>.<table>`.
366
+ - On failure, retry with generated statements (and optional extension).
367
+ 5) Log a compact summary (counts + total duration) and return it as a dict.
368
+
369
+ Parameters
370
+ ----------
371
+ entity_id : Mapping[str, Any] | Iterable[str]
372
+ Entity identifiers used to parameterize collect-stat statements.
373
+ If a mapping (e.g., dict), its *keys* are used and sorted.
374
+ If an iterable (e.g., list/tuple), it’s sorted directly.
375
+ primary_index : Optional[str]
376
+ Primary index used by `generate_collect_stats` (may be None).
377
+ partitioning : Optional[str]
378
+ Partitioning clause used by `generate_collect_stats` (may be None).
379
+ feature_infos : pd.DataFrame
380
+ Must contain columns: 'FEATURE_TABLE', 'FEATURE_DATABASE', 'FEATURE_ID'.
381
+
382
+ Returns
383
+ -------
384
+ Dict[str, Any]
385
+ Summary with keys:
386
+ - total_tables: int
387
+ - ok: int
388
+ - retried: int
389
+ - failed: int
390
+ - duration_seconds: float
391
+ - duration_hms: str
392
+ - details: list[dict] # per-table status entries
393
+ """
394
+ # --- Validate inputs -----------------------------------------------------
395
+ required_cols = {"FEATURE_TABLE", "FEATURE_DATABASE", "FEATURE_ID"}
396
+ missing = required_cols.difference(feature_infos.columns)
397
+ if missing:
398
+ raise ValueError(f"feature_infos is missing required columns: {sorted(missing)}")
399
+
400
+ # --- Normalize & sort entity IDs ----------------------------------------
401
+ if hasattr(entity_id, "keys"):
402
+ sorted_entity_ids = sorted(list(entity_id.keys()))
403
+ else:
404
+ sorted_entity_ids = sorted(list(entity_id))
405
+
406
+ # --- Group to unique targets --------------------------------------------
407
+ target_tables = (
408
+ feature_infos[["FEATURE_TABLE", "FEATURE_DATABASE", "FEATURE_ID"]]
409
+ .groupby(["FEATURE_TABLE", "FEATURE_DATABASE"])
410
+ .count()
411
+ .reset_index()
412
+ )
398
413
 
399
- # Group the target tables by 'FEATURE_TABLE' and 'FEATURE_DATABASE' and count occurrences.
400
- target_tables = feature_infos[['FEATURE_TABLE', 'FEATURE_DATABASE', 'FEATURE_ID']].groupby(
401
- ['FEATURE_TABLE', 'FEATURE_DATABASE']
402
- ).count().reset_index()
414
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
415
+ logger_safe(
416
+ "debug",
417
+ "collect_stats.targets | count=%s | tables=%s",
418
+ len(target_tables),
419
+ target_tables[["FEATURE_DATABASE", "FEATURE_TABLE"]].to_dict(orient="records"),
420
+ )
403
421
 
404
- # Generate the collect statistics query and its optional extension.
422
+ # --- Prepare statements --------------------------------------------------
405
423
  query_collect_stats, query_collect_stats_extension = generate_collect_stats(
406
- sorted_entity_id,
424
+ sorted_entity_ids,
407
425
  primary_index=primary_index,
408
- partitioning=partitioning
426
+ partitioning=partitioning,
409
427
  )
410
428
 
411
- # Record the start time for measuring query execution duration.
412
- start_time = time.time()
413
-
414
- # Loop through the grouped target tables and execute the queries.
415
- for i, row in target_tables.iterrows():
416
- # Execute the main collect statistics query.
417
- execute_query(query_collect_stats + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
429
+ # --- Execute -------------------------------------------------------------
430
+ started = time.perf_counter()
431
+ results: list[Dict[str, Any]] = []
432
+
433
+ ok = retried = failed = 0
434
+
435
+ for _, row in target_tables.iterrows():
436
+ db = row["FEATURE_DATABASE"]
437
+ tbl = row["FEATURE_TABLE"]
438
+ table_fqn = f"{db}.{tbl}"
439
+
440
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
441
+ logger_safe("debug", "collect_stats.run | table=%s", table_fqn)
442
+
443
+ t0 = time.perf_counter()
444
+ status = "ok"
445
+ error_short = None
446
+ retried_flag = False
447
+
448
+ try:
449
+ execute_query(f"COLLECT STATS ON {table_fqn}")
450
+ ok += 1
451
+ except Exception as e:
452
+ # First attempt failed; try generated statement(s)
453
+ error_short = str(e).split("\n")[0]
454
+ logger_safe("warning", "collect_stats.initial_fail | table=%s | err=%s", table_fqn, error_short)
455
+
456
+ try:
457
+ execute_query(query_collect_stats + f" ON {table_fqn}")
458
+ retried_flag = True
459
+ retried += 1
460
+
461
+ if query_collect_stats_extension is not None:
462
+ execute_query(query_collect_stats_extension + f" ON {table_fqn}")
463
+ except Exception as e2:
464
+ status = "failed"
465
+ error_short = str(e2).split("\n")[0]
466
+ failed += 1
467
+ logger_safe("error", "collect_stats.retry_fail | table=%s | err=%s", table_fqn, error_short)
468
+
469
+ dt = time.perf_counter() - t0
470
+ results.append(
471
+ {
472
+ "table": table_fqn,
473
+ "status": status,
474
+ "retried": retried_flag,
475
+ "elapsed_s": dt,
476
+ "error": error_short,
477
+ }
478
+ )
479
+
480
+ # --- Final summary -------------------------------------------------------
481
+ elapsed = time.perf_counter() - started
482
+ formatted = seconds_to_dhms(elapsed)
483
+
484
+ # Structured, parseable one-liner
485
+ logger_safe(
486
+ "info",
487
+ "collect_stats.summary | tables=%d | ok=%d | retried=%d | failed=%d | duration=%s (%.3fs)",
488
+ len(target_tables),
489
+ ok,
490
+ retried,
491
+ failed,
492
+ formatted,
493
+ elapsed,
494
+ )
418
495
 
419
- # If an extension query exists, execute it as well.
420
- if query_collect_stats_extension is not None:
421
- execute_query(query_collect_stats_extension + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
496
+ return {
497
+ "total_tables": int(len(target_tables)),
498
+ "ok": int(ok),
499
+ "retried": int(retried),
500
+ "failed": int(failed),
501
+ "duration_seconds": float(elapsed),
502
+ "duration_hms": formatted,
503
+ "details": results,
504
+ }
422
505
 
423
- # Record the end time after query execution.
424
- end_time = time.time()
425
506
 
426
- # Calculate the elapsed time in seconds and format it into a human-readable format.
427
- elapsed_time = end_time - start_time
428
- formatted_elapsed_time = seconds_to_dhms(elapsed_time)
429
-
430
- # Log the execution time if logging is enabled.
431
- if tdfs4ds.DISPLAY_LOGS:
432
- print(f'Storage of the prepared features - collect stats only : {formatted_elapsed_time} ({elapsed_time}s)')
433
507
 
434
508
 
435
509
  def _store_feature_update_insert(entity_id, volatile_table_name, entity_null_substitute={},primary_index=None,
@@ -627,9 +701,8 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
627
701
  >>> store_feature(entity_id_dict, prepared_features)
628
702
  """
629
703
 
630
- #feature_catalog = tdml.DataFrame(tdml.in_schema(tdfs4ds.SCHEMA, tdfs4ds.FEATURE_CATALOG_NAME))
631
-
632
- if tdfs4ds.FEATURE_STORE_TIME == None:
704
+ # VALIDTIME handling
705
+ if tdfs4ds.FEATURE_STORE_TIME is None:
633
706
  validtime_statement = 'CURRENT VALIDTIME'
634
707
  validtime_statement2 = validtime_statement
635
708
  validtime_start = 'CAST(CURRENT_TIME AS TIMESTAMP(0) WITH TIME ZONE)'
@@ -638,180 +711,155 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
638
711
  validtime_statement2 = f"VALIDTIME AS OF TIMESTAMP '{tdfs4ds.FEATURE_STORE_TIME}'"
639
712
  validtime_start = f"CAST('{tdfs4ds.FEATURE_STORE_TIME}' AS TIMESTAMP(0) WITH TIME ZONE)"
640
713
 
641
- if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED':
642
- end_period_ = '9999-01-01 00:00:00'
643
- else:
644
- end_period_ = tdfs4ds.END_PERIOD
714
+ end_period_ = '9999-01-01 00:00:00' if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED' else tdfs4ds.END_PERIOD
645
715
 
646
- if tdfs4ds.DEBUG_MODE:
647
- print('tdfs4ds.FEATURE_STORE_TIME :' , tdfs4ds.FEATURE_STORE_TIME)
716
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
717
+ logger_safe("debug", "FEATURE_STORE_TIME=%s | END_PERIOD=%s", tdfs4ds.FEATURE_STORE_TIME, tdfs4ds.END_PERIOD)
718
+ logger_safe("debug", "entity_id=%s", entity_id)
648
719
 
720
+ # Entity id helpers
721
+ sorted_entity_id = sorted(list(entity_id.keys()))
722
+ ENTITY_ID = ','.join(sorted_entity_id)
649
723
 
650
- if tdfs4ds.DEBUG_MODE:
651
- print('entity_id :' , entity_id)
652
-
653
- sorted_entity_id = list(entity_id.keys())
654
- sorted_entity_id.sort()
655
- ENTITY_ID = ','.join([k for k in sorted_entity_id])
656
-
657
- count_features = pd.DataFrame(tdml.execute_sql(f"""
658
- SEL count(*) as NB_ROWS FROM
659
- {_get_database_username()}.
660
- {volatile_table_name}
661
- """).fetchall(), columns = ['NB_ROWS'])
662
-
663
- if tdfs4ds.DEBUG_MODE:
664
- print('count_features :' , count_features)
665
- print('features_infos :', features_infos)
724
+ # Count rows in volatile table
725
+ count_features = pd.DataFrame(
726
+ tdml.execute_sql(
727
+ f"""
728
+ SEL count(*) as NB_ROWS
729
+ FROM {_get_database_username()}.{volatile_table_name}
730
+ """
731
+ ).fetchall(),
732
+ columns=['NB_ROWS']
733
+ )
666
734
 
735
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
736
+ logger_safe("debug", "count_features=%s", count_features)
737
+ logger_safe("debug", "features_infos initial=%s", features_infos)
667
738
 
668
739
  if count_features.shape[0] > 0:
669
740
  features_infos['NB_ROWS'] = count_features['NB_ROWS'].values[0]
670
741
  else:
671
742
  features_infos['NB_ROWS'] = 0
672
743
 
673
- if tdfs4ds.DEBUG_MODE:
674
- print('features_infos :' , features_infos)
675
- # Group the target tables by feature table and feature database and count the number of occurrences
676
- target_tables = features_infos[['FEATURE_TABLE', 'FEATURE_DATABASE', 'NB_ROWS']].groupby(
677
- ['FEATURE_TABLE', 'FEATURE_DATABASE']).sum().reset_index()
678
-
679
- if tdfs4ds.DEBUG_MODE:
680
- print('target_tables :' , target_tables)
681
- if tdfs4ds.DISPLAY_LOGS:
682
- display_table(target_tables[['FEATURE_DATABASE', 'FEATURE_TABLE', 'NB_ROWS']])
744
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
745
+ logger_safe("debug", "features_infos updated=%s", features_infos)
683
746
 
747
+ # Compute target tables
748
+ target_tables = features_infos[['FEATURE_TABLE', 'FEATURE_DATABASE', 'NB_ROWS']].groupby(
749
+ ['FEATURE_TABLE', 'FEATURE_DATABASE']
750
+ ).sum().reset_index()
684
751
 
752
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
753
+ logger_safe("debug", "target_tables=%s", target_tables)
685
754
 
686
- sorted_entity_id = list(entity_id.keys())
687
- sorted_entity_id.sort()
755
+ # Optional display (keep existing UX semantics)
756
+ if getattr(tdfs4ds, "DISPLAY_LOGS", False):
757
+ try:
758
+ display_table(target_tables[['FEATURE_DATABASE', 'FEATURE_TABLE', 'NB_ROWS']])
759
+ except Exception as e:
760
+ logger_safe("warning", "display_table failed: %s", str(e).split('\n')[0])
688
761
 
689
762
  ENTITY_ID_ON = ' AND '.join([f'NEW_FEATURES.{k} = EXISTING_FEATURES.{k}' for k in sorted_entity_id])
690
-
691
763
  ENTITY_ID_SELECT = ', \n'.join(['NEW_FEATURES.' + k for k in sorted_entity_id])
692
- # Iterate over target tables and perform update and insert operations
693
-
694
-
695
- #query_collect_stats, query_collect_stats_extension = generate_collect_stats(sorted_entity_id,primary_index=primary_index, partitioning=partitioning)
696
-
697
764
 
698
765
  queries = []
699
- for i, row in features_infos.iterrows():
700
-
701
- features_infos_ = features_infos[(features_infos.FEATURE_TABLE == row['FEATURE_TABLE']) & (features_infos.FEATURE_DATABASE == row['FEATURE_DATABASE'])]
702
- feature_id_list = ','.join([str(x) for x in list(set(features_infos_.FEATURE_ID.values))])
703
- feature_version_list = ','.join(["'"+x+"'" for x in list(set(features_infos_.FEATURE_VERSION.values))])
704
-
705
-
706
- nested_query = f"SEL * FROM {_get_database_username()}.{volatile_table_name} WHERE FEATURE_ID IN ({feature_id_list})"
766
+ for _, row in features_infos.iterrows():
767
+ features_infos_ = features_infos[
768
+ (features_infos.FEATURE_TABLE == row['FEATURE_TABLE']) &
769
+ (features_infos.FEATURE_DATABASE == row['FEATURE_DATABASE'])
770
+ ]
771
+ feature_id_list = ','.join([str(x) for x in sorted(set(features_infos_.FEATURE_ID.values))])
772
+ feature_version_list = ','.join(["'" + x + "'" for x in sorted(set(features_infos_.FEATURE_VERSION.values))])
773
+
774
+ # Build nested query
707
775
  nested_query = f"""
708
776
  SEL
709
- {ENTITY_ID}
710
- , {row['FEATURE_ID']} AS FEATURE_ID
711
- , {row['FEATURE_NAME']} AS FEATURE_VALUE
712
- , '{row['FEATURE_VERSION']}' AS FEATURE_VERSION
713
- FROM {_get_database_username()}.{volatile_table_name}
777
+ {ENTITY_ID}
778
+ , {row['FEATURE_ID']} AS FEATURE_ID
779
+ , {row['FEATURE_NAME']} AS FEATURE_VALUE
780
+ , '{row['FEATURE_VERSION']}' AS FEATURE_VERSION
781
+ FROM {_get_database_username()}.{volatile_table_name}
714
782
  """
715
783
 
716
- if tdfs4ds.FEATURE_STORE_TIME == None:
784
+ if tdfs4ds.FEATURE_STORE_TIME is None:
717
785
  query_merge = f"""
718
786
  {validtime_statement}
719
- MERGE INTO {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']} EXISTING_FEATURES
720
-
787
+ MERGE INTO {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']} EXISTING_FEATURES
721
788
  USING ( {nested_query} ) NEW_FEATURES
722
- ON {ENTITY_ID_ON}
789
+ ON {ENTITY_ID_ON}
723
790
  AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
724
791
  AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
725
792
  AND NEW_FEATURES.FEATURE_ID IN ({feature_id_list})
726
793
  AND EXISTING_FEATURES.FEATURE_ID IN ({feature_id_list})
727
794
  AND EXISTING_FEATURES.FEATURE_VERSION IN ({feature_version_list})
728
795
  WHEN MATCHED THEN
729
- UPDATE
730
- SET
731
- FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
796
+ UPDATE SET FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
732
797
  WHEN NOT MATCHED THEN
733
798
  INSERT
734
- ({ENTITY_ID_SELECT},
735
- NEW_FEATURES.FEATURE_ID,
736
- NEW_FEATURES.FEATURE_VALUE,
737
- NEW_FEATURES.FEATURE_VERSION)
738
- --,
739
- --{validtime_start},
740
- --'{end_period_}')
799
+ (
800
+ {ENTITY_ID_SELECT},
801
+ NEW_FEATURES.FEATURE_ID,
802
+ NEW_FEATURES.FEATURE_VALUE,
803
+ NEW_FEATURES.FEATURE_VERSION
804
+ )
741
805
  """
742
806
  else:
743
807
  query_merge = f"""
744
808
  {validtime_statement}
745
- MERGE INTO {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']} EXISTING_FEATURES
809
+ MERGE INTO {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']} EXISTING_FEATURES
746
810
  USING ( {nested_query} ) NEW_FEATURES
747
- ON {ENTITY_ID_ON}
811
+ ON {ENTITY_ID_ON}
748
812
  AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
749
- AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
750
- AND EXISTING_FEATURES.FEATURE_ID IN ({feature_id_list})
751
- AND NEW_FEATURES.FEATURE_ID IN ({feature_id_list})
813
+ AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
814
+ AND EXISTING_FEATURES.FEATURE_ID IN ({feature_id_list})
815
+ AND NEW_FEATURES.FEATURE_ID IN ({feature_id_list})
752
816
  AND EXISTING_FEATURES.FEATURE_VERSION IN ({feature_version_list})
753
817
  WHEN MATCHED THEN
754
- UPDATE
755
- SET
756
- FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
818
+ UPDATE SET FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
757
819
  WHEN NOT MATCHED THEN
758
820
  INSERT
759
- ({ENTITY_ID_SELECT},
760
- NEW_FEATURES.FEATURE_ID,
761
- NEW_FEATURES.FEATURE_VALUE,
762
- NEW_FEATURES.FEATURE_VERSION,
763
- {validtime_start},
764
- '{end_period_}')
821
+ (
822
+ {ENTITY_ID_SELECT},
823
+ NEW_FEATURES.FEATURE_ID,
824
+ NEW_FEATURES.FEATURE_VALUE,
825
+ NEW_FEATURES.FEATURE_VERSION,
826
+ {validtime_start},
827
+ '{end_period_}'
828
+ )
765
829
  """
766
830
 
767
- entity_id_str = ', \n'.join([k for k in sorted_entity_id])
768
- if tdfs4ds.DEBUG_MODE: print(
769
- f'merge feature values of new {entity_id_str} combinations in {row.iloc[1]}.{row.iloc[0]}')
770
- if tdfs4ds.DEBUG_MODE:
771
- print(query_merge)
831
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
832
+ entity_id_str = ', '.join(sorted_entity_id)
833
+ logger_safe(
834
+ "debug",
835
+ "Merging feature values for entity keys (%s) into %s.%s",
836
+ entity_id_str, row['FEATURE_DATABASE'], row['FEATURE_TABLE']
837
+ )
838
+ logger_safe("debug", "Query (truncated): %s", "\n".join(query_merge.splitlines()[:12]) + "\n...")
772
839
 
773
840
  queries.append(query_merge)
774
841
 
775
- query_merge = '; \n'.join(queries)
776
842
  try:
777
- # Record the end time
778
843
  start_time = time.time()
779
844
 
780
845
  for q in queries:
781
- if tdfs4ds.DEBUG_MODE:
782
- print(q.split('\n')[0:3])
783
- # Execute the SQL query to create the volatile table.
846
+ if getattr(tdfs4ds, "DEBUG_MODE", False):
847
+ logger_safe("debug", "Executing merge (head): %s", "\n".join(q.split('\n')[0:3]))
784
848
  execute_query(q)
785
- #execute_query(query_merge)
786
- # Record the end time
787
- end_time = time.time()
788
849
 
789
- # Calculate the elapsed time in seconds
790
- elapsed_time = end_time - start_time
850
+ elapsed_time = time.time() - start_time
791
851
  formatted_elapsed_time = seconds_to_dhms(elapsed_time)
792
- if tdfs4ds.DISPLAY_LOGS:
793
- print(f'Storage of the prepared features - merge only : {formatted_elapsed_time} ({elapsed_time}s)')
852
+ logger_safe(
853
+ "info",
854
+ "Storage of prepared features (merge-only) completed in %s (%.3fs)",
855
+ formatted_elapsed_time, elapsed_time
856
+ )
794
857
  except Exception as e:
795
- print(str(e))
858
+ logger_safe("exception", "Feature storage (merge) failed: %s", str(e).split('\n')[0])
796
859
  raise
797
860
 
798
- # # Record the end time
799
- # start_time = time.time()
800
- # for i, row in features_infos.iterrows():
801
- # execute_query(query_collect_stats + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
802
- # #print(query_collect_stats + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
803
- # if query_collect_stats_extension is not None:
804
- # execute_query(query_collect_stats_extension + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
805
- # #print(query_collect_stats_extension + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
806
- # # Record the end time
807
- # end_time = time.time()
808
- #
809
- # # Calculate the elapsed time in seconds
810
- # elapsed_time = end_time - start_time
811
- # formatted_elapsed_time = seconds_to_dhms(elapsed_time)
812
- # if tdfs4ds.DISPLAY_LOGS:
813
- # print(f'Storage of the prepared features - collect stats only : {formatted_elapsed_time} ({elapsed_time}s)')
814
861
  return
862
+
815
863
  def store_feature(entity_id, volatile_table_name, entity_null_substitute = {},primary_index=None,
816
864
  partitioning='', features_infos = None, **kwargs):
817
865
  """
@@ -832,27 +880,47 @@ def store_feature(entity_id, volatile_table_name, entity_null_substitute = {},pr
832
880
  >>> store_feature(entity_id_dict, prepared_features)
833
881
  """
834
882
 
835
- # Record the start time
836
883
  start_time = time.time()
837
884
 
885
+ # Choose storage strategy
838
886
  if tdfs4ds.STORE_FEATURE == 'UPDATE_INSERT':
839
- _store_feature_update_insert(entity_id, volatile_table_name, entity_null_substitute=entity_null_substitute,primary_index=primary_index,
840
- partitioning=partitioning, features_infos=features_infos, **kwargs)
887
+ logger_safe("info", "Storing features using UPDATE/INSERT strategy.")
888
+ _store_feature_update_insert(
889
+ entity_id,
890
+ volatile_table_name,
891
+ entity_null_substitute=entity_null_substitute,
892
+ primary_index=primary_index,
893
+ partitioning=partitioning,
894
+ features_infos=features_infos,
895
+ **kwargs
896
+ )
841
897
  elif tdfs4ds.STORE_FEATURE == 'MERGE':
842
- _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=entity_null_substitute,primary_index=primary_index,
843
- partitioning=partitioning, features_infos=features_infos, **kwargs)
898
+ logger_safe("info", "Storing features using MERGE strategy.")
899
+ _store_feature_merge(
900
+ entity_id,
901
+ volatile_table_name,
902
+ entity_null_substitute=entity_null_substitute,
903
+ primary_index=primary_index,
904
+ partitioning=partitioning,
905
+ features_infos=features_infos,
906
+ **kwargs
907
+ )
844
908
  else:
845
- # Handle other conditions or operations as required
846
- pass
847
-
848
- # Record the end time
849
- end_time = time.time()
850
-
851
- # Calculate the elapsed time in seconds
852
- elapsed_time = end_time - start_time
909
+ logger_safe(
910
+ "warning",
911
+ "Unknown STORE_FEATURE strategy '%s'. No storage operation was performed.",
912
+ tdfs4ds.STORE_FEATURE
913
+ )
914
+
915
+ # Log duration
916
+ elapsed_time = time.time() - start_time
853
917
  formatted_elapsed_time = seconds_to_dhms(elapsed_time)
854
- if tdfs4ds.DISPLAY_LOGS:
855
- print(f'Storage of the prepared features : {formatted_elapsed_time} ({elapsed_time}s)')
918
+ logger_safe(
919
+ "info",
920
+ "Storage of prepared features completed in %s (%.3fs)",
921
+ formatted_elapsed_time,
922
+ elapsed_time
923
+ )
856
924
 
857
925
  def prepare_feature_ingestion_tdstone2(df, entity_id):
858
926
  """