tdfs4ds 0.2.4.33__py3-none-any.whl → 0.2.4.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tdfs4ds/__init__.py CHANGED
@@ -1,5 +1,6 @@
1
- __version__ = '0.2.4.33'
1
+ __version__ = '0.2.4.35'
2
2
  import logging
3
+
3
4
  # Setup the logger
4
5
  logging.basicConfig(
5
6
  level=logging.INFO,
@@ -66,7 +67,7 @@ import tdfs4ds.datasets
66
67
  import time
67
68
 
68
69
  import inspect
69
- import tqdm
70
+ from tqdm.auto import tqdm # auto picks the right frontend (notebook/terminal)
70
71
 
71
72
  from tdfs4ds.feature_store.feature_data_processing import generate_on_clause
72
73
 
@@ -671,8 +672,28 @@ def _upload_features(
671
672
  else:
672
673
  logger_safe("info", "FilterManager detected: %s filters to process", filtermanager.nb_filters)
673
674
  something_computed = False
674
- for i in range(filtermanager.nb_filters):
675
- filtermanager.update(i + 1)
675
+ for i in tqdm(
676
+ range(filtermanager.nb_filters),
677
+ total=filtermanager.nb_filters,
678
+ desc="Applying filters",
679
+ unit="filter",
680
+ leave=False
681
+ ):
682
+ filter_id = i + 1
683
+ filtermanager.update(filter_id)
684
+
685
+ # show which filter is being applied in the bar
686
+ try:
687
+ tqdm.write(f"Applying filter {filter_id}/{filtermanager.nb_filters}")
688
+ # If display() returns a long string, you can shorten it:
689
+ bar_info = str(filtermanager.display())
690
+ if len(bar_info) > 80:
691
+ bar_info = bar_info[:77] + "..."
692
+ tqdm.tqdm._instances and next(iter(tqdm.tqdm._instances)).set_postfix_str(bar_info)
693
+ except Exception:
694
+ # postfix is optional; ignore errors from display() here
695
+ pass
696
+
676
697
  logger_safe("debug", "Applying filter %s/%s:\n%s",
677
698
  i + 1, filtermanager.nb_filters, filtermanager.display())
678
699
 
@@ -685,38 +706,40 @@ def _upload_features(
685
706
 
686
707
  if do_compute or force_compute:
687
708
  tdfs4ds.process_store.process_followup.followup_open(
688
- run_id=tdfs4ds.RUN_ID,
689
- process_type=tdfs4ds.PROCESS_TYPE,
690
- process_id=process_id,
691
- filtermanager=filtermanager
709
+ run_id = tdfs4ds.RUN_ID,
710
+ process_type = tdfs4ds.PROCESS_TYPE,
711
+ process_id = process_id,
712
+ filtermanager = filtermanager
692
713
  )
693
714
  try:
694
715
  prepared_features, volatile_table, features_infos = prepare_feature_ingestion(
695
716
  df, entity_id, feature_names,
696
- feature_versions=selected_features,
697
- primary_index=primary_index,
698
- entity_null_substitute=entity_null_substitute,
699
- partitioning=partitioning
717
+ feature_versions = selected_features,
718
+ primary_index = primary_index,
719
+ entity_null_substitute = entity_null_substitute,
720
+ partitioning = partitioning
700
721
  )
722
+
701
723
  store_feature(entity_id, volatile_table, entity_null_substitute,
702
724
  primary_index, partitioning, features_infos)
725
+
703
726
  something_computed = True
704
727
 
705
728
  tdfs4ds.process_store.process_followup.followup_close(
706
- run_id=tdfs4ds.RUN_ID,
707
- process_type=tdfs4ds.PROCESS_TYPE,
708
- process_id=process_id,
709
- filtermanager=filtermanager
729
+ run_id = tdfs4ds.RUN_ID,
730
+ process_type = tdfs4ds.PROCESS_TYPE,
731
+ process_id = process_id,
732
+ filtermanager = filtermanager
710
733
  )
711
734
 
712
735
  except Exception as e:
713
736
  logger_safe("exception", "Error with filter iteration %s: %s", i + 1, str(e))
714
737
  tdfs4ds.process_store.process_followup.followup_close(
715
- run_id=tdfs4ds.RUN_ID,
716
- process_type=tdfs4ds.PROCESS_TYPE,
717
- process_id=process_id,
718
- status='FAILED,' + str(e).split('\n')[0],
719
- filtermanager=filtermanager
738
+ run_id = tdfs4ds.RUN_ID,
739
+ process_type = tdfs4ds.PROCESS_TYPE,
740
+ process_id = process_id,
741
+ status = 'FAILED,' + str(e).split('\n')[0],
742
+ filtermanager = filtermanager
720
743
  )
721
744
  raise
722
745
 
@@ -1188,9 +1211,6 @@ def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None):
1188
1211
  >>> roll_out(process_list, time_manager, time_id_start=1, time_id_end=10)
1189
1212
  """
1190
1213
 
1191
- #global DISPLAY_LOGS
1192
- #global FEATURE_STORE_TIME
1193
-
1194
1214
  # Disable display logs
1195
1215
  temp_DISPLAY_LOGS = tdfs4ds.DISPLAY_LOGS
1196
1216
  tdfs4ds.DISPLAY_LOGS = False
@@ -1198,40 +1218,43 @@ def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None):
1198
1218
  tdfs4ds.PROCESS_TYPE = 'ROLL_OUT'
1199
1219
  tdfs4ds.RUN_ID = str(uuid.uuid4())
1200
1220
 
1201
-
1202
-
1203
1221
  try:
1222
+ # Define range of time steps
1204
1223
  if time_id_end is None:
1205
- pbar = tqdm.tqdm(range(time_id_start, time_manager.nb_time_steps + 1), desc="Starting")
1224
+ time_range = range(time_id_start, time_manager.nb_time_steps + 1)
1206
1225
  else:
1207
- pbar = tqdm.tqdm(range(time_id_start, min([time_manager.nb_time_steps + 1,time_id_end+1]) ), desc="Starting")
1208
- # Iterate over each date in the provided list
1226
+ time_range = range(time_id_start, min(time_manager.nb_time_steps + 1, time_id_end + 1))
1227
+
1228
+ # Progress bar
1229
+ pbar = tqdm(time_range, desc="Starting rollout", unit="step")
1230
+
1209
1231
  for i in pbar:
1210
- # Update the time manager with the new date
1211
- time_manager.update(time_id = i )
1232
+ # Update time manager
1233
+ time_manager.update(time_id=i)
1212
1234
  date_ = str(time_manager.display()['BUSINESS_DATE'].values[0])
1213
- pbar.set_description(f"Processing {date_}")
1214
- # Synchronize the time for the feature store with the current date
1235
+
1236
+ # Sync feature store time
1215
1237
  tdfs4ds.FEATURE_STORE_TIME = time_manager.get_date_in_the_past()
1216
- pbar.set_description(f"Processing {tdfs4ds.FEATURE_STORE_TIME}")
1238
+
1239
+ # Display current progress in tqdm
1240
+ pbar.set_postfix(time=date_, feature_time=tdfs4ds.FEATURE_STORE_TIME)
1241
+
1217
1242
  if tdfs4ds.DEBUG_MODE:
1218
- print('def roll_out','date_', date_)
1219
- print('def roll_out','time_manager.get_date_in_the_past()', time_manager.get_date_in_the_past())
1220
- print('def roll_out','tdfs4ds.FEATURE_STORE_TIME', tdfs4ds.FEATURE_STORE_TIME)
1221
- # Execute each process in the process list for the current date
1243
+ print("roll_out | date_:", date_)
1244
+ print("roll_out | feature_store_time:", tdfs4ds.FEATURE_STORE_TIME)
1245
+
1246
+ # Execute all processes for this time step
1222
1247
  for proc_id in process_list:
1223
- pbar.set_description(f"Processing {date_} process {proc_id}")
1248
+ pbar.set_description(f"Processing {date_} | proc {proc_id}")
1224
1249
  run(process_id=proc_id, force_compute=False)
1225
1250
 
1251
+ # Restore settings
1226
1252
  tdfs4ds.DISPLAY_LOGS = temp_DISPLAY_LOGS
1253
+
1227
1254
  except Exception as e:
1228
1255
  tdfs4ds.DISPLAY_LOGS = temp_DISPLAY_LOGS
1229
- # If an exception occurs, print the date and the first line of the exception message
1230
- #print(date_)
1231
1256
  print(str(e).split('\n')[0])
1232
1257
  tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
1233
1258
  raise
1234
1259
 
1235
- tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
1236
-
1237
-
1260
+ tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
@@ -75,7 +75,7 @@ def generate_collect_stats(entity_id, primary_index='', partitioning=''):
75
75
 
76
76
  # Initialize the extended query with sampling and threshold settings for statistics collection
77
77
  query_extension_header = 'COLLECT STATISTICS USING SAMPLE 25 PERCENT AND THRESHOLD 15 PERCENT'
78
- query_extension = []
78
+ query_extension = []
79
79
 
80
80
  # Add primary index columns to the extended query
81
81
  if primary_index:
@@ -343,67 +343,167 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
343
343
  # return None, None, None
344
344
 
345
345
 
346
- def apply_collect_stats(entity_id, primary_index, partitioning, feature_infos):
347
- """
348
- Applies a collect statistics operation on target tables grouped by feature table and database.
349
-
350
- This function performs the following steps:
351
- 1. Sorts the `entity_id`.
352
- 2. Groups the feature information by feature table and database to count occurrences.
353
- 3. Generates collect statistics queries.
354
- 4. Executes the queries on the target tables while recording the execution time.
355
- 5. Logs the elapsed time if logging is enabled.
356
-
357
- Args:
358
- entity_id (list): A list of entity IDs to process.
359
- primary_index (str): The primary index to use in the collect statistics query.
360
- partitioning (str): Partitioning information for the query.
361
- feature_infos (pd.DataFrame): A DataFrame containing feature information,
362
- including columns 'FEATURE_TABLE', 'FEATURE_DATABASE', and 'FEATURE_ID'.
346
+ import time
347
+ from typing import Any, Dict, Iterable, Mapping, Optional, Tuple
363
348
 
364
- Returns:
365
- None
366
- """
367
- # Sort entity IDs for consistent ordering
368
- sorted_entity_id = list(entity_id.keys())
369
- sorted_entity_id.sort()
349
+ import pandas as pd
370
350
 
371
- # Group target tables
372
- target_tables = feature_infos[['FEATURE_TABLE', 'FEATURE_DATABASE', 'FEATURE_ID']].groupby(
373
- ['FEATURE_TABLE', 'FEATURE_DATABASE']
374
- ).count().reset_index()
351
+ def apply_collect_stats(
352
+ entity_id: Mapping[str, Any] | Iterable[str],
353
+ primary_index: Optional[str],
354
+ partitioning: Optional[str],
355
+ feature_infos: pd.DataFrame,
356
+ ) -> Dict[str, Any]:
357
+ """
358
+ Run COLLECT STATS on all target feature tables, with fallbacks and timing.
359
+
360
+ Steps:
361
+ 1) Determine a stable ordering of entity IDs (for deterministic query gen).
362
+ 2) Group `feature_infos` by FEATURE_DATABASE + FEATURE_TABLE to get unique targets.
363
+ 3) Generate COLLECT STATS statements via `generate_collect_stats(...)` for fallback use.
364
+ 4) For each target table:
365
+ - Try a simple `COLLECT STATS ON <db>.<table>`.
366
+ - On failure, retry with generated statements (and optional extension).
367
+ 5) Log a compact summary (counts + total duration) and return it as a dict.
368
+
369
+ Parameters
370
+ ----------
371
+ entity_id : Mapping[str, Any] | Iterable[str]
372
+ Entity identifiers used to parameterize collect-stat statements.
373
+ If a mapping (e.g., dict), its *keys* are used and sorted.
374
+ If an iterable (e.g., list/tuple), it’s sorted directly.
375
+ primary_index : Optional[str]
376
+ Primary index used by `generate_collect_stats` (may be None).
377
+ partitioning : Optional[str]
378
+ Partitioning clause used by `generate_collect_stats` (may be None).
379
+ feature_infos : pd.DataFrame
380
+ Must contain columns: 'FEATURE_TABLE', 'FEATURE_DATABASE', 'FEATURE_ID'.
381
+
382
+ Returns
383
+ -------
384
+ Dict[str, Any]
385
+ Summary with keys:
386
+ - total_tables: int
387
+ - ok: int
388
+ - retried: int
389
+ - failed: int
390
+ - duration_seconds: float
391
+ - duration_hms: str
392
+ - details: list[dict] # per-table status entries
393
+ """
394
+ # --- Validate inputs -----------------------------------------------------
395
+ required_cols = {"FEATURE_TABLE", "FEATURE_DATABASE", "FEATURE_ID"}
396
+ missing = required_cols.difference(feature_infos.columns)
397
+ if missing:
398
+ raise ValueError(f"feature_infos is missing required columns: {sorted(missing)}")
399
+
400
+ # --- Normalize & sort entity IDs ----------------------------------------
401
+ if hasattr(entity_id, "keys"):
402
+ sorted_entity_ids = sorted(list(entity_id.keys()))
403
+ else:
404
+ sorted_entity_ids = sorted(list(entity_id))
405
+
406
+ # --- Group to unique targets --------------------------------------------
407
+ target_tables = (
408
+ feature_infos[["FEATURE_TABLE", "FEATURE_DATABASE", "FEATURE_ID"]]
409
+ .groupby(["FEATURE_TABLE", "FEATURE_DATABASE"])
410
+ .count()
411
+ .reset_index()
412
+ )
375
413
 
376
414
  if getattr(tdfs4ds, "DEBUG_MODE", False):
377
- logger_safe("debug", "Target tables for COLLECT STATs: %s", target_tables[['FEATURE_DATABASE','FEATURE_TABLE']].to_dict(orient='records'))
415
+ logger_safe(
416
+ "debug",
417
+ "collect_stats.targets | count=%s | tables=%s",
418
+ len(target_tables),
419
+ target_tables[["FEATURE_DATABASE", "FEATURE_TABLE"]].to_dict(orient="records"),
420
+ )
378
421
 
379
- # Generate COLLECT STATs queries
422
+ # --- Prepare statements --------------------------------------------------
380
423
  query_collect_stats, query_collect_stats_extension = generate_collect_stats(
381
- sorted_entity_id,
424
+ sorted_entity_ids,
382
425
  primary_index=primary_index,
383
- partitioning=partitioning
426
+ partitioning=partitioning,
384
427
  )
385
428
 
386
- start_time = time.time()
429
+ # --- Execute -------------------------------------------------------------
430
+ started = time.perf_counter()
431
+ results: list[Dict[str, Any]] = []
432
+
433
+ ok = retried = failed = 0
387
434
 
388
- # Execute COLLECT STATs
389
435
  for _, row in target_tables.iterrows():
390
- table_fqn = f"{row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}"
436
+ db = row["FEATURE_DATABASE"]
437
+ tbl = row["FEATURE_TABLE"]
438
+ table_fqn = f"{db}.{tbl}"
439
+
391
440
  if getattr(tdfs4ds, "DEBUG_MODE", False):
392
- logger_safe("debug", "Running COLLECT STATs on %s", table_fqn)
441
+ logger_safe("debug", "collect_stats.run | table=%s", table_fqn)
393
442
 
394
- execute_query(query_collect_stats + f" ON {table_fqn}")
443
+ t0 = time.perf_counter()
444
+ status = "ok"
445
+ error_short = None
446
+ retried_flag = False
395
447
 
396
- if query_collect_stats_extension is not None:
397
- execute_query(query_collect_stats_extension + f" ON {table_fqn}")
448
+ try:
449
+ tdml.execute_sql(f"COLLECT STATS ON {table_fqn}")
450
+ ok += 1
451
+ except Exception as e:
452
+ # First attempt failed; try generated statement(s)
453
+ error_short = str(e).split("\n")[0]
454
+ logger_safe("warning", "collect_stats.initial_fail | table=%s | err=%s", table_fqn, error_short)
455
+
456
+ try:
457
+ execute_query(query_collect_stats + f" ON {table_fqn}")
458
+ retried_flag = True
459
+ retried += 1
460
+
461
+ if query_collect_stats_extension is not None:
462
+ execute_query(query_collect_stats_extension + f" ON {table_fqn}")
463
+ except Exception as e2:
464
+ status = "failed"
465
+ error_short = str(e2).split("\n")[0]
466
+ failed += 1
467
+ logger_safe("error", "collect_stats.retry_fail | table=%s | err=%s", table_fqn, error_short)
468
+
469
+ dt = time.perf_counter() - t0
470
+ results.append(
471
+ {
472
+ "table": table_fqn,
473
+ "status": status,
474
+ "retried": retried_flag,
475
+ "elapsed_s": dt,
476
+ "error": error_short,
477
+ }
478
+ )
398
479
 
399
- elapsed_time = time.time() - start_time
400
- formatted_elapsed_time = seconds_to_dhms(elapsed_time)
480
+ # --- Final summary -------------------------------------------------------
481
+ elapsed = time.perf_counter() - started
482
+ formatted = seconds_to_dhms(elapsed)
483
+
484
+ # Structured, parseable one-liner
401
485
  logger_safe(
402
486
  "info",
403
- "Storage of the prepared features - collect stats only: %s (%.3fs)",
404
- formatted_elapsed_time, elapsed_time
487
+ "collect_stats.summary | tables=%d | ok=%d | retried=%d | failed=%d | duration=%s (%.3fs)",
488
+ len(target_tables),
489
+ ok,
490
+ retried,
491
+ failed,
492
+ formatted,
493
+ elapsed,
405
494
  )
406
495
 
496
+ return {
497
+ "total_tables": int(len(target_tables)),
498
+ "ok": int(ok),
499
+ "retried": int(retried),
500
+ "failed": int(failed),
501
+ "duration_seconds": float(elapsed),
502
+ "duration_hms": formatted,
503
+ "details": results,
504
+ }
505
+
506
+
407
507
 
408
508
 
409
509
  def _store_feature_update_insert(entity_id, volatile_table_name, entity_null_substitute={},primary_index=None,
@@ -9,6 +9,7 @@ import pandas as pd
9
9
  import tqdm
10
10
  import inspect
11
11
  import re
12
+ from tdfs4ds import logger_safe, logger
12
13
 
13
14
  @execute_query_wrapper
14
15
  def feature_store_catalog_view_creation():
@@ -350,46 +351,47 @@ def register_features(entity_id, feature_names_types, primary_index = None, part
350
351
 
351
352
  def _register_features_merge(entity_id, feature_names_types, primary_index=None, partitioning=''):
352
353
  """
353
- Registers or updates feature definitions in a Teradata database's feature catalog, associating entity identifiers
354
- with feature names, types, and other metadata. This function prepares and executes SQL operations to insert new
355
- feature definitions or update existing ones, considering partitioning strategies and primary index configurations.
356
-
357
- Parameters:
358
- - entity_id (dict): Specifies the entity's identifiers with keys representing attribute names. This dictionary
359
- is crucial for defining the scope and granularity of feature data.
360
- - feature_names_types (dict): Maps feature names to their properties, including data types and unique identifiers.
361
- Each value is a dictionary with keys 'type' and 'id' indicating the feature's data
362
- type and a unique identifier, respectively.
363
- - primary_index (list, optional): Identifies the primary index column(s) for the feature data. This influences
364
- the organization and performance of database operations. If not specified,
365
- defaults are used based on the entity_id structure.
366
- - partitioning (str, optional): Describes the partitioning strategy through a string listing column names used
367
- for partitioning. This can impact data storage and retrieval performance.
368
-
369
- Returns:
370
- pd.DataFrame: Contains details of the registered features, including names, types, IDs, and references to the
371
- respective feature store table and view names, alongside metadata about the entity and database schema.
372
-
373
- Note:
374
- - The function dynamically constructs SQL queries for inserting new features or updating existing ones in the
375
- feature catalog, adapting to the provided partitioning and primary index settings.
376
- - Assumes the existence of a Teradata feature catalog table in the specified schema and that the database connection
377
- is correctly configured.
378
- - Utilizes the tdfs4ds module for database schema configurations and valid-time temporal table considerations.
379
-
380
- Example Usage:
381
- >>> entity_id = {'customer_id': 'INTEGER'}
382
- >>> feature_names_types = {'age': {'type': 'BIGINT', 'id': 1}, 'gender': {'type': 'VARCHAR_LATIN', 'id': 2}}
383
- >>> registered_features = register_features(entity_id, feature_names_types)
384
- >>> print(registered_features)
385
-
386
- This example demonstrates registering features for an entity with attributes customer_id, age, and gender,
387
- where age and gender features have specified types and unique IDs.
354
+ Register or update feature definitions in the feature catalog, with temporal support.
355
+
356
+ This function builds (or refreshes) entries in the Teradata feature catalog from a
357
+ mapping of feature names to their metadata, computes the target feature store table
358
+ and view names, stages the metadata to a temporary table, and executes a MERGE into
359
+ the catalog (with optional VALIDTIME support based on `tdfs4ds.FEATURE_STORE_TIME`).
360
+
361
+ Parameters
362
+ ----------
363
+ entity_id : dict[str, Any]
364
+ Mapping of entity-key column names to types. Only the keys (column names) are
365
+ required here; values are not used by this function.
366
+ feature_names_types : dict[str, dict]
367
+ Dict of feature name -> {"type": <SQL_TYPE>, "id": <int>} describing each
368
+ feature’s storage type and identifier in the catalog.
369
+ primary_index : list[str] | None, optional
370
+ Primary index column(s) to use when deriving the feature store table/view names.
371
+ If None, defaults are inferred by `get_feature_store_table_name`.
372
+ partitioning : str, optional
373
+ Partitioning expression or comma-separated column list used by
374
+ `get_feature_store_table_name`.
375
+
376
+ Returns
377
+ -------
378
+ pd.DataFrame
379
+ A dataframe of the features that were (up)registered, including:
380
+ FEATURE_NAME, FEATURE_TYPE, FEATURE_ID, FEATURE_TABLE, FEATURE_VIEW,
381
+ ENTITY_NAME, FEATURE_DATABASE, DATA_DOMAIN.
382
+
383
+ Notes
384
+ -----
385
+ - When `tdfs4ds.FEATURE_STORE_TIME is None`, uses CURRENT VALIDTIME (non-explicit start/end).
386
+ Otherwise uses `VALIDTIME PERIOD ('<FEATURE_STORE_TIME>', '<END_PERIOD>')` and adds
387
+ the valid-time start/end when inserting.
388
+ - Respects `tdfs4ds.DISPLAY_LOGS` via `logger_safe`.
388
389
  """
389
390
 
390
- if tdfs4ds.FEATURE_STORE_TIME == None:
391
+ # --- VALIDTIME setup -----------------------------------------------------
392
+ if tdfs4ds.FEATURE_STORE_TIME is None:
391
393
  validtime_statement = 'CURRENT VALIDTIME'
392
- validtime_start = 'CAST(CURRENT_TIME AS TIMESTAMP(0) WITH TIME ZONE)'
394
+ validtime_start = "CAST(CURRENT_TIME AS TIMESTAMP(0) WITH TIME ZONE)"
393
395
  else:
394
396
  validtime_statement = f"VALIDTIME PERIOD '({tdfs4ds.FEATURE_STORE_TIME},{tdfs4ds.END_PERIOD})'"
395
397
  validtime_start = f"CAST('{tdfs4ds.FEATURE_STORE_TIME}' AS TIMESTAMP(0) WITH TIME ZONE)"
@@ -399,154 +401,174 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
399
401
  else:
400
402
  end_period_ = tdfs4ds.END_PERIOD
401
403
 
402
- if len(list(feature_names_types.keys())) == 0:
403
- if tdfs4ds.DISPLAY_LOGS: print('no new feature to register')
404
+ # --- Input checks & early exit ------------------------------------------
405
+ if not feature_names_types:
406
+ logger_safe("info", "register_features: no new features to register")
404
407
  return
405
408
 
406
- # Create a comma-separated string of entity IDs
407
- entity_id_list = list(entity_id.keys())
408
- entity_id_list.sort()
409
- ENTITY_ID__ = ','.join([k for k in entity_id_list])
410
-
411
- # Create a DataFrame from the feature_names_types dictionary
412
- if len(feature_names_types.keys()) > 1:
413
- df = pd.DataFrame(feature_names_types).transpose().reset_index()
414
- df.columns = ['FEATURE_NAME', 'FEATURE_TYPE', 'FEATURE_ID']
415
- else:
416
- df = pd.DataFrame(columns=['FEATURE_NAME', 'FEATURE_TYPE', 'FEATURE_ID'])
417
- k = list(feature_names_types.keys())[0]
418
- df['FEATURE_NAME'] = [k]
419
- df['FEATURE_TYPE'] = [feature_names_types[k]['type']]
420
- df['FEATURE_ID'] = [feature_names_types[k]['id']]
421
-
422
-
409
+ # --- Entity columns (ordered, stable) -----------------------------------
410
+ entity_cols = sorted(list(entity_id.keys()))
411
+ ENTITY_ID__ = ",".join(entity_cols)
423
412
 
424
- if tdfs4ds.DEBUG_MODE:
425
- print('register_features', 'primary_index', primary_index)
426
- print('register_features', 'partitioning', partitioning)
427
- print('df', df)
428
-
429
- # Generate the feature table and view names based on the entity ID and feature type
430
- df['FEATURE_TABLE'] = df.apply(lambda row: get_feature_store_table_name(entity_id, row.iloc[1],
431
- primary_index=primary_index,
432
- partitioning=partitioning)[0],
433
- axis=1)
434
- df['FEATURE_VIEW'] = df.apply(lambda row: get_feature_store_table_name(entity_id, row.iloc[1],
435
- primary_index=primary_index,
436
- partitioning=partitioning)[1],
437
- axis=1)
438
-
439
- # Add additional columns to the DataFrame
440
- df['ENTITY_NAME'] = ENTITY_ID__
441
- df['FEATURE_DATABASE'] = tdfs4ds.SCHEMA
442
- df['DATA_DOMAIN'] = tdfs4ds.DATA_DOMAIN
443
-
444
- # Copy the DataFrame to a temporary table in Teradata
445
- tdml.copy_to_sql(df, table_name='temp', schema_name=tdfs4ds.SCHEMA, if_exists='replace',
446
- primary_index='FEATURE_ID',
447
- types={'FEATURE_ID': tdml.BIGINT})
448
-
449
-
450
-
451
- if tdfs4ds.DEBUG_MODE:
452
- print("-----------_register_features_merge - df")
453
- print(df)
454
-
455
- if tdfs4ds.FEATURE_STORE_TIME == None:
413
+ # --- Build dataframe safely (no transpose tricks) ------------------------
414
+ rows = []
415
+ for fname, meta in feature_names_types.items():
416
+ try:
417
+ rows.append({
418
+ "FEATURE_NAME": fname,
419
+ "FEATURE_TYPE": meta["type"],
420
+ "FEATURE_ID": meta["id"],
421
+ })
422
+ except KeyError as e:
423
+ logger_safe("error", "register_features: missing key %s in feature '%s' meta=%s", str(e), fname, meta)
424
+ raise
425
+
426
+ df = pd.DataFrame(rows, columns=["FEATURE_NAME", "FEATURE_TYPE", "FEATURE_ID"])
427
+
428
+ logger_safe(
429
+ "debug",
430
+ "register_features: features_count=%d | entity_cols=%s | primary_index=%s | partitioning=%s",
431
+ len(df),
432
+ entity_cols,
433
+ primary_index,
434
+ partitioning,
435
+ )
436
+
437
+ # --- Compute feature table & view names ---------------------------------
438
+ # Use apply to preserve original order; get_feature_store_table_name returns (table, view)
439
+ df["FEATURE_TABLE"] = df.apply(
440
+ lambda row: get_feature_store_table_name(
441
+ entity_id,
442
+ row["FEATURE_TYPE"],
443
+ primary_index=primary_index,
444
+ partitioning=partitioning
445
+ )[0],
446
+ axis=1
447
+ )
448
+ df["FEATURE_VIEW"] = df.apply(
449
+ lambda row: get_feature_store_table_name(
450
+ entity_id,
451
+ row["FEATURE_TYPE"],
452
+ primary_index=primary_index,
453
+ partitioning=partitioning
454
+ )[1],
455
+ axis=1
456
+ )
457
+
458
+ # --- Add catalog columns -------------------------------------------------
459
+ df["ENTITY_NAME"] = ENTITY_ID__
460
+ df["FEATURE_DATABASE"] = tdfs4ds.SCHEMA
461
+ df["DATA_DOMAIN"] = tdfs4ds.DATA_DOMAIN
462
+
463
+ # --- Stage to temp table -------------------------------------------------
464
+ tdml.copy_to_sql(
465
+ df,
466
+ table_name="temp",
467
+ schema_name=tdfs4ds.SCHEMA,
468
+ if_exists="replace",
469
+ primary_index="FEATURE_ID",
470
+ types={"FEATURE_ID": tdml.BIGINT},
471
+ )
472
+ logger_safe("debug", "register_features: staged %d rows to %s.temp", len(df), tdfs4ds.SCHEMA)
473
+
474
+ # --- Build MERGE statement ----------------------------------------------
475
+ if tdfs4ds.FEATURE_STORE_TIME is None:
476
+ # no explicit start/end in INSERT branch
456
477
  query_merge = f"""
457
478
  {validtime_statement}
458
- MERGE INTO {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME} EXISTING_FEATURES
479
+ MERGE INTO {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME} EXISTING_FEATURES
459
480
  USING (
460
481
  SELECT
461
- CASE WHEN B.FEATURE_ID IS NULL THEN A.FEATURE_ID ELSE B.FEATURE_ID END AS FEATURE_ID
462
- , A.FEATURE_NAME
463
- , A.FEATURE_TYPE
464
- , A.FEATURE_TABLE
465
- , A.FEATURE_DATABASE
466
- , A.FEATURE_VIEW
467
- , A.ENTITY_NAME
468
- , A.DATA_DOMAIN
469
- FROM {tdfs4ds.SCHEMA}.temp A
470
- LEFT JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW} B
471
- ON A.FEATURE_NAME = B.FEATURE_NAME
472
- AND A.ENTITY_NAME = B.ENTITY_NAME -- modified
473
- AND A.DATA_DOMAIN = B.DATA_DOMAIN
474
- ) UPDATED_FEATURES
475
- ON UPDATED_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
476
- AND UPDATED_FEATURES.FEATURE_NAME = EXISTING_FEATURES.FEATURE_NAME
477
- AND UPDATED_FEATURES.DATA_DOMAIN = EXISTING_FEATURES.DATA_DOMAIN
478
- WHEN MATCHED THEN
479
- UPDATE
480
- SET
481
- FEATURE_TABLE = UPDATED_FEATURES.FEATURE_TABLE,
482
- FEATURE_TYPE = UPDATED_FEATURES.FEATURE_TYPE,
483
- FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE,
484
- FEATURE_VIEW = UPDATED_FEATURES.FEATURE_VIEW
485
- --,ENTITY_NAME = UPDATED_FEATURES.ENTITY_NAME -- modified
486
- WHEN NOT MATCHED THEN
487
- INSERT
482
+ CASE WHEN B.FEATURE_ID IS NULL THEN A.FEATURE_ID ELSE B.FEATURE_ID END AS FEATURE_ID
483
+ , A.FEATURE_NAME
484
+ , A.FEATURE_TYPE
485
+ , A.FEATURE_TABLE
486
+ , A.FEATURE_DATABASE
487
+ , A.FEATURE_VIEW
488
+ , A.ENTITY_NAME
489
+ , A.DATA_DOMAIN
490
+ FROM {tdfs4ds.SCHEMA}.temp A
491
+ LEFT JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW} B
492
+ ON A.FEATURE_NAME = B.FEATURE_NAME
493
+ AND A.ENTITY_NAME = B.ENTITY_NAME
494
+ AND A.DATA_DOMAIN = B.DATA_DOMAIN
495
+ ) UPDATED_FEATURES
496
+ ON UPDATED_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
497
+ AND UPDATED_FEATURES.FEATURE_NAME = EXISTING_FEATURES.FEATURE_NAME
498
+ AND UPDATED_FEATURES.DATA_DOMAIN = EXISTING_FEATURES.DATA_DOMAIN
499
+ WHEN MATCHED THEN UPDATE SET
500
+ FEATURE_TABLE = UPDATED_FEATURES.FEATURE_TABLE
501
+ , FEATURE_TYPE = UPDATED_FEATURES.FEATURE_TYPE
502
+ , FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE
503
+ , FEATURE_VIEW = UPDATED_FEATURES.FEATURE_VIEW
504
+ WHEN NOT MATCHED THEN INSERT
488
505
  ( UPDATED_FEATURES.FEATURE_ID
489
- , UPDATED_FEATURES.FEATURE_NAME
490
- , UPDATED_FEATURES.FEATURE_TYPE
491
- , UPDATED_FEATURES.FEATURE_TABLE
492
- , UPDATED_FEATURES.FEATURE_DATABASE
493
- , UPDATED_FEATURES.FEATURE_VIEW
494
- , UPDATED_FEATURES.ENTITY_NAME
495
- , UPDATED_FEATURES.DATA_DOMAIN
496
- )
497
- """
506
+ , UPDATED_FEATURES.FEATURE_NAME
507
+ , UPDATED_FEATURES.FEATURE_TYPE
508
+ , UPDATED_FEATURES.FEATURE_TABLE
509
+ , UPDATED_FEATURES.FEATURE_DATABASE
510
+ , UPDATED_FEATURES.FEATURE_VIEW
511
+ , UPDATED_FEATURES.ENTITY_NAME
512
+ , UPDATED_FEATURES.DATA_DOMAIN
513
+ );
514
+ """
498
515
  else:
516
+ # insert with explicit valid-time start/end
499
517
  query_merge = f"""
500
518
  {validtime_statement}
501
- MERGE INTO {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME} EXISTING_FEATURES
519
+ MERGE INTO {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME} EXISTING_FEATURES
502
520
  USING (
503
521
  SELECT
504
- CASE WHEN B.FEATURE_ID IS NULL THEN A.FEATURE_ID ELSE B.FEATURE_ID END AS FEATURE_ID
505
- , A.FEATURE_NAME
506
- , A.FEATURE_TYPE
507
- , A.FEATURE_TABLE
508
- , A.FEATURE_DATABASE
509
- , A.FEATURE_VIEW
510
- , A.ENTITY_NAME
511
- , A.DATA_DOMAIN
512
- FROM {tdfs4ds.SCHEMA}.temp A
513
- LEFT JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW} B
514
- ON A.FEATURE_NAME = B.FEATURE_NAME
515
- AND A.ENTITY_NAME = B.ENTITY_NAME -- modified
516
- AND A.DATA_DOMAIN = B.DATA_DOMAIN
517
- ) UPDATED_FEATURES
518
- ON UPDATED_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
519
- AND UPDATED_FEATURES.FEATURE_NAME = EXISTING_FEATURES.FEATURE_NAME
520
- AND UPDATED_FEATURES.DATA_DOMAIN = EXISTING_FEATURES.DATA_DOMAIN
521
- WHEN MATCHED THEN
522
- UPDATE
523
- SET
524
- FEATURE_TABLE = UPDATED_FEATURES.FEATURE_TABLE,
525
- FEATURE_TYPE = UPDATED_FEATURES.FEATURE_TYPE,
526
- FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE,
527
- FEATURE_VIEW = UPDATED_FEATURES.FEATURE_VIEW
528
- --,ENTITY_NAME = UPDATED_FEATURES.ENTITY_NAME -- modified
529
- WHEN NOT MATCHED THEN
530
- INSERT
522
+ CASE WHEN B.FEATURE_ID IS NULL THEN A.FEATURE_ID ELSE B.FEATURE_ID END AS FEATURE_ID
523
+ , A.FEATURE_NAME
524
+ , A.FEATURE_TYPE
525
+ , A.FEATURE_TABLE
526
+ , A.FEATURE_DATABASE
527
+ , A.FEATURE_VIEW
528
+ , A.ENTITY_NAME
529
+ , A.DATA_DOMAIN
530
+ FROM {tdfs4ds.SCHEMA}.temp A
531
+ LEFT JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW} B
532
+ ON A.FEATURE_NAME = B.FEATURE_NAME
533
+ AND A.ENTITY_NAME = B.ENTITY_NAME
534
+ AND A.DATA_DOMAIN = B.DATA_DOMAIN
535
+ ) UPDATED_FEATURES
536
+ ON UPDATED_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
537
+ AND UPDATED_FEATURES.FEATURE_NAME = EXISTING_FEATURES.FEATURE_NAME
538
+ AND UPDATED_FEATURES.DATA_DOMAIN = EXISTING_FEATURES.DATA_DOMAIN
539
+ WHEN MATCHED THEN UPDATE SET
540
+ FEATURE_TABLE = UPDATED_FEATURES.FEATURE_TABLE
541
+ , FEATURE_TYPE = UPDATED_FEATURES.FEATURE_TYPE
542
+ , FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE
543
+ , FEATURE_VIEW = UPDATED_FEATURES.FEATURE_VIEW
544
+ WHEN NOT MATCHED THEN INSERT
531
545
  ( UPDATED_FEATURES.FEATURE_ID
532
- , UPDATED_FEATURES.FEATURE_NAME
533
- , UPDATED_FEATURES.FEATURE_TYPE
534
- , UPDATED_FEATURES.FEATURE_TABLE
535
- , UPDATED_FEATURES.FEATURE_DATABASE
536
- , UPDATED_FEATURES.FEATURE_VIEW
537
- , UPDATED_FEATURES.ENTITY_NAME
538
- , UPDATED_FEATURES.DATA_DOMAIN,
539
- {validtime_start},
540
- '{end_period_}')
541
- """
546
+ , UPDATED_FEATURES.FEATURE_NAME
547
+ , UPDATED_FEATURES.FEATURE_TYPE
548
+ , UPDATED_FEATURES.FEATURE_TABLE
549
+ , UPDATED_FEATURES.FEATURE_DATABASE
550
+ , UPDATED_FEATURES.FEATURE_VIEW
551
+ , UPDATED_FEATURES.ENTITY_NAME
552
+ , UPDATED_FEATURES.DATA_DOMAIN
553
+ , {validtime_start}
554
+ , '{end_period_}'
555
+ );
556
+ """
542
557
 
543
- if tdfs4ds.DEBUG_MODE:
544
- print("-----------_register_features_merge - query_merge")
545
- print(query_merge)
546
- # Execute the update and insert queries
558
+ logger_safe("debug", "register_features: merge_sql_preview=%s", " ".join(query_merge.split())[:400] + " ...")
559
+
560
+ # --- Execute MERGE -------------------------------------------------------
547
561
  execute_query(query_merge)
562
+ logger_safe(
563
+ "info",
564
+ "register_features: merged %d features into %s.%s",
565
+ len(df),
566
+ tdfs4ds.SCHEMA,
567
+ tdfs4ds.FEATURE_CATALOG_NAME,
568
+ )
548
569
 
549
570
  return df
571
+
550
572
  def _register_features_update_insert(entity_id, feature_names_types, primary_index = None, partitioning = ''):
551
573
  """
552
574
  Registers or updates feature definitions in a Teradata database's feature catalog, associating entity identifiers
@@ -308,7 +308,7 @@ class FilterManager:
308
308
  FROM {self.schema_name}.{self.table_name}
309
309
  WHERE {self.filter_id_name} = {filter_id}
310
310
  """
311
- logger_safe("info", "Updating active filter | %s", ','.join([c + ':' + v for c,v in zip(select_cols_str, tdml.execute_sql(f"SEL * FROM {self.schema_name}.{self.view_name}").fetchall()[0])]))
311
+ logger_safe("info", "Updating active filter | %s", ','.join([f"{c}:{v}" for c,v in zip(select_cols_str, tdml.execute_sql(f"SEL * FROM {self.schema_name}.{self.view_name}").fetchall()[0])]))
312
312
 
313
313
  if getattr(tdfs4ds, "DEBUG_MODE", False):
314
314
  logger_safe("debug", "Replacing view with new filter:\n%s", query)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tdfs4ds
3
- Version: 0.2.4.33
3
+ Version: 0.2.4.35
4
4
  Summary: A python package to simplify the usage of feature store using Teradata Vantage ...
5
5
  Author: Denis Molin
6
6
  Requires-Python: >=3.6
@@ -2,7 +2,7 @@ tdfs/__init__.py,sha256=7AcO7uB1opRCt7t2JOHworKimfAaDeO3boRW7u9Geo8,23
2
2
  tdfs/datasets.py,sha256=-b2MPEKGki2V1M8iUcoDR9uc2krIK7u1CK-EhChvihs,985
3
3
  tdfs/feature_store.py,sha256=Honu7eOAXxP4Ivz0mRlhuNkfTDzgZl5HB1WlQUwzcZ0,31354
4
4
  tdfs/data/curves.csv,sha256=q0Tm-0yu7VMK4lHvHpgi1LMeRq0lO5gJy2Q17brKbEM,112488
5
- tdfs4ds/__init__.py,sha256=A-MJUMl06mJBwd94ByY8DZoatCL4A8r7mqe5u6EzCMw,55010
5
+ tdfs4ds/__init__.py,sha256=n3eGxALMqT_UmwvP_VZ8K0bdKSFAtPhe9bi7Kg0TQtA,55698
6
6
  tdfs4ds/datasets.py,sha256=LE4Gn0muwdyrIrCrbkE92cnafUML63z1lj5bFIIVzmc,3524
7
7
  tdfs4ds/feature_engineering.py,sha256=oVnZ2V_XNGE12LKC_fNfkrWSQZLgtYRmaf8Dispi6S4,7081
8
8
  tdfs4ds/feature_store.py,sha256=y-oItPZw6nBkBcGAceaATZbkLPTsvpk0OnpzTxYofDs,68576
@@ -17,22 +17,22 @@ tdfs4ds/dataset/dataset.py,sha256=J_fgfsVdR9zSOXrUOqyotqsUD-GlQMGyuld6ueov45w,76
17
17
  tdfs4ds/dataset/dataset_catalog.py,sha256=qxS2thDW2MvsRouSFaX1M0sX2J7IzBAYD8Yf22Tsd5k,16638
18
18
  tdfs4ds/feature_store/__init__.py,sha256=a7NPCkpTx40UR5LRErwnskpABG2Vuib7F5wUjaUGCnI,209
19
19
  tdfs4ds/feature_store/entity_management.py,sha256=9ltytv3yCTG84NZXBpb1Tlkf9pOxvrNb0MVidU4pwvE,10157
20
- tdfs4ds/feature_store/feature_data_processing.py,sha256=JEtz1UpZY7oMaonuhzC8eTbZAL8SzrnLOpo0WTEDFUM,41697
20
+ tdfs4ds/feature_store/feature_data_processing.py,sha256=gjwypiTfwTyGyrP20v35Vu2uGIrCY80OBBeMVBsdjuk,45020
21
21
  tdfs4ds/feature_store/feature_query_retrieval.py,sha256=51c6ZNlLFiBIxNPinS8ot8bjWEIb1QV2eVg69yzVF80,35381
22
- tdfs4ds/feature_store/feature_store_management.py,sha256=pWM9sjppBgRIg3l1ksoDJsM1fnaZlWtnuE3JuOP_2mY,54736
22
+ tdfs4ds/feature_store/feature_store_management.py,sha256=yXLbINYLA-lzd0t_6TzEe9a8Anlum4x8TRoxZU3FIr8,54276
23
23
  tdfs4ds/process_store/__init__.py,sha256=npHR_xju5ecGmWfYHDyteLwiU3x-cL4HD3sFK_th7xY,229
24
24
  tdfs4ds/process_store/process_followup.py,sha256=PvLcU7meg3ljBlPfuez3qwTVqpHHhVJxYxGqjgiHE8E,7265
25
25
  tdfs4ds/process_store/process_query_administration.py,sha256=AOufkJ6DFUpBiGm-6Q6Dq0Aovw31UGTscZ3Ya0ewS-0,7851
26
26
  tdfs4ds/process_store/process_registration_management.py,sha256=2fFjt4Pmh3An1BUFvRX3xABSlQrlWiEiPQStH3A9Xpk,36130
27
27
  tdfs4ds/process_store/process_store_catalog_management.py,sha256=eVUU9uanyXCUkzi2vcHbJPL9qFiXVasnCxPGr-r9EY8,16090
28
28
  tdfs4ds/utils/__init__.py,sha256=-yTMfDLZbQnIRQ64s_bczzT21tDW2A8FZeq9PX5SgFU,168
29
- tdfs4ds/utils/filter_management.py,sha256=JdCHkkw_L6vpmjPMMp3AY2ZwITGrwAvljHxZttgeWTg,24761
29
+ tdfs4ds/utils/filter_management.py,sha256=5_8fYYtl8RQgbIi6L_1geNM0wJMm3t1n4QvNA5DnaQg,24760
30
30
  tdfs4ds/utils/info.py,sha256=sShnUxXMlvCtQ6xtShDhqdpTr6sMG0dZQhNBFgUENDY,12058
31
31
  tdfs4ds/utils/lineage.py,sha256=gy5M42qy5fvdWmlohAY3WPYoqAyp5VakeEmeT1YjrJQ,37839
32
32
  tdfs4ds/utils/query_management.py,sha256=nAcE8QY1GWAKgOtb-ubSfDVcnYbU7Ge8CruVRLoPtmY,6356
33
33
  tdfs4ds/utils/time_management.py,sha256=asIWvK5K81NNwAGqC-9Tv4Timscxyv0vyuPFs01whu0,31461
34
34
  tdfs4ds/utils/visualization.py,sha256=5S528KoKzzkrAdCxfy7ecyqKvAXBoibNvHwz_u5ISMs,23167
35
- tdfs4ds-0.2.4.33.dist-info/METADATA,sha256=0HSUyalUNwp7ZD6Z811pBaNbMb0GEAYsNSzFcAaEWnk,14326
36
- tdfs4ds-0.2.4.33.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
37
- tdfs4ds-0.2.4.33.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
38
- tdfs4ds-0.2.4.33.dist-info/RECORD,,
35
+ tdfs4ds-0.2.4.35.dist-info/METADATA,sha256=1gmDbv0lpgEcRd0ucWdSSyfGUTyb0-nCxVoMy9Y8JKk,14326
36
+ tdfs4ds-0.2.4.35.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
37
+ tdfs4ds-0.2.4.35.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
38
+ tdfs4ds-0.2.4.35.dist-info/RECORD,,