tdfs4ds 0.2.4.32__py3-none-any.whl → 0.2.4.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,7 @@ import pandas as pd
9
9
  import tqdm
10
10
  import inspect
11
11
  import re
12
+ from tdfs4ds import logger_safe, logger
12
13
 
13
14
  @execute_query_wrapper
14
15
  def feature_store_catalog_view_creation():
@@ -350,46 +351,47 @@ def register_features(entity_id, feature_names_types, primary_index = None, part
350
351
 
351
352
  def _register_features_merge(entity_id, feature_names_types, primary_index=None, partitioning=''):
352
353
  """
353
- Registers or updates feature definitions in a Teradata database's feature catalog, associating entity identifiers
354
- with feature names, types, and other metadata. This function prepares and executes SQL operations to insert new
355
- feature definitions or update existing ones, considering partitioning strategies and primary index configurations.
356
-
357
- Parameters:
358
- - entity_id (dict): Specifies the entity's identifiers with keys representing attribute names. This dictionary
359
- is crucial for defining the scope and granularity of feature data.
360
- - feature_names_types (dict): Maps feature names to their properties, including data types and unique identifiers.
361
- Each value is a dictionary with keys 'type' and 'id' indicating the feature's data
362
- type and a unique identifier, respectively.
363
- - primary_index (list, optional): Identifies the primary index column(s) for the feature data. This influences
364
- the organization and performance of database operations. If not specified,
365
- defaults are used based on the entity_id structure.
366
- - partitioning (str, optional): Describes the partitioning strategy through a string listing column names used
367
- for partitioning. This can impact data storage and retrieval performance.
368
-
369
- Returns:
370
- pd.DataFrame: Contains details of the registered features, including names, types, IDs, and references to the
371
- respective feature store table and view names, alongside metadata about the entity and database schema.
372
-
373
- Note:
374
- - The function dynamically constructs SQL queries for inserting new features or updating existing ones in the
375
- feature catalog, adapting to the provided partitioning and primary index settings.
376
- - Assumes the existence of a Teradata feature catalog table in the specified schema and that the database connection
377
- is correctly configured.
378
- - Utilizes the tdfs4ds module for database schema configurations and valid-time temporal table considerations.
379
-
380
- Example Usage:
381
- >>> entity_id = {'customer_id': 'INTEGER'}
382
- >>> feature_names_types = {'age': {'type': 'BIGINT', 'id': 1}, 'gender': {'type': 'VARCHAR_LATIN', 'id': 2}}
383
- >>> registered_features = register_features(entity_id, feature_names_types)
384
- >>> print(registered_features)
385
-
386
- This example demonstrates registering features for an entity with attributes customer_id, age, and gender,
387
- where age and gender features have specified types and unique IDs.
354
+ Register or update feature definitions in the feature catalog, with temporal support.
355
+
356
+ This function builds (or refreshes) entries in the Teradata feature catalog from a
357
+ mapping of feature names to their metadata, computes the target feature store table
358
+ and view names, stages the metadata to a temporary table, and executes a MERGE into
359
+ the catalog (with optional VALIDTIME support based on `tdfs4ds.FEATURE_STORE_TIME`).
360
+
361
+ Parameters
362
+ ----------
363
+ entity_id : dict[str, Any]
364
+ Mapping of entity-key column names to types. Only the keys (column names) are
365
+ required here; values are not used by this function.
366
+ feature_names_types : dict[str, dict]
367
+ Dict of feature name -> {"type": <SQL_TYPE>, "id": <int>} describing each
368
+ feature’s storage type and identifier in the catalog.
369
+ primary_index : list[str] | None, optional
370
+ Primary index column(s) to use when deriving the feature store table/view names.
371
+ If None, defaults are inferred by `get_feature_store_table_name`.
372
+ partitioning : str, optional
373
+ Partitioning expression or comma-separated column list used by
374
+ `get_feature_store_table_name`.
375
+
376
+ Returns
377
+ -------
378
+ pd.DataFrame
379
+ A dataframe of the features that were (up)registered, including:
380
+ FEATURE_NAME, FEATURE_TYPE, FEATURE_ID, FEATURE_TABLE, FEATURE_VIEW,
381
+ ENTITY_NAME, FEATURE_DATABASE, DATA_DOMAIN.
382
+
383
+ Notes
384
+ -----
385
+ - When `tdfs4ds.FEATURE_STORE_TIME is None`, uses CURRENT VALIDTIME (non-explicit start/end).
386
+ Otherwise uses `VALIDTIME PERIOD ('<FEATURE_STORE_TIME>', '<END_PERIOD>')` and adds
387
+ the valid-time start/end when inserting.
388
+ - Respects `tdfs4ds.DISPLAY_LOGS` via `logger_safe`.
388
389
  """
389
390
 
390
- if tdfs4ds.FEATURE_STORE_TIME == None:
391
+ # --- VALIDTIME setup -----------------------------------------------------
392
+ if tdfs4ds.FEATURE_STORE_TIME is None:
391
393
  validtime_statement = 'CURRENT VALIDTIME'
392
- validtime_start = 'CAST(CURRENT_TIME AS TIMESTAMP(0) WITH TIME ZONE)'
394
+ validtime_start = "CAST(CURRENT_TIME AS TIMESTAMP(0) WITH TIME ZONE)"
393
395
  else:
394
396
  validtime_statement = f"VALIDTIME PERIOD '({tdfs4ds.FEATURE_STORE_TIME},{tdfs4ds.END_PERIOD})'"
395
397
  validtime_start = f"CAST('{tdfs4ds.FEATURE_STORE_TIME}' AS TIMESTAMP(0) WITH TIME ZONE)"
@@ -399,154 +401,174 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
399
401
  else:
400
402
  end_period_ = tdfs4ds.END_PERIOD
401
403
 
402
- if len(list(feature_names_types.keys())) == 0:
403
- if tdfs4ds.DISPLAY_LOGS: print('no new feature to register')
404
+ # --- Input checks & early exit ------------------------------------------
405
+ if not feature_names_types:
406
+ logger_safe("info", "register_features: no new features to register")
404
407
  return
405
408
 
406
- # Create a comma-separated string of entity IDs
407
- entity_id_list = list(entity_id.keys())
408
- entity_id_list.sort()
409
- ENTITY_ID__ = ','.join([k for k in entity_id_list])
410
-
411
- # Create a DataFrame from the feature_names_types dictionary
412
- if len(feature_names_types.keys()) > 1:
413
- df = pd.DataFrame(feature_names_types).transpose().reset_index()
414
- df.columns = ['FEATURE_NAME', 'FEATURE_TYPE', 'FEATURE_ID']
415
- else:
416
- df = pd.DataFrame(columns=['FEATURE_NAME', 'FEATURE_TYPE', 'FEATURE_ID'])
417
- k = list(feature_names_types.keys())[0]
418
- df['FEATURE_NAME'] = [k]
419
- df['FEATURE_TYPE'] = [feature_names_types[k]['type']]
420
- df['FEATURE_ID'] = [feature_names_types[k]['id']]
421
-
422
-
409
+ # --- Entity columns (ordered, stable) -----------------------------------
410
+ entity_cols = sorted(list(entity_id.keys()))
411
+ ENTITY_ID__ = ",".join(entity_cols)
423
412
 
424
- if tdfs4ds.DEBUG_MODE:
425
- print('register_features', 'primary_index', primary_index)
426
- print('register_features', 'partitioning', partitioning)
427
- print('df', df)
428
-
429
- # Generate the feature table and view names based on the entity ID and feature type
430
- df['FEATURE_TABLE'] = df.apply(lambda row: get_feature_store_table_name(entity_id, row.iloc[1],
431
- primary_index=primary_index,
432
- partitioning=partitioning)[0],
433
- axis=1)
434
- df['FEATURE_VIEW'] = df.apply(lambda row: get_feature_store_table_name(entity_id, row.iloc[1],
435
- primary_index=primary_index,
436
- partitioning=partitioning)[1],
437
- axis=1)
438
-
439
- # Add additional columns to the DataFrame
440
- df['ENTITY_NAME'] = ENTITY_ID__
441
- df['FEATURE_DATABASE'] = tdfs4ds.SCHEMA
442
- df['DATA_DOMAIN'] = tdfs4ds.DATA_DOMAIN
443
-
444
- # Copy the DataFrame to a temporary table in Teradata
445
- tdml.copy_to_sql(df, table_name='temp', schema_name=tdfs4ds.SCHEMA, if_exists='replace',
446
- primary_index='FEATURE_ID',
447
- types={'FEATURE_ID': tdml.BIGINT})
448
-
449
-
450
-
451
- if tdfs4ds.DEBUG_MODE:
452
- print("-----------_register_features_merge - df")
453
- print(df)
454
-
455
- if tdfs4ds.FEATURE_STORE_TIME == None:
413
+ # --- Build dataframe safely (no transpose tricks) ------------------------
414
+ rows = []
415
+ for fname, meta in feature_names_types.items():
416
+ try:
417
+ rows.append({
418
+ "FEATURE_NAME": fname,
419
+ "FEATURE_TYPE": meta["type"],
420
+ "FEATURE_ID": meta["id"],
421
+ })
422
+ except KeyError as e:
423
+ logger_safe("error", "register_features: missing key %s in feature '%s' meta=%s", str(e), fname, meta)
424
+ raise
425
+
426
+ df = pd.DataFrame(rows, columns=["FEATURE_NAME", "FEATURE_TYPE", "FEATURE_ID"])
427
+
428
+ logger_safe(
429
+ "debug",
430
+ "register_features: features_count=%d | entity_cols=%s | primary_index=%s | partitioning=%s",
431
+ len(df),
432
+ entity_cols,
433
+ primary_index,
434
+ partitioning,
435
+ )
436
+
437
+ # --- Compute feature table & view names ---------------------------------
438
+ # Use apply to preserve original order; get_feature_store_table_name returns (table, view)
439
+ df["FEATURE_TABLE"] = df.apply(
440
+ lambda row: get_feature_store_table_name(
441
+ entity_id,
442
+ row["FEATURE_TYPE"],
443
+ primary_index=primary_index,
444
+ partitioning=partitioning
445
+ )[0],
446
+ axis=1
447
+ )
448
+ df["FEATURE_VIEW"] = df.apply(
449
+ lambda row: get_feature_store_table_name(
450
+ entity_id,
451
+ row["FEATURE_TYPE"],
452
+ primary_index=primary_index,
453
+ partitioning=partitioning
454
+ )[1],
455
+ axis=1
456
+ )
457
+
458
+ # --- Add catalog columns -------------------------------------------------
459
+ df["ENTITY_NAME"] = ENTITY_ID__
460
+ df["FEATURE_DATABASE"] = tdfs4ds.SCHEMA
461
+ df["DATA_DOMAIN"] = tdfs4ds.DATA_DOMAIN
462
+
463
+ # --- Stage to temp table -------------------------------------------------
464
+ tdml.copy_to_sql(
465
+ df,
466
+ table_name="temp",
467
+ schema_name=tdfs4ds.SCHEMA,
468
+ if_exists="replace",
469
+ primary_index="FEATURE_ID",
470
+ types={"FEATURE_ID": tdml.BIGINT},
471
+ )
472
+ logger_safe("debug", "register_features: staged %d rows to %s.temp", len(df), tdfs4ds.SCHEMA)
473
+
474
+ # --- Build MERGE statement ----------------------------------------------
475
+ if tdfs4ds.FEATURE_STORE_TIME is None:
476
+ # no explicit start/end in INSERT branch
456
477
  query_merge = f"""
457
478
  {validtime_statement}
458
- MERGE INTO {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME} EXISTING_FEATURES
479
+ MERGE INTO {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME} EXISTING_FEATURES
459
480
  USING (
460
481
  SELECT
461
- CASE WHEN B.FEATURE_ID IS NULL THEN A.FEATURE_ID ELSE B.FEATURE_ID END AS FEATURE_ID
462
- , A.FEATURE_NAME
463
- , A.FEATURE_TYPE
464
- , A.FEATURE_TABLE
465
- , A.FEATURE_DATABASE
466
- , A.FEATURE_VIEW
467
- , A.ENTITY_NAME
468
- , A.DATA_DOMAIN
469
- FROM {tdfs4ds.SCHEMA}.temp A
470
- LEFT JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW} B
471
- ON A.FEATURE_NAME = B.FEATURE_NAME
472
- AND A.ENTITY_NAME = B.ENTITY_NAME -- modified
473
- AND A.DATA_DOMAIN = B.DATA_DOMAIN
474
- ) UPDATED_FEATURES
475
- ON UPDATED_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
476
- AND UPDATED_FEATURES.FEATURE_NAME = EXISTING_FEATURES.FEATURE_NAME
477
- AND UPDATED_FEATURES.DATA_DOMAIN = EXISTING_FEATURES.DATA_DOMAIN
478
- WHEN MATCHED THEN
479
- UPDATE
480
- SET
481
- FEATURE_TABLE = UPDATED_FEATURES.FEATURE_TABLE,
482
- FEATURE_TYPE = UPDATED_FEATURES.FEATURE_TYPE,
483
- FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE,
484
- FEATURE_VIEW = UPDATED_FEATURES.FEATURE_VIEW
485
- --,ENTITY_NAME = UPDATED_FEATURES.ENTITY_NAME -- modified
486
- WHEN NOT MATCHED THEN
487
- INSERT
482
+ CASE WHEN B.FEATURE_ID IS NULL THEN A.FEATURE_ID ELSE B.FEATURE_ID END AS FEATURE_ID
483
+ , A.FEATURE_NAME
484
+ , A.FEATURE_TYPE
485
+ , A.FEATURE_TABLE
486
+ , A.FEATURE_DATABASE
487
+ , A.FEATURE_VIEW
488
+ , A.ENTITY_NAME
489
+ , A.DATA_DOMAIN
490
+ FROM {tdfs4ds.SCHEMA}.temp A
491
+ LEFT JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW} B
492
+ ON A.FEATURE_NAME = B.FEATURE_NAME
493
+ AND A.ENTITY_NAME = B.ENTITY_NAME
494
+ AND A.DATA_DOMAIN = B.DATA_DOMAIN
495
+ ) UPDATED_FEATURES
496
+ ON UPDATED_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
497
+ AND UPDATED_FEATURES.FEATURE_NAME = EXISTING_FEATURES.FEATURE_NAME
498
+ AND UPDATED_FEATURES.DATA_DOMAIN = EXISTING_FEATURES.DATA_DOMAIN
499
+ WHEN MATCHED THEN UPDATE SET
500
+ FEATURE_TABLE = UPDATED_FEATURES.FEATURE_TABLE
501
+ , FEATURE_TYPE = UPDATED_FEATURES.FEATURE_TYPE
502
+ , FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE
503
+ , FEATURE_VIEW = UPDATED_FEATURES.FEATURE_VIEW
504
+ WHEN NOT MATCHED THEN INSERT
488
505
  ( UPDATED_FEATURES.FEATURE_ID
489
- , UPDATED_FEATURES.FEATURE_NAME
490
- , UPDATED_FEATURES.FEATURE_TYPE
491
- , UPDATED_FEATURES.FEATURE_TABLE
492
- , UPDATED_FEATURES.FEATURE_DATABASE
493
- , UPDATED_FEATURES.FEATURE_VIEW
494
- , UPDATED_FEATURES.ENTITY_NAME
495
- , UPDATED_FEATURES.DATA_DOMAIN
496
- )
497
- """
506
+ , UPDATED_FEATURES.FEATURE_NAME
507
+ , UPDATED_FEATURES.FEATURE_TYPE
508
+ , UPDATED_FEATURES.FEATURE_TABLE
509
+ , UPDATED_FEATURES.FEATURE_DATABASE
510
+ , UPDATED_FEATURES.FEATURE_VIEW
511
+ , UPDATED_FEATURES.ENTITY_NAME
512
+ , UPDATED_FEATURES.DATA_DOMAIN
513
+ );
514
+ """
498
515
  else:
516
+ # insert with explicit valid-time start/end
499
517
  query_merge = f"""
500
518
  {validtime_statement}
501
- MERGE INTO {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME} EXISTING_FEATURES
519
+ MERGE INTO {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME} EXISTING_FEATURES
502
520
  USING (
503
521
  SELECT
504
- CASE WHEN B.FEATURE_ID IS NULL THEN A.FEATURE_ID ELSE B.FEATURE_ID END AS FEATURE_ID
505
- , A.FEATURE_NAME
506
- , A.FEATURE_TYPE
507
- , A.FEATURE_TABLE
508
- , A.FEATURE_DATABASE
509
- , A.FEATURE_VIEW
510
- , A.ENTITY_NAME
511
- , A.DATA_DOMAIN
512
- FROM {tdfs4ds.SCHEMA}.temp A
513
- LEFT JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW} B
514
- ON A.FEATURE_NAME = B.FEATURE_NAME
515
- AND A.ENTITY_NAME = B.ENTITY_NAME -- modified
516
- AND A.DATA_DOMAIN = B.DATA_DOMAIN
517
- ) UPDATED_FEATURES
518
- ON UPDATED_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
519
- AND UPDATED_FEATURES.FEATURE_NAME = EXISTING_FEATURES.FEATURE_NAME
520
- AND UPDATED_FEATURES.DATA_DOMAIN = EXISTING_FEATURES.DATA_DOMAIN
521
- WHEN MATCHED THEN
522
- UPDATE
523
- SET
524
- FEATURE_TABLE = UPDATED_FEATURES.FEATURE_TABLE,
525
- FEATURE_TYPE = UPDATED_FEATURES.FEATURE_TYPE,
526
- FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE,
527
- FEATURE_VIEW = UPDATED_FEATURES.FEATURE_VIEW
528
- --,ENTITY_NAME = UPDATED_FEATURES.ENTITY_NAME -- modified
529
- WHEN NOT MATCHED THEN
530
- INSERT
522
+ CASE WHEN B.FEATURE_ID IS NULL THEN A.FEATURE_ID ELSE B.FEATURE_ID END AS FEATURE_ID
523
+ , A.FEATURE_NAME
524
+ , A.FEATURE_TYPE
525
+ , A.FEATURE_TABLE
526
+ , A.FEATURE_DATABASE
527
+ , A.FEATURE_VIEW
528
+ , A.ENTITY_NAME
529
+ , A.DATA_DOMAIN
530
+ FROM {tdfs4ds.SCHEMA}.temp A
531
+ LEFT JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW} B
532
+ ON A.FEATURE_NAME = B.FEATURE_NAME
533
+ AND A.ENTITY_NAME = B.ENTITY_NAME
534
+ AND A.DATA_DOMAIN = B.DATA_DOMAIN
535
+ ) UPDATED_FEATURES
536
+ ON UPDATED_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
537
+ AND UPDATED_FEATURES.FEATURE_NAME = EXISTING_FEATURES.FEATURE_NAME
538
+ AND UPDATED_FEATURES.DATA_DOMAIN = EXISTING_FEATURES.DATA_DOMAIN
539
+ WHEN MATCHED THEN UPDATE SET
540
+ FEATURE_TABLE = UPDATED_FEATURES.FEATURE_TABLE
541
+ , FEATURE_TYPE = UPDATED_FEATURES.FEATURE_TYPE
542
+ , FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE
543
+ , FEATURE_VIEW = UPDATED_FEATURES.FEATURE_VIEW
544
+ WHEN NOT MATCHED THEN INSERT
531
545
  ( UPDATED_FEATURES.FEATURE_ID
532
- , UPDATED_FEATURES.FEATURE_NAME
533
- , UPDATED_FEATURES.FEATURE_TYPE
534
- , UPDATED_FEATURES.FEATURE_TABLE
535
- , UPDATED_FEATURES.FEATURE_DATABASE
536
- , UPDATED_FEATURES.FEATURE_VIEW
537
- , UPDATED_FEATURES.ENTITY_NAME
538
- , UPDATED_FEATURES.DATA_DOMAIN,
539
- {validtime_start},
540
- '{end_period_}')
541
- """
546
+ , UPDATED_FEATURES.FEATURE_NAME
547
+ , UPDATED_FEATURES.FEATURE_TYPE
548
+ , UPDATED_FEATURES.FEATURE_TABLE
549
+ , UPDATED_FEATURES.FEATURE_DATABASE
550
+ , UPDATED_FEATURES.FEATURE_VIEW
551
+ , UPDATED_FEATURES.ENTITY_NAME
552
+ , UPDATED_FEATURES.DATA_DOMAIN
553
+ , {validtime_start}
554
+ , '{end_period_}'
555
+ );
556
+ """
542
557
 
543
- if tdfs4ds.DEBUG_MODE:
544
- print("-----------_register_features_merge - query_merge")
545
- print(query_merge)
546
- # Execute the update and insert queries
558
+ logger_safe("debug", "register_features: merge_sql_preview=%s", " ".join(query_merge.split())[:400] + " ...")
559
+
560
+ # --- Execute MERGE -------------------------------------------------------
547
561
  execute_query(query_merge)
562
+ logger_safe(
563
+ "info",
564
+ "register_features: merged %d features into %s.%s",
565
+ len(df),
566
+ tdfs4ds.SCHEMA,
567
+ tdfs4ds.FEATURE_CATALOG_NAME,
568
+ )
548
569
 
549
570
  return df
571
+
550
572
  def _register_features_update_insert(entity_id, feature_names_types, primary_index = None, partitioning = ''):
551
573
  """
552
574
  Registers or updates feature definitions in a Teradata database's feature catalog, associating entity identifiers
@@ -28,7 +28,7 @@ def list_processes():
28
28
  return tdml.DataFrame(tdml.in_schema(tdfs4ds.SCHEMA, tdfs4ds.PROCESS_CATALOG_NAME_VIEW))
29
29
  except Exception as e:
30
30
  print(str(e))
31
- print(query)
31
+ print(tdml.DataFrame(tdml.in_schema(tdfs4ds.SCHEMA, tdfs4ds.PROCESS_CATALOG_NAME_VIEW)).show_query())
32
32
 
33
33
  def list_processes_feature_split():
34
34
  """
@@ -3,6 +3,7 @@ import tdfs4ds
3
3
  from tdfs4ds.utils.query_management import execute_query_wrapper
4
4
  import uuid
5
5
  import json
6
+ from tdfs4ds import logger,logger_safe
6
7
 
7
8
  @execute_query_wrapper
8
9
  def register_process_view(view_name, entity_id, feature_names, metadata={}, entity_null_substitute = {}, **kwargs):
@@ -74,80 +75,91 @@ def _register_process_view_merge(view_name, entity_id, feature_names, metadata={
74
75
  - Requires 'tdml' module for DataFrame operations and 'uuid' for generating unique identifiers.
75
76
  """
76
77
 
77
-
78
- # Handling the case where the view name is provided as a DataFrame
79
- if type(view_name) == tdml.dataframe.dataframe.DataFrame:
78
+ # Handle teradataml DataFrame input
79
+ if isinstance(view_name, tdml.dataframe.dataframe.DataFrame):
80
80
  try:
81
81
  view_name = view_name._table_name
82
- except:
83
- print(
84
- 'create your teradata dataframe using tdml.DataFrame(<view name>). Crystallize your view if needed')
82
+ except Exception:
83
+ logger_safe(
84
+ "error",
85
+ "Invalid DataFrame for view registration. Use: tdml.DataFrame(<table/view>). Crystallize if needed."
86
+ )
85
87
  raise
86
88
 
89
+ # Prevent using temporary teradataml views
87
90
  if view_name.split('.')[1].startswith('ml__'):
88
- tdfs4ds.logger.error('Your dataframe is a temporary teradataml dataframe. Please crystallize your view first.')
89
- raise ValueError("Invalid process view name: it starts with 'ml__'. Please consider view crystallization")
90
-
91
- # Get filter manager:
91
+ logger_safe(
92
+ "error",
93
+ "Invalid view name '%s': starts with 'ml__'. Please crystallize your view first.",
94
+ view_name
95
+ )
96
+ raise ValueError("Invalid process view name: temporary teradataml views are not allowed.")
97
+
98
+ # Get optional arguments
92
99
  filtermanager = kwargs.get('filtermanager', None)
93
- if filtermanager is None:
94
- query_upsert_filtermanager = None
95
-
96
- # Get data distribution related inputs:
97
- primary_index = kwargs.get('primary_index', [e for e in entity_id.keys()])
100
+ query_upsert_filtermanager = None
101
+ primary_index = kwargs.get('primary_index', list(entity_id.keys()))
98
102
  partitioning = kwargs.get('partitioning', '').replace("'", '"')
99
103
 
100
104
  if primary_index is None:
101
- primary_index = [e for e in entity_id.keys()]
105
+ primary_index = list(entity_id.keys())
102
106
 
107
+ feature_names = ','.join(feature_names)
103
108
 
109
+ # Validtime period
110
+ end_period_ = '9999-01-01 00:00:00' if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED' else tdfs4ds.END_PERIOD
111
+ validtime_statement = (
112
+ 'CURRENT VALIDTIME'
113
+ if tdfs4ds.FEATURE_STORE_TIME is None
114
+ else f"VALIDTIME PERIOD '({tdfs4ds.FEATURE_STORE_TIME},{end_period_})'"
115
+ )
104
116
 
105
- # Joining the feature names into a comma-separated string
106
- feature_names = ','.join(feature_names)
117
+ logger_safe("info", "Registering process view: %s", view_name)
107
118
 
108
- # Setting the end period for the view
109
- if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED':
110
- end_period_ = '9999-01-01 00:00:00'
111
- else:
112
- end_period_ = tdfs4ds.END_PERIOD
119
+ # Check if view already exists in catalog
120
+ query_process_id = f"""
121
+ SEL PROCESS_ID FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}
122
+ WHERE view_name = '{view_name}'
123
+ """
124
+ process_id_result = tdml.execute_sql(query_process_id).fetchall()
113
125
 
114
- if tdfs4ds.FEATURE_STORE_TIME == None:
115
- validtime_statement = 'CURRENT VALIDTIME'
116
- else:
117
- validtime_statement = f"VALIDTIME PERIOD '({tdfs4ds.FEATURE_STORE_TIME},{end_period_})'"
126
+ if process_id_result:
127
+ process_id = process_id_result[0][0]
128
+ logger_safe("info", "Updating existing process_id=%s", process_id)
118
129
 
130
+ query_feature_version = f"""
131
+ SEL PROCESS_VERSION FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}
132
+ WHERE view_name = '{view_name}'
133
+ """
134
+ feature_version = tdml.execute_sql(query_feature_version).fetchall()[0][0]
119
135
 
120
- query_process_id = f"SEL PROCESS_ID FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW} WHERE view_name = '{view_name}'"
121
- process_id = tdml.execute_sql(query_process_id).fetchall()
122
- if len(process_id)>0:
123
- process_id = process_id[0][0]
124
- query_feature_version = f"SEL PROCESS_VERSION FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW} WHERE view_name = '{view_name}'"
125
- feature_version = tdml.execute_sql(query_feature_version).fetchall()[0][0]
126
- query_primary_index = f"SEL FOR_PRIMARY_INDEX, FOR_DATA_PARTITIONING FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME} WHERE process_id = '{process_id}'"
127
- query_primary_index_res = tdml.execute_sql(query_primary_index).fetchall()
128
- if len(query_primary_index_res)>0:
129
- FOR_PRIMARY_INDEX, FOR_DATA_PARTITIONING = tdml.execute_sql(query_primary_index).fetchall()[0]
136
+ query_primary_index = f"""
137
+ SEL FOR_PRIMARY_INDEX, FOR_DATA_PARTITIONING
138
+ FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME}
139
+ WHERE process_id = '{process_id}'
140
+ """
141
+ dist_res = tdml.execute_sql(query_primary_index).fetchall()
142
+ if dist_res:
143
+ FOR_PRIMARY_INDEX, FOR_DATA_PARTITIONING = dist_res[0]
130
144
  else:
131
- raise ValueError(f"""
132
- There is not information on primary index and partitioning for process: {process_id}.
133
- The working date is: {validtime_statement}
134
- The content of the distribution table is:
135
- {print(tdml.DataFrame.from_query(f"SEL * FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME} WHERE process_id = '{process_id}'"))}
136
- """)
145
+ logger_safe(
146
+ "error",
147
+ "Missing data distribution info for existing process %s. Check distribution table.",
148
+ process_id
149
+ )
150
+ raise ValueError("Missing distribution info.")
137
151
  else:
138
- # Generating a unique process identifier
139
152
  process_id = str(uuid.uuid4())
140
153
  feature_version = 1
141
154
  FOR_PRIMARY_INDEX = ",".join(primary_index)
142
155
  FOR_DATA_PARTITIONING = partitioning
156
+ logger_safe("info", "Generated new process_id=%s", process_id)
143
157
 
144
- # Create a comma-separated string of entity IDs
145
- entity_id_list = list(entity_id.keys())
146
- entity_id_list.sort()
147
- ENTITY_ID__ = ','.join([k for k in entity_id_list])
158
+ # Build entity_id string
159
+ ENTITY_ID__ = ','.join(sorted(entity_id.keys()))
160
+ logger_safe("debug", "Entity IDs: %s", ENTITY_ID__)
161
+ logger_safe("debug", "Feature names: %s", feature_names)
148
162
 
149
- print('feature_version :',feature_version)
150
- print('int(feature_version) :', int(feature_version))
151
163
  if tdfs4ds.FEATURE_STORE_TIME == None:
152
164
 
153
165
 
@@ -402,16 +414,16 @@ def _register_process_view_merge(view_name, entity_id, feature_names, metadata={
402
414
  """
403
415
 
404
416
 
405
- # Logging the process registration
406
- print(f'register process with id : {process_id}')
407
- print(f"to run the process again just type : run(process_id='{process_id}')")
408
- print(f"to update your dataset : dataset = run(process_id='{process_id}',return_dataset=True)")
417
+ logger_safe("info", "Process registered: process_id=%s", process_id)
418
+ logger_safe("info", "To rerun: run(process_id='%s')", process_id)
419
+ logger_safe("info", "To build dataset: dataset = run(process_id='%s', return_dataset=True)", process_id)
409
420
 
410
- #print('query_insert_dist', query_upsert_dist)
421
+ # Return queries
411
422
  if kwargs.get('with_process_id'):
412
423
  return query_upsert, process_id, query_upsert_dist, query_upsert_filtermanager
413
424
  else:
414
425
  return query_upsert, query_upsert_dist, query_upsert_filtermanager
426
+
415
427
  @execute_query_wrapper
416
428
  def _register_process_view_update_insert(view_name, entity_id, feature_names, metadata={}, entity_null_substitute={}, **kwargs):
417
429
  """