tdfs4ds 0.2.4.35__py3-none-any.whl → 0.2.4.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tdfs4ds/__init__.py CHANGED
@@ -1,5 +1,6 @@
1
- __version__ = '0.2.4.35'
1
+ __version__ = '0.2.4.37'
2
2
  import logging
3
+ import json
3
4
 
4
5
  # Setup the logger
5
6
  logging.basicConfig(
@@ -562,6 +563,81 @@ def _upload_features(
562
563
  process_id=None, force_compute=False,
563
564
  force_varchar_length=None
564
565
  ):
566
+ """
567
+ Uploads a set of features into the Feature Store for a given entity.
568
+
569
+ This function registers an entity and its associated features in the feature catalog
570
+ if they are not already defined, prepares the data for ingestion, and stores it in the
571
+ feature store. It also supports incremental feature computation and conditional execution
572
+ depending on prior runs.
573
+
574
+ Parameters
575
+ ----------
576
+ df : pandas.DataFrame
577
+ Input dataframe containing entity keys and feature columns to upload.
578
+ entity_id : str, list, or dict
579
+ Identifier(s) for the entity. Can be:
580
+ - A string (single entity key)
581
+ - A list of key column names
582
+ - A dict mapping column names to data types
583
+ If not a dict, entity metadata is inferred automatically.
584
+ feature_names : list of str
585
+ List of feature column names to upload from `df`.
586
+ feature_versions : dict or int, optional
587
+ Feature version(s). If a single integer is provided, it is applied to all features.
588
+ If a dict is provided, it maps each feature name to its version.
589
+ Default is FEATURE_VERSION_DEFAULT.
590
+ primary_index : str or list, optional
591
+ Primary index to use when storing features in Teradata.
592
+ partitioning : str, optional
593
+ Partitioning clause for feature store tables. Default is ''.
594
+ filtermanager : FilterManager, optional
595
+ If provided, features are built iteratively per filter step.
596
+ entity_null_substitute : dict, optional
597
+ Replacement values for nulls in entity keys.
598
+ Example: {'customer_id': -1}
599
+ process_id : str, optional
600
+ Identifier for the process execution, used for follow-up logging.
601
+ force_compute : bool, optional
602
+ If True, forces recomputation even if the same process_id and timestamp were
603
+ already computed earlier. If False, the computation is skipped when existing
604
+ results are detected. Default is False.
605
+ force_varchar_length : int, optional
606
+ If provided, all VARCHAR feature columns are resized to this length
607
+ before ingestion.
608
+
609
+ Returns
610
+ -------
611
+ pandas.DataFrame or None
612
+ If BUILD_DATASET_AT_UPLOAD is enabled, returns a dataset built from the
613
+ ingested features for validation. Otherwise, returns None.
614
+
615
+ Notes
616
+ -----
617
+ - Uses global tdfs4ds context such as FEATURE_STORE_TIME, RUN_ID, and PROCESS_TYPE.
618
+ - Logs ingestion status in process follow-up tables.
619
+ - Skips ingestion when existing completed results are found unless
620
+ `force_compute=True`.
621
+ - Applies Teradata-optimized storage and statistics collection.
622
+
623
+ Raises
624
+ ------
625
+ ValueError
626
+ If unsupported data types are found (CLOB/BLOB/JSON).
627
+ Exception
628
+ For ingestion failure or storage errors.
629
+
630
+ Example
631
+ -------
632
+ >>> _upload_features(
633
+ ... df=dataframe,
634
+ ... entity_id="customer_id",
635
+ ... feature_names=["age", "credit_score"],
636
+ ... process_id="customer_features_v1",
637
+ ... force_compute=False
638
+ ... )
639
+ """
640
+
565
641
  from tdfs4ds.feature_store.entity_management import register_entity
566
642
  from tdfs4ds.feature_store.feature_store_management import Gettdtypes
567
643
  from tdfs4ds.feature_store.feature_store_management import register_features
@@ -633,6 +709,12 @@ def _upload_features(
633
709
 
634
710
  if filtermanager is None:
635
711
  do_compute = not (process_id and follow_up is not None and follow_up.shape[0] > 0)
712
+ if not do_compute and not force_compute:
713
+ logger_safe(
714
+ "info",
715
+ "Skipping computation for process_id=%s at time %s (already exists, force_compute=False)",
716
+ process_id, tdfs4ds.FEATURE_STORE_TIME
717
+ )
636
718
  if do_compute or force_compute:
637
719
  logger_safe("info", "Beginning feature ingestion for entity=%s", entity_id)
638
720
  tdfs4ds.process_store.process_followup.followup_open(
@@ -670,26 +752,44 @@ def _upload_features(
670
752
  raise
671
753
 
672
754
  else:
755
+
673
756
  logger_safe("info", "FilterManager detected: %s filters to process", filtermanager.nb_filters)
674
757
  something_computed = False
675
- for i in tqdm(
758
+ pbar = tqdm(
676
759
  range(filtermanager.nb_filters),
677
760
  total=filtermanager.nb_filters,
678
761
  desc="Applying filters",
679
762
  unit="filter",
680
763
  leave=False
681
- ):
764
+ )
765
+
766
+ for i in pbar:
682
767
  filter_id = i + 1
683
768
  filtermanager.update(filter_id)
684
769
 
685
- # show which filter is being applied in the bar
686
770
  try:
687
- tqdm.write(f"Applying filter {filter_id}/{filtermanager.nb_filters}")
688
- # If display() returns a long string, you can shorten it:
689
- bar_info = str(filtermanager.display())
690
- if len(bar_info) > 80:
691
- bar_info = bar_info[:77] + "..."
692
- tqdm.tqdm._instances and next(iter(tqdm.tqdm._instances)).set_postfix_str(bar_info)
771
+ pbar.set_description(f"Applying filter {filter_id}/{filtermanager.nb_filters}")
772
+
773
+ # Convert datetime columns to string
774
+ df_bar = filtermanager.display().to_pandas().astype(object) # avoid conversion issues
775
+ for col in df_bar.select_dtypes(include=["datetime", "datetimetz"]).columns:
776
+ df_bar[col] = df_bar[col].dt.strftime("%Y-%m-%d %H:%M:%S")
777
+
778
+ # Convert to JSON object (dict)
779
+ bar_info = df_bar.iloc[0].to_dict()
780
+
781
+ # ---- ADD THIS: handle python date objects ----
782
+ from datetime import date, datetime
783
+ for key, value in bar_info.items():
784
+ if isinstance(value, (date, datetime)): # convert date/datetime to string
785
+ bar_info[key] = value.strftime("%Y-%m-%d %H:%M:%S")
786
+ # ----------------------------------------------
787
+
788
+ bar_info = str(bar_info)
789
+ if len(bar_info) > 120:
790
+ bar_info = bar_info[:117] + "..."
791
+ pbar.set_postfix_str(bar_info)
792
+
693
793
  except Exception:
694
794
  # postfix is optional; ignore errors from display() here
695
795
  pass
@@ -704,6 +804,12 @@ def _upload_features(
704
804
  if follow_up.shape[0] > 0:
705
805
  do_compute = False
706
806
 
807
+ if not do_compute and not force_compute:
808
+ logger_safe(
809
+ "info",
810
+ "Skipping computation for process_id=%s at time %s (already exists, force_compute=False)",
811
+ process_id, tdfs4ds.FEATURE_STORE_TIME
812
+ )
707
813
  if do_compute or force_compute:
708
814
  tdfs4ds.process_store.process_followup.followup_open(
709
815
  run_id = tdfs4ds.RUN_ID,
@@ -1179,41 +1285,50 @@ def upload_tdstone2_scores(model):
1179
1285
  return dataset
1180
1286
 
1181
1287
 
1182
- def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None):
1288
+ def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None, force_compute = False, force_display_logs = False):
1183
1289
  """
1184
- Executes a series of processes for each date in a given list, managing the time and logging settings.
1290
+ Executes a series of processes for each date in a given list, managing time, computation settings, and logging.
1185
1291
 
1186
1292
  This function iterates over a range of time steps, updating a TimeManager object with each step, and then
1187
- executes a list of processes for that time step. It also manages the synchronization of time for a feature store
1188
- and disables display logs during its execution.
1293
+ executes a list of processes for that time step. It also manages synchronization of time for the feature store
1294
+ and optionally controls forced computation and log display behavior.
1189
1295
 
1190
1296
  Parameters:
1191
1297
  - process_list (list): A list of process IDs that need to be executed for each time step.
1192
- - time_manager (TimeManager object): An object that manages time-related operations, like updating or retrieving time.
1298
+ - time_manager (TimeManager): An object that manages time-related operations, like updating or retrieving time.
1193
1299
  - time_id_start (int, optional): The starting time step ID. Default is 1.
1194
- - time_id_end (int, optional): The ending time step ID. If None, it will run until the last time step in the time manager.
1300
+ - time_id_end (int, optional): The ending time step ID. If None, it will run until the last time step in the
1301
+ time manager.
1302
+ - force_compute (bool, optional): If True, forces each process to recompute even if previous results exist.
1303
+ Default is False.
1304
+ - force_display_logs (bool, optional): If True, forces log display during the rollout even if global log display
1305
+ is disabled. Default is False.
1195
1306
 
1196
1307
  Side Effects:
1197
- - Sets global variables DISPLAY_LOGS and FEATURE_STORE_TIME.
1308
+ - Temporarily modifies global variables DISPLAY_LOGS, PROCESS_TYPE, RUN_ID, and FEATURE_STORE_TIME.
1309
+ - Restores DISPLAY_LOGS setting after execution.
1198
1310
  - Catches and prints exceptions along with the time step on which they occurred.
1199
1311
 
1200
- This function performs the following steps:
1201
- 1. Disables display logs and sets the process type to 'ROLL_OUT'.
1202
- 2. Iterates over the specified range of time steps.
1203
- 3. Updates the time manager with the current time step.
1204
- 4. Synchronizes the feature store time with the current time step.
1205
- 5. Executes each process in the process list for the current time step.
1206
- 6. Restores the original display log setting after execution.
1312
+ Steps performed:
1313
+ 1. Disables display logs by default unless `force_display_logs` is True.
1314
+ 2. Sets process type to 'ROLL_OUT' and initializes a unique run ID.
1315
+ 3. Iterates over the specified range of time steps.
1316
+ 4. Updates the time manager with the current time step.
1317
+ 5. Synchronizes the feature store time with the current time step.
1318
+ 6. Executes each process in the process list with optional forced computation.
1319
+ 7. Restores original display log settings after completion.
1207
1320
 
1208
1321
  Example:
1209
1322
  >>> process_list = ['process_1', 'process_2']
1210
1323
  >>> time_manager = TimeManager(...)
1211
- >>> roll_out(process_list, time_manager, time_id_start=1, time_id_end=10)
1324
+ >>> roll_out(process_list, time_manager, time_id_start=1, time_id_end=10, force_compute=True, force_display_logs=True)
1212
1325
  """
1213
1326
 
1214
1327
  # Disable display logs
1215
1328
  temp_DISPLAY_LOGS = tdfs4ds.DISPLAY_LOGS
1216
1329
  tdfs4ds.DISPLAY_LOGS = False
1330
+ if force_display_logs:
1331
+ tdfs4ds.DISPLAY_LOGS = True
1217
1332
  PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
1218
1333
  tdfs4ds.PROCESS_TYPE = 'ROLL_OUT'
1219
1334
  tdfs4ds.RUN_ID = str(uuid.uuid4())
@@ -1246,7 +1361,7 @@ def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None):
1246
1361
  # Execute all processes for this time step
1247
1362
  for proc_id in process_list:
1248
1363
  pbar.set_description(f"Processing {date_} | proc {proc_id}")
1249
- run(process_id=proc_id, force_compute=False)
1364
+ run(process_id=proc_id, force_compute=force_compute)
1250
1365
 
1251
1366
  # Restore settings
1252
1367
  tdfs4ds.DISPLAY_LOGS = temp_DISPLAY_LOGS
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tdfs4ds
3
- Version: 0.2.4.35
3
+ Version: 0.2.4.37
4
4
  Summary: A python package to simplify the usage of feature store using Teradata Vantage ...
5
5
  Author: Denis Molin
6
6
  Requires-Python: >=3.6
@@ -2,7 +2,7 @@ tdfs/__init__.py,sha256=7AcO7uB1opRCt7t2JOHworKimfAaDeO3boRW7u9Geo8,23
2
2
  tdfs/datasets.py,sha256=-b2MPEKGki2V1M8iUcoDR9uc2krIK7u1CK-EhChvihs,985
3
3
  tdfs/feature_store.py,sha256=Honu7eOAXxP4Ivz0mRlhuNkfTDzgZl5HB1WlQUwzcZ0,31354
4
4
  tdfs/data/curves.csv,sha256=q0Tm-0yu7VMK4lHvHpgi1LMeRq0lO5gJy2Q17brKbEM,112488
5
- tdfs4ds/__init__.py,sha256=n3eGxALMqT_UmwvP_VZ8K0bdKSFAtPhe9bi7Kg0TQtA,55698
5
+ tdfs4ds/__init__.py,sha256=mrvk5jKmcdYg4waC7bpCHsDGlPb8h1JmSgx3dBlo1Ow,60776
6
6
  tdfs4ds/datasets.py,sha256=LE4Gn0muwdyrIrCrbkE92cnafUML63z1lj5bFIIVzmc,3524
7
7
  tdfs4ds/feature_engineering.py,sha256=oVnZ2V_XNGE12LKC_fNfkrWSQZLgtYRmaf8Dispi6S4,7081
8
8
  tdfs4ds/feature_store.py,sha256=y-oItPZw6nBkBcGAceaATZbkLPTsvpk0OnpzTxYofDs,68576
@@ -32,7 +32,7 @@ tdfs4ds/utils/lineage.py,sha256=gy5M42qy5fvdWmlohAY3WPYoqAyp5VakeEmeT1YjrJQ,3783
32
32
  tdfs4ds/utils/query_management.py,sha256=nAcE8QY1GWAKgOtb-ubSfDVcnYbU7Ge8CruVRLoPtmY,6356
33
33
  tdfs4ds/utils/time_management.py,sha256=asIWvK5K81NNwAGqC-9Tv4Timscxyv0vyuPFs01whu0,31461
34
34
  tdfs4ds/utils/visualization.py,sha256=5S528KoKzzkrAdCxfy7ecyqKvAXBoibNvHwz_u5ISMs,23167
35
- tdfs4ds-0.2.4.35.dist-info/METADATA,sha256=1gmDbv0lpgEcRd0ucWdSSyfGUTyb0-nCxVoMy9Y8JKk,14326
36
- tdfs4ds-0.2.4.35.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
37
- tdfs4ds-0.2.4.35.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
38
- tdfs4ds-0.2.4.35.dist-info/RECORD,,
35
+ tdfs4ds-0.2.4.37.dist-info/METADATA,sha256=kXimo1unejodaf9W0tOBPLaKisjougkurjt5C9IpA6g,14326
36
+ tdfs4ds-0.2.4.37.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
37
+ tdfs4ds-0.2.4.37.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
38
+ tdfs4ds-0.2.4.37.dist-info/RECORD,,