tdfs4ds 0.2.4.35__py3-none-any.whl → 0.2.4.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +141 -26
- {tdfs4ds-0.2.4.35.dist-info → tdfs4ds-0.2.4.37.dist-info}/METADATA +1 -1
- {tdfs4ds-0.2.4.35.dist-info → tdfs4ds-0.2.4.37.dist-info}/RECORD +5 -5
- {tdfs4ds-0.2.4.35.dist-info → tdfs4ds-0.2.4.37.dist-info}/WHEEL +0 -0
- {tdfs4ds-0.2.4.35.dist-info → tdfs4ds-0.2.4.37.dist-info}/top_level.txt +0 -0
tdfs4ds/__init__.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
__version__ = '0.2.4.
|
|
1
|
+
__version__ = '0.2.4.37'
|
|
2
2
|
import logging
|
|
3
|
+
import json
|
|
3
4
|
|
|
4
5
|
# Setup the logger
|
|
5
6
|
logging.basicConfig(
|
|
@@ -562,6 +563,81 @@ def _upload_features(
|
|
|
562
563
|
process_id=None, force_compute=False,
|
|
563
564
|
force_varchar_length=None
|
|
564
565
|
):
|
|
566
|
+
"""
|
|
567
|
+
Uploads a set of features into the Feature Store for a given entity.
|
|
568
|
+
|
|
569
|
+
This function registers an entity and its associated features in the feature catalog
|
|
570
|
+
if they are not already defined, prepares the data for ingestion, and stores it in the
|
|
571
|
+
feature store. It also supports incremental feature computation and conditional execution
|
|
572
|
+
depending on prior runs.
|
|
573
|
+
|
|
574
|
+
Parameters
|
|
575
|
+
----------
|
|
576
|
+
df : pandas.DataFrame
|
|
577
|
+
Input dataframe containing entity keys and feature columns to upload.
|
|
578
|
+
entity_id : str, list, or dict
|
|
579
|
+
Identifier(s) for the entity. Can be:
|
|
580
|
+
- A string (single entity key)
|
|
581
|
+
- A list of key column names
|
|
582
|
+
- A dict mapping column names to data types
|
|
583
|
+
If not a dict, entity metadata is inferred automatically.
|
|
584
|
+
feature_names : list of str
|
|
585
|
+
List of feature column names to upload from `df`.
|
|
586
|
+
feature_versions : dict or int, optional
|
|
587
|
+
Feature version(s). If a single integer is provided, it is applied to all features.
|
|
588
|
+
If a dict is provided, it maps each feature name to its version.
|
|
589
|
+
Default is FEATURE_VERSION_DEFAULT.
|
|
590
|
+
primary_index : str or list, optional
|
|
591
|
+
Primary index to use when storing features in Teradata.
|
|
592
|
+
partitioning : str, optional
|
|
593
|
+
Partitioning clause for feature store tables. Default is ''.
|
|
594
|
+
filtermanager : FilterManager, optional
|
|
595
|
+
If provided, features are built iteratively per filter step.
|
|
596
|
+
entity_null_substitute : dict, optional
|
|
597
|
+
Replacement values for nulls in entity keys.
|
|
598
|
+
Example: {'customer_id': -1}
|
|
599
|
+
process_id : str, optional
|
|
600
|
+
Identifier for the process execution, used for follow-up logging.
|
|
601
|
+
force_compute : bool, optional
|
|
602
|
+
If True, forces recomputation even if the same process_id and timestamp were
|
|
603
|
+
already computed earlier. If False, the computation is skipped when existing
|
|
604
|
+
results are detected. Default is False.
|
|
605
|
+
force_varchar_length : int, optional
|
|
606
|
+
If provided, all VARCHAR feature columns are resized to this length
|
|
607
|
+
before ingestion.
|
|
608
|
+
|
|
609
|
+
Returns
|
|
610
|
+
-------
|
|
611
|
+
pandas.DataFrame or None
|
|
612
|
+
If BUILD_DATASET_AT_UPLOAD is enabled, returns a dataset built from the
|
|
613
|
+
ingested features for validation. Otherwise, returns None.
|
|
614
|
+
|
|
615
|
+
Notes
|
|
616
|
+
-----
|
|
617
|
+
- Uses global tdfs4ds context such as FEATURE_STORE_TIME, RUN_ID, and PROCESS_TYPE.
|
|
618
|
+
- Logs ingestion status in process follow-up tables.
|
|
619
|
+
- Skips ingestion when existing completed results are found unless
|
|
620
|
+
`force_compute=True`.
|
|
621
|
+
- Applies Teradata-optimized storage and statistics collection.
|
|
622
|
+
|
|
623
|
+
Raises
|
|
624
|
+
------
|
|
625
|
+
ValueError
|
|
626
|
+
If unsupported data types are found (CLOB/BLOB/JSON).
|
|
627
|
+
Exception
|
|
628
|
+
For ingestion failure or storage errors.
|
|
629
|
+
|
|
630
|
+
Example
|
|
631
|
+
-------
|
|
632
|
+
>>> _upload_features(
|
|
633
|
+
... df=dataframe,
|
|
634
|
+
... entity_id="customer_id",
|
|
635
|
+
... feature_names=["age", "credit_score"],
|
|
636
|
+
... process_id="customer_features_v1",
|
|
637
|
+
... force_compute=False
|
|
638
|
+
... )
|
|
639
|
+
"""
|
|
640
|
+
|
|
565
641
|
from tdfs4ds.feature_store.entity_management import register_entity
|
|
566
642
|
from tdfs4ds.feature_store.feature_store_management import Gettdtypes
|
|
567
643
|
from tdfs4ds.feature_store.feature_store_management import register_features
|
|
@@ -633,6 +709,12 @@ def _upload_features(
|
|
|
633
709
|
|
|
634
710
|
if filtermanager is None:
|
|
635
711
|
do_compute = not (process_id and follow_up is not None and follow_up.shape[0] > 0)
|
|
712
|
+
if not do_compute and not force_compute:
|
|
713
|
+
logger_safe(
|
|
714
|
+
"info",
|
|
715
|
+
"Skipping computation for process_id=%s at time %s (already exists, force_compute=False)",
|
|
716
|
+
process_id, tdfs4ds.FEATURE_STORE_TIME
|
|
717
|
+
)
|
|
636
718
|
if do_compute or force_compute:
|
|
637
719
|
logger_safe("info", "Beginning feature ingestion for entity=%s", entity_id)
|
|
638
720
|
tdfs4ds.process_store.process_followup.followup_open(
|
|
@@ -670,26 +752,44 @@ def _upload_features(
|
|
|
670
752
|
raise
|
|
671
753
|
|
|
672
754
|
else:
|
|
755
|
+
|
|
673
756
|
logger_safe("info", "FilterManager detected: %s filters to process", filtermanager.nb_filters)
|
|
674
757
|
something_computed = False
|
|
675
|
-
|
|
758
|
+
pbar = tqdm(
|
|
676
759
|
range(filtermanager.nb_filters),
|
|
677
760
|
total=filtermanager.nb_filters,
|
|
678
761
|
desc="Applying filters",
|
|
679
762
|
unit="filter",
|
|
680
763
|
leave=False
|
|
681
|
-
)
|
|
764
|
+
)
|
|
765
|
+
|
|
766
|
+
for i in pbar:
|
|
682
767
|
filter_id = i + 1
|
|
683
768
|
filtermanager.update(filter_id)
|
|
684
769
|
|
|
685
|
-
# show which filter is being applied in the bar
|
|
686
770
|
try:
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
771
|
+
pbar.set_description(f"Applying filter {filter_id}/{filtermanager.nb_filters}")
|
|
772
|
+
|
|
773
|
+
# Convert datetime columns to string
|
|
774
|
+
df_bar = filtermanager.display().to_pandas().astype(object) # avoid conversion issues
|
|
775
|
+
for col in df_bar.select_dtypes(include=["datetime", "datetimetz"]).columns:
|
|
776
|
+
df_bar[col] = df_bar[col].dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
777
|
+
|
|
778
|
+
# Convert to JSON object (dict)
|
|
779
|
+
bar_info = df_bar.iloc[0].to_dict()
|
|
780
|
+
|
|
781
|
+
# ---- ADD THIS: handle python date objects ----
|
|
782
|
+
from datetime import date, datetime
|
|
783
|
+
for key, value in bar_info.items():
|
|
784
|
+
if isinstance(value, (date, datetime)): # convert date/datetime to string
|
|
785
|
+
bar_info[key] = value.strftime("%Y-%m-%d %H:%M:%S")
|
|
786
|
+
# ----------------------------------------------
|
|
787
|
+
|
|
788
|
+
bar_info = str(bar_info)
|
|
789
|
+
if len(bar_info) > 120:
|
|
790
|
+
bar_info = bar_info[:117] + "..."
|
|
791
|
+
pbar.set_postfix_str(bar_info)
|
|
792
|
+
|
|
693
793
|
except Exception:
|
|
694
794
|
# postfix is optional; ignore errors from display() here
|
|
695
795
|
pass
|
|
@@ -704,6 +804,12 @@ def _upload_features(
|
|
|
704
804
|
if follow_up.shape[0] > 0:
|
|
705
805
|
do_compute = False
|
|
706
806
|
|
|
807
|
+
if not do_compute and not force_compute:
|
|
808
|
+
logger_safe(
|
|
809
|
+
"info",
|
|
810
|
+
"Skipping computation for process_id=%s at time %s (already exists, force_compute=False)",
|
|
811
|
+
process_id, tdfs4ds.FEATURE_STORE_TIME
|
|
812
|
+
)
|
|
707
813
|
if do_compute or force_compute:
|
|
708
814
|
tdfs4ds.process_store.process_followup.followup_open(
|
|
709
815
|
run_id = tdfs4ds.RUN_ID,
|
|
@@ -1179,41 +1285,50 @@ def upload_tdstone2_scores(model):
|
|
|
1179
1285
|
return dataset
|
|
1180
1286
|
|
|
1181
1287
|
|
|
1182
|
-
def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None):
|
|
1288
|
+
def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None, force_compute = False, force_display_logs = False):
|
|
1183
1289
|
"""
|
|
1184
|
-
Executes a series of processes for each date in a given list, managing
|
|
1290
|
+
Executes a series of processes for each date in a given list, managing time, computation settings, and logging.
|
|
1185
1291
|
|
|
1186
1292
|
This function iterates over a range of time steps, updating a TimeManager object with each step, and then
|
|
1187
|
-
executes a list of processes for that time step. It also manages
|
|
1188
|
-
and
|
|
1293
|
+
executes a list of processes for that time step. It also manages synchronization of time for the feature store
|
|
1294
|
+
and optionally controls forced computation and log display behavior.
|
|
1189
1295
|
|
|
1190
1296
|
Parameters:
|
|
1191
1297
|
- process_list (list): A list of process IDs that need to be executed for each time step.
|
|
1192
|
-
- time_manager (TimeManager
|
|
1298
|
+
- time_manager (TimeManager): An object that manages time-related operations, like updating or retrieving time.
|
|
1193
1299
|
- time_id_start (int, optional): The starting time step ID. Default is 1.
|
|
1194
|
-
- time_id_end (int, optional): The ending time step ID. If None, it will run until the last time step in the
|
|
1300
|
+
- time_id_end (int, optional): The ending time step ID. If None, it will run until the last time step in the
|
|
1301
|
+
time manager.
|
|
1302
|
+
- force_compute (bool, optional): If True, forces each process to recompute even if previous results exist.
|
|
1303
|
+
Default is False.
|
|
1304
|
+
- force_display_logs (bool, optional): If True, forces log display during the rollout even if global log display
|
|
1305
|
+
is disabled. Default is False.
|
|
1195
1306
|
|
|
1196
1307
|
Side Effects:
|
|
1197
|
-
-
|
|
1308
|
+
- Temporarily modifies global variables DISPLAY_LOGS, PROCESS_TYPE, RUN_ID, and FEATURE_STORE_TIME.
|
|
1309
|
+
- Restores DISPLAY_LOGS setting after execution.
|
|
1198
1310
|
- Catches and prints exceptions along with the time step on which they occurred.
|
|
1199
1311
|
|
|
1200
|
-
|
|
1201
|
-
1. Disables display logs
|
|
1202
|
-
2.
|
|
1203
|
-
3.
|
|
1204
|
-
4.
|
|
1205
|
-
5.
|
|
1206
|
-
6.
|
|
1312
|
+
Steps performed:
|
|
1313
|
+
1. Disables display logs by default unless `force_display_logs` is True.
|
|
1314
|
+
2. Sets process type to 'ROLL_OUT' and initializes a unique run ID.
|
|
1315
|
+
3. Iterates over the specified range of time steps.
|
|
1316
|
+
4. Updates the time manager with the current time step.
|
|
1317
|
+
5. Synchronizes the feature store time with the current time step.
|
|
1318
|
+
6. Executes each process in the process list with optional forced computation.
|
|
1319
|
+
7. Restores original display log settings after completion.
|
|
1207
1320
|
|
|
1208
1321
|
Example:
|
|
1209
1322
|
>>> process_list = ['process_1', 'process_2']
|
|
1210
1323
|
>>> time_manager = TimeManager(...)
|
|
1211
|
-
>>> roll_out(process_list, time_manager, time_id_start=1, time_id_end=10)
|
|
1324
|
+
>>> roll_out(process_list, time_manager, time_id_start=1, time_id_end=10, force_compute=True, force_display_logs=True)
|
|
1212
1325
|
"""
|
|
1213
1326
|
|
|
1214
1327
|
# Disable display logs
|
|
1215
1328
|
temp_DISPLAY_LOGS = tdfs4ds.DISPLAY_LOGS
|
|
1216
1329
|
tdfs4ds.DISPLAY_LOGS = False
|
|
1330
|
+
if force_display_logs:
|
|
1331
|
+
tdfs4ds.DISPLAY_LOGS = True
|
|
1217
1332
|
PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
|
|
1218
1333
|
tdfs4ds.PROCESS_TYPE = 'ROLL_OUT'
|
|
1219
1334
|
tdfs4ds.RUN_ID = str(uuid.uuid4())
|
|
@@ -1246,7 +1361,7 @@ def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None):
|
|
|
1246
1361
|
# Execute all processes for this time step
|
|
1247
1362
|
for proc_id in process_list:
|
|
1248
1363
|
pbar.set_description(f"Processing {date_} | proc {proc_id}")
|
|
1249
|
-
run(process_id=proc_id, force_compute=
|
|
1364
|
+
run(process_id=proc_id, force_compute=force_compute)
|
|
1250
1365
|
|
|
1251
1366
|
# Restore settings
|
|
1252
1367
|
tdfs4ds.DISPLAY_LOGS = temp_DISPLAY_LOGS
|
|
@@ -2,7 +2,7 @@ tdfs/__init__.py,sha256=7AcO7uB1opRCt7t2JOHworKimfAaDeO3boRW7u9Geo8,23
|
|
|
2
2
|
tdfs/datasets.py,sha256=-b2MPEKGki2V1M8iUcoDR9uc2krIK7u1CK-EhChvihs,985
|
|
3
3
|
tdfs/feature_store.py,sha256=Honu7eOAXxP4Ivz0mRlhuNkfTDzgZl5HB1WlQUwzcZ0,31354
|
|
4
4
|
tdfs/data/curves.csv,sha256=q0Tm-0yu7VMK4lHvHpgi1LMeRq0lO5gJy2Q17brKbEM,112488
|
|
5
|
-
tdfs4ds/__init__.py,sha256=
|
|
5
|
+
tdfs4ds/__init__.py,sha256=mrvk5jKmcdYg4waC7bpCHsDGlPb8h1JmSgx3dBlo1Ow,60776
|
|
6
6
|
tdfs4ds/datasets.py,sha256=LE4Gn0muwdyrIrCrbkE92cnafUML63z1lj5bFIIVzmc,3524
|
|
7
7
|
tdfs4ds/feature_engineering.py,sha256=oVnZ2V_XNGE12LKC_fNfkrWSQZLgtYRmaf8Dispi6S4,7081
|
|
8
8
|
tdfs4ds/feature_store.py,sha256=y-oItPZw6nBkBcGAceaATZbkLPTsvpk0OnpzTxYofDs,68576
|
|
@@ -32,7 +32,7 @@ tdfs4ds/utils/lineage.py,sha256=gy5M42qy5fvdWmlohAY3WPYoqAyp5VakeEmeT1YjrJQ,3783
|
|
|
32
32
|
tdfs4ds/utils/query_management.py,sha256=nAcE8QY1GWAKgOtb-ubSfDVcnYbU7Ge8CruVRLoPtmY,6356
|
|
33
33
|
tdfs4ds/utils/time_management.py,sha256=asIWvK5K81NNwAGqC-9Tv4Timscxyv0vyuPFs01whu0,31461
|
|
34
34
|
tdfs4ds/utils/visualization.py,sha256=5S528KoKzzkrAdCxfy7ecyqKvAXBoibNvHwz_u5ISMs,23167
|
|
35
|
-
tdfs4ds-0.2.4.
|
|
36
|
-
tdfs4ds-0.2.4.
|
|
37
|
-
tdfs4ds-0.2.4.
|
|
38
|
-
tdfs4ds-0.2.4.
|
|
35
|
+
tdfs4ds-0.2.4.37.dist-info/METADATA,sha256=kXimo1unejodaf9W0tOBPLaKisjougkurjt5C9IpA6g,14326
|
|
36
|
+
tdfs4ds-0.2.4.37.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
|
37
|
+
tdfs4ds-0.2.4.37.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
|
|
38
|
+
tdfs4ds-0.2.4.37.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|