tdfs4ds 0.2.4.34__py3-none-any.whl → 0.2.4.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +114 -17
- tdfs4ds/feature_store/feature_data_processing.py +1 -1
- {tdfs4ds-0.2.4.34.dist-info → tdfs4ds-0.2.4.36.dist-info}/METADATA +1 -1
- {tdfs4ds-0.2.4.34.dist-info → tdfs4ds-0.2.4.36.dist-info}/RECORD +6 -6
- {tdfs4ds-0.2.4.34.dist-info → tdfs4ds-0.2.4.36.dist-info}/WHEEL +0 -0
- {tdfs4ds-0.2.4.34.dist-info → tdfs4ds-0.2.4.36.dist-info}/top_level.txt +0 -0
tdfs4ds/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
__version__ = '0.2.4.
|
|
1
|
+
__version__ = '0.2.4.36'
|
|
2
2
|
import logging
|
|
3
3
|
|
|
4
4
|
# Setup the logger
|
|
@@ -562,6 +562,81 @@ def _upload_features(
|
|
|
562
562
|
process_id=None, force_compute=False,
|
|
563
563
|
force_varchar_length=None
|
|
564
564
|
):
|
|
565
|
+
"""
|
|
566
|
+
Uploads a set of features into the Feature Store for a given entity.
|
|
567
|
+
|
|
568
|
+
This function registers an entity and its associated features in the feature catalog
|
|
569
|
+
if they are not already defined, prepares the data for ingestion, and stores it in the
|
|
570
|
+
feature store. It also supports incremental feature computation and conditional execution
|
|
571
|
+
depending on prior runs.
|
|
572
|
+
|
|
573
|
+
Parameters
|
|
574
|
+
----------
|
|
575
|
+
df : pandas.DataFrame
|
|
576
|
+
Input dataframe containing entity keys and feature columns to upload.
|
|
577
|
+
entity_id : str, list, or dict
|
|
578
|
+
Identifier(s) for the entity. Can be:
|
|
579
|
+
- A string (single entity key)
|
|
580
|
+
- A list of key column names
|
|
581
|
+
- A dict mapping column names to data types
|
|
582
|
+
If not a dict, entity metadata is inferred automatically.
|
|
583
|
+
feature_names : list of str
|
|
584
|
+
List of feature column names to upload from `df`.
|
|
585
|
+
feature_versions : dict or int, optional
|
|
586
|
+
Feature version(s). If a single integer is provided, it is applied to all features.
|
|
587
|
+
If a dict is provided, it maps each feature name to its version.
|
|
588
|
+
Default is FEATURE_VERSION_DEFAULT.
|
|
589
|
+
primary_index : str or list, optional
|
|
590
|
+
Primary index to use when storing features in Teradata.
|
|
591
|
+
partitioning : str, optional
|
|
592
|
+
Partitioning clause for feature store tables. Default is ''.
|
|
593
|
+
filtermanager : FilterManager, optional
|
|
594
|
+
If provided, features are built iteratively per filter step.
|
|
595
|
+
entity_null_substitute : dict, optional
|
|
596
|
+
Replacement values for nulls in entity keys.
|
|
597
|
+
Example: {'customer_id': -1}
|
|
598
|
+
process_id : str, optional
|
|
599
|
+
Identifier for the process execution, used for follow-up logging.
|
|
600
|
+
force_compute : bool, optional
|
|
601
|
+
If True, forces recomputation even if the same process_id and timestamp were
|
|
602
|
+
already computed earlier. If False, the computation is skipped when existing
|
|
603
|
+
results are detected. Default is False.
|
|
604
|
+
force_varchar_length : int, optional
|
|
605
|
+
If provided, all VARCHAR feature columns are resized to this length
|
|
606
|
+
before ingestion.
|
|
607
|
+
|
|
608
|
+
Returns
|
|
609
|
+
-------
|
|
610
|
+
pandas.DataFrame or None
|
|
611
|
+
If BUILD_DATASET_AT_UPLOAD is enabled, returns a dataset built from the
|
|
612
|
+
ingested features for validation. Otherwise, returns None.
|
|
613
|
+
|
|
614
|
+
Notes
|
|
615
|
+
-----
|
|
616
|
+
- Uses global tdfs4ds context such as FEATURE_STORE_TIME, RUN_ID, and PROCESS_TYPE.
|
|
617
|
+
- Logs ingestion status in process follow-up tables.
|
|
618
|
+
- Skips ingestion when existing completed results are found unless
|
|
619
|
+
`force_compute=True`.
|
|
620
|
+
- Applies Teradata-optimized storage and statistics collection.
|
|
621
|
+
|
|
622
|
+
Raises
|
|
623
|
+
------
|
|
624
|
+
ValueError
|
|
625
|
+
If unsupported data types are found (CLOB/BLOB/JSON).
|
|
626
|
+
Exception
|
|
627
|
+
For ingestion failure or storage errors.
|
|
628
|
+
|
|
629
|
+
Example
|
|
630
|
+
-------
|
|
631
|
+
>>> _upload_features(
|
|
632
|
+
... df=dataframe,
|
|
633
|
+
... entity_id="customer_id",
|
|
634
|
+
... feature_names=["age", "credit_score"],
|
|
635
|
+
... process_id="customer_features_v1",
|
|
636
|
+
... force_compute=False
|
|
637
|
+
... )
|
|
638
|
+
"""
|
|
639
|
+
|
|
565
640
|
from tdfs4ds.feature_store.entity_management import register_entity
|
|
566
641
|
from tdfs4ds.feature_store.feature_store_management import Gettdtypes
|
|
567
642
|
from tdfs4ds.feature_store.feature_store_management import register_features
|
|
@@ -633,6 +708,12 @@ def _upload_features(
|
|
|
633
708
|
|
|
634
709
|
if filtermanager is None:
|
|
635
710
|
do_compute = not (process_id and follow_up is not None and follow_up.shape[0] > 0)
|
|
711
|
+
if not do_compute and not force_compute:
|
|
712
|
+
logger_safe(
|
|
713
|
+
"info",
|
|
714
|
+
"Skipping computation for process_id=%s at time %s (already exists, force_compute=False)",
|
|
715
|
+
process_id, tdfs4ds.FEATURE_STORE_TIME
|
|
716
|
+
)
|
|
636
717
|
if do_compute or force_compute:
|
|
637
718
|
logger_safe("info", "Beginning feature ingestion for entity=%s", entity_id)
|
|
638
719
|
tdfs4ds.process_store.process_followup.followup_open(
|
|
@@ -670,6 +751,7 @@ def _upload_features(
|
|
|
670
751
|
raise
|
|
671
752
|
|
|
672
753
|
else:
|
|
754
|
+
|
|
673
755
|
logger_safe("info", "FilterManager detected: %s filters to process", filtermanager.nb_filters)
|
|
674
756
|
something_computed = False
|
|
675
757
|
for i in tqdm(
|
|
@@ -704,6 +786,12 @@ def _upload_features(
|
|
|
704
786
|
if follow_up.shape[0] > 0:
|
|
705
787
|
do_compute = False
|
|
706
788
|
|
|
789
|
+
if not do_compute and not force_compute:
|
|
790
|
+
logger_safe(
|
|
791
|
+
"info",
|
|
792
|
+
"Skipping computation for process_id=%s at time %s (already exists, force_compute=False)",
|
|
793
|
+
process_id, tdfs4ds.FEATURE_STORE_TIME
|
|
794
|
+
)
|
|
707
795
|
if do_compute or force_compute:
|
|
708
796
|
tdfs4ds.process_store.process_followup.followup_open(
|
|
709
797
|
run_id = tdfs4ds.RUN_ID,
|
|
@@ -1179,41 +1267,50 @@ def upload_tdstone2_scores(model):
|
|
|
1179
1267
|
return dataset
|
|
1180
1268
|
|
|
1181
1269
|
|
|
1182
|
-
def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None):
|
|
1270
|
+
def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None, force_compute = False, force_display_logs = False):
|
|
1183
1271
|
"""
|
|
1184
|
-
Executes a series of processes for each date in a given list, managing
|
|
1272
|
+
Executes a series of processes for each date in a given list, managing time, computation settings, and logging.
|
|
1185
1273
|
|
|
1186
1274
|
This function iterates over a range of time steps, updating a TimeManager object with each step, and then
|
|
1187
|
-
executes a list of processes for that time step. It also manages
|
|
1188
|
-
and
|
|
1275
|
+
executes a list of processes for that time step. It also manages synchronization of time for the feature store
|
|
1276
|
+
and optionally controls forced computation and log display behavior.
|
|
1189
1277
|
|
|
1190
1278
|
Parameters:
|
|
1191
1279
|
- process_list (list): A list of process IDs that need to be executed for each time step.
|
|
1192
|
-
- time_manager (TimeManager
|
|
1280
|
+
- time_manager (TimeManager): An object that manages time-related operations, like updating or retrieving time.
|
|
1193
1281
|
- time_id_start (int, optional): The starting time step ID. Default is 1.
|
|
1194
|
-
- time_id_end (int, optional): The ending time step ID. If None, it will run until the last time step in the
|
|
1282
|
+
- time_id_end (int, optional): The ending time step ID. If None, it will run until the last time step in the
|
|
1283
|
+
time manager.
|
|
1284
|
+
- force_compute (bool, optional): If True, forces each process to recompute even if previous results exist.
|
|
1285
|
+
Default is False.
|
|
1286
|
+
- force_display_logs (bool, optional): If True, forces log display during the rollout even if global log display
|
|
1287
|
+
is disabled. Default is False.
|
|
1195
1288
|
|
|
1196
1289
|
Side Effects:
|
|
1197
|
-
-
|
|
1290
|
+
- Temporarily modifies global variables DISPLAY_LOGS, PROCESS_TYPE, RUN_ID, and FEATURE_STORE_TIME.
|
|
1291
|
+
- Restores DISPLAY_LOGS setting after execution.
|
|
1198
1292
|
- Catches and prints exceptions along with the time step on which they occurred.
|
|
1199
1293
|
|
|
1200
|
-
|
|
1201
|
-
1. Disables display logs
|
|
1202
|
-
2.
|
|
1203
|
-
3.
|
|
1204
|
-
4.
|
|
1205
|
-
5.
|
|
1206
|
-
6.
|
|
1294
|
+
Steps performed:
|
|
1295
|
+
1. Disables display logs by default unless `force_display_logs` is True.
|
|
1296
|
+
2. Sets process type to 'ROLL_OUT' and initializes a unique run ID.
|
|
1297
|
+
3. Iterates over the specified range of time steps.
|
|
1298
|
+
4. Updates the time manager with the current time step.
|
|
1299
|
+
5. Synchronizes the feature store time with the current time step.
|
|
1300
|
+
6. Executes each process in the process list with optional forced computation.
|
|
1301
|
+
7. Restores original display log settings after completion.
|
|
1207
1302
|
|
|
1208
1303
|
Example:
|
|
1209
1304
|
>>> process_list = ['process_1', 'process_2']
|
|
1210
1305
|
>>> time_manager = TimeManager(...)
|
|
1211
|
-
>>> roll_out(process_list, time_manager, time_id_start=1, time_id_end=10)
|
|
1306
|
+
>>> roll_out(process_list, time_manager, time_id_start=1, time_id_end=10, force_compute=True, force_display_logs=True)
|
|
1212
1307
|
"""
|
|
1213
1308
|
|
|
1214
1309
|
# Disable display logs
|
|
1215
1310
|
temp_DISPLAY_LOGS = tdfs4ds.DISPLAY_LOGS
|
|
1216
1311
|
tdfs4ds.DISPLAY_LOGS = False
|
|
1312
|
+
if force_display_logs:
|
|
1313
|
+
tdfs4ds.DISPLAY_LOGS = True
|
|
1217
1314
|
PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
|
|
1218
1315
|
tdfs4ds.PROCESS_TYPE = 'ROLL_OUT'
|
|
1219
1316
|
tdfs4ds.RUN_ID = str(uuid.uuid4())
|
|
@@ -1246,7 +1343,7 @@ def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None):
|
|
|
1246
1343
|
# Execute all processes for this time step
|
|
1247
1344
|
for proc_id in process_list:
|
|
1248
1345
|
pbar.set_description(f"Processing {date_} | proc {proc_id}")
|
|
1249
|
-
run(process_id=proc_id, force_compute=
|
|
1346
|
+
run(process_id=proc_id, force_compute=force_compute)
|
|
1250
1347
|
|
|
1251
1348
|
# Restore settings
|
|
1252
1349
|
tdfs4ds.DISPLAY_LOGS = temp_DISPLAY_LOGS
|
|
@@ -446,7 +446,7 @@ def apply_collect_stats(
|
|
|
446
446
|
retried_flag = False
|
|
447
447
|
|
|
448
448
|
try:
|
|
449
|
-
|
|
449
|
+
tdml.execute_sql(f"COLLECT STATS ON {table_fqn}")
|
|
450
450
|
ok += 1
|
|
451
451
|
except Exception as e:
|
|
452
452
|
# First attempt failed; try generated statement(s)
|
|
@@ -2,7 +2,7 @@ tdfs/__init__.py,sha256=7AcO7uB1opRCt7t2JOHworKimfAaDeO3boRW7u9Geo8,23
|
|
|
2
2
|
tdfs/datasets.py,sha256=-b2MPEKGki2V1M8iUcoDR9uc2krIK7u1CK-EhChvihs,985
|
|
3
3
|
tdfs/feature_store.py,sha256=Honu7eOAXxP4Ivz0mRlhuNkfTDzgZl5HB1WlQUwzcZ0,31354
|
|
4
4
|
tdfs/data/curves.csv,sha256=q0Tm-0yu7VMK4lHvHpgi1LMeRq0lO5gJy2Q17brKbEM,112488
|
|
5
|
-
tdfs4ds/__init__.py,sha256=
|
|
5
|
+
tdfs4ds/__init__.py,sha256=Smyg37GXuUsUkaV1RqG7fhMy6-h6bRkpq7bDG_WGYTs,60077
|
|
6
6
|
tdfs4ds/datasets.py,sha256=LE4Gn0muwdyrIrCrbkE92cnafUML63z1lj5bFIIVzmc,3524
|
|
7
7
|
tdfs4ds/feature_engineering.py,sha256=oVnZ2V_XNGE12LKC_fNfkrWSQZLgtYRmaf8Dispi6S4,7081
|
|
8
8
|
tdfs4ds/feature_store.py,sha256=y-oItPZw6nBkBcGAceaATZbkLPTsvpk0OnpzTxYofDs,68576
|
|
@@ -17,7 +17,7 @@ tdfs4ds/dataset/dataset.py,sha256=J_fgfsVdR9zSOXrUOqyotqsUD-GlQMGyuld6ueov45w,76
|
|
|
17
17
|
tdfs4ds/dataset/dataset_catalog.py,sha256=qxS2thDW2MvsRouSFaX1M0sX2J7IzBAYD8Yf22Tsd5k,16638
|
|
18
18
|
tdfs4ds/feature_store/__init__.py,sha256=a7NPCkpTx40UR5LRErwnskpABG2Vuib7F5wUjaUGCnI,209
|
|
19
19
|
tdfs4ds/feature_store/entity_management.py,sha256=9ltytv3yCTG84NZXBpb1Tlkf9pOxvrNb0MVidU4pwvE,10157
|
|
20
|
-
tdfs4ds/feature_store/feature_data_processing.py,sha256=
|
|
20
|
+
tdfs4ds/feature_store/feature_data_processing.py,sha256=gjwypiTfwTyGyrP20v35Vu2uGIrCY80OBBeMVBsdjuk,45020
|
|
21
21
|
tdfs4ds/feature_store/feature_query_retrieval.py,sha256=51c6ZNlLFiBIxNPinS8ot8bjWEIb1QV2eVg69yzVF80,35381
|
|
22
22
|
tdfs4ds/feature_store/feature_store_management.py,sha256=yXLbINYLA-lzd0t_6TzEe9a8Anlum4x8TRoxZU3FIr8,54276
|
|
23
23
|
tdfs4ds/process_store/__init__.py,sha256=npHR_xju5ecGmWfYHDyteLwiU3x-cL4HD3sFK_th7xY,229
|
|
@@ -32,7 +32,7 @@ tdfs4ds/utils/lineage.py,sha256=gy5M42qy5fvdWmlohAY3WPYoqAyp5VakeEmeT1YjrJQ,3783
|
|
|
32
32
|
tdfs4ds/utils/query_management.py,sha256=nAcE8QY1GWAKgOtb-ubSfDVcnYbU7Ge8CruVRLoPtmY,6356
|
|
33
33
|
tdfs4ds/utils/time_management.py,sha256=asIWvK5K81NNwAGqC-9Tv4Timscxyv0vyuPFs01whu0,31461
|
|
34
34
|
tdfs4ds/utils/visualization.py,sha256=5S528KoKzzkrAdCxfy7ecyqKvAXBoibNvHwz_u5ISMs,23167
|
|
35
|
-
tdfs4ds-0.2.4.
|
|
36
|
-
tdfs4ds-0.2.4.
|
|
37
|
-
tdfs4ds-0.2.4.
|
|
38
|
-
tdfs4ds-0.2.4.
|
|
35
|
+
tdfs4ds-0.2.4.36.dist-info/METADATA,sha256=wKUqiK1ohnWfe1ZSrgLq_4OXnyEh4K9I5Jm5PfRkaHs,14326
|
|
36
|
+
tdfs4ds-0.2.4.36.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
|
37
|
+
tdfs4ds-0.2.4.36.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
|
|
38
|
+
tdfs4ds-0.2.4.36.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|