tdfs4ds 0.2.4.26__py3-none-any.whl → 0.2.4.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +586 -564
- tdfs4ds/feature_store/feature_data_processing.py +367 -299
- tdfs4ds/feature_store/feature_query_retrieval.py +105 -52
- tdfs4ds/feature_store/feature_store_management.py +226 -231
- tdfs4ds/process_store/process_followup.py +113 -2
- tdfs4ds/process_store/process_query_administration.py +1 -1
- tdfs4ds/process_store/process_registration_management.py +67 -55
- tdfs4ds/process_store/process_store_catalog_management.py +2 -2
- tdfs4ds/utils/filter_management.py +521 -138
- tdfs4ds/utils/query_management.py +18 -40
- tdfs4ds/utils/time_management.py +547 -97
- {tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.4.41.dist-info}/METADATA +1 -1
- {tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.4.41.dist-info}/RECORD +15 -15
- {tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.4.41.dist-info}/WHEEL +0 -0
- {tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.4.41.dist-info}/top_level.txt +0 -0
tdfs4ds/__init__.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
|
-
__version__ = '0.2.4.
|
|
1
|
+
__version__ = '0.2.4.41'
|
|
2
2
|
import logging
|
|
3
|
+
import json
|
|
4
|
+
|
|
3
5
|
# Setup the logger
|
|
4
6
|
logging.basicConfig(
|
|
5
7
|
level=logging.INFO,
|
|
@@ -7,6 +9,15 @@ logging.basicConfig(
|
|
|
7
9
|
datefmt='%Y-%m-%d %H:%M:%S' # Set the date/time format
|
|
8
10
|
)
|
|
9
11
|
|
|
12
|
+
# Helper: central logging gate controlled by tdfs4ds.DISPLAY_LOGS
|
|
13
|
+
def logger_safe(level, message, *args, **kwargs):
|
|
14
|
+
"""
|
|
15
|
+
Wrapper around the global `logger` that only emits logs when
|
|
16
|
+
tdfs4ds.DISPLAY_LOGS is True. `level` is a string like "info", "error", etc.
|
|
17
|
+
"""
|
|
18
|
+
if getattr(tdfs4ds, "DISPLAY_LOGS", True):
|
|
19
|
+
getattr(logger, level)(message, *args, **kwargs)
|
|
20
|
+
|
|
10
21
|
logger = logging.getLogger(__name__)
|
|
11
22
|
|
|
12
23
|
from tdfs4ds.feature_store.feature_query_retrieval import get_available_entity_id_records, write_where_clause_filter
|
|
@@ -57,7 +68,7 @@ import tdfs4ds.datasets
|
|
|
57
68
|
import time
|
|
58
69
|
|
|
59
70
|
import inspect
|
|
60
|
-
import tqdm
|
|
71
|
+
from tqdm.auto import tqdm # auto picks the right frontend (notebook/terminal)
|
|
61
72
|
|
|
62
73
|
from tdfs4ds.feature_store.feature_data_processing import generate_on_clause
|
|
63
74
|
|
|
@@ -70,92 +81,80 @@ PROCESS_TYPE = 'RUN PROCESS'
|
|
|
70
81
|
try:
|
|
71
82
|
SCHEMA = tdml.context.context._get_current_databasename()
|
|
72
83
|
if SCHEMA is None:
|
|
73
|
-
|
|
74
|
-
|
|
84
|
+
logger.warning("No default database detected for feature store.")
|
|
85
|
+
logger.warning('Please set it explicitly: tdfs4ds.feature_store.schema = "<feature store database>"')
|
|
75
86
|
else:
|
|
76
|
-
|
|
77
|
-
|
|
87
|
+
logger.info("Default database detected for feature store: %s", SCHEMA)
|
|
88
|
+
logger.info('tdfs4ds.feature_store.schema = "%s"', SCHEMA)
|
|
89
|
+
|
|
78
90
|
if DATA_DOMAIN is None:
|
|
79
91
|
DATA_DOMAIN = SCHEMA
|
|
80
|
-
|
|
81
|
-
|
|
92
|
+
logger.info("DATA_DOMAIN not set. Defaulting to SCHEMA: %s", DATA_DOMAIN)
|
|
93
|
+
logger.info('You can override it using: tdfs4ds.DATA_DOMAIN = "<your data domain>"')
|
|
82
94
|
|
|
83
95
|
except Exception as e:
|
|
84
|
-
|
|
85
|
-
|
|
96
|
+
logger.error("Could not determine current database: %s", str(e).split('\n')[0])
|
|
97
|
+
logger.warning("Please specify the feature store database manually:")
|
|
98
|
+
logger.warning('tdfs4ds.feature_store.schema = "<feature store database>"')
|
|
86
99
|
|
|
87
100
|
|
|
88
101
|
def setup(database, if_exists='fail'):
|
|
89
102
|
"""
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
This function sets the database schema for feature and process catalogs. If specified, it also handles
|
|
93
|
-
the replacement of existing catalog tables. It reports the status of these operations, including any
|
|
94
|
-
encountered exceptions.
|
|
95
|
-
|
|
96
|
-
Parameters:
|
|
97
|
-
database (str): The name of the database schema to be used.
|
|
98
|
-
if_exists (str, optional): Determines the behavior if catalog tables already exist in the database.
|
|
99
|
-
'fail' (default) - Do nothing if the tables exist.
|
|
100
|
-
'replace' - Drop the tables if they exist before creating new ones.
|
|
101
|
-
|
|
102
|
-
Steps performed:
|
|
103
|
-
1. Sets the schema to the provided database name.
|
|
104
|
-
2. If 'if_exists' is 'replace', attempts to drop 'FS_FEATURE_CATALOG' and 'FS_PROCESS_CATALOG' tables.
|
|
105
|
-
3. Creates new feature and process catalog tables and sets their names in the tdfs4ds module.
|
|
106
|
-
4. Prints the names of the newly created tables along with the database name.
|
|
107
|
-
5. Captures and prints the first line of any exceptions that occur during these operations.
|
|
108
|
-
|
|
109
|
-
Returns:
|
|
110
|
-
None
|
|
103
|
+
Initialize the feature store environment by creating catalog tables and views.
|
|
111
104
|
"""
|
|
112
105
|
|
|
113
106
|
from tdfs4ds.feature_store.feature_store_management import feature_store_catalog_creation
|
|
114
107
|
from tdfs4ds.process_store.process_store_catalog_management import process_store_catalog_creation
|
|
115
108
|
|
|
116
109
|
tdfs4ds.SCHEMA = database
|
|
110
|
+
logger_safe("info", "Setting up feature store in database: %s", database)
|
|
111
|
+
|
|
117
112
|
if if_exists == 'replace':
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
print(str(e).split('\n')[0])
|
|
126
|
-
try:
|
|
127
|
-
tdml.db_drop_table(table_name = tdfs4ds.DATA_DISTRIBUTION_NAME, schema_name=database)
|
|
128
|
-
except Exception as e:
|
|
129
|
-
print(str(e).split('\n')[0])
|
|
113
|
+
logger_safe("info", "Replacing existing catalog tables if they exist.")
|
|
114
|
+
for table in [tdfs4ds.FEATURE_CATALOG_NAME, tdfs4ds.PROCESS_CATALOG_NAME, tdfs4ds.DATA_DISTRIBUTION_NAME]:
|
|
115
|
+
try:
|
|
116
|
+
tdml.db_drop_table(table_name=table, schema_name=database)
|
|
117
|
+
logger_safe("info", "Dropped table %s.%s", database, table)
|
|
118
|
+
except Exception as e:
|
|
119
|
+
logger_safe("warning", "Could not drop table %s.%s: %s", database, table, str(e).split('\n')[0])
|
|
130
120
|
|
|
131
121
|
DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME).drop_catalog()
|
|
122
|
+
|
|
132
123
|
try:
|
|
133
124
|
tdfs4ds.FEATURE_CATALOG_NAME = feature_store_catalog_creation()
|
|
134
|
-
|
|
125
|
+
logger_safe("info", "Feature catalog table created: %s in database %s", tdfs4ds.FEATURE_CATALOG_NAME, database)
|
|
135
126
|
except Exception as e:
|
|
136
|
-
|
|
127
|
+
logger_safe("error", "Feature catalog creation failed: %s", str(e).split('\n')[0])
|
|
137
128
|
|
|
138
129
|
try:
|
|
139
|
-
tdfs4ds.PROCESS_CATALOG_NAME,
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
130
|
+
(tdfs4ds.PROCESS_CATALOG_NAME,
|
|
131
|
+
tdfs4ds.DATA_DISTRIBUTION_NAME,
|
|
132
|
+
tdfs4ds.FILTER_MANAGER_NAME) = process_store_catalog_creation()
|
|
133
|
+
|
|
134
|
+
logger_safe("info", "Process catalog table created: %s", tdfs4ds.PROCESS_CATALOG_NAME)
|
|
135
|
+
logger_safe("info", "Data distribution table created: %s", tdfs4ds.DATA_DISTRIBUTION_NAME)
|
|
136
|
+
logger_safe("info", "Filter manager table created: %s", tdfs4ds.FILTER_MANAGER_NAME)
|
|
143
137
|
except Exception as e:
|
|
144
|
-
|
|
138
|
+
logger_safe("error", "Process catalog creation failed: %s", str(e).split('\n')[0])
|
|
145
139
|
|
|
146
140
|
try:
|
|
147
141
|
tdfs4ds.process_store.process_followup.follow_up_table_creation()
|
|
142
|
+
logger_safe("info", "Follow-up table created successfully.")
|
|
148
143
|
except Exception as e:
|
|
149
|
-
|
|
144
|
+
logger_safe("error", "Follow-up table creation failed: %s", str(e).split('\n')[0])
|
|
150
145
|
|
|
151
146
|
tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
|
|
152
147
|
tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
|
|
148
|
+
|
|
153
149
|
dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
|
|
154
150
|
if not dataset_catalog._exists():
|
|
155
151
|
dataset_catalog.create_catalog()
|
|
152
|
+
logger_safe("info", "Dataset catalog created: %s", tdfs4ds.DATASET_CATALOG_NAME)
|
|
156
153
|
|
|
154
|
+
logger_safe("info", "Setup complete.")
|
|
157
155
|
return
|
|
158
156
|
|
|
157
|
+
|
|
159
158
|
def connect(
|
|
160
159
|
database = tdfs4ds.SCHEMA,
|
|
161
160
|
feature_catalog_name = tdfs4ds.FEATURE_CATALOG_NAME,
|
|
@@ -166,15 +165,15 @@ def connect(
|
|
|
166
165
|
feature_catalog_name_view = tdfs4ds.FEATURE_CATALOG_NAME_VIEW,
|
|
167
166
|
process_catalog_name_view = tdfs4ds.PROCESS_CATALOG_NAME_VIEW,
|
|
168
167
|
dataset_catalog_name = tdfs4ds.DATASET_CATALOG_NAME,
|
|
169
|
-
create_if_missing = False
|
|
168
|
+
create_if_missing = False
|
|
170
169
|
):
|
|
171
|
-
if database is
|
|
172
|
-
tdfs4ds.SCHEMA = database
|
|
173
|
-
else:
|
|
170
|
+
if database is None:
|
|
174
171
|
raise ValueError("database parameter is None.")
|
|
172
|
+
tdfs4ds.SCHEMA = database
|
|
173
|
+
logger_safe("info", "Connecting to feature store in database: %s", database)
|
|
175
174
|
|
|
176
175
|
tables = [x.lower() for x in list(tdml.db_list_tables(schema_name=tdfs4ds.SCHEMA, object_type='table').TableName.values)]
|
|
177
|
-
|
|
176
|
+
|
|
178
177
|
feature_exists = feature_catalog_name.lower() in tables
|
|
179
178
|
process_exists = process_catalog_name.lower() in tables
|
|
180
179
|
distrib_exists = data_distribution_name.lower() in tables
|
|
@@ -183,20 +182,20 @@ def connect(
|
|
|
183
182
|
|
|
184
183
|
if not (feature_exists and process_exists and distrib_exists and filter_manager_exists):
|
|
185
184
|
if not create_if_missing:
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
# Follow-up table handling
|
|
185
|
+
logger_safe("warning", "Feature store components missing and create_if_missing=False")
|
|
186
|
+
return False
|
|
187
|
+
logger_safe("info", "Missing components detected; creating missing parts...")
|
|
188
|
+
if not feature_exists:
|
|
189
|
+
tdfs4ds.feature_store.feature_store_management.feature_store_catalog_creation()
|
|
190
|
+
if not process_exists:
|
|
191
|
+
tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_creation()
|
|
192
|
+
if not distrib_exists:
|
|
193
|
+
tdfs4ds.data_distribution.data_distribution_catalog_creation()
|
|
194
|
+
if not filter_manager_exists:
|
|
195
|
+
tdfs4ds.filter_manager.filter_manager_catalog_creation()
|
|
196
|
+
|
|
199
197
|
if not followup_name_exists:
|
|
198
|
+
logger_safe("info", "Creating follow-up table: %s", followup_name)
|
|
200
199
|
tdfs4ds.process_store.process_followup.follow_up_table_creation()
|
|
201
200
|
tdfs4ds.FOLLOW_UP_NAME = followup_name
|
|
202
201
|
|
|
@@ -210,30 +209,31 @@ def connect(
|
|
|
210
209
|
|
|
211
210
|
process_list = tdml.DataFrame(tdml.in_schema(database, process_catalog_name))
|
|
212
211
|
if 'ENTITY_NULL_SUBSTITUTE' not in process_list.columns:
|
|
213
|
-
|
|
214
|
-
print('upgrade to the latest DDL')
|
|
212
|
+
logger_safe("warning", "ENTITY_NULL_SUBSTITUTE column missing. Upgrading catalog.")
|
|
215
213
|
tdfs4ds.process_store.process_store_catalog_management.upgrade_process_catalog()
|
|
216
214
|
|
|
217
215
|
tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
|
|
218
216
|
tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
|
|
219
217
|
|
|
220
|
-
# Dataset
|
|
218
|
+
# Dataset Catalog
|
|
221
219
|
tdfs4ds.DATASET_CATALOG_NAME = dataset_catalog_name
|
|
222
|
-
dataset_catalog = DatasetCatalog(schema_name=database, name=
|
|
220
|
+
dataset_catalog = DatasetCatalog(schema_name=database, name=dataset_catalog_name)
|
|
223
221
|
if not dataset_catalog._exists():
|
|
224
222
|
dataset_catalog.create_catalog()
|
|
223
|
+
logger_safe("info", "Dataset catalog created: %s", dataset_catalog_name)
|
|
225
224
|
|
|
226
|
-
#
|
|
225
|
+
# Detect temporal distribution
|
|
227
226
|
def is_data_distribution_temporal():
|
|
228
227
|
return 'PERIOD' in tdfs4ds.utils.lineage.get_ddl(
|
|
229
228
|
view_name=tdfs4ds.DATA_DISTRIBUTION_NAME,
|
|
230
229
|
schema_name=tdfs4ds.SCHEMA,
|
|
231
230
|
object_type='table'
|
|
232
231
|
)
|
|
233
|
-
|
|
232
|
+
|
|
234
233
|
tdfs4ds.DATA_DISTRIBUTION_TEMPORAL = is_data_distribution_temporal()
|
|
235
|
-
|
|
236
|
-
return True
|
|
234
|
+
logger_safe("info", "Connected to feature store successfully.")
|
|
235
|
+
return True
|
|
236
|
+
|
|
237
237
|
|
|
238
238
|
|
|
239
239
|
|
|
@@ -287,50 +287,22 @@ def get_dataset_entity(dataset_id = None):
|
|
|
287
287
|
def get_dataset_features(dataset_id = None):
|
|
288
288
|
return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_features(dataset_id)
|
|
289
289
|
|
|
290
|
-
def run(process_id, return_dataset
|
|
290
|
+
def run(process_id, return_dataset=False, force_compute=False, force_varchar_length=None):
|
|
291
291
|
"""
|
|
292
292
|
Executes a specific process from the feature store identified by the process ID.
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
Parameters:
|
|
296
|
-
- process_id (str): The unique identifier of the process to run.
|
|
297
|
-
- return_dataset (bool, optional): A flag indicating whether to return the dataset created during the process.
|
|
298
|
-
Default is False.
|
|
299
|
-
- force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
|
|
300
|
-
Default is False.
|
|
301
|
-
- force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
|
|
302
|
-
VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
|
|
303
|
-
where k is the smallest integer so that the original lengths is smaller or equal
|
|
304
|
-
to k x force_varchar_length. Default is None.
|
|
305
|
-
|
|
306
|
-
Returns:
|
|
307
|
-
DataFrame or None: If return_dataset is True, returns the dataset created during the process. Otherwise, returns None.
|
|
308
|
-
|
|
309
|
-
This function performs the following steps:
|
|
310
|
-
1. Determines the process type and initializes necessary variables.
|
|
311
|
-
2. Constructs and executes a SQL query to retrieve process details by process ID.
|
|
312
|
-
3. Fetches the filter manager, process type, primary index, partitioning, and data domain from the query result.
|
|
313
|
-
4. Handles different process types, such as 'denormalized view' and 'tdstone2 view'.
|
|
314
|
-
5. For 'denormalized view' process type, extracts necessary details, fetches data, and uploads features to the feature store.
|
|
315
|
-
6. Optionally returns the dataset created during the process if return_dataset is True.
|
|
316
|
-
|
|
317
|
-
Note:
|
|
318
|
-
- The function relies on various sub-modules within the `tdfs4ds` library for different steps of the process, from
|
|
319
|
-
data retrieval to feature uploading.
|
|
320
|
-
- It is intended to be used internally within a system that manages a Teradata feature store, assuming access to
|
|
321
|
-
a Teradata database and the appropriate schema for feature storage.
|
|
293
|
+
Uses global `logger` for diagnostics (gated by tdfs4ds.DISPLAY_LOGS).
|
|
322
294
|
"""
|
|
323
295
|
|
|
324
296
|
if tdfs4ds.PROCESS_TYPE is None:
|
|
325
297
|
PROCESS_TYPE_ = 'RUN PROCESS'
|
|
326
|
-
tdfs4ds.RUN_ID
|
|
298
|
+
tdfs4ds.RUN_ID = str(uuid.uuid4())
|
|
327
299
|
else:
|
|
328
300
|
PROCESS_TYPE_ = tdfs4ds.PROCESS_TYPE
|
|
329
301
|
|
|
330
|
-
if tdfs4ds
|
|
331
|
-
|
|
302
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
303
|
+
logger_safe("debug", "def run | tdfs4ds.FEATURE_STORE_TIME=%s", tdfs4ds.FEATURE_STORE_TIME)
|
|
332
304
|
|
|
333
|
-
if tdfs4ds.FEATURE_STORE_TIME
|
|
305
|
+
if tdfs4ds.FEATURE_STORE_TIME is None:
|
|
334
306
|
validtime_statement = 'CURRENT VALIDTIME'
|
|
335
307
|
else:
|
|
336
308
|
validtime_statement = f"VALIDTIME AS OF TIMESTAMP '{tdfs4ds.FEATURE_STORE_TIME}'"
|
|
@@ -342,148 +314,110 @@ def run(process_id, return_dataset = False, force_compute = False, force_varchar
|
|
|
342
314
|
WHERE A.PROCESS_ID = '{process_id}'
|
|
343
315
|
"""
|
|
344
316
|
|
|
317
|
+
logger_safe(
|
|
318
|
+
"info",
|
|
319
|
+
"Starting run | run_id=%s | process_type=%s | process_id=%s | return_dataset=%s | force_compute=%s | force_varchar_length=%s",
|
|
320
|
+
tdfs4ds.RUN_ID, PROCESS_TYPE_, process_id, return_dataset, force_compute, force_varchar_length
|
|
321
|
+
)
|
|
322
|
+
|
|
345
323
|
# Executing the query and converting the result to Pandas DataFrame
|
|
346
324
|
df = tdml.DataFrame.from_query(query).to_pandas()
|
|
347
325
|
|
|
348
|
-
# Check if exactly one record is returned, else
|
|
326
|
+
# Check if exactly one record is returned, else log an error and return
|
|
349
327
|
if df.shape[0] != 1:
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
328
|
+
logger_safe(
|
|
329
|
+
"error",
|
|
330
|
+
"Process catalog lookup returned %s record(s); expected 1. Check table %s.%s. Query: %s",
|
|
331
|
+
df.shape[0], tdfs4ds.SCHEMA, tdfs4ds.PROCESS_CATALOG_NAME_VIEW, query.strip()
|
|
332
|
+
)
|
|
353
333
|
return
|
|
354
334
|
|
|
355
|
-
|
|
356
335
|
# Fetching the filter manager
|
|
357
336
|
filter_schema_name = df['FILTER_DATABASE_NAME'].values[0]
|
|
358
337
|
if filter_schema_name is None:
|
|
359
338
|
filtermanager = None
|
|
360
339
|
else:
|
|
361
340
|
filter_view_name = df['FILTER_VIEW_NAME'].values[0]
|
|
362
|
-
filter_table_name = df['FILTER_TABLE_NAME'].values[0]
|
|
341
|
+
filter_table_name = df['FILTER_TABLE_NAME'].values[0] # kept for parity; not used directly here
|
|
363
342
|
filtermanager = FilterManager(table_name=filter_view_name, schema_name=filter_schema_name)
|
|
364
343
|
|
|
365
|
-
# Fetching
|
|
366
|
-
process_type
|
|
367
|
-
|
|
368
|
-
# Fetching the primary index from the query result
|
|
369
|
-
primary_index = df['FOR_PRIMARY_INDEX'].values[0]
|
|
344
|
+
# Fetching process metadata
|
|
345
|
+
process_type = df['PROCESS_TYPE'].values[0]
|
|
346
|
+
primary_index = df['FOR_PRIMARY_INDEX'].values[0]
|
|
370
347
|
if primary_index is not None:
|
|
371
|
-
primary_index = primary_index.split(',')
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
348
|
+
primary_index = [x.strip() for x in primary_index.split(',') if x.strip()]
|
|
349
|
+
partitioning = df['FOR_DATA_PARTITIONING'].values[0]
|
|
350
|
+
DATA_DOMAIN = df['DATA_DOMAIN'].values[0]
|
|
351
|
+
|
|
352
|
+
logger_safe(
|
|
353
|
+
"info",
|
|
354
|
+
"Process metadata | process_id=%s | process_type=%s | primary_index=%s | partitioning=%s | data_domain=%s | validtime=%s",
|
|
355
|
+
process_id, process_type, primary_index, partitioning, DATA_DOMAIN, validtime_statement
|
|
356
|
+
)
|
|
378
357
|
|
|
379
358
|
# Handling 'denormalized view' process type
|
|
380
359
|
if process_type == 'denormalized view':
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
entity_id = df['ENTITY_ID'].values[0].split(',')
|
|
360
|
+
view_name = df['VIEW_NAME'].values[0]
|
|
361
|
+
entity_id = [x.strip() for x in df['ENTITY_ID'].values[0].split(',') if x.strip()]
|
|
384
362
|
entity_null_substitute = eval(df['ENTITY_NULL_SUBSTITUTE'].values[0])
|
|
385
|
-
feature_names
|
|
363
|
+
feature_names = [x.strip() for x in df['FEATURE_NAMES'].values[0].split(',') if x.strip()]
|
|
386
364
|
|
|
387
|
-
# Fetching data and uploading features to the feature store
|
|
388
365
|
df_data = tdml.DataFrame(tdml.in_schema(view_name.split('.')[0], view_name.split('.')[1]))
|
|
389
366
|
|
|
390
|
-
if tdfs4ds
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
367
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
368
|
+
logger_safe("debug", "run | entity_id=%s", entity_id)
|
|
369
|
+
logger_safe("debug", "run | entity_null_substitute=%s", entity_null_substitute)
|
|
370
|
+
logger_safe("debug", "run | feature_names=%s", feature_names)
|
|
371
|
+
logger_safe("debug", "run | process_id=%s", process_id)
|
|
372
|
+
logger_safe("debug", "run | primary_index=%s", primary_index)
|
|
373
|
+
logger_safe("debug", "run | partitioning=%s", partitioning)
|
|
374
|
+
|
|
397
375
|
dataset = _upload_features(
|
|
398
376
|
df_data,
|
|
399
377
|
entity_id,
|
|
400
378
|
feature_names,
|
|
401
|
-
feature_versions
|
|
402
|
-
primary_index
|
|
403
|
-
partitioning
|
|
404
|
-
filtermanager
|
|
405
|
-
entity_null_substitute
|
|
406
|
-
process_id
|
|
407
|
-
force_compute=
|
|
408
|
-
force_varchar_length
|
|
379
|
+
feature_versions=process_id,
|
|
380
|
+
primary_index=primary_index,
|
|
381
|
+
partitioning=partitioning,
|
|
382
|
+
filtermanager=filtermanager,
|
|
383
|
+
entity_null_substitute=entity_null_substitute,
|
|
384
|
+
process_id=process_id,
|
|
385
|
+
force_compute=force_compute,
|
|
386
|
+
force_varchar_length=force_varchar_length
|
|
409
387
|
)
|
|
410
388
|
|
|
411
389
|
# Handling 'tdstone2 view' process type
|
|
412
390
|
elif process_type == 'tdstone2 view':
|
|
413
|
-
|
|
414
|
-
|
|
391
|
+
logger_safe("warning", "Process type 'tdstone2 view' not implemented yet for process_id=%s", process_id)
|
|
392
|
+
dataset = None
|
|
415
393
|
|
|
394
|
+
else:
|
|
395
|
+
logger_safe("error", "Unknown process type '%s' for process_id=%s", process_type, process_id)
|
|
396
|
+
dataset = None
|
|
416
397
|
|
|
417
398
|
if return_dataset:
|
|
399
|
+
logger_safe("info", "Run finished with dataset | run_id=%s | process_id=%s", tdfs4ds.RUN_ID, process_id)
|
|
418
400
|
return dataset
|
|
419
401
|
else:
|
|
402
|
+
logger_safe("info", "Run finished without dataset | run_id=%s | process_id=%s", tdfs4ds.RUN_ID, process_id)
|
|
420
403
|
return
|
|
421
404
|
|
|
422
|
-
def upload_features(df, entity_id, feature_names, metadata={}, primary_index = None, partitioning = '', filtermanager = None, entity_null_substitute = {}, force_compute = True, force_varchar_length = 1024):
|
|
423
|
-
"""
|
|
424
|
-
Uploads feature data from a DataFrame to the feature store for a specified entity. This involves registering the
|
|
425
|
-
process in the feature store, executing the necessary SQL to insert the data, and returning the resulting dataset
|
|
426
|
-
for further use or inspection.
|
|
427
405
|
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
- filtermanager (object, optional): An object managing filter conditions for the feature data. Default is None.
|
|
444
|
-
- entity_null_substitute (dict, optional): A dictionary specifying substitute values for nulls in entity columns.
|
|
445
|
-
Default is an empty dictionary.
|
|
446
|
-
- force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
|
|
447
|
-
Default is True.
|
|
448
|
-
- force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
|
|
449
|
-
VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
|
|
450
|
-
where k is the smallest integer so that the original lengths is smaller or equal
|
|
451
|
-
to k x force_varchar_length. Default is 1024.
|
|
452
|
-
Returns:
|
|
453
|
-
DataFrame: A DataFrame representing the dataset resulting from the upload process, typically used for validation
|
|
454
|
-
or further processing.
|
|
455
|
-
|
|
456
|
-
The process involves several steps, including entity ID type conversion if necessary, feature name normalization,
|
|
457
|
-
process registration in the feature store, and the execution of SQL queries to insert the data. The function concludes
|
|
458
|
-
by returning a dataset derived from the uploaded data, offering immediate access to the newly stored information.
|
|
459
|
-
|
|
460
|
-
Example:
|
|
461
|
-
>>> df = tdml.DataFrame(...)
|
|
462
|
-
>>> entity_id = ['customer_id']
|
|
463
|
-
>>> feature_names = ['age', 'income']
|
|
464
|
-
>>> dataset = upload_features(df, entity_id, feature_names)
|
|
465
|
-
>>> # Another example with list-based entity_id, custom primary_index, and partitioning
|
|
466
|
-
>>> tddf = tdml.DataFrame(...) # Assuming tddf is predefined with appropriate columns
|
|
467
|
-
>>> entity_id = ['tx_type', 'txn_id']
|
|
468
|
-
>>> primary_index = ['txn_id']
|
|
469
|
-
>>> partitioning = '''
|
|
470
|
-
... PARTITION BY CASE_N (
|
|
471
|
-
... tx_type LIKE 'DEBIT',
|
|
472
|
-
... tx_type LIKE 'PAYMENT',
|
|
473
|
-
... tx_type LIKE 'CASH_OUT',
|
|
474
|
-
... tx_type LIKE 'CASH_IN',
|
|
475
|
-
... tx_type LIKE 'TRANSFER',
|
|
476
|
-
... NO CASE,
|
|
477
|
-
... UNKNOWN)'''
|
|
478
|
-
>>> features = [x for x in tddf.columns if x not in entity_id]
|
|
479
|
-
>>> dataset = upload_features(
|
|
480
|
-
... df = tddf,
|
|
481
|
-
... entity_id = entity_id,
|
|
482
|
-
... feature_names = features,
|
|
483
|
-
... metadata = {'project': 'test'},
|
|
484
|
-
... primary_index = primary_index,
|
|
485
|
-
... partitioning = partitioning
|
|
486
|
-
... )
|
|
406
|
+
def upload_features(
|
|
407
|
+
df,
|
|
408
|
+
entity_id,
|
|
409
|
+
feature_names,
|
|
410
|
+
metadata={},
|
|
411
|
+
primary_index=None,
|
|
412
|
+
partitioning='',
|
|
413
|
+
filtermanager=None,
|
|
414
|
+
entity_null_substitute={},
|
|
415
|
+
force_compute=True,
|
|
416
|
+
force_varchar_length=1024
|
|
417
|
+
):
|
|
418
|
+
"""
|
|
419
|
+
Uploads feature data from a DataFrame to the feature store for a specified entity.
|
|
420
|
+
All diagnostics go through `logger_safe()` which respects `tdfs4ds.DISPLAY_LOGS`.
|
|
487
421
|
"""
|
|
488
422
|
|
|
489
423
|
from tdfs4ds.utils.info import get_column_types
|
|
@@ -491,45 +425,42 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
|
|
|
491
425
|
from tdfs4ds.process_store.process_registration_management import register_process_view
|
|
492
426
|
|
|
493
427
|
# Convert entity_id to a dictionary if it's not already one
|
|
494
|
-
if
|
|
428
|
+
if isinstance(entity_id, list):
|
|
495
429
|
entity_id.sort()
|
|
496
430
|
entity_id = get_column_types(df, entity_id)
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
elif
|
|
431
|
+
logger_safe("debug", "entity_id converted to dict: %s", entity_id)
|
|
432
|
+
|
|
433
|
+
elif isinstance(entity_id, str):
|
|
500
434
|
entity_id = [entity_id]
|
|
501
435
|
entity_id = get_column_types(df, entity_id)
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
if
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
feature_names = feature_names.split(',')
|
|
436
|
+
logger_safe("debug", "entity_id converted to dict: %s", entity_id)
|
|
437
|
+
|
|
438
|
+
# Normalize feature_names
|
|
439
|
+
if not isinstance(feature_names, list):
|
|
440
|
+
logger_safe("debug", "feature_names is not a list: %s", feature_names)
|
|
441
|
+
if isinstance(feature_names, str) and ',' in feature_names:
|
|
442
|
+
feature_names = [x.strip() for x in feature_names.split(',')]
|
|
510
443
|
else:
|
|
511
444
|
feature_names = [feature_names]
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
if primary_index is not None and
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
primary_index = primary_index.split(',')
|
|
445
|
+
logger_safe("debug", "feature_names converted to list: %s", feature_names)
|
|
446
|
+
logger_safe("debug", "Check the conversion is as expected.")
|
|
447
|
+
|
|
448
|
+
# Normalize primary_index
|
|
449
|
+
if primary_index is not None and not isinstance(primary_index, list):
|
|
450
|
+
logger_safe("debug", "primary_index is not a list: %s", primary_index)
|
|
451
|
+
if isinstance(primary_index, str) and ',' in primary_index:
|
|
452
|
+
primary_index = [x.strip() for x in primary_index.split(',')]
|
|
521
453
|
else:
|
|
522
454
|
primary_index = [primary_index]
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
print('check it is a expected.')
|
|
455
|
+
logger_safe("debug", "primary_index converted to list: %s", primary_index)
|
|
456
|
+
logger_safe("debug", "Check the conversion is as expected.")
|
|
526
457
|
|
|
458
|
+
# Partitioning
|
|
527
459
|
partitioning = tdfs4ds.utils.info.generate_partitioning_clause(partitioning=partitioning)
|
|
528
460
|
|
|
529
|
-
|
|
530
|
-
print("filtermanager", filtermanager)
|
|
461
|
+
logger_safe("debug", "filtermanager: %s", filtermanager)
|
|
531
462
|
|
|
532
|
-
# Register
|
|
463
|
+
# Register process -> get SQL(s) + process_id
|
|
533
464
|
query_insert, process_id, query_insert_dist, query_insert_filtermanager = register_process_view.__wrapped__(
|
|
534
465
|
view_name = df,
|
|
535
466
|
entity_id = entity_id,
|
|
@@ -542,104 +473,171 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
|
|
|
542
473
|
entity_null_substitute = entity_null_substitute
|
|
543
474
|
)
|
|
544
475
|
|
|
545
|
-
|
|
546
|
-
execute_query(query_insert)
|
|
547
|
-
execute_query(query_insert_dist)
|
|
548
|
-
if tdfs4ds.DEBUG_MODE:
|
|
549
|
-
print("query_insert_filtermanager",query_insert_filtermanager)
|
|
550
|
-
if query_insert_filtermanager is not None:
|
|
551
|
-
execute_query(query_insert_filtermanager)
|
|
476
|
+
logger_safe("info", "Registered process (process_id=%s) for upload_features", process_id)
|
|
552
477
|
|
|
553
|
-
#
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
478
|
+
# Execute queries
|
|
479
|
+
try:
|
|
480
|
+
execute_query(query_insert)
|
|
481
|
+
logger_safe("info", "Executed main insert query for process_id=%s", process_id)
|
|
482
|
+
except Exception as e:
|
|
483
|
+
logger_safe("exception", "Main insert query failed for process_id=%s", process_id)
|
|
484
|
+
raise
|
|
560
485
|
|
|
561
|
-
|
|
486
|
+
try:
|
|
487
|
+
execute_query(query_insert_dist)
|
|
488
|
+
logger_safe("info", "Executed distribution insert query for process_id=%s", process_id)
|
|
489
|
+
except Exception as e:
|
|
490
|
+
logger_safe("exception", "Distribution insert query failed for process_id=%s", process_id)
|
|
491
|
+
raise
|
|
562
492
|
|
|
563
|
-
|
|
493
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
494
|
+
# Avoid dumping entire SQL in normal logs; keep it debug-only.
|
|
495
|
+
logger_safe("debug", "query_insert_filtermanager: %s", query_insert_filtermanager)
|
|
564
496
|
|
|
497
|
+
if query_insert_filtermanager is not None:
|
|
498
|
+
try:
|
|
499
|
+
execute_query(query_insert_filtermanager)
|
|
500
|
+
logger_safe("info", "Executed filtermanager insert query for process_id=%s", process_id)
|
|
565
501
|
except Exception as e:
|
|
566
|
-
|
|
567
|
-
run_id = tdfs4ds.RUN_ID,
|
|
568
|
-
process_type = tdfs4ds.PROCESS_TYPE,
|
|
569
|
-
process_id = process_id,
|
|
570
|
-
status = 'FAILED,' + str(e).split('\n')[0]
|
|
571
|
-
)
|
|
502
|
+
logger_safe("exception", "Filtermanager insert query failed for process_id=%s", process_id)
|
|
572
503
|
raise
|
|
573
504
|
|
|
505
|
+
# Run the registered process (with/without dataset)
|
|
506
|
+
PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
|
|
507
|
+
tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES'
|
|
508
|
+
if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
|
|
509
|
+
tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES WITH DATASET VALIDATION'
|
|
510
|
+
tdfs4ds.RUN_ID = str(uuid.uuid4())
|
|
574
511
|
|
|
575
|
-
|
|
576
|
-
|
|
512
|
+
logger_safe(
|
|
513
|
+
"info",
|
|
514
|
+
"Starting run (run_id=%s, process_type=%s, process_id=%s, force_compute=%s, force_varchar_length=%s)",
|
|
515
|
+
tdfs4ds.RUN_ID, tdfs4ds.PROCESS_TYPE, process_id, force_compute, force_varchar_length
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
try:
|
|
519
|
+
if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
|
|
520
|
+
dataset = run(
|
|
521
|
+
process_id=process_id,
|
|
522
|
+
return_dataset=True,
|
|
523
|
+
force_compute=force_compute,
|
|
524
|
+
force_varchar_length=force_varchar_length
|
|
525
|
+
)
|
|
526
|
+
logger_safe("info", "Run completed with dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
|
|
527
|
+
return dataset
|
|
528
|
+
else:
|
|
529
|
+
run(
|
|
530
|
+
process_id=process_id,
|
|
531
|
+
return_dataset=False,
|
|
532
|
+
force_compute=force_compute,
|
|
533
|
+
force_varchar_length=force_varchar_length
|
|
534
|
+
)
|
|
535
|
+
logger_safe("info", "Run completed without dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
|
|
536
|
+
return
|
|
577
537
|
|
|
538
|
+
except Exception as e:
|
|
539
|
+
# Keep your existing follow-up close behavior, but ensure the error is logged.
|
|
578
540
|
try:
|
|
579
|
-
run(process_id=process_id, return_dataset=False, force_compute = force_compute, force_varchar_length = force_varchar_length)
|
|
580
|
-
except Exception as e:
|
|
581
541
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
582
542
|
run_id = tdfs4ds.RUN_ID,
|
|
583
543
|
process_type = tdfs4ds.PROCESS_TYPE,
|
|
584
544
|
process_id = process_id,
|
|
585
545
|
status = 'FAILED,' + str(e).split('\n')[0]
|
|
586
546
|
)
|
|
587
|
-
|
|
588
|
-
|
|
547
|
+
finally:
|
|
548
|
+
logger_safe("exception", "Run failed (run_id=%s, process_id=%s): %s",
|
|
549
|
+
tdfs4ds.RUN_ID, process_id, str(e).split('\n')[0]
|
|
550
|
+
)
|
|
551
|
+
raise
|
|
552
|
+
finally:
|
|
553
|
+
# Restore previous process type just in case the caller relies on it.
|
|
554
|
+
tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
|
|
589
555
|
|
|
590
|
-
tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
|
|
591
556
|
|
|
592
|
-
def _upload_features(df, entity_id, feature_names,
|
|
593
|
-
feature_versions=FEATURE_VERSION_DEFAULT, primary_index = None, partitioning = '', filtermanager=None, entity_null_substitute={}, process_id = None, force_compute = False,force_varchar_length = None):
|
|
594
|
-
"""
|
|
595
|
-
Uploads features from a DataFrame to the feature store, handling entity registration, feature type determination,
|
|
596
|
-
feature registration, preparation for ingestion, and storage in the designated feature tables.
|
|
597
557
|
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
- partitioning (str, optional): A string indicating the partitioning strategy for the feature store tables, which can
|
|
609
|
-
enhance query performance based on the access patterns.
|
|
610
|
-
- filtermanager (object, optional): An object managing filter conditions for the feature data. Default is None.
|
|
611
|
-
- entity_null_substitute (dict, optional): A dictionary specifying substitute values for nulls in entity columns.
|
|
612
|
-
Default is an empty dictionary.
|
|
613
|
-
- process_id (str, optional): An identifier for the process, used for tracking and follow-up. Default is None.
|
|
614
|
-
- force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
|
|
615
|
-
Default is False.
|
|
616
|
-
- force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
|
|
617
|
-
VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
|
|
618
|
-
where k is the smallest integer so that the original lengths is smaller or equal
|
|
619
|
-
to k x force_varchar_length. Default is None.
|
|
558
|
+
def _upload_features(
|
|
559
|
+
df, entity_id, feature_names,
|
|
560
|
+
feature_versions=FEATURE_VERSION_DEFAULT,
|
|
561
|
+
primary_index=None, partitioning='',
|
|
562
|
+
filtermanager=None, entity_null_substitute={},
|
|
563
|
+
process_id=None, force_compute=False,
|
|
564
|
+
force_varchar_length=None
|
|
565
|
+
):
|
|
566
|
+
"""
|
|
567
|
+
Uploads a set of features into the Feature Store for a given entity.
|
|
620
568
|
|
|
569
|
+
This function registers an entity and its associated features in the feature catalog
|
|
570
|
+
if they are not already defined, prepares the data for ingestion, and stores it in the
|
|
571
|
+
feature store. It also supports incremental feature computation and conditional execution
|
|
572
|
+
depending on prior runs.
|
|
621
573
|
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
574
|
+
Parameters
|
|
575
|
+
----------
|
|
576
|
+
df : pandas.DataFrame
|
|
577
|
+
Input dataframe containing entity keys and feature columns to upload.
|
|
578
|
+
entity_id : str, list, or dict
|
|
579
|
+
Identifier(s) for the entity. Can be:
|
|
580
|
+
- A string (single entity key)
|
|
581
|
+
- A list of key column names
|
|
582
|
+
- A dict mapping column names to data types
|
|
583
|
+
If not a dict, entity metadata is inferred automatically.
|
|
584
|
+
feature_names : list of str
|
|
585
|
+
List of feature column names to upload from `df`.
|
|
586
|
+
feature_versions : dict or int, optional
|
|
587
|
+
Feature version(s). If a single integer is provided, it is applied to all features.
|
|
588
|
+
If a dict is provided, it maps each feature name to its version.
|
|
589
|
+
Default is FEATURE_VERSION_DEFAULT.
|
|
590
|
+
primary_index : str or list, optional
|
|
591
|
+
Primary index to use when storing features in Teradata.
|
|
592
|
+
partitioning : str, optional
|
|
593
|
+
Partitioning clause for feature store tables. Default is ''.
|
|
594
|
+
filtermanager : FilterManager, optional
|
|
595
|
+
If provided, features are built iteratively per filter step.
|
|
596
|
+
entity_null_substitute : dict, optional
|
|
597
|
+
Replacement values for nulls in entity keys.
|
|
598
|
+
Example: {'customer_id': -1}
|
|
599
|
+
process_id : str, optional
|
|
600
|
+
Identifier for the process execution, used for follow-up logging.
|
|
601
|
+
force_compute : bool, optional
|
|
602
|
+
If True, forces recomputation even if the same process_id and timestamp were
|
|
603
|
+
already computed earlier. If False, the computation is skipped when existing
|
|
604
|
+
results are detected. Default is False.
|
|
605
|
+
force_varchar_length : int, optional
|
|
606
|
+
If provided, all VARCHAR feature columns are resized to this length
|
|
607
|
+
before ingestion.
|
|
608
|
+
|
|
609
|
+
Returns
|
|
610
|
+
-------
|
|
611
|
+
pandas.DataFrame or None
|
|
612
|
+
If BUILD_DATASET_AT_UPLOAD is enabled, returns a dataset built from the
|
|
613
|
+
ingested features for validation. Otherwise, returns None.
|
|
614
|
+
|
|
615
|
+
Notes
|
|
616
|
+
-----
|
|
617
|
+
- Uses global tdfs4ds context such as FEATURE_STORE_TIME, RUN_ID, and PROCESS_TYPE.
|
|
618
|
+
- Logs ingestion status in process follow-up tables.
|
|
619
|
+
- Skips ingestion when existing completed results are found unless
|
|
620
|
+
`force_compute=True`.
|
|
621
|
+
- Applies Teradata-optimized storage and statistics collection.
|
|
622
|
+
|
|
623
|
+
Raises
|
|
624
|
+
------
|
|
625
|
+
ValueError
|
|
626
|
+
If unsupported data types are found (CLOB/BLOB/JSON).
|
|
627
|
+
Exception
|
|
628
|
+
For ingestion failure or storage errors.
|
|
634
629
|
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
630
|
+
Example
|
|
631
|
+
-------
|
|
632
|
+
>>> _upload_features(
|
|
633
|
+
... df=dataframe,
|
|
634
|
+
... entity_id="customer_id",
|
|
635
|
+
... feature_names=["age", "credit_score"],
|
|
636
|
+
... process_id="customer_features_v1",
|
|
637
|
+
... force_compute=False
|
|
638
|
+
... )
|
|
641
639
|
"""
|
|
642
|
-
|
|
640
|
+
|
|
643
641
|
from tdfs4ds.feature_store.entity_management import register_entity
|
|
644
642
|
from tdfs4ds.feature_store.feature_store_management import Gettdtypes
|
|
645
643
|
from tdfs4ds.feature_store.feature_store_management import register_features
|
|
@@ -647,194 +645,180 @@ def _upload_features(df, entity_id, feature_names,
|
|
|
647
645
|
from tdfs4ds.feature_store.feature_data_processing import store_feature, apply_collect_stats
|
|
648
646
|
from tdfs4ds.utils.info import get_column_types, update_varchar_length
|
|
649
647
|
|
|
650
|
-
# Convert entity_id to a dictionary if
|
|
651
|
-
if
|
|
648
|
+
# Convert entity_id to a dictionary if not already
|
|
649
|
+
if isinstance(entity_id, list):
|
|
652
650
|
entity_id.sort()
|
|
653
651
|
entity_id = get_column_types(df, entity_id)
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
entity_id
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
#register_entity(entity_id, primary_index=primary_index, partitioning=partitioning)
|
|
663
|
-
|
|
664
|
-
# If feature_versions is a list, create a dictionary mapping each feature name to its corresponding version.
|
|
665
|
-
# If feature_versions is a string, create a dictionary mapping each feature name to this string.
|
|
666
|
-
if type(feature_versions) == list:
|
|
667
|
-
selected_features = {k: v for k, v in zip(feature_names, feature_versions)}
|
|
652
|
+
logger_safe("debug", "entity_id converted to dict: %s", entity_id)
|
|
653
|
+
elif isinstance(entity_id, str):
|
|
654
|
+
entity_id = get_column_types(df, [entity_id])
|
|
655
|
+
logger_safe("debug", "entity_id converted to dict: %s", entity_id)
|
|
656
|
+
|
|
657
|
+
# Map feature versions
|
|
658
|
+
if isinstance(feature_versions, list):
|
|
659
|
+
selected_features = dict(zip(feature_names, feature_versions))
|
|
668
660
|
else:
|
|
669
661
|
selected_features = {k: feature_versions for k in feature_names}
|
|
670
662
|
|
|
671
|
-
# Get
|
|
672
|
-
feature_names_types = Gettdtypes(
|
|
673
|
-
df,
|
|
674
|
-
features_columns=feature_names,
|
|
675
|
-
entity_id=entity_id
|
|
676
|
-
)
|
|
663
|
+
# Get Teradata types for features
|
|
664
|
+
feature_names_types = Gettdtypes(df, features_columns=feature_names, entity_id=entity_id)
|
|
677
665
|
|
|
678
666
|
if force_varchar_length is not None:
|
|
679
|
-
|
|
680
|
-
feature_names_types = update_varchar_length(
|
|
667
|
+
logger_safe("debug", "Updating VARCHAR lengths with force_varchar_length=%s", force_varchar_length)
|
|
668
|
+
feature_names_types = update_varchar_length(
|
|
669
|
+
feature_names_types,
|
|
670
|
+
new_varchar_length=force_varchar_length
|
|
671
|
+
)
|
|
681
672
|
|
|
682
673
|
def validate_feature_types(feature_names_types):
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
feature_names_types (dict): A dictionary where keys are feature names and values are their data types.
|
|
689
|
-
|
|
690
|
-
Raises:
|
|
691
|
-
ValueError: If any feature type contains 'clob', 'blob', or 'json'.
|
|
692
|
-
"""
|
|
693
|
-
invalid_types = {key: value['type'] for key, value in feature_names_types.items()
|
|
694
|
-
if any(term in value['type'].lower() for term in ['clob', 'blob', 'json'])}
|
|
695
|
-
|
|
696
|
-
if invalid_types:
|
|
674
|
+
invalid = {
|
|
675
|
+
k: v['type'] for k, v in feature_names_types.items()
|
|
676
|
+
if any(x in v['type'].lower() for x in ['clob', 'blob', 'json'])
|
|
677
|
+
}
|
|
678
|
+
if invalid:
|
|
697
679
|
raise ValueError(
|
|
698
|
-
f"
|
|
699
|
-
"
|
|
680
|
+
f"Unsupported data types found: {invalid}. "
|
|
681
|
+
"CLOB/BLOB/JSON are not supported."
|
|
700
682
|
)
|
|
701
|
-
|
|
702
|
-
validate_feature_types(feature_names_types)
|
|
703
|
-
|
|
683
|
+
|
|
684
|
+
validate_feature_types(feature_names_types)
|
|
685
|
+
|
|
686
|
+
logger_safe("info", "Registering entity %s in feature store", entity_id)
|
|
704
687
|
register_entity(entity_id, feature_names_types, primary_index=primary_index, partitioning=partitioning)
|
|
705
688
|
|
|
706
|
-
if tdfs4ds
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
primary_index,
|
|
720
|
-
partitioning
|
|
721
|
-
)
|
|
722
|
-
|
|
723
|
-
if tdfs4ds.DEBUG_MODE:
|
|
724
|
-
print("---------_upload_features")
|
|
725
|
-
print("filtermanager : ", filtermanager)
|
|
726
|
-
print("feature names : ", feature_names)
|
|
727
|
-
print("selected features : ", selected_features)
|
|
728
|
-
|
|
729
|
-
if process_id is not None and tdfs4ds.FEATURE_STORE_TIME is not None:
|
|
689
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
690
|
+
logger_safe(
|
|
691
|
+
"debug",
|
|
692
|
+
"_upload_features entity_id=%s null_substitute=%s features=%s primary_index=%s partitioning=%s",
|
|
693
|
+
entity_id, entity_null_substitute, feature_names, primary_index, partitioning
|
|
694
|
+
)
|
|
695
|
+
logger_safe("debug", "selected_features=%s df.columns=%s", selected_features, df.columns)
|
|
696
|
+
|
|
697
|
+
register_features(entity_id, feature_names_types, primary_index, partitioning)
|
|
698
|
+
logger_safe("info", "Features registered in catalog: %s", feature_names)
|
|
699
|
+
|
|
700
|
+
follow_up = None
|
|
701
|
+
if process_id and tdfs4ds.FEATURE_STORE_TIME:
|
|
730
702
|
follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
|
|
731
|
-
follow_up = follow_up[
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
do_compute = False
|
|
703
|
+
follow_up = follow_up[
|
|
704
|
+
(follow_up.STATUS == 'COMPLETED') &
|
|
705
|
+
(follow_up.VALIDTIME_DATE.isna() == False) &
|
|
706
|
+
(follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) &
|
|
707
|
+
(follow_up.PROCESS_ID == process_id)
|
|
708
|
+
]
|
|
738
709
|
|
|
739
|
-
|
|
710
|
+
if filtermanager is None:
|
|
711
|
+
do_compute = not (process_id and follow_up is not None and follow_up.shape[0] > 0)
|
|
712
|
+
if not do_compute and not force_compute:
|
|
713
|
+
logger_safe(
|
|
714
|
+
"info",
|
|
715
|
+
"Skipping computation for process_id=%s at time %s (already exists, force_compute=False)",
|
|
716
|
+
process_id, tdfs4ds.FEATURE_STORE_TIME
|
|
717
|
+
)
|
|
740
718
|
if do_compute or force_compute:
|
|
741
|
-
|
|
719
|
+
logger_safe("info", "Beginning feature ingestion for entity=%s", entity_id)
|
|
742
720
|
tdfs4ds.process_store.process_followup.followup_open(
|
|
743
|
-
run_id
|
|
744
|
-
process_type
|
|
745
|
-
process_id
|
|
721
|
+
run_id=tdfs4ds.RUN_ID,
|
|
722
|
+
process_type=tdfs4ds.PROCESS_TYPE,
|
|
723
|
+
process_id=process_id
|
|
746
724
|
)
|
|
747
|
-
|
|
748
725
|
try:
|
|
749
|
-
prepared_features,
|
|
750
|
-
df,
|
|
751
|
-
entity_id,
|
|
752
|
-
feature_names,
|
|
726
|
+
prepared_features, volatile_table, features_infos = prepare_feature_ingestion(
|
|
727
|
+
df, entity_id, feature_names,
|
|
753
728
|
feature_versions=selected_features,
|
|
754
729
|
primary_index=primary_index,
|
|
755
730
|
entity_null_substitute=entity_null_substitute,
|
|
756
731
|
partitioning=partitioning
|
|
757
732
|
)
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
volatile_table_name,
|
|
762
|
-
entity_null_substitute=entity_null_substitute,
|
|
763
|
-
primary_index=primary_index,
|
|
764
|
-
partitioning=partitioning,
|
|
765
|
-
features_infos = features_infos
|
|
766
|
-
)
|
|
767
|
-
|
|
768
|
-
# Collect statistics
|
|
769
|
-
apply_collect_stats(
|
|
770
|
-
entity_id,
|
|
771
|
-
primary_index = primary_index,
|
|
772
|
-
partitioning = partitioning,
|
|
773
|
-
feature_infos = features_infos
|
|
774
|
-
)
|
|
733
|
+
store_feature(entity_id, volatile_table, entity_null_substitute,
|
|
734
|
+
primary_index, partitioning, features_infos)
|
|
735
|
+
apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
|
|
775
736
|
|
|
776
737
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
777
|
-
run_id
|
|
778
|
-
process_type
|
|
779
|
-
process_id
|
|
738
|
+
run_id=tdfs4ds.RUN_ID,
|
|
739
|
+
process_type=tdfs4ds.PROCESS_TYPE,
|
|
740
|
+
process_id=process_id
|
|
780
741
|
)
|
|
742
|
+
logger_safe("info", "Feature ingestion completed for entity=%s", entity_id)
|
|
781
743
|
|
|
782
744
|
except Exception as e:
|
|
745
|
+
logger_safe("exception", "Feature ingestion failed for entity=%s", entity_id)
|
|
783
746
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
784
|
-
run_id
|
|
785
|
-
process_type
|
|
786
|
-
process_id
|
|
787
|
-
status
|
|
747
|
+
run_id=tdfs4ds.RUN_ID,
|
|
748
|
+
process_type=tdfs4ds.PROCESS_TYPE,
|
|
749
|
+
process_id=process_id,
|
|
750
|
+
status='FAILED,' + str(e).split('\n')[0]
|
|
788
751
|
)
|
|
789
752
|
raise
|
|
753
|
+
|
|
790
754
|
else:
|
|
791
|
-
# get the total number of filter condition in the filter manager
|
|
792
|
-
nb_filters = filtermanager.nb_filters
|
|
793
755
|
|
|
794
|
-
|
|
756
|
+
logger_safe("info", "FilterManager detected: %s filters to process", filtermanager.nb_filters)
|
|
795
757
|
something_computed = False
|
|
758
|
+
pbar = tqdm(
|
|
759
|
+
range(filtermanager.nb_filters),
|
|
760
|
+
total=filtermanager.nb_filters,
|
|
761
|
+
desc="Applying filters",
|
|
762
|
+
unit="filter",
|
|
763
|
+
leave=False
|
|
764
|
+
)
|
|
796
765
|
|
|
797
|
-
for i in
|
|
766
|
+
for i in pbar:
|
|
767
|
+
filter_id = i + 1
|
|
768
|
+
filtermanager.update(filter_id)
|
|
798
769
|
|
|
799
|
-
|
|
800
|
-
|
|
770
|
+
try:
|
|
771
|
+
pbar.set_description(f"Applying filter {filter_id}/{filtermanager.nb_filters}")
|
|
801
772
|
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
773
|
+
# Convert datetime columns to string
|
|
774
|
+
df_bar = filtermanager.display().to_pandas().astype(object) # avoid conversion issues
|
|
775
|
+
for col in df_bar.select_dtypes(include=["datetime", "datetimetz"]).columns:
|
|
776
|
+
df_bar[col] = df_bar[col].dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
805
777
|
|
|
806
|
-
#
|
|
807
|
-
|
|
808
|
-
follow_up = follow_up[(follow_up.STATUS == 'COMPLETED') & (follow_up.VALIDTIME_DATE.isna() == False) & (
|
|
809
|
-
follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) & (follow_up.PROCESS_ID == process_id)]
|
|
778
|
+
# Convert to JSON object (dict)
|
|
779
|
+
bar_info = df_bar.iloc[0].to_dict()
|
|
810
780
|
|
|
811
|
-
|
|
812
|
-
|
|
781
|
+
# ---- ADD THIS: handle python date objects ----
|
|
782
|
+
from datetime import date, datetime
|
|
783
|
+
for key, value in bar_info.items():
|
|
784
|
+
if isinstance(value, (date, datetime)): # convert date/datetime to string
|
|
785
|
+
bar_info[key] = value.strftime("%Y-%m-%d %H:%M:%S")
|
|
786
|
+
# ----------------------------------------------
|
|
813
787
|
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
FROM {filtermanager.schema_name}.{filtermanager.view_name}
|
|
823
|
-
"""
|
|
824
|
-
),
|
|
825
|
-
on = 'APPLIED_FILTER',
|
|
826
|
-
how = 'inner',
|
|
827
|
-
lprefix = 'l',
|
|
828
|
-
rprefix = 'r'
|
|
829
|
-
)
|
|
830
|
-
# if already computed and completed, then do_compute is set to False
|
|
831
|
-
if follow_up_.shape[0] > 0:
|
|
832
|
-
do_compute = False
|
|
788
|
+
bar_info = str(bar_info)
|
|
789
|
+
if len(bar_info) > 120:
|
|
790
|
+
bar_info = bar_info[:117] + "..."
|
|
791
|
+
pbar.set_postfix_str(bar_info)
|
|
792
|
+
|
|
793
|
+
except Exception:
|
|
794
|
+
# postfix is optional; ignore errors from display() here
|
|
795
|
+
pass
|
|
833
796
|
|
|
834
|
-
|
|
835
|
-
|
|
797
|
+
logger_safe("debug", "Applying filter %s/%s:\n%s",
|
|
798
|
+
i + 1, filtermanager.nb_filters, filtermanager.display())
|
|
836
799
|
|
|
800
|
+
do_compute = True
|
|
801
|
+
if process_id and tdfs4ds.FEATURE_STORE_TIME:
|
|
802
|
+
# see if already computed
|
|
803
|
+
follow_up = tdfs4ds.process_store.process_followup.follow_up_report(process_id=process_id, filtermanager=filtermanager)
|
|
804
|
+
follow_up = follow_up[
|
|
805
|
+
(follow_up.STATUS == 'COMPLETED') &
|
|
806
|
+
(follow_up.VALIDTIME_DATE.isna() == False) &
|
|
807
|
+
(follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME)
|
|
808
|
+
]
|
|
809
|
+
|
|
810
|
+
if follow_up.shape[0] > 0:
|
|
811
|
+
do_compute = False
|
|
812
|
+
|
|
813
|
+
if not do_compute and not force_compute:
|
|
814
|
+
logger_safe(
|
|
815
|
+
"info",
|
|
816
|
+
"Skipping computation for process_id=%s at time %s (already exists, force_compute=False)",
|
|
817
|
+
process_id, tdfs4ds.FEATURE_STORE_TIME
|
|
818
|
+
)
|
|
819
|
+
pbar.colour = "green"
|
|
837
820
|
if do_compute or force_compute:
|
|
821
|
+
pbar.colour = "blue"
|
|
838
822
|
tdfs4ds.process_store.process_followup.followup_open(
|
|
839
823
|
run_id = tdfs4ds.RUN_ID,
|
|
840
824
|
process_type = tdfs4ds.PROCESS_TYPE,
|
|
@@ -842,83 +826,58 @@ def _upload_features(df, entity_id, feature_names,
|
|
|
842
826
|
filtermanager = filtermanager
|
|
843
827
|
)
|
|
844
828
|
try:
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
df,
|
|
848
|
-
entity_id,
|
|
849
|
-
feature_names,
|
|
829
|
+
prepared_features, volatile_table, features_infos = prepare_feature_ingestion(
|
|
830
|
+
df, entity_id, feature_names,
|
|
850
831
|
feature_versions = selected_features,
|
|
851
832
|
primary_index = primary_index,
|
|
852
833
|
entity_null_substitute = entity_null_substitute,
|
|
853
834
|
partitioning = partitioning
|
|
854
835
|
)
|
|
855
836
|
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
volatile_table_name,
|
|
860
|
-
entity_null_substitute=entity_null_substitute,
|
|
861
|
-
primary_index = primary_index,
|
|
862
|
-
partitioning = partitioning,
|
|
863
|
-
features_infos=features_infos
|
|
864
|
-
|
|
865
|
-
)
|
|
866
|
-
|
|
867
|
-
# indicate that something has been processed:
|
|
837
|
+
store_feature(entity_id, volatile_table, entity_null_substitute,
|
|
838
|
+
primary_index, partitioning, features_infos)
|
|
839
|
+
|
|
868
840
|
something_computed = True
|
|
869
841
|
|
|
870
842
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
871
|
-
run_id=tdfs4ds.RUN_ID,
|
|
872
|
-
process_type=tdfs4ds.PROCESS_TYPE,
|
|
873
|
-
process_id=process_id,
|
|
843
|
+
run_id = tdfs4ds.RUN_ID,
|
|
844
|
+
process_type = tdfs4ds.PROCESS_TYPE,
|
|
845
|
+
process_id = process_id,
|
|
874
846
|
filtermanager = filtermanager
|
|
875
847
|
)
|
|
876
848
|
|
|
877
849
|
except Exception as e:
|
|
878
|
-
|
|
850
|
+
logger_safe("exception", "Error with filter iteration %s: %s", i + 1, str(e))
|
|
879
851
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
880
|
-
run_id=tdfs4ds.RUN_ID,
|
|
881
|
-
process_type=tdfs4ds.PROCESS_TYPE,
|
|
882
|
-
process_id=process_id,
|
|
883
|
-
status='FAILED,' + str(e).split('\n')[0],
|
|
884
|
-
filtermanager=filtermanager
|
|
852
|
+
run_id = tdfs4ds.RUN_ID,
|
|
853
|
+
process_type = tdfs4ds.PROCESS_TYPE,
|
|
854
|
+
process_id = process_id,
|
|
855
|
+
status = 'FAILED,' + str(e).split('\n')[0],
|
|
856
|
+
filtermanager = filtermanager
|
|
885
857
|
)
|
|
886
858
|
raise
|
|
887
|
-
# Clean up by dropping the temporary volatile table.
|
|
888
|
-
# tdml.execute_sql(f'DROP TABLE {volatile_table_name}')
|
|
889
859
|
|
|
890
|
-
# Collect statistics only if something has been computed
|
|
891
860
|
if something_computed:
|
|
892
|
-
apply_collect_stats(
|
|
893
|
-
entity_id,
|
|
894
|
-
primary_index = primary_index,
|
|
895
|
-
partitioning = partitioning,
|
|
896
|
-
feature_infos = features_infos
|
|
897
|
-
)
|
|
861
|
+
apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
|
|
898
862
|
|
|
899
|
-
# Build a dataset view in the feature store.
|
|
900
863
|
if tdfs4ds.BUILD_DATASET_AT_UPLOAD:
|
|
901
|
-
|
|
864
|
+
logger_safe("info", "Building dataset for validation...")
|
|
902
865
|
try:
|
|
903
|
-
|
|
904
|
-
entity_id,
|
|
905
|
-
selected_features,
|
|
866
|
+
return build_dataset(
|
|
867
|
+
entity_id, selected_features,
|
|
906
868
|
view_name=None,
|
|
907
|
-
entity_null_substitute
|
|
869
|
+
entity_null_substitute=entity_null_substitute
|
|
908
870
|
)
|
|
909
871
|
except Exception as e:
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
print('entity :', entity_id)
|
|
913
|
-
print('selected features :', selected_features)
|
|
914
|
-
|
|
915
|
-
# Return the dataset view.
|
|
916
|
-
return dataset
|
|
872
|
+
logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
|
|
873
|
+
logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
|
|
917
874
|
else:
|
|
918
|
-
|
|
875
|
+
logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False)")
|
|
919
876
|
return
|
|
920
877
|
|
|
921
878
|
|
|
879
|
+
|
|
880
|
+
|
|
922
881
|
def build_dataset(entity_id, selected_features, view_name, schema_name=None, comment=None, return_query=False,
|
|
923
882
|
feature_store_time=False, join_type='INNER'):
|
|
924
883
|
"""
|
|
@@ -935,6 +894,10 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
|
|
|
935
894
|
selected_features : dict
|
|
936
895
|
A dictionary where the keys are feature table names, and the values are lists of tuples
|
|
937
896
|
(feature_id, feature_version, feature_name) specifying the features to retrieve.
|
|
897
|
+
NOTE: feature_version may be either:
|
|
898
|
+
- a single UUID string, or
|
|
899
|
+
- a list of dicts like:
|
|
900
|
+
{"process_id": <UUID>, "process_view_name": <str>}
|
|
938
901
|
|
|
939
902
|
view_name : str
|
|
940
903
|
The name of the view to be created in the database.
|
|
@@ -1004,6 +967,24 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
|
|
|
1004
967
|
# Sort the entity ID list for consistent query generation
|
|
1005
968
|
list_entity_id.sort()
|
|
1006
969
|
|
|
970
|
+
# Helpers
|
|
971
|
+
import re
|
|
972
|
+
def _sanitize_identifier(name: str) -> str:
|
|
973
|
+
# Keep letters, numbers, and underscores; replace others with '_'
|
|
974
|
+
return re.sub(r'[^0-9A-Za-z_]', '_', name)
|
|
975
|
+
|
|
976
|
+
used_alias_counts = {} # base_alias -> count
|
|
977
|
+
|
|
978
|
+
def _unique_alias(base: str) -> str:
|
|
979
|
+
"""
|
|
980
|
+
Ensure alias uniqueness: if base already used, append _2, _3, ...
|
|
981
|
+
"""
|
|
982
|
+
if base not in used_alias_counts:
|
|
983
|
+
used_alias_counts[base] = 1
|
|
984
|
+
return base
|
|
985
|
+
used_alias_counts[base] += 1
|
|
986
|
+
return f"{base}_{used_alias_counts[base]}"
|
|
987
|
+
|
|
1007
988
|
# Initialize sub-query construction
|
|
1008
989
|
tdfs4ds.logger.info("Generating the sub-queries for feature retrieval.")
|
|
1009
990
|
sub_queries = []
|
|
@@ -1014,21 +995,52 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
|
|
|
1014
995
|
# Construct sub-queries for each feature
|
|
1015
996
|
for k, v in list_features.items():
|
|
1016
997
|
for feature_id, feature_version, feature_name in v:
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
998
|
+
|
|
999
|
+
# Multiple processes: list of dicts
|
|
1000
|
+
if isinstance(feature_version, list):
|
|
1001
|
+
for item in feature_version:
|
|
1002
|
+
process_id = item.get("process_id")
|
|
1003
|
+
process_view_name = item.get("process_view_name") or "PROCESS"
|
|
1004
|
+
base_alias = _sanitize_identifier(f"{feature_name}_{process_view_name}")
|
|
1005
|
+
alias = _unique_alias(base_alias)
|
|
1006
|
+
|
|
1007
|
+
txt_where = f"(FEATURE_ID = {feature_id} AND FEATURE_VERSION='{process_id}')"
|
|
1008
|
+
feature_str = ',B1.FEATURE_VALUE AS ' + alias
|
|
1009
|
+
|
|
1010
|
+
sub_queries.append(
|
|
1011
|
+
{
|
|
1012
|
+
'feature_name': alias,
|
|
1013
|
+
'query': f"""
|
|
1014
|
+
SEQUENCED VALIDTIME
|
|
1015
|
+
SELECT
|
|
1016
|
+
{txt_entity}
|
|
1017
|
+
{feature_str}
|
|
1018
|
+
FROM {k} B1
|
|
1019
|
+
WHERE {txt_where}
|
|
1020
|
+
"""
|
|
1021
|
+
}
|
|
1022
|
+
)
|
|
1023
|
+
|
|
1024
|
+
# Single UUID
|
|
1025
|
+
else:
|
|
1026
|
+
base_alias = _sanitize_identifier(feature_name)
|
|
1027
|
+
alias = _unique_alias(base_alias)
|
|
1028
|
+
|
|
1029
|
+
txt_where = f"(FEATURE_ID = {feature_id} AND FEATURE_VERSION='{feature_version}')"
|
|
1030
|
+
feature_str = ',B1.FEATURE_VALUE AS ' + alias
|
|
1031
|
+
sub_queries.append(
|
|
1032
|
+
{
|
|
1033
|
+
'feature_name': alias,
|
|
1034
|
+
'query': f"""
|
|
1035
|
+
SEQUENCED VALIDTIME
|
|
1036
|
+
SELECT
|
|
1037
|
+
{txt_entity}
|
|
1038
|
+
{feature_str}
|
|
1039
|
+
FROM {k} B1
|
|
1040
|
+
WHERE {txt_where}
|
|
1041
|
+
"""
|
|
1042
|
+
}
|
|
1043
|
+
)
|
|
1032
1044
|
|
|
1033
1045
|
# Handle case where no features are available
|
|
1034
1046
|
if len(sub_queries) == 0:
|
|
@@ -1102,6 +1114,7 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
|
|
|
1102
1114
|
return tdml.DataFrame.from_table(tdml.in_schema(schema_name, view_name))
|
|
1103
1115
|
|
|
1104
1116
|
|
|
1117
|
+
|
|
1105
1118
|
def build_dataset_opt(entity_id, selected_features, view_name = None, schema_name=tdfs4ds.SCHEMA,
|
|
1106
1119
|
comment='dataset', no_temporal=False, time_manager=None, query_only=False, entity_null_substitute={},
|
|
1107
1120
|
other=None, time_column=None, filtermanager = None, filter_conditions = None
|
|
@@ -1280,82 +1293,91 @@ def upload_tdstone2_scores(model):
|
|
|
1280
1293
|
return dataset
|
|
1281
1294
|
|
|
1282
1295
|
|
|
1283
|
-
def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None):
|
|
1296
|
+
def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None, force_compute = False, force_display_logs = False):
|
|
1284
1297
|
"""
|
|
1285
|
-
Executes a series of processes for each date in a given list, managing
|
|
1298
|
+
Executes a series of processes for each date in a given list, managing time, computation settings, and logging.
|
|
1286
1299
|
|
|
1287
1300
|
This function iterates over a range of time steps, updating a TimeManager object with each step, and then
|
|
1288
|
-
executes a list of processes for that time step. It also manages
|
|
1289
|
-
and
|
|
1301
|
+
executes a list of processes for that time step. It also manages synchronization of time for the feature store
|
|
1302
|
+
and optionally controls forced computation and log display behavior.
|
|
1290
1303
|
|
|
1291
1304
|
Parameters:
|
|
1292
1305
|
- process_list (list): A list of process IDs that need to be executed for each time step.
|
|
1293
|
-
- time_manager (TimeManager
|
|
1306
|
+
- time_manager (TimeManager): An object that manages time-related operations, like updating or retrieving time.
|
|
1294
1307
|
- time_id_start (int, optional): The starting time step ID. Default is 1.
|
|
1295
|
-
- time_id_end (int, optional): The ending time step ID. If None, it will run until the last time step in the
|
|
1308
|
+
- time_id_end (int, optional): The ending time step ID. If None, it will run until the last time step in the
|
|
1309
|
+
time manager.
|
|
1310
|
+
- force_compute (bool, optional): If True, forces each process to recompute even if previous results exist.
|
|
1311
|
+
Default is False.
|
|
1312
|
+
- force_display_logs (bool, optional): If True, forces log display during the rollout even if global log display
|
|
1313
|
+
is disabled. Default is False.
|
|
1296
1314
|
|
|
1297
1315
|
Side Effects:
|
|
1298
|
-
-
|
|
1316
|
+
- Temporarily modifies global variables DISPLAY_LOGS, PROCESS_TYPE, RUN_ID, and FEATURE_STORE_TIME.
|
|
1317
|
+
- Restores DISPLAY_LOGS setting after execution.
|
|
1299
1318
|
- Catches and prints exceptions along with the time step on which they occurred.
|
|
1300
1319
|
|
|
1301
|
-
|
|
1302
|
-
1. Disables display logs
|
|
1303
|
-
2.
|
|
1304
|
-
3.
|
|
1305
|
-
4.
|
|
1306
|
-
5.
|
|
1307
|
-
6.
|
|
1320
|
+
Steps performed:
|
|
1321
|
+
1. Disables display logs by default unless `force_display_logs` is True.
|
|
1322
|
+
2. Sets process type to 'ROLL_OUT' and initializes a unique run ID.
|
|
1323
|
+
3. Iterates over the specified range of time steps.
|
|
1324
|
+
4. Updates the time manager with the current time step.
|
|
1325
|
+
5. Synchronizes the feature store time with the current time step.
|
|
1326
|
+
6. Executes each process in the process list with optional forced computation.
|
|
1327
|
+
7. Restores original display log settings after completion.
|
|
1308
1328
|
|
|
1309
1329
|
Example:
|
|
1310
1330
|
>>> process_list = ['process_1', 'process_2']
|
|
1311
1331
|
>>> time_manager = TimeManager(...)
|
|
1312
|
-
>>> roll_out(process_list, time_manager, time_id_start=1, time_id_end=10)
|
|
1332
|
+
>>> roll_out(process_list, time_manager, time_id_start=1, time_id_end=10, force_compute=True, force_display_logs=True)
|
|
1313
1333
|
"""
|
|
1314
1334
|
|
|
1315
|
-
#global DISPLAY_LOGS
|
|
1316
|
-
#global FEATURE_STORE_TIME
|
|
1317
|
-
|
|
1318
1335
|
# Disable display logs
|
|
1319
1336
|
temp_DISPLAY_LOGS = tdfs4ds.DISPLAY_LOGS
|
|
1320
1337
|
tdfs4ds.DISPLAY_LOGS = False
|
|
1338
|
+
if force_display_logs:
|
|
1339
|
+
tdfs4ds.DISPLAY_LOGS = True
|
|
1321
1340
|
PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
|
|
1322
1341
|
tdfs4ds.PROCESS_TYPE = 'ROLL_OUT'
|
|
1323
1342
|
tdfs4ds.RUN_ID = str(uuid.uuid4())
|
|
1324
1343
|
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
1344
|
try:
|
|
1345
|
+
# Define range of time steps
|
|
1328
1346
|
if time_id_end is None:
|
|
1329
|
-
|
|
1347
|
+
time_range = range(time_id_start, time_manager.nb_time_steps + 1)
|
|
1330
1348
|
else:
|
|
1331
|
-
|
|
1332
|
-
|
|
1349
|
+
time_range = range(time_id_start, min(time_manager.nb_time_steps + 1, time_id_end + 1))
|
|
1350
|
+
|
|
1351
|
+
# Progress bar
|
|
1352
|
+
pbar = tqdm(time_range, desc="Starting rollout", unit="step")
|
|
1353
|
+
|
|
1333
1354
|
for i in pbar:
|
|
1334
|
-
# Update
|
|
1335
|
-
time_manager.update(time_id
|
|
1355
|
+
# Update time manager
|
|
1356
|
+
time_manager.update(time_id=i)
|
|
1336
1357
|
date_ = str(time_manager.display()['BUSINESS_DATE'].values[0])
|
|
1337
|
-
|
|
1338
|
-
#
|
|
1358
|
+
|
|
1359
|
+
# Sync feature store time
|
|
1339
1360
|
tdfs4ds.FEATURE_STORE_TIME = time_manager.get_date_in_the_past()
|
|
1340
|
-
|
|
1361
|
+
|
|
1362
|
+
# Display current progress in tqdm
|
|
1363
|
+
pbar.set_postfix(time=date_, feature_time=tdfs4ds.FEATURE_STORE_TIME)
|
|
1364
|
+
|
|
1341
1365
|
if tdfs4ds.DEBUG_MODE:
|
|
1342
|
-
print(
|
|
1343
|
-
print(
|
|
1344
|
-
|
|
1345
|
-
# Execute
|
|
1366
|
+
print("roll_out | date_:", date_)
|
|
1367
|
+
print("roll_out | feature_store_time:", tdfs4ds.FEATURE_STORE_TIME)
|
|
1368
|
+
|
|
1369
|
+
# Execute all processes for this time step
|
|
1346
1370
|
for proc_id in process_list:
|
|
1347
|
-
pbar.set_description(f"Processing {date_}
|
|
1348
|
-
run(process_id=proc_id, force_compute=
|
|
1371
|
+
pbar.set_description(f"Processing {date_} | proc {proc_id}")
|
|
1372
|
+
run(process_id=proc_id, force_compute=force_compute)
|
|
1349
1373
|
|
|
1374
|
+
# Restore settings
|
|
1350
1375
|
tdfs4ds.DISPLAY_LOGS = temp_DISPLAY_LOGS
|
|
1376
|
+
|
|
1351
1377
|
except Exception as e:
|
|
1352
1378
|
tdfs4ds.DISPLAY_LOGS = temp_DISPLAY_LOGS
|
|
1353
|
-
# If an exception occurs, print the date and the first line of the exception message
|
|
1354
|
-
#print(date_)
|
|
1355
1379
|
print(str(e).split('\n')[0])
|
|
1356
1380
|
tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
|
|
1357
1381
|
raise
|
|
1358
1382
|
|
|
1359
|
-
tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
|
|
1360
|
-
|
|
1361
|
-
|
|
1383
|
+
tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
|