tdfs4ds 0.2.4.32__py3-none-any.whl → 0.2.4.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +387 -542
- tdfs4ds/feature_store/feature_data_processing.py +367 -299
- tdfs4ds/feature_store/feature_store_management.py +189 -167
- tdfs4ds/process_store/process_query_administration.py +1 -1
- tdfs4ds/process_store/process_registration_management.py +67 -55
- tdfs4ds/utils/filter_management.py +87 -53
- tdfs4ds/utils/time_management.py +67 -24
- {tdfs4ds-0.2.4.32.dist-info → tdfs4ds-0.2.4.34.dist-info}/METADATA +1 -1
- {tdfs4ds-0.2.4.32.dist-info → tdfs4ds-0.2.4.34.dist-info}/RECORD +11 -11
- {tdfs4ds-0.2.4.32.dist-info → tdfs4ds-0.2.4.34.dist-info}/WHEEL +0 -0
- {tdfs4ds-0.2.4.32.dist-info → tdfs4ds-0.2.4.34.dist-info}/top_level.txt +0 -0
tdfs4ds/__init__.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
__version__ = '0.2.4.
|
|
1
|
+
__version__ = '0.2.4.34'
|
|
2
2
|
import logging
|
|
3
|
+
|
|
3
4
|
# Setup the logger
|
|
4
5
|
logging.basicConfig(
|
|
5
6
|
level=logging.INFO,
|
|
@@ -7,6 +8,15 @@ logging.basicConfig(
|
|
|
7
8
|
datefmt='%Y-%m-%d %H:%M:%S' # Set the date/time format
|
|
8
9
|
)
|
|
9
10
|
|
|
11
|
+
# Helper: central logging gate controlled by tdfs4ds.DISPLAY_LOGS
|
|
12
|
+
def logger_safe(level, message, *args, **kwargs):
|
|
13
|
+
"""
|
|
14
|
+
Wrapper around the global `logger` that only emits logs when
|
|
15
|
+
tdfs4ds.DISPLAY_LOGS is True. `level` is a string like "info", "error", etc.
|
|
16
|
+
"""
|
|
17
|
+
if getattr(tdfs4ds, "DISPLAY_LOGS", True):
|
|
18
|
+
getattr(logger, level)(message, *args, **kwargs)
|
|
19
|
+
|
|
10
20
|
logger = logging.getLogger(__name__)
|
|
11
21
|
|
|
12
22
|
from tdfs4ds.feature_store.feature_query_retrieval import get_available_entity_id_records, write_where_clause_filter
|
|
@@ -57,7 +67,7 @@ import tdfs4ds.datasets
|
|
|
57
67
|
import time
|
|
58
68
|
|
|
59
69
|
import inspect
|
|
60
|
-
import tqdm
|
|
70
|
+
from tqdm.auto import tqdm # auto picks the right frontend (notebook/terminal)
|
|
61
71
|
|
|
62
72
|
from tdfs4ds.feature_store.feature_data_processing import generate_on_clause
|
|
63
73
|
|
|
@@ -70,92 +80,80 @@ PROCESS_TYPE = 'RUN PROCESS'
|
|
|
70
80
|
try:
|
|
71
81
|
SCHEMA = tdml.context.context._get_current_databasename()
|
|
72
82
|
if SCHEMA is None:
|
|
73
|
-
|
|
74
|
-
|
|
83
|
+
logger.warning("No default database detected for feature store.")
|
|
84
|
+
logger.warning('Please set it explicitly: tdfs4ds.feature_store.schema = "<feature store database>"')
|
|
75
85
|
else:
|
|
76
|
-
|
|
77
|
-
|
|
86
|
+
logger.info("Default database detected for feature store: %s", SCHEMA)
|
|
87
|
+
logger.info('tdfs4ds.feature_store.schema = "%s"', SCHEMA)
|
|
88
|
+
|
|
78
89
|
if DATA_DOMAIN is None:
|
|
79
90
|
DATA_DOMAIN = SCHEMA
|
|
80
|
-
|
|
81
|
-
|
|
91
|
+
logger.info("DATA_DOMAIN not set. Defaulting to SCHEMA: %s", DATA_DOMAIN)
|
|
92
|
+
logger.info('You can override it using: tdfs4ds.DATA_DOMAIN = "<your data domain>"')
|
|
82
93
|
|
|
83
94
|
except Exception as e:
|
|
84
|
-
|
|
85
|
-
|
|
95
|
+
logger.error("Could not determine current database: %s", str(e).split('\n')[0])
|
|
96
|
+
logger.warning("Please specify the feature store database manually:")
|
|
97
|
+
logger.warning('tdfs4ds.feature_store.schema = "<feature store database>"')
|
|
86
98
|
|
|
87
99
|
|
|
88
100
|
def setup(database, if_exists='fail'):
|
|
89
101
|
"""
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
This function sets the database schema for feature and process catalogs. If specified, it also handles
|
|
93
|
-
the replacement of existing catalog tables. It reports the status of these operations, including any
|
|
94
|
-
encountered exceptions.
|
|
95
|
-
|
|
96
|
-
Parameters:
|
|
97
|
-
database (str): The name of the database schema to be used.
|
|
98
|
-
if_exists (str, optional): Determines the behavior if catalog tables already exist in the database.
|
|
99
|
-
'fail' (default) - Do nothing if the tables exist.
|
|
100
|
-
'replace' - Drop the tables if they exist before creating new ones.
|
|
101
|
-
|
|
102
|
-
Steps performed:
|
|
103
|
-
1. Sets the schema to the provided database name.
|
|
104
|
-
2. If 'if_exists' is 'replace', attempts to drop 'FS_FEATURE_CATALOG' and 'FS_PROCESS_CATALOG' tables.
|
|
105
|
-
3. Creates new feature and process catalog tables and sets their names in the tdfs4ds module.
|
|
106
|
-
4. Prints the names of the newly created tables along with the database name.
|
|
107
|
-
5. Captures and prints the first line of any exceptions that occur during these operations.
|
|
108
|
-
|
|
109
|
-
Returns:
|
|
110
|
-
None
|
|
102
|
+
Initialize the feature store environment by creating catalog tables and views.
|
|
111
103
|
"""
|
|
112
104
|
|
|
113
105
|
from tdfs4ds.feature_store.feature_store_management import feature_store_catalog_creation
|
|
114
106
|
from tdfs4ds.process_store.process_store_catalog_management import process_store_catalog_creation
|
|
115
107
|
|
|
116
108
|
tdfs4ds.SCHEMA = database
|
|
109
|
+
logger_safe("info", "Setting up feature store in database: %s", database)
|
|
110
|
+
|
|
117
111
|
if if_exists == 'replace':
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
print(str(e).split('\n')[0])
|
|
126
|
-
try:
|
|
127
|
-
tdml.db_drop_table(table_name = tdfs4ds.DATA_DISTRIBUTION_NAME, schema_name=database)
|
|
128
|
-
except Exception as e:
|
|
129
|
-
print(str(e).split('\n')[0])
|
|
112
|
+
logger_safe("info", "Replacing existing catalog tables if they exist.")
|
|
113
|
+
for table in [tdfs4ds.FEATURE_CATALOG_NAME, tdfs4ds.PROCESS_CATALOG_NAME, tdfs4ds.DATA_DISTRIBUTION_NAME]:
|
|
114
|
+
try:
|
|
115
|
+
tdml.db_drop_table(table_name=table, schema_name=database)
|
|
116
|
+
logger_safe("info", "Dropped table %s.%s", database, table)
|
|
117
|
+
except Exception as e:
|
|
118
|
+
logger_safe("warning", "Could not drop table %s.%s: %s", database, table, str(e).split('\n')[0])
|
|
130
119
|
|
|
131
120
|
DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME).drop_catalog()
|
|
121
|
+
|
|
132
122
|
try:
|
|
133
123
|
tdfs4ds.FEATURE_CATALOG_NAME = feature_store_catalog_creation()
|
|
134
|
-
|
|
124
|
+
logger_safe("info", "Feature catalog table created: %s in database %s", tdfs4ds.FEATURE_CATALOG_NAME, database)
|
|
135
125
|
except Exception as e:
|
|
136
|
-
|
|
126
|
+
logger_safe("error", "Feature catalog creation failed: %s", str(e).split('\n')[0])
|
|
137
127
|
|
|
138
128
|
try:
|
|
139
|
-
tdfs4ds.PROCESS_CATALOG_NAME,
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
129
|
+
(tdfs4ds.PROCESS_CATALOG_NAME,
|
|
130
|
+
tdfs4ds.DATA_DISTRIBUTION_NAME,
|
|
131
|
+
tdfs4ds.FILTER_MANAGER_NAME) = process_store_catalog_creation()
|
|
132
|
+
|
|
133
|
+
logger_safe("info", "Process catalog table created: %s", tdfs4ds.PROCESS_CATALOG_NAME)
|
|
134
|
+
logger_safe("info", "Data distribution table created: %s", tdfs4ds.DATA_DISTRIBUTION_NAME)
|
|
135
|
+
logger_safe("info", "Filter manager table created: %s", tdfs4ds.FILTER_MANAGER_NAME)
|
|
143
136
|
except Exception as e:
|
|
144
|
-
|
|
137
|
+
logger_safe("error", "Process catalog creation failed: %s", str(e).split('\n')[0])
|
|
145
138
|
|
|
146
139
|
try:
|
|
147
140
|
tdfs4ds.process_store.process_followup.follow_up_table_creation()
|
|
141
|
+
logger_safe("info", "Follow-up table created successfully.")
|
|
148
142
|
except Exception as e:
|
|
149
|
-
|
|
143
|
+
logger_safe("error", "Follow-up table creation failed: %s", str(e).split('\n')[0])
|
|
150
144
|
|
|
151
145
|
tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
|
|
152
146
|
tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
|
|
147
|
+
|
|
153
148
|
dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
|
|
154
149
|
if not dataset_catalog._exists():
|
|
155
150
|
dataset_catalog.create_catalog()
|
|
151
|
+
logger_safe("info", "Dataset catalog created: %s", tdfs4ds.DATASET_CATALOG_NAME)
|
|
156
152
|
|
|
153
|
+
logger_safe("info", "Setup complete.")
|
|
157
154
|
return
|
|
158
155
|
|
|
156
|
+
|
|
159
157
|
def connect(
|
|
160
158
|
database = tdfs4ds.SCHEMA,
|
|
161
159
|
feature_catalog_name = tdfs4ds.FEATURE_CATALOG_NAME,
|
|
@@ -166,15 +164,15 @@ def connect(
|
|
|
166
164
|
feature_catalog_name_view = tdfs4ds.FEATURE_CATALOG_NAME_VIEW,
|
|
167
165
|
process_catalog_name_view = tdfs4ds.PROCESS_CATALOG_NAME_VIEW,
|
|
168
166
|
dataset_catalog_name = tdfs4ds.DATASET_CATALOG_NAME,
|
|
169
|
-
create_if_missing = False
|
|
167
|
+
create_if_missing = False
|
|
170
168
|
):
|
|
171
|
-
if database is
|
|
172
|
-
tdfs4ds.SCHEMA = database
|
|
173
|
-
else:
|
|
169
|
+
if database is None:
|
|
174
170
|
raise ValueError("database parameter is None.")
|
|
171
|
+
tdfs4ds.SCHEMA = database
|
|
172
|
+
logger_safe("info", "Connecting to feature store in database: %s", database)
|
|
175
173
|
|
|
176
174
|
tables = [x.lower() for x in list(tdml.db_list_tables(schema_name=tdfs4ds.SCHEMA, object_type='table').TableName.values)]
|
|
177
|
-
|
|
175
|
+
|
|
178
176
|
feature_exists = feature_catalog_name.lower() in tables
|
|
179
177
|
process_exists = process_catalog_name.lower() in tables
|
|
180
178
|
distrib_exists = data_distribution_name.lower() in tables
|
|
@@ -183,20 +181,20 @@ def connect(
|
|
|
183
181
|
|
|
184
182
|
if not (feature_exists and process_exists and distrib_exists and filter_manager_exists):
|
|
185
183
|
if not create_if_missing:
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
# Follow-up table handling
|
|
184
|
+
logger_safe("warning", "Feature store components missing and create_if_missing=False")
|
|
185
|
+
return False
|
|
186
|
+
logger_safe("info", "Missing components detected; creating missing parts...")
|
|
187
|
+
if not feature_exists:
|
|
188
|
+
tdfs4ds.feature_store.feature_store_management.feature_store_catalog_creation()
|
|
189
|
+
if not process_exists:
|
|
190
|
+
tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_creation()
|
|
191
|
+
if not distrib_exists:
|
|
192
|
+
tdfs4ds.data_distribution.data_distribution_catalog_creation()
|
|
193
|
+
if not filter_manager_exists:
|
|
194
|
+
tdfs4ds.filter_manager.filter_manager_catalog_creation()
|
|
195
|
+
|
|
199
196
|
if not followup_name_exists:
|
|
197
|
+
logger_safe("info", "Creating follow-up table: %s", followup_name)
|
|
200
198
|
tdfs4ds.process_store.process_followup.follow_up_table_creation()
|
|
201
199
|
tdfs4ds.FOLLOW_UP_NAME = followup_name
|
|
202
200
|
|
|
@@ -210,30 +208,31 @@ def connect(
|
|
|
210
208
|
|
|
211
209
|
process_list = tdml.DataFrame(tdml.in_schema(database, process_catalog_name))
|
|
212
210
|
if 'ENTITY_NULL_SUBSTITUTE' not in process_list.columns:
|
|
213
|
-
|
|
214
|
-
print('upgrade to the latest DDL')
|
|
211
|
+
logger_safe("warning", "ENTITY_NULL_SUBSTITUTE column missing. Upgrading catalog.")
|
|
215
212
|
tdfs4ds.process_store.process_store_catalog_management.upgrade_process_catalog()
|
|
216
213
|
|
|
217
214
|
tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
|
|
218
215
|
tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
|
|
219
216
|
|
|
220
|
-
# Dataset
|
|
217
|
+
# Dataset Catalog
|
|
221
218
|
tdfs4ds.DATASET_CATALOG_NAME = dataset_catalog_name
|
|
222
|
-
dataset_catalog = DatasetCatalog(schema_name=database, name=
|
|
219
|
+
dataset_catalog = DatasetCatalog(schema_name=database, name=dataset_catalog_name)
|
|
223
220
|
if not dataset_catalog._exists():
|
|
224
221
|
dataset_catalog.create_catalog()
|
|
222
|
+
logger_safe("info", "Dataset catalog created: %s", dataset_catalog_name)
|
|
225
223
|
|
|
226
|
-
#
|
|
224
|
+
# Detect temporal distribution
|
|
227
225
|
def is_data_distribution_temporal():
|
|
228
226
|
return 'PERIOD' in tdfs4ds.utils.lineage.get_ddl(
|
|
229
227
|
view_name=tdfs4ds.DATA_DISTRIBUTION_NAME,
|
|
230
228
|
schema_name=tdfs4ds.SCHEMA,
|
|
231
229
|
object_type='table'
|
|
232
230
|
)
|
|
233
|
-
|
|
231
|
+
|
|
234
232
|
tdfs4ds.DATA_DISTRIBUTION_TEMPORAL = is_data_distribution_temporal()
|
|
235
|
-
|
|
236
|
-
return True
|
|
233
|
+
logger_safe("info", "Connected to feature store successfully.")
|
|
234
|
+
return True
|
|
235
|
+
|
|
237
236
|
|
|
238
237
|
|
|
239
238
|
|
|
@@ -287,50 +286,22 @@ def get_dataset_entity(dataset_id = None):
|
|
|
287
286
|
def get_dataset_features(dataset_id = None):
|
|
288
287
|
return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_features(dataset_id)
|
|
289
288
|
|
|
290
|
-
def run(process_id, return_dataset
|
|
289
|
+
def run(process_id, return_dataset=False, force_compute=False, force_varchar_length=None):
|
|
291
290
|
"""
|
|
292
291
|
Executes a specific process from the feature store identified by the process ID.
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
Parameters:
|
|
296
|
-
- process_id (str): The unique identifier of the process to run.
|
|
297
|
-
- return_dataset (bool, optional): A flag indicating whether to return the dataset created during the process.
|
|
298
|
-
Default is False.
|
|
299
|
-
- force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
|
|
300
|
-
Default is False.
|
|
301
|
-
- force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
|
|
302
|
-
VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
|
|
303
|
-
where k is the smallest integer so that the original lengths is smaller or equal
|
|
304
|
-
to k x force_varchar_length. Default is None.
|
|
305
|
-
|
|
306
|
-
Returns:
|
|
307
|
-
DataFrame or None: If return_dataset is True, returns the dataset created during the process. Otherwise, returns None.
|
|
308
|
-
|
|
309
|
-
This function performs the following steps:
|
|
310
|
-
1. Determines the process type and initializes necessary variables.
|
|
311
|
-
2. Constructs and executes a SQL query to retrieve process details by process ID.
|
|
312
|
-
3. Fetches the filter manager, process type, primary index, partitioning, and data domain from the query result.
|
|
313
|
-
4. Handles different process types, such as 'denormalized view' and 'tdstone2 view'.
|
|
314
|
-
5. For 'denormalized view' process type, extracts necessary details, fetches data, and uploads features to the feature store.
|
|
315
|
-
6. Optionally returns the dataset created during the process if return_dataset is True.
|
|
316
|
-
|
|
317
|
-
Note:
|
|
318
|
-
- The function relies on various sub-modules within the `tdfs4ds` library for different steps of the process, from
|
|
319
|
-
data retrieval to feature uploading.
|
|
320
|
-
- It is intended to be used internally within a system that manages a Teradata feature store, assuming access to
|
|
321
|
-
a Teradata database and the appropriate schema for feature storage.
|
|
292
|
+
Uses global `logger` for diagnostics (gated by tdfs4ds.DISPLAY_LOGS).
|
|
322
293
|
"""
|
|
323
294
|
|
|
324
295
|
if tdfs4ds.PROCESS_TYPE is None:
|
|
325
296
|
PROCESS_TYPE_ = 'RUN PROCESS'
|
|
326
|
-
tdfs4ds.RUN_ID
|
|
297
|
+
tdfs4ds.RUN_ID = str(uuid.uuid4())
|
|
327
298
|
else:
|
|
328
299
|
PROCESS_TYPE_ = tdfs4ds.PROCESS_TYPE
|
|
329
300
|
|
|
330
|
-
if tdfs4ds
|
|
331
|
-
|
|
301
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
302
|
+
logger_safe("debug", "def run | tdfs4ds.FEATURE_STORE_TIME=%s", tdfs4ds.FEATURE_STORE_TIME)
|
|
332
303
|
|
|
333
|
-
if tdfs4ds.FEATURE_STORE_TIME
|
|
304
|
+
if tdfs4ds.FEATURE_STORE_TIME is None:
|
|
334
305
|
validtime_statement = 'CURRENT VALIDTIME'
|
|
335
306
|
else:
|
|
336
307
|
validtime_statement = f"VALIDTIME AS OF TIMESTAMP '{tdfs4ds.FEATURE_STORE_TIME}'"
|
|
@@ -342,148 +313,110 @@ def run(process_id, return_dataset = False, force_compute = False, force_varchar
|
|
|
342
313
|
WHERE A.PROCESS_ID = '{process_id}'
|
|
343
314
|
"""
|
|
344
315
|
|
|
316
|
+
logger_safe(
|
|
317
|
+
"info",
|
|
318
|
+
"Starting run | run_id=%s | process_type=%s | process_id=%s | return_dataset=%s | force_compute=%s | force_varchar_length=%s",
|
|
319
|
+
tdfs4ds.RUN_ID, PROCESS_TYPE_, process_id, return_dataset, force_compute, force_varchar_length
|
|
320
|
+
)
|
|
321
|
+
|
|
345
322
|
# Executing the query and converting the result to Pandas DataFrame
|
|
346
323
|
df = tdml.DataFrame.from_query(query).to_pandas()
|
|
347
324
|
|
|
348
|
-
# Check if exactly one record is returned, else
|
|
325
|
+
# Check if exactly one record is returned, else log an error and return
|
|
349
326
|
if df.shape[0] != 1:
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
327
|
+
logger_safe(
|
|
328
|
+
"error",
|
|
329
|
+
"Process catalog lookup returned %s record(s); expected 1. Check table %s.%s. Query: %s",
|
|
330
|
+
df.shape[0], tdfs4ds.SCHEMA, tdfs4ds.PROCESS_CATALOG_NAME_VIEW, query.strip()
|
|
331
|
+
)
|
|
353
332
|
return
|
|
354
333
|
|
|
355
|
-
|
|
356
334
|
# Fetching the filter manager
|
|
357
335
|
filter_schema_name = df['FILTER_DATABASE_NAME'].values[0]
|
|
358
336
|
if filter_schema_name is None:
|
|
359
337
|
filtermanager = None
|
|
360
338
|
else:
|
|
361
339
|
filter_view_name = df['FILTER_VIEW_NAME'].values[0]
|
|
362
|
-
filter_table_name = df['FILTER_TABLE_NAME'].values[0]
|
|
340
|
+
filter_table_name = df['FILTER_TABLE_NAME'].values[0] # kept for parity; not used directly here
|
|
363
341
|
filtermanager = FilterManager(table_name=filter_view_name, schema_name=filter_schema_name)
|
|
364
342
|
|
|
365
|
-
# Fetching
|
|
366
|
-
process_type
|
|
367
|
-
|
|
368
|
-
# Fetching the primary index from the query result
|
|
369
|
-
primary_index = df['FOR_PRIMARY_INDEX'].values[0]
|
|
343
|
+
# Fetching process metadata
|
|
344
|
+
process_type = df['PROCESS_TYPE'].values[0]
|
|
345
|
+
primary_index = df['FOR_PRIMARY_INDEX'].values[0]
|
|
370
346
|
if primary_index is not None:
|
|
371
|
-
primary_index = primary_index.split(',')
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
347
|
+
primary_index = [x.strip() for x in primary_index.split(',') if x.strip()]
|
|
348
|
+
partitioning = df['FOR_DATA_PARTITIONING'].values[0]
|
|
349
|
+
DATA_DOMAIN = df['DATA_DOMAIN'].values[0]
|
|
350
|
+
|
|
351
|
+
logger_safe(
|
|
352
|
+
"info",
|
|
353
|
+
"Process metadata | process_id=%s | process_type=%s | primary_index=%s | partitioning=%s | data_domain=%s | validtime=%s",
|
|
354
|
+
process_id, process_type, primary_index, partitioning, DATA_DOMAIN, validtime_statement
|
|
355
|
+
)
|
|
378
356
|
|
|
379
357
|
# Handling 'denormalized view' process type
|
|
380
358
|
if process_type == 'denormalized view':
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
entity_id = df['ENTITY_ID'].values[0].split(',')
|
|
359
|
+
view_name = df['VIEW_NAME'].values[0]
|
|
360
|
+
entity_id = [x.strip() for x in df['ENTITY_ID'].values[0].split(',') if x.strip()]
|
|
384
361
|
entity_null_substitute = eval(df['ENTITY_NULL_SUBSTITUTE'].values[0])
|
|
385
|
-
feature_names
|
|
362
|
+
feature_names = [x.strip() for x in df['FEATURE_NAMES'].values[0].split(',') if x.strip()]
|
|
386
363
|
|
|
387
|
-
# Fetching data and uploading features to the feature store
|
|
388
364
|
df_data = tdml.DataFrame(tdml.in_schema(view_name.split('.')[0], view_name.split('.')[1]))
|
|
389
365
|
|
|
390
|
-
if tdfs4ds
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
366
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
367
|
+
logger_safe("debug", "run | entity_id=%s", entity_id)
|
|
368
|
+
logger_safe("debug", "run | entity_null_substitute=%s", entity_null_substitute)
|
|
369
|
+
logger_safe("debug", "run | feature_names=%s", feature_names)
|
|
370
|
+
logger_safe("debug", "run | process_id=%s", process_id)
|
|
371
|
+
logger_safe("debug", "run | primary_index=%s", primary_index)
|
|
372
|
+
logger_safe("debug", "run | partitioning=%s", partitioning)
|
|
373
|
+
|
|
397
374
|
dataset = _upload_features(
|
|
398
375
|
df_data,
|
|
399
376
|
entity_id,
|
|
400
377
|
feature_names,
|
|
401
|
-
feature_versions
|
|
402
|
-
primary_index
|
|
403
|
-
partitioning
|
|
404
|
-
filtermanager
|
|
405
|
-
entity_null_substitute
|
|
406
|
-
process_id
|
|
407
|
-
force_compute=
|
|
408
|
-
force_varchar_length
|
|
378
|
+
feature_versions=process_id,
|
|
379
|
+
primary_index=primary_index,
|
|
380
|
+
partitioning=partitioning,
|
|
381
|
+
filtermanager=filtermanager,
|
|
382
|
+
entity_null_substitute=entity_null_substitute,
|
|
383
|
+
process_id=process_id,
|
|
384
|
+
force_compute=force_compute,
|
|
385
|
+
force_varchar_length=force_varchar_length
|
|
409
386
|
)
|
|
410
387
|
|
|
411
388
|
# Handling 'tdstone2 view' process type
|
|
412
389
|
elif process_type == 'tdstone2 view':
|
|
413
|
-
|
|
414
|
-
|
|
390
|
+
logger_safe("warning", "Process type 'tdstone2 view' not implemented yet for process_id=%s", process_id)
|
|
391
|
+
dataset = None
|
|
415
392
|
|
|
393
|
+
else:
|
|
394
|
+
logger_safe("error", "Unknown process type '%s' for process_id=%s", process_type, process_id)
|
|
395
|
+
dataset = None
|
|
416
396
|
|
|
417
397
|
if return_dataset:
|
|
398
|
+
logger_safe("info", "Run finished with dataset | run_id=%s | process_id=%s", tdfs4ds.RUN_ID, process_id)
|
|
418
399
|
return dataset
|
|
419
400
|
else:
|
|
401
|
+
logger_safe("info", "Run finished without dataset | run_id=%s | process_id=%s", tdfs4ds.RUN_ID, process_id)
|
|
420
402
|
return
|
|
421
403
|
|
|
422
|
-
def upload_features(df, entity_id, feature_names, metadata={}, primary_index = None, partitioning = '', filtermanager = None, entity_null_substitute = {}, force_compute = True, force_varchar_length = 1024):
|
|
423
|
-
"""
|
|
424
|
-
Uploads feature data from a DataFrame to the feature store for a specified entity. This involves registering the
|
|
425
|
-
process in the feature store, executing the necessary SQL to insert the data, and returning the resulting dataset
|
|
426
|
-
for further use or inspection.
|
|
427
|
-
|
|
428
|
-
The function supports dynamic entity ID interpretation and flexible feature name handling, ensuring compatibility
|
|
429
|
-
with various data schemas. It automatically registers the data upload process and applies additional metadata,
|
|
430
|
-
if provided.
|
|
431
|
-
|
|
432
|
-
Parameters:
|
|
433
|
-
- df (DataFrame): The DataFrame containing the feature data to be uploaded.
|
|
434
|
-
- entity_id (dict, list, or str): The identifier of the entity to which the features belong. This can be:
|
|
435
|
-
- a dictionary mapping column names to their data types,
|
|
436
|
-
- a list of column names, which will be automatically converted to a dictionary with types inferred from `df`,
|
|
437
|
-
- a string representing a single column name, which will be converted into a list and then to a dictionary as above.
|
|
438
|
-
- feature_names (list or str): The names of the features to be uploaded. If a string is provided, it will be
|
|
439
|
-
split into a list based on commas or treated as a single feature name.
|
|
440
|
-
- metadata (dict, optional): Additional metadata to associate with the upload process. Defaults to an empty dictionary.
|
|
441
|
-
- primary_index (list, optional): Specifies the primary index columns for optimizing data storage and retrieval.
|
|
442
|
-
- partitioning (str, optional): Defines how the data should be partitioned in the store for performance optimization.
|
|
443
|
-
- filtermanager (object, optional): An object managing filter conditions for the feature data. Default is None.
|
|
444
|
-
- entity_null_substitute (dict, optional): A dictionary specifying substitute values for nulls in entity columns.
|
|
445
|
-
Default is an empty dictionary.
|
|
446
|
-
- force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
|
|
447
|
-
Default is True.
|
|
448
|
-
- force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
|
|
449
|
-
VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
|
|
450
|
-
where k is the smallest integer so that the original lengths is smaller or equal
|
|
451
|
-
to k x force_varchar_length. Default is 1024.
|
|
452
|
-
Returns:
|
|
453
|
-
DataFrame: A DataFrame representing the dataset resulting from the upload process, typically used for validation
|
|
454
|
-
or further processing.
|
|
455
404
|
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
... tx_type LIKE 'DEBIT',
|
|
472
|
-
... tx_type LIKE 'PAYMENT',
|
|
473
|
-
... tx_type LIKE 'CASH_OUT',
|
|
474
|
-
... tx_type LIKE 'CASH_IN',
|
|
475
|
-
... tx_type LIKE 'TRANSFER',
|
|
476
|
-
... NO CASE,
|
|
477
|
-
... UNKNOWN)'''
|
|
478
|
-
>>> features = [x for x in tddf.columns if x not in entity_id]
|
|
479
|
-
>>> dataset = upload_features(
|
|
480
|
-
... df = tddf,
|
|
481
|
-
... entity_id = entity_id,
|
|
482
|
-
... feature_names = features,
|
|
483
|
-
... metadata = {'project': 'test'},
|
|
484
|
-
... primary_index = primary_index,
|
|
485
|
-
... partitioning = partitioning
|
|
486
|
-
... )
|
|
405
|
+
def upload_features(
|
|
406
|
+
df,
|
|
407
|
+
entity_id,
|
|
408
|
+
feature_names,
|
|
409
|
+
metadata={},
|
|
410
|
+
primary_index=None,
|
|
411
|
+
partitioning='',
|
|
412
|
+
filtermanager=None,
|
|
413
|
+
entity_null_substitute={},
|
|
414
|
+
force_compute=True,
|
|
415
|
+
force_varchar_length=1024
|
|
416
|
+
):
|
|
417
|
+
"""
|
|
418
|
+
Uploads feature data from a DataFrame to the feature store for a specified entity.
|
|
419
|
+
All diagnostics go through `logger_safe()` which respects `tdfs4ds.DISPLAY_LOGS`.
|
|
487
420
|
"""
|
|
488
421
|
|
|
489
422
|
from tdfs4ds.utils.info import get_column_types
|
|
@@ -491,45 +424,42 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
|
|
|
491
424
|
from tdfs4ds.process_store.process_registration_management import register_process_view
|
|
492
425
|
|
|
493
426
|
# Convert entity_id to a dictionary if it's not already one
|
|
494
|
-
if
|
|
427
|
+
if isinstance(entity_id, list):
|
|
495
428
|
entity_id.sort()
|
|
496
429
|
entity_id = get_column_types(df, entity_id)
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
elif
|
|
430
|
+
logger_safe("debug", "entity_id converted to dict: %s", entity_id)
|
|
431
|
+
|
|
432
|
+
elif isinstance(entity_id, str):
|
|
500
433
|
entity_id = [entity_id]
|
|
501
434
|
entity_id = get_column_types(df, entity_id)
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
if
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
feature_names = feature_names.split(',')
|
|
435
|
+
logger_safe("debug", "entity_id converted to dict: %s", entity_id)
|
|
436
|
+
|
|
437
|
+
# Normalize feature_names
|
|
438
|
+
if not isinstance(feature_names, list):
|
|
439
|
+
logger_safe("debug", "feature_names is not a list: %s", feature_names)
|
|
440
|
+
if isinstance(feature_names, str) and ',' in feature_names:
|
|
441
|
+
feature_names = [x.strip() for x in feature_names.split(',')]
|
|
510
442
|
else:
|
|
511
443
|
feature_names = [feature_names]
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
if primary_index is not None and
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
primary_index = primary_index.split(',')
|
|
444
|
+
logger_safe("debug", "feature_names converted to list: %s", feature_names)
|
|
445
|
+
logger_safe("debug", "Check the conversion is as expected.")
|
|
446
|
+
|
|
447
|
+
# Normalize primary_index
|
|
448
|
+
if primary_index is not None and not isinstance(primary_index, list):
|
|
449
|
+
logger_safe("debug", "primary_index is not a list: %s", primary_index)
|
|
450
|
+
if isinstance(primary_index, str) and ',' in primary_index:
|
|
451
|
+
primary_index = [x.strip() for x in primary_index.split(',')]
|
|
521
452
|
else:
|
|
522
453
|
primary_index = [primary_index]
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
print('check it is a expected.')
|
|
454
|
+
logger_safe("debug", "primary_index converted to list: %s", primary_index)
|
|
455
|
+
logger_safe("debug", "Check the conversion is as expected.")
|
|
526
456
|
|
|
457
|
+
# Partitioning
|
|
527
458
|
partitioning = tdfs4ds.utils.info.generate_partitioning_clause(partitioning=partitioning)
|
|
528
459
|
|
|
529
|
-
|
|
530
|
-
print("filtermanager", filtermanager)
|
|
460
|
+
logger_safe("debug", "filtermanager: %s", filtermanager)
|
|
531
461
|
|
|
532
|
-
# Register
|
|
462
|
+
# Register process -> get SQL(s) + process_id
|
|
533
463
|
query_insert, process_id, query_insert_dist, query_insert_filtermanager = register_process_view.__wrapped__(
|
|
534
464
|
view_name = df,
|
|
535
465
|
entity_id = entity_id,
|
|
@@ -542,104 +472,96 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
|
|
|
542
472
|
entity_null_substitute = entity_null_substitute
|
|
543
473
|
)
|
|
544
474
|
|
|
545
|
-
|
|
546
|
-
execute_query(query_insert)
|
|
547
|
-
execute_query(query_insert_dist)
|
|
548
|
-
if tdfs4ds.DEBUG_MODE:
|
|
549
|
-
print("query_insert_filtermanager",query_insert_filtermanager)
|
|
550
|
-
if query_insert_filtermanager is not None:
|
|
551
|
-
execute_query(query_insert_filtermanager)
|
|
475
|
+
logger_safe("info", "Registered process (process_id=%s) for upload_features", process_id)
|
|
552
476
|
|
|
553
|
-
#
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
477
|
+
# Execute queries
|
|
478
|
+
try:
|
|
479
|
+
execute_query(query_insert)
|
|
480
|
+
logger_safe("info", "Executed main insert query for process_id=%s", process_id)
|
|
481
|
+
except Exception as e:
|
|
482
|
+
logger_safe("exception", "Main insert query failed for process_id=%s", process_id)
|
|
483
|
+
raise
|
|
560
484
|
|
|
561
|
-
|
|
485
|
+
try:
|
|
486
|
+
execute_query(query_insert_dist)
|
|
487
|
+
logger_safe("info", "Executed distribution insert query for process_id=%s", process_id)
|
|
488
|
+
except Exception as e:
|
|
489
|
+
logger_safe("exception", "Distribution insert query failed for process_id=%s", process_id)
|
|
490
|
+
raise
|
|
562
491
|
|
|
563
|
-
|
|
492
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
493
|
+
# Avoid dumping entire SQL in normal logs; keep it debug-only.
|
|
494
|
+
logger_safe("debug", "query_insert_filtermanager: %s", query_insert_filtermanager)
|
|
564
495
|
|
|
496
|
+
if query_insert_filtermanager is not None:
|
|
497
|
+
try:
|
|
498
|
+
execute_query(query_insert_filtermanager)
|
|
499
|
+
logger_safe("info", "Executed filtermanager insert query for process_id=%s", process_id)
|
|
565
500
|
except Exception as e:
|
|
566
|
-
|
|
567
|
-
run_id = tdfs4ds.RUN_ID,
|
|
568
|
-
process_type = tdfs4ds.PROCESS_TYPE,
|
|
569
|
-
process_id = process_id,
|
|
570
|
-
status = 'FAILED,' + str(e).split('\n')[0]
|
|
571
|
-
)
|
|
501
|
+
logger_safe("exception", "Filtermanager insert query failed for process_id=%s", process_id)
|
|
572
502
|
raise
|
|
573
503
|
|
|
504
|
+
# Run the registered process (with/without dataset)
|
|
505
|
+
PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
|
|
506
|
+
tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES'
|
|
507
|
+
if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
|
|
508
|
+
tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES WITH DATASET VALIDATION'
|
|
509
|
+
tdfs4ds.RUN_ID = str(uuid.uuid4())
|
|
574
510
|
|
|
575
|
-
|
|
576
|
-
|
|
511
|
+
logger_safe(
|
|
512
|
+
"info",
|
|
513
|
+
"Starting run (run_id=%s, process_type=%s, process_id=%s, force_compute=%s, force_varchar_length=%s)",
|
|
514
|
+
tdfs4ds.RUN_ID, tdfs4ds.PROCESS_TYPE, process_id, force_compute, force_varchar_length
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
try:
|
|
518
|
+
if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
|
|
519
|
+
dataset = run(
|
|
520
|
+
process_id=process_id,
|
|
521
|
+
return_dataset=True,
|
|
522
|
+
force_compute=force_compute,
|
|
523
|
+
force_varchar_length=force_varchar_length
|
|
524
|
+
)
|
|
525
|
+
logger_safe("info", "Run completed with dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
|
|
526
|
+
return dataset
|
|
527
|
+
else:
|
|
528
|
+
run(
|
|
529
|
+
process_id=process_id,
|
|
530
|
+
return_dataset=False,
|
|
531
|
+
force_compute=force_compute,
|
|
532
|
+
force_varchar_length=force_varchar_length
|
|
533
|
+
)
|
|
534
|
+
logger_safe("info", "Run completed without dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
|
|
535
|
+
return
|
|
577
536
|
|
|
537
|
+
except Exception as e:
|
|
538
|
+
# Keep your existing follow-up close behavior, but ensure the error is logged.
|
|
578
539
|
try:
|
|
579
|
-
run(process_id=process_id, return_dataset=False, force_compute = force_compute, force_varchar_length = force_varchar_length)
|
|
580
|
-
except Exception as e:
|
|
581
540
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
582
541
|
run_id = tdfs4ds.RUN_ID,
|
|
583
542
|
process_type = tdfs4ds.PROCESS_TYPE,
|
|
584
543
|
process_id = process_id,
|
|
585
544
|
status = 'FAILED,' + str(e).split('\n')[0]
|
|
586
545
|
)
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
Uploads features from a DataFrame to the feature store, handling entity registration, feature type determination,
|
|
596
|
-
feature registration, preparation for ingestion, and storage in the designated feature tables.
|
|
546
|
+
finally:
|
|
547
|
+
logger_safe("exception", "Run failed (run_id=%s, process_id=%s): %s",
|
|
548
|
+
tdfs4ds.RUN_ID, process_id, str(e).split('\n')[0]
|
|
549
|
+
)
|
|
550
|
+
raise
|
|
551
|
+
finally:
|
|
552
|
+
# Restore previous process type just in case the caller relies on it.
|
|
553
|
+
tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
|
|
597
554
|
|
|
598
|
-
Parameters:
|
|
599
|
-
- df (DataFrame): The input DataFrame containing the feature data.
|
|
600
|
-
- entity_id (str or dict): The identifier for the entity to which these features belong. This can be a single ID
|
|
601
|
-
(str) or a dictionary of attribute names and values uniquely identifying the entity.
|
|
602
|
-
- feature_names (list): A list of strings specifying the names of the features to be uploaded.
|
|
603
|
-
- feature_versions (str or list, optional): Specifies the versions of the features to be uploaded. Can be a single
|
|
604
|
-
string applied to all features or a list of strings specifying the version
|
|
605
|
-
for each feature respectively. Default is 'dev.0.0'.
|
|
606
|
-
- primary_index (list, optional): Specifies the columns to be used as the primary index in the feature store tables.
|
|
607
|
-
This can significantly impact the performance of data retrieval operations.
|
|
608
|
-
- partitioning (str, optional): A string indicating the partitioning strategy for the feature store tables, which can
|
|
609
|
-
enhance query performance based on the access patterns.
|
|
610
|
-
- filtermanager (object, optional): An object managing filter conditions for the feature data. Default is None.
|
|
611
|
-
- entity_null_substitute (dict, optional): A dictionary specifying substitute values for nulls in entity columns.
|
|
612
|
-
Default is an empty dictionary.
|
|
613
|
-
- process_id (str, optional): An identifier for the process, used for tracking and follow-up. Default is None.
|
|
614
|
-
- force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
|
|
615
|
-
Default is False.
|
|
616
|
-
- force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
|
|
617
|
-
VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
|
|
618
|
-
where k is the smallest integer so that the original lengths is smaller or equal
|
|
619
|
-
to k x force_varchar_length. Default is None.
|
|
620
555
|
|
|
621
556
|
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
4. Prepares the feature data for ingestion, including any necessary transformations.
|
|
631
|
-
5. Stores the prepared feature data in the feature store.
|
|
632
|
-
6. Optionally, cleans up temporary resources used during the process.
|
|
633
|
-
7. Builds and returns a view of the dataset representing the uploaded features for easy access.
|
|
634
|
-
|
|
635
|
-
Note:
|
|
636
|
-
- The function relies on various sub-modules within the `tdfs4ds` library for different steps of the process, from
|
|
637
|
-
entity and feature registration to data preparation and storage.
|
|
638
|
-
- It is intended to be used internally within a system that manages a Teradata feature store, assuming access to
|
|
639
|
-
a Teradata database and the appropriate schema for feature storage.
|
|
640
|
-
- The function assumes that the feature_versions, if provided as a list, matches the length of feature_names.
|
|
641
|
-
"""
|
|
642
|
-
|
|
557
|
+
def _upload_features(
|
|
558
|
+
df, entity_id, feature_names,
|
|
559
|
+
feature_versions=FEATURE_VERSION_DEFAULT,
|
|
560
|
+
primary_index=None, partitioning='',
|
|
561
|
+
filtermanager=None, entity_null_substitute={},
|
|
562
|
+
process_id=None, force_compute=False,
|
|
563
|
+
force_varchar_length=None
|
|
564
|
+
):
|
|
643
565
|
from tdfs4ds.feature_store.entity_management import register_entity
|
|
644
566
|
from tdfs4ds.feature_store.feature_store_management import Gettdtypes
|
|
645
567
|
from tdfs4ds.feature_store.feature_store_management import register_features
|
|
@@ -647,193 +569,141 @@ def _upload_features(df, entity_id, feature_names,
|
|
|
647
569
|
from tdfs4ds.feature_store.feature_data_processing import store_feature, apply_collect_stats
|
|
648
570
|
from tdfs4ds.utils.info import get_column_types, update_varchar_length
|
|
649
571
|
|
|
650
|
-
# Convert entity_id to a dictionary if
|
|
651
|
-
if
|
|
572
|
+
# Convert entity_id to a dictionary if not already
|
|
573
|
+
if isinstance(entity_id, list):
|
|
652
574
|
entity_id.sort()
|
|
653
575
|
entity_id = get_column_types(df, entity_id)
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
entity_id
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
#register_entity(entity_id, primary_index=primary_index, partitioning=partitioning)
|
|
663
|
-
|
|
664
|
-
# If feature_versions is a list, create a dictionary mapping each feature name to its corresponding version.
|
|
665
|
-
# If feature_versions is a string, create a dictionary mapping each feature name to this string.
|
|
666
|
-
if type(feature_versions) == list:
|
|
667
|
-
selected_features = {k: v for k, v in zip(feature_names, feature_versions)}
|
|
576
|
+
logger_safe("debug", "entity_id converted to dict: %s", entity_id)
|
|
577
|
+
elif isinstance(entity_id, str):
|
|
578
|
+
entity_id = get_column_types(df, [entity_id])
|
|
579
|
+
logger_safe("debug", "entity_id converted to dict: %s", entity_id)
|
|
580
|
+
|
|
581
|
+
# Map feature versions
|
|
582
|
+
if isinstance(feature_versions, list):
|
|
583
|
+
selected_features = dict(zip(feature_names, feature_versions))
|
|
668
584
|
else:
|
|
669
585
|
selected_features = {k: feature_versions for k in feature_names}
|
|
670
586
|
|
|
671
|
-
# Get
|
|
672
|
-
feature_names_types = Gettdtypes(
|
|
673
|
-
df,
|
|
674
|
-
features_columns=feature_names,
|
|
675
|
-
entity_id=entity_id
|
|
676
|
-
)
|
|
587
|
+
# Get Teradata types for features
|
|
588
|
+
feature_names_types = Gettdtypes(df, features_columns=feature_names, entity_id=entity_id)
|
|
677
589
|
|
|
678
590
|
if force_varchar_length is not None:
|
|
679
|
-
|
|
680
|
-
feature_names_types = update_varchar_length(
|
|
591
|
+
logger_safe("debug", "Updating VARCHAR lengths with force_varchar_length=%s", force_varchar_length)
|
|
592
|
+
feature_names_types = update_varchar_length(
|
|
593
|
+
feature_names_types,
|
|
594
|
+
new_varchar_length=force_varchar_length
|
|
595
|
+
)
|
|
681
596
|
|
|
682
597
|
def validate_feature_types(feature_names_types):
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
feature_names_types (dict): A dictionary where keys are feature names and values are their data types.
|
|
689
|
-
|
|
690
|
-
Raises:
|
|
691
|
-
ValueError: If any feature type contains 'clob', 'blob', or 'json'.
|
|
692
|
-
"""
|
|
693
|
-
invalid_types = {key: value['type'] for key, value in feature_names_types.items()
|
|
694
|
-
if any(term in value['type'].lower() for term in ['clob', 'blob', 'json'])}
|
|
695
|
-
|
|
696
|
-
if invalid_types:
|
|
598
|
+
invalid = {
|
|
599
|
+
k: v['type'] for k, v in feature_names_types.items()
|
|
600
|
+
if any(x in v['type'].lower() for x in ['clob', 'blob', 'json'])
|
|
601
|
+
}
|
|
602
|
+
if invalid:
|
|
697
603
|
raise ValueError(
|
|
698
|
-
f"
|
|
699
|
-
"
|
|
604
|
+
f"Unsupported data types found: {invalid}. "
|
|
605
|
+
"CLOB/BLOB/JSON are not supported."
|
|
700
606
|
)
|
|
701
|
-
|
|
702
|
-
validate_feature_types(feature_names_types)
|
|
703
|
-
|
|
607
|
+
|
|
608
|
+
validate_feature_types(feature_names_types)
|
|
609
|
+
|
|
610
|
+
logger_safe("info", "Registering entity %s in feature store", entity_id)
|
|
704
611
|
register_entity(entity_id, feature_names_types, primary_index=primary_index, partitioning=partitioning)
|
|
705
612
|
|
|
706
|
-
if tdfs4ds
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
primary_index,
|
|
720
|
-
partitioning
|
|
721
|
-
)
|
|
722
|
-
|
|
723
|
-
if tdfs4ds.DEBUG_MODE:
|
|
724
|
-
print("---------_upload_features")
|
|
725
|
-
print("filtermanager : ", filtermanager)
|
|
726
|
-
print("feature names : ", feature_names)
|
|
727
|
-
print("selected features : ", selected_features)
|
|
728
|
-
|
|
729
|
-
if process_id is not None and tdfs4ds.FEATURE_STORE_TIME is not None:
|
|
613
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
614
|
+
logger_safe(
|
|
615
|
+
"debug",
|
|
616
|
+
"_upload_features entity_id=%s null_substitute=%s features=%s primary_index=%s partitioning=%s",
|
|
617
|
+
entity_id, entity_null_substitute, feature_names, primary_index, partitioning
|
|
618
|
+
)
|
|
619
|
+
logger_safe("debug", "selected_features=%s df.columns=%s", selected_features, df.columns)
|
|
620
|
+
|
|
621
|
+
register_features(entity_id, feature_names_types, primary_index, partitioning)
|
|
622
|
+
logger_safe("info", "Features registered in catalog: %s", feature_names)
|
|
623
|
+
|
|
624
|
+
follow_up = None
|
|
625
|
+
if process_id and tdfs4ds.FEATURE_STORE_TIME:
|
|
730
626
|
follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
|
|
731
|
-
follow_up = follow_up[
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
do_compute = False
|
|
627
|
+
follow_up = follow_up[
|
|
628
|
+
(follow_up.STATUS == 'COMPLETED') &
|
|
629
|
+
(follow_up.VALIDTIME_DATE.isna() == False) &
|
|
630
|
+
(follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) &
|
|
631
|
+
(follow_up.PROCESS_ID == process_id)
|
|
632
|
+
]
|
|
738
633
|
|
|
739
|
-
|
|
634
|
+
if filtermanager is None:
|
|
635
|
+
do_compute = not (process_id and follow_up is not None and follow_up.shape[0] > 0)
|
|
740
636
|
if do_compute or force_compute:
|
|
741
|
-
|
|
637
|
+
logger_safe("info", "Beginning feature ingestion for entity=%s", entity_id)
|
|
742
638
|
tdfs4ds.process_store.process_followup.followup_open(
|
|
743
|
-
run_id
|
|
744
|
-
process_type
|
|
745
|
-
process_id
|
|
639
|
+
run_id=tdfs4ds.RUN_ID,
|
|
640
|
+
process_type=tdfs4ds.PROCESS_TYPE,
|
|
641
|
+
process_id=process_id
|
|
746
642
|
)
|
|
747
|
-
|
|
748
643
|
try:
|
|
749
|
-
prepared_features,
|
|
750
|
-
df,
|
|
751
|
-
entity_id,
|
|
752
|
-
feature_names,
|
|
644
|
+
prepared_features, volatile_table, features_infos = prepare_feature_ingestion(
|
|
645
|
+
df, entity_id, feature_names,
|
|
753
646
|
feature_versions=selected_features,
|
|
754
647
|
primary_index=primary_index,
|
|
755
648
|
entity_null_substitute=entity_null_substitute,
|
|
756
649
|
partitioning=partitioning
|
|
757
650
|
)
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
volatile_table_name,
|
|
762
|
-
entity_null_substitute=entity_null_substitute,
|
|
763
|
-
primary_index=primary_index,
|
|
764
|
-
partitioning=partitioning,
|
|
765
|
-
features_infos = features_infos
|
|
766
|
-
)
|
|
767
|
-
|
|
768
|
-
# Collect statistics
|
|
769
|
-
apply_collect_stats(
|
|
770
|
-
entity_id,
|
|
771
|
-
primary_index = primary_index,
|
|
772
|
-
partitioning = partitioning,
|
|
773
|
-
feature_infos = features_infos
|
|
774
|
-
)
|
|
651
|
+
store_feature(entity_id, volatile_table, entity_null_substitute,
|
|
652
|
+
primary_index, partitioning, features_infos)
|
|
653
|
+
apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
|
|
775
654
|
|
|
776
655
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
777
|
-
run_id
|
|
778
|
-
process_type
|
|
779
|
-
process_id
|
|
656
|
+
run_id=tdfs4ds.RUN_ID,
|
|
657
|
+
process_type=tdfs4ds.PROCESS_TYPE,
|
|
658
|
+
process_id=process_id
|
|
780
659
|
)
|
|
660
|
+
logger_safe("info", "Feature ingestion completed for entity=%s", entity_id)
|
|
781
661
|
|
|
782
662
|
except Exception as e:
|
|
663
|
+
logger_safe("exception", "Feature ingestion failed for entity=%s", entity_id)
|
|
783
664
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
784
|
-
run_id
|
|
785
|
-
process_type
|
|
786
|
-
process_id
|
|
787
|
-
status
|
|
665
|
+
run_id=tdfs4ds.RUN_ID,
|
|
666
|
+
process_type=tdfs4ds.PROCESS_TYPE,
|
|
667
|
+
process_id=process_id,
|
|
668
|
+
status='FAILED,' + str(e).split('\n')[0]
|
|
788
669
|
)
|
|
789
670
|
raise
|
|
790
|
-
else:
|
|
791
|
-
# get the total number of filter condition in the filter manager
|
|
792
|
-
nb_filters = filtermanager.nb_filters
|
|
793
671
|
|
|
794
|
-
|
|
672
|
+
else:
|
|
673
|
+
logger_safe("info", "FilterManager detected: %s filters to process", filtermanager.nb_filters)
|
|
795
674
|
something_computed = False
|
|
675
|
+
for i in tqdm(
|
|
676
|
+
range(filtermanager.nb_filters),
|
|
677
|
+
total=filtermanager.nb_filters,
|
|
678
|
+
desc="Applying filters",
|
|
679
|
+
unit="filter",
|
|
680
|
+
leave=False
|
|
681
|
+
):
|
|
682
|
+
filter_id = i + 1
|
|
683
|
+
filtermanager.update(filter_id)
|
|
684
|
+
|
|
685
|
+
# show which filter is being applied in the bar
|
|
686
|
+
try:
|
|
687
|
+
tqdm.write(f"Applying filter {filter_id}/{filtermanager.nb_filters}")
|
|
688
|
+
# If display() returns a long string, you can shorten it:
|
|
689
|
+
bar_info = str(filtermanager.display())
|
|
690
|
+
if len(bar_info) > 80:
|
|
691
|
+
bar_info = bar_info[:77] + "..."
|
|
692
|
+
tqdm.tqdm._instances and next(iter(tqdm.tqdm._instances)).set_postfix_str(bar_info)
|
|
693
|
+
except Exception:
|
|
694
|
+
# postfix is optional; ignore errors from display() here
|
|
695
|
+
pass
|
|
696
|
+
|
|
697
|
+
logger_safe("debug", "Applying filter %s/%s:\n%s",
|
|
698
|
+
i + 1, filtermanager.nb_filters, filtermanager.display())
|
|
796
699
|
|
|
797
|
-
for i in range(nb_filters):
|
|
798
|
-
|
|
799
|
-
# place the cursor on the next filter
|
|
800
|
-
filtermanager.update(i+1)
|
|
801
|
-
|
|
802
|
-
if filtermanager.time_filtering:
|
|
803
|
-
# if the filter manager is hybrid, then synchronize the time with tdfs4ds
|
|
804
|
-
tdfs4ds.FEATURE_STORE_TIME = filtermanager.get_date_in_the_past()
|
|
805
|
-
|
|
806
|
-
# overwrite the follow up table to tilter on the VALIDTIME_DATE too
|
|
807
|
-
follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
|
|
808
|
-
follow_up = follow_up[(follow_up.STATUS == 'COMPLETED') & (follow_up.VALIDTIME_DATE.isna() == False) & (
|
|
809
|
-
follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) & (follow_up.PROCESS_ID == process_id)]
|
|
810
|
-
|
|
811
|
-
# initialize do_compute, the flag that something has to be computed
|
|
812
700
|
do_compute = True
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
follow_up_ = follow_up.assign(APPLIED_FILTER=follow_up.APPLIED_FILTER.cast(tdml.VARCHAR(20000))).join(
|
|
818
|
-
tdml.DataFrame.from_query(
|
|
819
|
-
f"""
|
|
820
|
-
SELECT
|
|
821
|
-
CAST(JSON_AGG({','.join(filtermanager.col_names)}) AS VARCHAR(20000)) AS APPLIED_FILTER
|
|
822
|
-
FROM {filtermanager.schema_name}.{filtermanager.view_name}
|
|
823
|
-
"""
|
|
824
|
-
),
|
|
825
|
-
on = 'APPLIED_FILTER',
|
|
826
|
-
how = 'inner',
|
|
827
|
-
lprefix = 'l',
|
|
828
|
-
rprefix = 'r'
|
|
829
|
-
)
|
|
830
|
-
# if already computed and completed, then do_compute is set to False
|
|
831
|
-
if follow_up_.shape[0] > 0:
|
|
701
|
+
if process_id and tdfs4ds.FEATURE_STORE_TIME:
|
|
702
|
+
# see if already computed
|
|
703
|
+
follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
|
|
704
|
+
if follow_up.shape[0] > 0:
|
|
832
705
|
do_compute = False
|
|
833
706
|
|
|
834
|
-
if tdfs4ds.DISPLAY_LOGS:
|
|
835
|
-
print(filtermanager.display())
|
|
836
|
-
|
|
837
707
|
if do_compute or force_compute:
|
|
838
708
|
tdfs4ds.process_store.process_followup.followup_open(
|
|
839
709
|
run_id = tdfs4ds.RUN_ID,
|
|
@@ -842,83 +712,58 @@ def _upload_features(df, entity_id, feature_names,
|
|
|
842
712
|
filtermanager = filtermanager
|
|
843
713
|
)
|
|
844
714
|
try:
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
df,
|
|
848
|
-
entity_id,
|
|
849
|
-
feature_names,
|
|
715
|
+
prepared_features, volatile_table, features_infos = prepare_feature_ingestion(
|
|
716
|
+
df, entity_id, feature_names,
|
|
850
717
|
feature_versions = selected_features,
|
|
851
718
|
primary_index = primary_index,
|
|
852
719
|
entity_null_substitute = entity_null_substitute,
|
|
853
720
|
partitioning = partitioning
|
|
854
721
|
)
|
|
855
722
|
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
volatile_table_name,
|
|
860
|
-
entity_null_substitute=entity_null_substitute,
|
|
861
|
-
primary_index = primary_index,
|
|
862
|
-
partitioning = partitioning,
|
|
863
|
-
features_infos=features_infos
|
|
864
|
-
|
|
865
|
-
)
|
|
866
|
-
|
|
867
|
-
# indicate that something has been processed:
|
|
723
|
+
store_feature(entity_id, volatile_table, entity_null_substitute,
|
|
724
|
+
primary_index, partitioning, features_infos)
|
|
725
|
+
|
|
868
726
|
something_computed = True
|
|
869
727
|
|
|
870
728
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
871
|
-
run_id=tdfs4ds.RUN_ID,
|
|
872
|
-
process_type=tdfs4ds.PROCESS_TYPE,
|
|
873
|
-
process_id=process_id,
|
|
729
|
+
run_id = tdfs4ds.RUN_ID,
|
|
730
|
+
process_type = tdfs4ds.PROCESS_TYPE,
|
|
731
|
+
process_id = process_id,
|
|
874
732
|
filtermanager = filtermanager
|
|
875
733
|
)
|
|
876
734
|
|
|
877
735
|
except Exception as e:
|
|
878
|
-
|
|
736
|
+
logger_safe("exception", "Error with filter iteration %s: %s", i + 1, str(e))
|
|
879
737
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
880
|
-
run_id=tdfs4ds.RUN_ID,
|
|
881
|
-
process_type=tdfs4ds.PROCESS_TYPE,
|
|
882
|
-
process_id=process_id,
|
|
883
|
-
status='FAILED,' + str(e).split('\n')[0],
|
|
884
|
-
filtermanager=filtermanager
|
|
738
|
+
run_id = tdfs4ds.RUN_ID,
|
|
739
|
+
process_type = tdfs4ds.PROCESS_TYPE,
|
|
740
|
+
process_id = process_id,
|
|
741
|
+
status = 'FAILED,' + str(e).split('\n')[0],
|
|
742
|
+
filtermanager = filtermanager
|
|
885
743
|
)
|
|
886
744
|
raise
|
|
887
|
-
# Clean up by dropping the temporary volatile table.
|
|
888
|
-
# tdml.execute_sql(f'DROP TABLE {volatile_table_name}')
|
|
889
745
|
|
|
890
|
-
# Collect statistics only if something has been computed
|
|
891
746
|
if something_computed:
|
|
892
|
-
apply_collect_stats(
|
|
893
|
-
entity_id,
|
|
894
|
-
primary_index = primary_index,
|
|
895
|
-
partitioning = partitioning,
|
|
896
|
-
feature_infos = features_infos
|
|
897
|
-
)
|
|
747
|
+
apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
|
|
898
748
|
|
|
899
|
-
# Build a dataset view in the feature store.
|
|
900
749
|
if tdfs4ds.BUILD_DATASET_AT_UPLOAD:
|
|
901
|
-
|
|
750
|
+
logger_safe("info", "Building dataset for validation...")
|
|
902
751
|
try:
|
|
903
|
-
|
|
904
|
-
entity_id,
|
|
905
|
-
selected_features,
|
|
752
|
+
return build_dataset(
|
|
753
|
+
entity_id, selected_features,
|
|
906
754
|
view_name=None,
|
|
907
|
-
entity_null_substitute
|
|
755
|
+
entity_null_substitute=entity_null_substitute
|
|
908
756
|
)
|
|
909
757
|
except Exception as e:
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
print('entity :', entity_id)
|
|
913
|
-
print('selected features :', selected_features)
|
|
914
|
-
|
|
915
|
-
# Return the dataset view.
|
|
916
|
-
return dataset
|
|
758
|
+
logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
|
|
759
|
+
logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
|
|
917
760
|
else:
|
|
918
|
-
|
|
761
|
+
logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False)")
|
|
919
762
|
return
|
|
920
763
|
|
|
921
764
|
|
|
765
|
+
|
|
766
|
+
|
|
922
767
|
def build_dataset(entity_id, selected_features, view_name, schema_name=None, comment=None, return_query=False,
|
|
923
768
|
feature_store_time=False, join_type='INNER'):
|
|
924
769
|
"""
|
|
@@ -1366,9 +1211,6 @@ def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None):
|
|
|
1366
1211
|
>>> roll_out(process_list, time_manager, time_id_start=1, time_id_end=10)
|
|
1367
1212
|
"""
|
|
1368
1213
|
|
|
1369
|
-
#global DISPLAY_LOGS
|
|
1370
|
-
#global FEATURE_STORE_TIME
|
|
1371
|
-
|
|
1372
1214
|
# Disable display logs
|
|
1373
1215
|
temp_DISPLAY_LOGS = tdfs4ds.DISPLAY_LOGS
|
|
1374
1216
|
tdfs4ds.DISPLAY_LOGS = False
|
|
@@ -1376,40 +1218,43 @@ def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None):
|
|
|
1376
1218
|
tdfs4ds.PROCESS_TYPE = 'ROLL_OUT'
|
|
1377
1219
|
tdfs4ds.RUN_ID = str(uuid.uuid4())
|
|
1378
1220
|
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
1221
|
try:
|
|
1222
|
+
# Define range of time steps
|
|
1382
1223
|
if time_id_end is None:
|
|
1383
|
-
|
|
1224
|
+
time_range = range(time_id_start, time_manager.nb_time_steps + 1)
|
|
1384
1225
|
else:
|
|
1385
|
-
|
|
1386
|
-
|
|
1226
|
+
time_range = range(time_id_start, min(time_manager.nb_time_steps + 1, time_id_end + 1))
|
|
1227
|
+
|
|
1228
|
+
# Progress bar
|
|
1229
|
+
pbar = tqdm(time_range, desc="Starting rollout", unit="step")
|
|
1230
|
+
|
|
1387
1231
|
for i in pbar:
|
|
1388
|
-
# Update
|
|
1389
|
-
time_manager.update(time_id
|
|
1232
|
+
# Update time manager
|
|
1233
|
+
time_manager.update(time_id=i)
|
|
1390
1234
|
date_ = str(time_manager.display()['BUSINESS_DATE'].values[0])
|
|
1391
|
-
|
|
1392
|
-
#
|
|
1235
|
+
|
|
1236
|
+
# Sync feature store time
|
|
1393
1237
|
tdfs4ds.FEATURE_STORE_TIME = time_manager.get_date_in_the_past()
|
|
1394
|
-
|
|
1238
|
+
|
|
1239
|
+
# Display current progress in tqdm
|
|
1240
|
+
pbar.set_postfix(time=date_, feature_time=tdfs4ds.FEATURE_STORE_TIME)
|
|
1241
|
+
|
|
1395
1242
|
if tdfs4ds.DEBUG_MODE:
|
|
1396
|
-
print(
|
|
1397
|
-
print(
|
|
1398
|
-
|
|
1399
|
-
# Execute
|
|
1243
|
+
print("roll_out | date_:", date_)
|
|
1244
|
+
print("roll_out | feature_store_time:", tdfs4ds.FEATURE_STORE_TIME)
|
|
1245
|
+
|
|
1246
|
+
# Execute all processes for this time step
|
|
1400
1247
|
for proc_id in process_list:
|
|
1401
|
-
pbar.set_description(f"Processing {date_}
|
|
1248
|
+
pbar.set_description(f"Processing {date_} | proc {proc_id}")
|
|
1402
1249
|
run(process_id=proc_id, force_compute=False)
|
|
1403
1250
|
|
|
1251
|
+
# Restore settings
|
|
1404
1252
|
tdfs4ds.DISPLAY_LOGS = temp_DISPLAY_LOGS
|
|
1253
|
+
|
|
1405
1254
|
except Exception as e:
|
|
1406
1255
|
tdfs4ds.DISPLAY_LOGS = temp_DISPLAY_LOGS
|
|
1407
|
-
# If an exception occurs, print the date and the first line of the exception message
|
|
1408
|
-
#print(date_)
|
|
1409
1256
|
print(str(e).split('\n')[0])
|
|
1410
1257
|
tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
|
|
1411
1258
|
raise
|
|
1412
1259
|
|
|
1413
|
-
tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
|
|
1414
|
-
|
|
1415
|
-
|
|
1260
|
+
tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
|