tdfs4ds 0.2.4.32__py3-none-any.whl → 0.2.4.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +341 -519
- tdfs4ds/feature_store/feature_data_processing.py +236 -268
- tdfs4ds/process_store/process_query_administration.py +1 -1
- tdfs4ds/process_store/process_registration_management.py +67 -55
- tdfs4ds/utils/filter_management.py +87 -53
- tdfs4ds/utils/time_management.py +67 -24
- {tdfs4ds-0.2.4.32.dist-info → tdfs4ds-0.2.4.33.dist-info}/METADATA +1 -1
- {tdfs4ds-0.2.4.32.dist-info → tdfs4ds-0.2.4.33.dist-info}/RECORD +10 -10
- {tdfs4ds-0.2.4.32.dist-info → tdfs4ds-0.2.4.33.dist-info}/WHEEL +0 -0
- {tdfs4ds-0.2.4.32.dist-info → tdfs4ds-0.2.4.33.dist-info}/top_level.txt +0 -0
tdfs4ds/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
__version__ = '0.2.4.
|
|
1
|
+
__version__ = '0.2.4.33'
|
|
2
2
|
import logging
|
|
3
3
|
# Setup the logger
|
|
4
4
|
logging.basicConfig(
|
|
@@ -7,6 +7,15 @@ logging.basicConfig(
|
|
|
7
7
|
datefmt='%Y-%m-%d %H:%M:%S' # Set the date/time format
|
|
8
8
|
)
|
|
9
9
|
|
|
10
|
+
# Helper: central logging gate controlled by tdfs4ds.DISPLAY_LOGS
|
|
11
|
+
def logger_safe(level, message, *args, **kwargs):
|
|
12
|
+
"""
|
|
13
|
+
Wrapper around the global `logger` that only emits logs when
|
|
14
|
+
tdfs4ds.DISPLAY_LOGS is True. `level` is a string like "info", "error", etc.
|
|
15
|
+
"""
|
|
16
|
+
if getattr(tdfs4ds, "DISPLAY_LOGS", True):
|
|
17
|
+
getattr(logger, level)(message, *args, **kwargs)
|
|
18
|
+
|
|
10
19
|
logger = logging.getLogger(__name__)
|
|
11
20
|
|
|
12
21
|
from tdfs4ds.feature_store.feature_query_retrieval import get_available_entity_id_records, write_where_clause_filter
|
|
@@ -70,92 +79,80 @@ PROCESS_TYPE = 'RUN PROCESS'
|
|
|
70
79
|
try:
|
|
71
80
|
SCHEMA = tdml.context.context._get_current_databasename()
|
|
72
81
|
if SCHEMA is None:
|
|
73
|
-
|
|
74
|
-
|
|
82
|
+
logger.warning("No default database detected for feature store.")
|
|
83
|
+
logger.warning('Please set it explicitly: tdfs4ds.feature_store.schema = "<feature store database>"')
|
|
75
84
|
else:
|
|
76
|
-
|
|
77
|
-
|
|
85
|
+
logger.info("Default database detected for feature store: %s", SCHEMA)
|
|
86
|
+
logger.info('tdfs4ds.feature_store.schema = "%s"', SCHEMA)
|
|
87
|
+
|
|
78
88
|
if DATA_DOMAIN is None:
|
|
79
89
|
DATA_DOMAIN = SCHEMA
|
|
80
|
-
|
|
81
|
-
|
|
90
|
+
logger.info("DATA_DOMAIN not set. Defaulting to SCHEMA: %s", DATA_DOMAIN)
|
|
91
|
+
logger.info('You can override it using: tdfs4ds.DATA_DOMAIN = "<your data domain>"')
|
|
82
92
|
|
|
83
93
|
except Exception as e:
|
|
84
|
-
|
|
85
|
-
|
|
94
|
+
logger.error("Could not determine current database: %s", str(e).split('\n')[0])
|
|
95
|
+
logger.warning("Please specify the feature store database manually:")
|
|
96
|
+
logger.warning('tdfs4ds.feature_store.schema = "<feature store database>"')
|
|
86
97
|
|
|
87
98
|
|
|
88
99
|
def setup(database, if_exists='fail'):
|
|
89
100
|
"""
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
This function sets the database schema for feature and process catalogs. If specified, it also handles
|
|
93
|
-
the replacement of existing catalog tables. It reports the status of these operations, including any
|
|
94
|
-
encountered exceptions.
|
|
95
|
-
|
|
96
|
-
Parameters:
|
|
97
|
-
database (str): The name of the database schema to be used.
|
|
98
|
-
if_exists (str, optional): Determines the behavior if catalog tables already exist in the database.
|
|
99
|
-
'fail' (default) - Do nothing if the tables exist.
|
|
100
|
-
'replace' - Drop the tables if they exist before creating new ones.
|
|
101
|
-
|
|
102
|
-
Steps performed:
|
|
103
|
-
1. Sets the schema to the provided database name.
|
|
104
|
-
2. If 'if_exists' is 'replace', attempts to drop 'FS_FEATURE_CATALOG' and 'FS_PROCESS_CATALOG' tables.
|
|
105
|
-
3. Creates new feature and process catalog tables and sets their names in the tdfs4ds module.
|
|
106
|
-
4. Prints the names of the newly created tables along with the database name.
|
|
107
|
-
5. Captures and prints the first line of any exceptions that occur during these operations.
|
|
108
|
-
|
|
109
|
-
Returns:
|
|
110
|
-
None
|
|
101
|
+
Initialize the feature store environment by creating catalog tables and views.
|
|
111
102
|
"""
|
|
112
103
|
|
|
113
104
|
from tdfs4ds.feature_store.feature_store_management import feature_store_catalog_creation
|
|
114
105
|
from tdfs4ds.process_store.process_store_catalog_management import process_store_catalog_creation
|
|
115
106
|
|
|
116
107
|
tdfs4ds.SCHEMA = database
|
|
108
|
+
logger_safe("info", "Setting up feature store in database: %s", database)
|
|
109
|
+
|
|
117
110
|
if if_exists == 'replace':
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
print(str(e).split('\n')[0])
|
|
126
|
-
try:
|
|
127
|
-
tdml.db_drop_table(table_name = tdfs4ds.DATA_DISTRIBUTION_NAME, schema_name=database)
|
|
128
|
-
except Exception as e:
|
|
129
|
-
print(str(e).split('\n')[0])
|
|
111
|
+
logger_safe("info", "Replacing existing catalog tables if they exist.")
|
|
112
|
+
for table in [tdfs4ds.FEATURE_CATALOG_NAME, tdfs4ds.PROCESS_CATALOG_NAME, tdfs4ds.DATA_DISTRIBUTION_NAME]:
|
|
113
|
+
try:
|
|
114
|
+
tdml.db_drop_table(table_name=table, schema_name=database)
|
|
115
|
+
logger_safe("info", "Dropped table %s.%s", database, table)
|
|
116
|
+
except Exception as e:
|
|
117
|
+
logger_safe("warning", "Could not drop table %s.%s: %s", database, table, str(e).split('\n')[0])
|
|
130
118
|
|
|
131
119
|
DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME).drop_catalog()
|
|
120
|
+
|
|
132
121
|
try:
|
|
133
122
|
tdfs4ds.FEATURE_CATALOG_NAME = feature_store_catalog_creation()
|
|
134
|
-
|
|
123
|
+
logger_safe("info", "Feature catalog table created: %s in database %s", tdfs4ds.FEATURE_CATALOG_NAME, database)
|
|
135
124
|
except Exception as e:
|
|
136
|
-
|
|
125
|
+
logger_safe("error", "Feature catalog creation failed: %s", str(e).split('\n')[0])
|
|
137
126
|
|
|
138
127
|
try:
|
|
139
|
-
tdfs4ds.PROCESS_CATALOG_NAME,
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
128
|
+
(tdfs4ds.PROCESS_CATALOG_NAME,
|
|
129
|
+
tdfs4ds.DATA_DISTRIBUTION_NAME,
|
|
130
|
+
tdfs4ds.FILTER_MANAGER_NAME) = process_store_catalog_creation()
|
|
131
|
+
|
|
132
|
+
logger_safe("info", "Process catalog table created: %s", tdfs4ds.PROCESS_CATALOG_NAME)
|
|
133
|
+
logger_safe("info", "Data distribution table created: %s", tdfs4ds.DATA_DISTRIBUTION_NAME)
|
|
134
|
+
logger_safe("info", "Filter manager table created: %s", tdfs4ds.FILTER_MANAGER_NAME)
|
|
143
135
|
except Exception as e:
|
|
144
|
-
|
|
136
|
+
logger_safe("error", "Process catalog creation failed: %s", str(e).split('\n')[0])
|
|
145
137
|
|
|
146
138
|
try:
|
|
147
139
|
tdfs4ds.process_store.process_followup.follow_up_table_creation()
|
|
140
|
+
logger_safe("info", "Follow-up table created successfully.")
|
|
148
141
|
except Exception as e:
|
|
149
|
-
|
|
142
|
+
logger_safe("error", "Follow-up table creation failed: %s", str(e).split('\n')[0])
|
|
150
143
|
|
|
151
144
|
tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
|
|
152
145
|
tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
|
|
146
|
+
|
|
153
147
|
dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
|
|
154
148
|
if not dataset_catalog._exists():
|
|
155
149
|
dataset_catalog.create_catalog()
|
|
150
|
+
logger_safe("info", "Dataset catalog created: %s", tdfs4ds.DATASET_CATALOG_NAME)
|
|
156
151
|
|
|
152
|
+
logger_safe("info", "Setup complete.")
|
|
157
153
|
return
|
|
158
154
|
|
|
155
|
+
|
|
159
156
|
def connect(
|
|
160
157
|
database = tdfs4ds.SCHEMA,
|
|
161
158
|
feature_catalog_name = tdfs4ds.FEATURE_CATALOG_NAME,
|
|
@@ -166,15 +163,15 @@ def connect(
|
|
|
166
163
|
feature_catalog_name_view = tdfs4ds.FEATURE_CATALOG_NAME_VIEW,
|
|
167
164
|
process_catalog_name_view = tdfs4ds.PROCESS_CATALOG_NAME_VIEW,
|
|
168
165
|
dataset_catalog_name = tdfs4ds.DATASET_CATALOG_NAME,
|
|
169
|
-
create_if_missing = False
|
|
166
|
+
create_if_missing = False
|
|
170
167
|
):
|
|
171
|
-
if database is
|
|
172
|
-
tdfs4ds.SCHEMA = database
|
|
173
|
-
else:
|
|
168
|
+
if database is None:
|
|
174
169
|
raise ValueError("database parameter is None.")
|
|
170
|
+
tdfs4ds.SCHEMA = database
|
|
171
|
+
logger_safe("info", "Connecting to feature store in database: %s", database)
|
|
175
172
|
|
|
176
173
|
tables = [x.lower() for x in list(tdml.db_list_tables(schema_name=tdfs4ds.SCHEMA, object_type='table').TableName.values)]
|
|
177
|
-
|
|
174
|
+
|
|
178
175
|
feature_exists = feature_catalog_name.lower() in tables
|
|
179
176
|
process_exists = process_catalog_name.lower() in tables
|
|
180
177
|
distrib_exists = data_distribution_name.lower() in tables
|
|
@@ -183,20 +180,20 @@ def connect(
|
|
|
183
180
|
|
|
184
181
|
if not (feature_exists and process_exists and distrib_exists and filter_manager_exists):
|
|
185
182
|
if not create_if_missing:
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
# Follow-up table handling
|
|
183
|
+
logger_safe("warning", "Feature store components missing and create_if_missing=False")
|
|
184
|
+
return False
|
|
185
|
+
logger_safe("info", "Missing components detected; creating missing parts...")
|
|
186
|
+
if not feature_exists:
|
|
187
|
+
tdfs4ds.feature_store.feature_store_management.feature_store_catalog_creation()
|
|
188
|
+
if not process_exists:
|
|
189
|
+
tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_creation()
|
|
190
|
+
if not distrib_exists:
|
|
191
|
+
tdfs4ds.data_distribution.data_distribution_catalog_creation()
|
|
192
|
+
if not filter_manager_exists:
|
|
193
|
+
tdfs4ds.filter_manager.filter_manager_catalog_creation()
|
|
194
|
+
|
|
199
195
|
if not followup_name_exists:
|
|
196
|
+
logger_safe("info", "Creating follow-up table: %s", followup_name)
|
|
200
197
|
tdfs4ds.process_store.process_followup.follow_up_table_creation()
|
|
201
198
|
tdfs4ds.FOLLOW_UP_NAME = followup_name
|
|
202
199
|
|
|
@@ -210,30 +207,31 @@ def connect(
|
|
|
210
207
|
|
|
211
208
|
process_list = tdml.DataFrame(tdml.in_schema(database, process_catalog_name))
|
|
212
209
|
if 'ENTITY_NULL_SUBSTITUTE' not in process_list.columns:
|
|
213
|
-
|
|
214
|
-
print('upgrade to the latest DDL')
|
|
210
|
+
logger_safe("warning", "ENTITY_NULL_SUBSTITUTE column missing. Upgrading catalog.")
|
|
215
211
|
tdfs4ds.process_store.process_store_catalog_management.upgrade_process_catalog()
|
|
216
212
|
|
|
217
213
|
tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
|
|
218
214
|
tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
|
|
219
215
|
|
|
220
|
-
# Dataset
|
|
216
|
+
# Dataset Catalog
|
|
221
217
|
tdfs4ds.DATASET_CATALOG_NAME = dataset_catalog_name
|
|
222
|
-
dataset_catalog = DatasetCatalog(schema_name=database, name=
|
|
218
|
+
dataset_catalog = DatasetCatalog(schema_name=database, name=dataset_catalog_name)
|
|
223
219
|
if not dataset_catalog._exists():
|
|
224
220
|
dataset_catalog.create_catalog()
|
|
221
|
+
logger_safe("info", "Dataset catalog created: %s", dataset_catalog_name)
|
|
225
222
|
|
|
226
|
-
#
|
|
223
|
+
# Detect temporal distribution
|
|
227
224
|
def is_data_distribution_temporal():
|
|
228
225
|
return 'PERIOD' in tdfs4ds.utils.lineage.get_ddl(
|
|
229
226
|
view_name=tdfs4ds.DATA_DISTRIBUTION_NAME,
|
|
230
227
|
schema_name=tdfs4ds.SCHEMA,
|
|
231
228
|
object_type='table'
|
|
232
229
|
)
|
|
233
|
-
|
|
230
|
+
|
|
234
231
|
tdfs4ds.DATA_DISTRIBUTION_TEMPORAL = is_data_distribution_temporal()
|
|
235
|
-
|
|
236
|
-
return True
|
|
232
|
+
logger_safe("info", "Connected to feature store successfully.")
|
|
233
|
+
return True
|
|
234
|
+
|
|
237
235
|
|
|
238
236
|
|
|
239
237
|
|
|
@@ -287,50 +285,22 @@ def get_dataset_entity(dataset_id = None):
|
|
|
287
285
|
def get_dataset_features(dataset_id = None):
|
|
288
286
|
return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_features(dataset_id)
|
|
289
287
|
|
|
290
|
-
def run(process_id, return_dataset
|
|
288
|
+
def run(process_id, return_dataset=False, force_compute=False, force_varchar_length=None):
|
|
291
289
|
"""
|
|
292
290
|
Executes a specific process from the feature store identified by the process ID.
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
Parameters:
|
|
296
|
-
- process_id (str): The unique identifier of the process to run.
|
|
297
|
-
- return_dataset (bool, optional): A flag indicating whether to return the dataset created during the process.
|
|
298
|
-
Default is False.
|
|
299
|
-
- force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
|
|
300
|
-
Default is False.
|
|
301
|
-
- force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
|
|
302
|
-
VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
|
|
303
|
-
where k is the smallest integer so that the original lengths is smaller or equal
|
|
304
|
-
to k x force_varchar_length. Default is None.
|
|
305
|
-
|
|
306
|
-
Returns:
|
|
307
|
-
DataFrame or None: If return_dataset is True, returns the dataset created during the process. Otherwise, returns None.
|
|
308
|
-
|
|
309
|
-
This function performs the following steps:
|
|
310
|
-
1. Determines the process type and initializes necessary variables.
|
|
311
|
-
2. Constructs and executes a SQL query to retrieve process details by process ID.
|
|
312
|
-
3. Fetches the filter manager, process type, primary index, partitioning, and data domain from the query result.
|
|
313
|
-
4. Handles different process types, such as 'denormalized view' and 'tdstone2 view'.
|
|
314
|
-
5. For 'denormalized view' process type, extracts necessary details, fetches data, and uploads features to the feature store.
|
|
315
|
-
6. Optionally returns the dataset created during the process if return_dataset is True.
|
|
316
|
-
|
|
317
|
-
Note:
|
|
318
|
-
- The function relies on various sub-modules within the `tdfs4ds` library for different steps of the process, from
|
|
319
|
-
data retrieval to feature uploading.
|
|
320
|
-
- It is intended to be used internally within a system that manages a Teradata feature store, assuming access to
|
|
321
|
-
a Teradata database and the appropriate schema for feature storage.
|
|
291
|
+
Uses global `logger` for diagnostics (gated by tdfs4ds.DISPLAY_LOGS).
|
|
322
292
|
"""
|
|
323
293
|
|
|
324
294
|
if tdfs4ds.PROCESS_TYPE is None:
|
|
325
295
|
PROCESS_TYPE_ = 'RUN PROCESS'
|
|
326
|
-
tdfs4ds.RUN_ID
|
|
296
|
+
tdfs4ds.RUN_ID = str(uuid.uuid4())
|
|
327
297
|
else:
|
|
328
298
|
PROCESS_TYPE_ = tdfs4ds.PROCESS_TYPE
|
|
329
299
|
|
|
330
|
-
if tdfs4ds
|
|
331
|
-
|
|
300
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
301
|
+
logger_safe("debug", "def run | tdfs4ds.FEATURE_STORE_TIME=%s", tdfs4ds.FEATURE_STORE_TIME)
|
|
332
302
|
|
|
333
|
-
if tdfs4ds.FEATURE_STORE_TIME
|
|
303
|
+
if tdfs4ds.FEATURE_STORE_TIME is None:
|
|
334
304
|
validtime_statement = 'CURRENT VALIDTIME'
|
|
335
305
|
else:
|
|
336
306
|
validtime_statement = f"VALIDTIME AS OF TIMESTAMP '{tdfs4ds.FEATURE_STORE_TIME}'"
|
|
@@ -342,148 +312,110 @@ def run(process_id, return_dataset = False, force_compute = False, force_varchar
|
|
|
342
312
|
WHERE A.PROCESS_ID = '{process_id}'
|
|
343
313
|
"""
|
|
344
314
|
|
|
315
|
+
logger_safe(
|
|
316
|
+
"info",
|
|
317
|
+
"Starting run | run_id=%s | process_type=%s | process_id=%s | return_dataset=%s | force_compute=%s | force_varchar_length=%s",
|
|
318
|
+
tdfs4ds.RUN_ID, PROCESS_TYPE_, process_id, return_dataset, force_compute, force_varchar_length
|
|
319
|
+
)
|
|
320
|
+
|
|
345
321
|
# Executing the query and converting the result to Pandas DataFrame
|
|
346
322
|
df = tdml.DataFrame.from_query(query).to_pandas()
|
|
347
323
|
|
|
348
|
-
# Check if exactly one record is returned, else
|
|
324
|
+
# Check if exactly one record is returned, else log an error and return
|
|
349
325
|
if df.shape[0] != 1:
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
326
|
+
logger_safe(
|
|
327
|
+
"error",
|
|
328
|
+
"Process catalog lookup returned %s record(s); expected 1. Check table %s.%s. Query: %s",
|
|
329
|
+
df.shape[0], tdfs4ds.SCHEMA, tdfs4ds.PROCESS_CATALOG_NAME_VIEW, query.strip()
|
|
330
|
+
)
|
|
353
331
|
return
|
|
354
332
|
|
|
355
|
-
|
|
356
333
|
# Fetching the filter manager
|
|
357
334
|
filter_schema_name = df['FILTER_DATABASE_NAME'].values[0]
|
|
358
335
|
if filter_schema_name is None:
|
|
359
336
|
filtermanager = None
|
|
360
337
|
else:
|
|
361
338
|
filter_view_name = df['FILTER_VIEW_NAME'].values[0]
|
|
362
|
-
filter_table_name = df['FILTER_TABLE_NAME'].values[0]
|
|
339
|
+
filter_table_name = df['FILTER_TABLE_NAME'].values[0] # kept for parity; not used directly here
|
|
363
340
|
filtermanager = FilterManager(table_name=filter_view_name, schema_name=filter_schema_name)
|
|
364
341
|
|
|
365
|
-
# Fetching
|
|
366
|
-
process_type
|
|
367
|
-
|
|
368
|
-
# Fetching the primary index from the query result
|
|
369
|
-
primary_index = df['FOR_PRIMARY_INDEX'].values[0]
|
|
342
|
+
# Fetching process metadata
|
|
343
|
+
process_type = df['PROCESS_TYPE'].values[0]
|
|
344
|
+
primary_index = df['FOR_PRIMARY_INDEX'].values[0]
|
|
370
345
|
if primary_index is not None:
|
|
371
|
-
primary_index = primary_index.split(',')
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
346
|
+
primary_index = [x.strip() for x in primary_index.split(',') if x.strip()]
|
|
347
|
+
partitioning = df['FOR_DATA_PARTITIONING'].values[0]
|
|
348
|
+
DATA_DOMAIN = df['DATA_DOMAIN'].values[0]
|
|
349
|
+
|
|
350
|
+
logger_safe(
|
|
351
|
+
"info",
|
|
352
|
+
"Process metadata | process_id=%s | process_type=%s | primary_index=%s | partitioning=%s | data_domain=%s | validtime=%s",
|
|
353
|
+
process_id, process_type, primary_index, partitioning, DATA_DOMAIN, validtime_statement
|
|
354
|
+
)
|
|
378
355
|
|
|
379
356
|
# Handling 'denormalized view' process type
|
|
380
357
|
if process_type == 'denormalized view':
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
entity_id = df['ENTITY_ID'].values[0].split(',')
|
|
358
|
+
view_name = df['VIEW_NAME'].values[0]
|
|
359
|
+
entity_id = [x.strip() for x in df['ENTITY_ID'].values[0].split(',') if x.strip()]
|
|
384
360
|
entity_null_substitute = eval(df['ENTITY_NULL_SUBSTITUTE'].values[0])
|
|
385
|
-
feature_names
|
|
361
|
+
feature_names = [x.strip() for x in df['FEATURE_NAMES'].values[0].split(',') if x.strip()]
|
|
386
362
|
|
|
387
|
-
# Fetching data and uploading features to the feature store
|
|
388
363
|
df_data = tdml.DataFrame(tdml.in_schema(view_name.split('.')[0], view_name.split('.')[1]))
|
|
389
364
|
|
|
390
|
-
if tdfs4ds
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
365
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
366
|
+
logger_safe("debug", "run | entity_id=%s", entity_id)
|
|
367
|
+
logger_safe("debug", "run | entity_null_substitute=%s", entity_null_substitute)
|
|
368
|
+
logger_safe("debug", "run | feature_names=%s", feature_names)
|
|
369
|
+
logger_safe("debug", "run | process_id=%s", process_id)
|
|
370
|
+
logger_safe("debug", "run | primary_index=%s", primary_index)
|
|
371
|
+
logger_safe("debug", "run | partitioning=%s", partitioning)
|
|
372
|
+
|
|
397
373
|
dataset = _upload_features(
|
|
398
374
|
df_data,
|
|
399
375
|
entity_id,
|
|
400
376
|
feature_names,
|
|
401
|
-
feature_versions
|
|
402
|
-
primary_index
|
|
403
|
-
partitioning
|
|
404
|
-
filtermanager
|
|
405
|
-
entity_null_substitute
|
|
406
|
-
process_id
|
|
407
|
-
force_compute=
|
|
408
|
-
force_varchar_length
|
|
377
|
+
feature_versions=process_id,
|
|
378
|
+
primary_index=primary_index,
|
|
379
|
+
partitioning=partitioning,
|
|
380
|
+
filtermanager=filtermanager,
|
|
381
|
+
entity_null_substitute=entity_null_substitute,
|
|
382
|
+
process_id=process_id,
|
|
383
|
+
force_compute=force_compute,
|
|
384
|
+
force_varchar_length=force_varchar_length
|
|
409
385
|
)
|
|
410
386
|
|
|
411
387
|
# Handling 'tdstone2 view' process type
|
|
412
388
|
elif process_type == 'tdstone2 view':
|
|
413
|
-
|
|
414
|
-
|
|
389
|
+
logger_safe("warning", "Process type 'tdstone2 view' not implemented yet for process_id=%s", process_id)
|
|
390
|
+
dataset = None
|
|
415
391
|
|
|
392
|
+
else:
|
|
393
|
+
logger_safe("error", "Unknown process type '%s' for process_id=%s", process_type, process_id)
|
|
394
|
+
dataset = None
|
|
416
395
|
|
|
417
396
|
if return_dataset:
|
|
397
|
+
logger_safe("info", "Run finished with dataset | run_id=%s | process_id=%s", tdfs4ds.RUN_ID, process_id)
|
|
418
398
|
return dataset
|
|
419
399
|
else:
|
|
400
|
+
logger_safe("info", "Run finished without dataset | run_id=%s | process_id=%s", tdfs4ds.RUN_ID, process_id)
|
|
420
401
|
return
|
|
421
402
|
|
|
422
|
-
def upload_features(df, entity_id, feature_names, metadata={}, primary_index = None, partitioning = '', filtermanager = None, entity_null_substitute = {}, force_compute = True, force_varchar_length = 1024):
|
|
423
|
-
"""
|
|
424
|
-
Uploads feature data from a DataFrame to the feature store for a specified entity. This involves registering the
|
|
425
|
-
process in the feature store, executing the necessary SQL to insert the data, and returning the resulting dataset
|
|
426
|
-
for further use or inspection.
|
|
427
|
-
|
|
428
|
-
The function supports dynamic entity ID interpretation and flexible feature name handling, ensuring compatibility
|
|
429
|
-
with various data schemas. It automatically registers the data upload process and applies additional metadata,
|
|
430
|
-
if provided.
|
|
431
|
-
|
|
432
|
-
Parameters:
|
|
433
|
-
- df (DataFrame): The DataFrame containing the feature data to be uploaded.
|
|
434
|
-
- entity_id (dict, list, or str): The identifier of the entity to which the features belong. This can be:
|
|
435
|
-
- a dictionary mapping column names to their data types,
|
|
436
|
-
- a list of column names, which will be automatically converted to a dictionary with types inferred from `df`,
|
|
437
|
-
- a string representing a single column name, which will be converted into a list and then to a dictionary as above.
|
|
438
|
-
- feature_names (list or str): The names of the features to be uploaded. If a string is provided, it will be
|
|
439
|
-
split into a list based on commas or treated as a single feature name.
|
|
440
|
-
- metadata (dict, optional): Additional metadata to associate with the upload process. Defaults to an empty dictionary.
|
|
441
|
-
- primary_index (list, optional): Specifies the primary index columns for optimizing data storage and retrieval.
|
|
442
|
-
- partitioning (str, optional): Defines how the data should be partitioned in the store for performance optimization.
|
|
443
|
-
- filtermanager (object, optional): An object managing filter conditions for the feature data. Default is None.
|
|
444
|
-
- entity_null_substitute (dict, optional): A dictionary specifying substitute values for nulls in entity columns.
|
|
445
|
-
Default is an empty dictionary.
|
|
446
|
-
- force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
|
|
447
|
-
Default is True.
|
|
448
|
-
- force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
|
|
449
|
-
VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
|
|
450
|
-
where k is the smallest integer so that the original lengths is smaller or equal
|
|
451
|
-
to k x force_varchar_length. Default is 1024.
|
|
452
|
-
Returns:
|
|
453
|
-
DataFrame: A DataFrame representing the dataset resulting from the upload process, typically used for validation
|
|
454
|
-
or further processing.
|
|
455
403
|
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
... tx_type LIKE 'DEBIT',
|
|
472
|
-
... tx_type LIKE 'PAYMENT',
|
|
473
|
-
... tx_type LIKE 'CASH_OUT',
|
|
474
|
-
... tx_type LIKE 'CASH_IN',
|
|
475
|
-
... tx_type LIKE 'TRANSFER',
|
|
476
|
-
... NO CASE,
|
|
477
|
-
... UNKNOWN)'''
|
|
478
|
-
>>> features = [x for x in tddf.columns if x not in entity_id]
|
|
479
|
-
>>> dataset = upload_features(
|
|
480
|
-
... df = tddf,
|
|
481
|
-
... entity_id = entity_id,
|
|
482
|
-
... feature_names = features,
|
|
483
|
-
... metadata = {'project': 'test'},
|
|
484
|
-
... primary_index = primary_index,
|
|
485
|
-
... partitioning = partitioning
|
|
486
|
-
... )
|
|
404
|
+
def upload_features(
|
|
405
|
+
df,
|
|
406
|
+
entity_id,
|
|
407
|
+
feature_names,
|
|
408
|
+
metadata={},
|
|
409
|
+
primary_index=None,
|
|
410
|
+
partitioning='',
|
|
411
|
+
filtermanager=None,
|
|
412
|
+
entity_null_substitute={},
|
|
413
|
+
force_compute=True,
|
|
414
|
+
force_varchar_length=1024
|
|
415
|
+
):
|
|
416
|
+
"""
|
|
417
|
+
Uploads feature data from a DataFrame to the feature store for a specified entity.
|
|
418
|
+
All diagnostics go through `logger_safe()` which respects `tdfs4ds.DISPLAY_LOGS`.
|
|
487
419
|
"""
|
|
488
420
|
|
|
489
421
|
from tdfs4ds.utils.info import get_column_types
|
|
@@ -491,45 +423,42 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
|
|
|
491
423
|
from tdfs4ds.process_store.process_registration_management import register_process_view
|
|
492
424
|
|
|
493
425
|
# Convert entity_id to a dictionary if it's not already one
|
|
494
|
-
if
|
|
426
|
+
if isinstance(entity_id, list):
|
|
495
427
|
entity_id.sort()
|
|
496
428
|
entity_id = get_column_types(df, entity_id)
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
elif
|
|
429
|
+
logger_safe("debug", "entity_id converted to dict: %s", entity_id)
|
|
430
|
+
|
|
431
|
+
elif isinstance(entity_id, str):
|
|
500
432
|
entity_id = [entity_id]
|
|
501
433
|
entity_id = get_column_types(df, entity_id)
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
if
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
feature_names = feature_names.split(',')
|
|
434
|
+
logger_safe("debug", "entity_id converted to dict: %s", entity_id)
|
|
435
|
+
|
|
436
|
+
# Normalize feature_names
|
|
437
|
+
if not isinstance(feature_names, list):
|
|
438
|
+
logger_safe("debug", "feature_names is not a list: %s", feature_names)
|
|
439
|
+
if isinstance(feature_names, str) and ',' in feature_names:
|
|
440
|
+
feature_names = [x.strip() for x in feature_names.split(',')]
|
|
510
441
|
else:
|
|
511
442
|
feature_names = [feature_names]
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
if primary_index is not None and
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
primary_index = primary_index.split(',')
|
|
443
|
+
logger_safe("debug", "feature_names converted to list: %s", feature_names)
|
|
444
|
+
logger_safe("debug", "Check the conversion is as expected.")
|
|
445
|
+
|
|
446
|
+
# Normalize primary_index
|
|
447
|
+
if primary_index is not None and not isinstance(primary_index, list):
|
|
448
|
+
logger_safe("debug", "primary_index is not a list: %s", primary_index)
|
|
449
|
+
if isinstance(primary_index, str) and ',' in primary_index:
|
|
450
|
+
primary_index = [x.strip() for x in primary_index.split(',')]
|
|
521
451
|
else:
|
|
522
452
|
primary_index = [primary_index]
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
print('check it is a expected.')
|
|
453
|
+
logger_safe("debug", "primary_index converted to list: %s", primary_index)
|
|
454
|
+
logger_safe("debug", "Check the conversion is as expected.")
|
|
526
455
|
|
|
456
|
+
# Partitioning
|
|
527
457
|
partitioning = tdfs4ds.utils.info.generate_partitioning_clause(partitioning=partitioning)
|
|
528
458
|
|
|
529
|
-
|
|
530
|
-
print("filtermanager", filtermanager)
|
|
459
|
+
logger_safe("debug", "filtermanager: %s", filtermanager)
|
|
531
460
|
|
|
532
|
-
# Register
|
|
461
|
+
# Register process -> get SQL(s) + process_id
|
|
533
462
|
query_insert, process_id, query_insert_dist, query_insert_filtermanager = register_process_view.__wrapped__(
|
|
534
463
|
view_name = df,
|
|
535
464
|
entity_id = entity_id,
|
|
@@ -542,104 +471,96 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
|
|
|
542
471
|
entity_null_substitute = entity_null_substitute
|
|
543
472
|
)
|
|
544
473
|
|
|
545
|
-
|
|
546
|
-
execute_query(query_insert)
|
|
547
|
-
execute_query(query_insert_dist)
|
|
548
|
-
if tdfs4ds.DEBUG_MODE:
|
|
549
|
-
print("query_insert_filtermanager",query_insert_filtermanager)
|
|
550
|
-
if query_insert_filtermanager is not None:
|
|
551
|
-
execute_query(query_insert_filtermanager)
|
|
552
|
-
|
|
553
|
-
# Run the registered process and return the resulting dataset
|
|
554
|
-
PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
|
|
555
|
-
tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES'
|
|
556
|
-
if tdfs4ds.BUILD_DATASET_AT_UPLOAD: tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES WITH DATASET VALIDATION'
|
|
557
|
-
tdfs4ds.RUN_ID = str(uuid.uuid4())
|
|
474
|
+
logger_safe("info", "Registered process (process_id=%s) for upload_features", process_id)
|
|
558
475
|
|
|
559
|
-
|
|
476
|
+
# Execute queries
|
|
477
|
+
try:
|
|
478
|
+
execute_query(query_insert)
|
|
479
|
+
logger_safe("info", "Executed main insert query for process_id=%s", process_id)
|
|
480
|
+
except Exception as e:
|
|
481
|
+
logger_safe("exception", "Main insert query failed for process_id=%s", process_id)
|
|
482
|
+
raise
|
|
560
483
|
|
|
561
|
-
|
|
484
|
+
try:
|
|
485
|
+
execute_query(query_insert_dist)
|
|
486
|
+
logger_safe("info", "Executed distribution insert query for process_id=%s", process_id)
|
|
487
|
+
except Exception as e:
|
|
488
|
+
logger_safe("exception", "Distribution insert query failed for process_id=%s", process_id)
|
|
489
|
+
raise
|
|
562
490
|
|
|
563
|
-
|
|
491
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
492
|
+
# Avoid dumping entire SQL in normal logs; keep it debug-only.
|
|
493
|
+
logger_safe("debug", "query_insert_filtermanager: %s", query_insert_filtermanager)
|
|
564
494
|
|
|
495
|
+
if query_insert_filtermanager is not None:
|
|
496
|
+
try:
|
|
497
|
+
execute_query(query_insert_filtermanager)
|
|
498
|
+
logger_safe("info", "Executed filtermanager insert query for process_id=%s", process_id)
|
|
565
499
|
except Exception as e:
|
|
566
|
-
|
|
567
|
-
run_id = tdfs4ds.RUN_ID,
|
|
568
|
-
process_type = tdfs4ds.PROCESS_TYPE,
|
|
569
|
-
process_id = process_id,
|
|
570
|
-
status = 'FAILED,' + str(e).split('\n')[0]
|
|
571
|
-
)
|
|
500
|
+
logger_safe("exception", "Filtermanager insert query failed for process_id=%s", process_id)
|
|
572
501
|
raise
|
|
573
502
|
|
|
503
|
+
# Run the registered process (with/without dataset)
|
|
504
|
+
PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
|
|
505
|
+
tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES'
|
|
506
|
+
if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
|
|
507
|
+
tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES WITH DATASET VALIDATION'
|
|
508
|
+
tdfs4ds.RUN_ID = str(uuid.uuid4())
|
|
574
509
|
|
|
575
|
-
|
|
576
|
-
|
|
510
|
+
logger_safe(
|
|
511
|
+
"info",
|
|
512
|
+
"Starting run (run_id=%s, process_type=%s, process_id=%s, force_compute=%s, force_varchar_length=%s)",
|
|
513
|
+
tdfs4ds.RUN_ID, tdfs4ds.PROCESS_TYPE, process_id, force_compute, force_varchar_length
|
|
514
|
+
)
|
|
577
515
|
|
|
516
|
+
try:
|
|
517
|
+
if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
|
|
518
|
+
dataset = run(
|
|
519
|
+
process_id=process_id,
|
|
520
|
+
return_dataset=True,
|
|
521
|
+
force_compute=force_compute,
|
|
522
|
+
force_varchar_length=force_varchar_length
|
|
523
|
+
)
|
|
524
|
+
logger_safe("info", "Run completed with dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
|
|
525
|
+
return dataset
|
|
526
|
+
else:
|
|
527
|
+
run(
|
|
528
|
+
process_id=process_id,
|
|
529
|
+
return_dataset=False,
|
|
530
|
+
force_compute=force_compute,
|
|
531
|
+
force_varchar_length=force_varchar_length
|
|
532
|
+
)
|
|
533
|
+
logger_safe("info", "Run completed without dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
|
|
534
|
+
return
|
|
535
|
+
|
|
536
|
+
except Exception as e:
|
|
537
|
+
# Keep your existing follow-up close behavior, but ensure the error is logged.
|
|
578
538
|
try:
|
|
579
|
-
run(process_id=process_id, return_dataset=False, force_compute = force_compute, force_varchar_length = force_varchar_length)
|
|
580
|
-
except Exception as e:
|
|
581
539
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
582
540
|
run_id = tdfs4ds.RUN_ID,
|
|
583
541
|
process_type = tdfs4ds.PROCESS_TYPE,
|
|
584
542
|
process_id = process_id,
|
|
585
543
|
status = 'FAILED,' + str(e).split('\n')[0]
|
|
586
544
|
)
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
Uploads features from a DataFrame to the feature store, handling entity registration, feature type determination,
|
|
596
|
-
feature registration, preparation for ingestion, and storage in the designated feature tables.
|
|
597
|
-
|
|
598
|
-
Parameters:
|
|
599
|
-
- df (DataFrame): The input DataFrame containing the feature data.
|
|
600
|
-
- entity_id (str or dict): The identifier for the entity to which these features belong. This can be a single ID
|
|
601
|
-
(str) or a dictionary of attribute names and values uniquely identifying the entity.
|
|
602
|
-
- feature_names (list): A list of strings specifying the names of the features to be uploaded.
|
|
603
|
-
- feature_versions (str or list, optional): Specifies the versions of the features to be uploaded. Can be a single
|
|
604
|
-
string applied to all features or a list of strings specifying the version
|
|
605
|
-
for each feature respectively. Default is 'dev.0.0'.
|
|
606
|
-
- primary_index (list, optional): Specifies the columns to be used as the primary index in the feature store tables.
|
|
607
|
-
This can significantly impact the performance of data retrieval operations.
|
|
608
|
-
- partitioning (str, optional): A string indicating the partitioning strategy for the feature store tables, which can
|
|
609
|
-
enhance query performance based on the access patterns.
|
|
610
|
-
- filtermanager (object, optional): An object managing filter conditions for the feature data. Default is None.
|
|
611
|
-
- entity_null_substitute (dict, optional): A dictionary specifying substitute values for nulls in entity columns.
|
|
612
|
-
Default is an empty dictionary.
|
|
613
|
-
- process_id (str, optional): An identifier for the process, used for tracking and follow-up. Default is None.
|
|
614
|
-
- force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
|
|
615
|
-
Default is False.
|
|
616
|
-
- force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
|
|
617
|
-
VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
|
|
618
|
-
where k is the smallest integer so that the original lengths is smaller or equal
|
|
619
|
-
to k x force_varchar_length. Default is None.
|
|
545
|
+
finally:
|
|
546
|
+
logger_safe("exception", "Run failed (run_id=%s, process_id=%s): %s",
|
|
547
|
+
tdfs4ds.RUN_ID, process_id, str(e).split('\n')[0]
|
|
548
|
+
)
|
|
549
|
+
raise
|
|
550
|
+
finally:
|
|
551
|
+
# Restore previous process type just in case the caller relies on it.
|
|
552
|
+
tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
|
|
620
553
|
|
|
621
554
|
|
|
622
|
-
Returns:
|
|
623
|
-
DataFrame: A DataFrame representing the dataset view created in the feature store, detailing the features and their
|
|
624
|
-
metadata, including versions and storage locations.
|
|
625
|
-
|
|
626
|
-
This function orchestrates several steps involved in feature storage:
|
|
627
|
-
1. Registers the entity in the feature store if not already present.
|
|
628
|
-
2. Determines the data types of the features based on the input DataFrame.
|
|
629
|
-
3. Registers the features, including their names, types, and versions, in the feature catalog.
|
|
630
|
-
4. Prepares the feature data for ingestion, including any necessary transformations.
|
|
631
|
-
5. Stores the prepared feature data in the feature store.
|
|
632
|
-
6. Optionally, cleans up temporary resources used during the process.
|
|
633
|
-
7. Builds and returns a view of the dataset representing the uploaded features for easy access.
|
|
634
555
|
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
556
|
+
def _upload_features(
|
|
557
|
+
df, entity_id, feature_names,
|
|
558
|
+
feature_versions=FEATURE_VERSION_DEFAULT,
|
|
559
|
+
primary_index=None, partitioning='',
|
|
560
|
+
filtermanager=None, entity_null_substitute={},
|
|
561
|
+
process_id=None, force_compute=False,
|
|
562
|
+
force_varchar_length=None
|
|
563
|
+
):
|
|
643
564
|
from tdfs4ds.feature_store.entity_management import register_entity
|
|
644
565
|
from tdfs4ds.feature_store.feature_store_management import Gettdtypes
|
|
645
566
|
from tdfs4ds.feature_store.feature_store_management import register_features
|
|
@@ -647,235 +568,149 @@ def _upload_features(df, entity_id, feature_names,
|
|
|
647
568
|
from tdfs4ds.feature_store.feature_data_processing import store_feature, apply_collect_stats
|
|
648
569
|
from tdfs4ds.utils.info import get_column_types, update_varchar_length
|
|
649
570
|
|
|
650
|
-
# Convert entity_id to a dictionary if
|
|
651
|
-
if
|
|
571
|
+
# Convert entity_id to a dictionary if not already
|
|
572
|
+
if isinstance(entity_id, list):
|
|
652
573
|
entity_id.sort()
|
|
653
574
|
entity_id = get_column_types(df, entity_id)
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
entity_id
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
#register_entity(entity_id, primary_index=primary_index, partitioning=partitioning)
|
|
663
|
-
|
|
664
|
-
# If feature_versions is a list, create a dictionary mapping each feature name to its corresponding version.
|
|
665
|
-
# If feature_versions is a string, create a dictionary mapping each feature name to this string.
|
|
666
|
-
if type(feature_versions) == list:
|
|
667
|
-
selected_features = {k: v for k, v in zip(feature_names, feature_versions)}
|
|
575
|
+
logger_safe("debug", "entity_id converted to dict: %s", entity_id)
|
|
576
|
+
elif isinstance(entity_id, str):
|
|
577
|
+
entity_id = get_column_types(df, [entity_id])
|
|
578
|
+
logger_safe("debug", "entity_id converted to dict: %s", entity_id)
|
|
579
|
+
|
|
580
|
+
# Map feature versions
|
|
581
|
+
if isinstance(feature_versions, list):
|
|
582
|
+
selected_features = dict(zip(feature_names, feature_versions))
|
|
668
583
|
else:
|
|
669
584
|
selected_features = {k: feature_versions for k in feature_names}
|
|
670
585
|
|
|
671
|
-
# Get
|
|
672
|
-
feature_names_types = Gettdtypes(
|
|
673
|
-
df,
|
|
674
|
-
features_columns=feature_names,
|
|
675
|
-
entity_id=entity_id
|
|
676
|
-
)
|
|
586
|
+
# Get Teradata types for features
|
|
587
|
+
feature_names_types = Gettdtypes(df, features_columns=feature_names, entity_id=entity_id)
|
|
677
588
|
|
|
678
589
|
if force_varchar_length is not None:
|
|
679
|
-
|
|
680
|
-
feature_names_types = update_varchar_length(
|
|
590
|
+
logger_safe("debug", "Updating VARCHAR lengths with force_varchar_length=%s", force_varchar_length)
|
|
591
|
+
feature_names_types = update_varchar_length(
|
|
592
|
+
feature_names_types,
|
|
593
|
+
new_varchar_length=force_varchar_length
|
|
594
|
+
)
|
|
681
595
|
|
|
682
596
|
def validate_feature_types(feature_names_types):
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
feature_names_types (dict): A dictionary where keys are feature names and values are their data types.
|
|
689
|
-
|
|
690
|
-
Raises:
|
|
691
|
-
ValueError: If any feature type contains 'clob', 'blob', or 'json'.
|
|
692
|
-
"""
|
|
693
|
-
invalid_types = {key: value['type'] for key, value in feature_names_types.items()
|
|
694
|
-
if any(term in value['type'].lower() for term in ['clob', 'blob', 'json'])}
|
|
695
|
-
|
|
696
|
-
if invalid_types:
|
|
597
|
+
invalid = {
|
|
598
|
+
k: v['type'] for k, v in feature_names_types.items()
|
|
599
|
+
if any(x in v['type'].lower() for x in ['clob', 'blob', 'json'])
|
|
600
|
+
}
|
|
601
|
+
if invalid:
|
|
697
602
|
raise ValueError(
|
|
698
|
-
f"
|
|
699
|
-
"
|
|
603
|
+
f"Unsupported data types found: {invalid}. "
|
|
604
|
+
"CLOB/BLOB/JSON are not supported."
|
|
700
605
|
)
|
|
701
|
-
|
|
702
|
-
validate_feature_types(feature_names_types)
|
|
703
|
-
|
|
606
|
+
|
|
607
|
+
validate_feature_types(feature_names_types)
|
|
608
|
+
|
|
609
|
+
logger_safe("info", "Registering entity %s in feature store", entity_id)
|
|
704
610
|
register_entity(entity_id, feature_names_types, primary_index=primary_index, partitioning=partitioning)
|
|
705
611
|
|
|
706
|
-
if tdfs4ds
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
primary_index,
|
|
720
|
-
partitioning
|
|
721
|
-
)
|
|
722
|
-
|
|
723
|
-
if tdfs4ds.DEBUG_MODE:
|
|
724
|
-
print("---------_upload_features")
|
|
725
|
-
print("filtermanager : ", filtermanager)
|
|
726
|
-
print("feature names : ", feature_names)
|
|
727
|
-
print("selected features : ", selected_features)
|
|
728
|
-
|
|
729
|
-
if process_id is not None and tdfs4ds.FEATURE_STORE_TIME is not None:
|
|
612
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
613
|
+
logger_safe(
|
|
614
|
+
"debug",
|
|
615
|
+
"_upload_features entity_id=%s null_substitute=%s features=%s primary_index=%s partitioning=%s",
|
|
616
|
+
entity_id, entity_null_substitute, feature_names, primary_index, partitioning
|
|
617
|
+
)
|
|
618
|
+
logger_safe("debug", "selected_features=%s df.columns=%s", selected_features, df.columns)
|
|
619
|
+
|
|
620
|
+
register_features(entity_id, feature_names_types, primary_index, partitioning)
|
|
621
|
+
logger_safe("info", "Features registered in catalog: %s", feature_names)
|
|
622
|
+
|
|
623
|
+
follow_up = None
|
|
624
|
+
if process_id and tdfs4ds.FEATURE_STORE_TIME:
|
|
730
625
|
follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
|
|
731
|
-
follow_up = follow_up[
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
do_compute = False
|
|
626
|
+
follow_up = follow_up[
|
|
627
|
+
(follow_up.STATUS == 'COMPLETED') &
|
|
628
|
+
(follow_up.VALIDTIME_DATE.isna() == False) &
|
|
629
|
+
(follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) &
|
|
630
|
+
(follow_up.PROCESS_ID == process_id)
|
|
631
|
+
]
|
|
738
632
|
|
|
739
|
-
|
|
633
|
+
if filtermanager is None:
|
|
634
|
+
do_compute = not (process_id and follow_up is not None and follow_up.shape[0] > 0)
|
|
740
635
|
if do_compute or force_compute:
|
|
741
|
-
|
|
636
|
+
logger_safe("info", "Beginning feature ingestion for entity=%s", entity_id)
|
|
742
637
|
tdfs4ds.process_store.process_followup.followup_open(
|
|
743
|
-
run_id
|
|
744
|
-
process_type
|
|
745
|
-
process_id
|
|
638
|
+
run_id=tdfs4ds.RUN_ID,
|
|
639
|
+
process_type=tdfs4ds.PROCESS_TYPE,
|
|
640
|
+
process_id=process_id
|
|
746
641
|
)
|
|
747
|
-
|
|
748
642
|
try:
|
|
749
|
-
prepared_features,
|
|
750
|
-
df,
|
|
751
|
-
entity_id,
|
|
752
|
-
feature_names,
|
|
643
|
+
prepared_features, volatile_table, features_infos = prepare_feature_ingestion(
|
|
644
|
+
df, entity_id, feature_names,
|
|
753
645
|
feature_versions=selected_features,
|
|
754
646
|
primary_index=primary_index,
|
|
755
647
|
entity_null_substitute=entity_null_substitute,
|
|
756
648
|
partitioning=partitioning
|
|
757
649
|
)
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
volatile_table_name,
|
|
762
|
-
entity_null_substitute=entity_null_substitute,
|
|
763
|
-
primary_index=primary_index,
|
|
764
|
-
partitioning=partitioning,
|
|
765
|
-
features_infos = features_infos
|
|
766
|
-
)
|
|
767
|
-
|
|
768
|
-
# Collect statistics
|
|
769
|
-
apply_collect_stats(
|
|
770
|
-
entity_id,
|
|
771
|
-
primary_index = primary_index,
|
|
772
|
-
partitioning = partitioning,
|
|
773
|
-
feature_infos = features_infos
|
|
774
|
-
)
|
|
650
|
+
store_feature(entity_id, volatile_table, entity_null_substitute,
|
|
651
|
+
primary_index, partitioning, features_infos)
|
|
652
|
+
apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
|
|
775
653
|
|
|
776
654
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
777
|
-
run_id
|
|
778
|
-
process_type
|
|
779
|
-
process_id
|
|
655
|
+
run_id=tdfs4ds.RUN_ID,
|
|
656
|
+
process_type=tdfs4ds.PROCESS_TYPE,
|
|
657
|
+
process_id=process_id
|
|
780
658
|
)
|
|
659
|
+
logger_safe("info", "Feature ingestion completed for entity=%s", entity_id)
|
|
781
660
|
|
|
782
661
|
except Exception as e:
|
|
662
|
+
logger_safe("exception", "Feature ingestion failed for entity=%s", entity_id)
|
|
783
663
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
784
|
-
run_id
|
|
785
|
-
process_type
|
|
786
|
-
process_id
|
|
787
|
-
status
|
|
664
|
+
run_id=tdfs4ds.RUN_ID,
|
|
665
|
+
process_type=tdfs4ds.PROCESS_TYPE,
|
|
666
|
+
process_id=process_id,
|
|
667
|
+
status='FAILED,' + str(e).split('\n')[0]
|
|
788
668
|
)
|
|
789
669
|
raise
|
|
790
|
-
else:
|
|
791
|
-
# get the total number of filter condition in the filter manager
|
|
792
|
-
nb_filters = filtermanager.nb_filters
|
|
793
670
|
|
|
794
|
-
|
|
671
|
+
else:
|
|
672
|
+
logger_safe("info", "FilterManager detected: %s filters to process", filtermanager.nb_filters)
|
|
795
673
|
something_computed = False
|
|
674
|
+
for i in range(filtermanager.nb_filters):
|
|
675
|
+
filtermanager.update(i + 1)
|
|
676
|
+
logger_safe("debug", "Applying filter %s/%s:\n%s",
|
|
677
|
+
i + 1, filtermanager.nb_filters, filtermanager.display())
|
|
796
678
|
|
|
797
|
-
for i in range(nb_filters):
|
|
798
|
-
|
|
799
|
-
# place the cursor on the next filter
|
|
800
|
-
filtermanager.update(i+1)
|
|
801
|
-
|
|
802
|
-
if filtermanager.time_filtering:
|
|
803
|
-
# if the filter manager is hybrid, then synchronize the time with tdfs4ds
|
|
804
|
-
tdfs4ds.FEATURE_STORE_TIME = filtermanager.get_date_in_the_past()
|
|
805
|
-
|
|
806
|
-
# overwrite the follow up table to tilter on the VALIDTIME_DATE too
|
|
807
|
-
follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
|
|
808
|
-
follow_up = follow_up[(follow_up.STATUS == 'COMPLETED') & (follow_up.VALIDTIME_DATE.isna() == False) & (
|
|
809
|
-
follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) & (follow_up.PROCESS_ID == process_id)]
|
|
810
|
-
|
|
811
|
-
# initialize do_compute, the flag that something has to be computed
|
|
812
679
|
do_compute = True
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
follow_up_ = follow_up.assign(APPLIED_FILTER=follow_up.APPLIED_FILTER.cast(tdml.VARCHAR(20000))).join(
|
|
818
|
-
tdml.DataFrame.from_query(
|
|
819
|
-
f"""
|
|
820
|
-
SELECT
|
|
821
|
-
CAST(JSON_AGG({','.join(filtermanager.col_names)}) AS VARCHAR(20000)) AS APPLIED_FILTER
|
|
822
|
-
FROM {filtermanager.schema_name}.{filtermanager.view_name}
|
|
823
|
-
"""
|
|
824
|
-
),
|
|
825
|
-
on = 'APPLIED_FILTER',
|
|
826
|
-
how = 'inner',
|
|
827
|
-
lprefix = 'l',
|
|
828
|
-
rprefix = 'r'
|
|
829
|
-
)
|
|
830
|
-
# if already computed and completed, then do_compute is set to False
|
|
831
|
-
if follow_up_.shape[0] > 0:
|
|
680
|
+
if process_id and tdfs4ds.FEATURE_STORE_TIME:
|
|
681
|
+
# see if already computed
|
|
682
|
+
follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
|
|
683
|
+
if follow_up.shape[0] > 0:
|
|
832
684
|
do_compute = False
|
|
833
685
|
|
|
834
|
-
if tdfs4ds.DISPLAY_LOGS:
|
|
835
|
-
print(filtermanager.display())
|
|
836
|
-
|
|
837
686
|
if do_compute or force_compute:
|
|
838
687
|
tdfs4ds.process_store.process_followup.followup_open(
|
|
839
|
-
run_id
|
|
840
|
-
process_type
|
|
841
|
-
process_id
|
|
842
|
-
filtermanager
|
|
688
|
+
run_id=tdfs4ds.RUN_ID,
|
|
689
|
+
process_type=tdfs4ds.PROCESS_TYPE,
|
|
690
|
+
process_id=process_id,
|
|
691
|
+
filtermanager=filtermanager
|
|
843
692
|
)
|
|
844
693
|
try:
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
feature_names,
|
|
850
|
-
feature_versions = selected_features,
|
|
851
|
-
primary_index = primary_index,
|
|
852
|
-
entity_null_substitute = entity_null_substitute,
|
|
853
|
-
partitioning = partitioning
|
|
854
|
-
)
|
|
855
|
-
|
|
856
|
-
# Store the prepared features in the feature store.
|
|
857
|
-
store_feature(
|
|
858
|
-
entity_id,
|
|
859
|
-
volatile_table_name,
|
|
694
|
+
prepared_features, volatile_table, features_infos = prepare_feature_ingestion(
|
|
695
|
+
df, entity_id, feature_names,
|
|
696
|
+
feature_versions=selected_features,
|
|
697
|
+
primary_index=primary_index,
|
|
860
698
|
entity_null_substitute=entity_null_substitute,
|
|
861
|
-
|
|
862
|
-
partitioning = partitioning,
|
|
863
|
-
features_infos=features_infos
|
|
864
|
-
|
|
699
|
+
partitioning=partitioning
|
|
865
700
|
)
|
|
866
|
-
|
|
867
|
-
|
|
701
|
+
store_feature(entity_id, volatile_table, entity_null_substitute,
|
|
702
|
+
primary_index, partitioning, features_infos)
|
|
868
703
|
something_computed = True
|
|
869
704
|
|
|
870
705
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
871
706
|
run_id=tdfs4ds.RUN_ID,
|
|
872
707
|
process_type=tdfs4ds.PROCESS_TYPE,
|
|
873
708
|
process_id=process_id,
|
|
874
|
-
filtermanager
|
|
709
|
+
filtermanager=filtermanager
|
|
875
710
|
)
|
|
876
711
|
|
|
877
712
|
except Exception as e:
|
|
878
|
-
|
|
713
|
+
logger_safe("exception", "Error with filter iteration %s: %s", i + 1, str(e))
|
|
879
714
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
880
715
|
run_id=tdfs4ds.RUN_ID,
|
|
881
716
|
process_type=tdfs4ds.PROCESS_TYPE,
|
|
@@ -884,41 +719,28 @@ def _upload_features(df, entity_id, feature_names,
|
|
|
884
719
|
filtermanager=filtermanager
|
|
885
720
|
)
|
|
886
721
|
raise
|
|
887
|
-
# Clean up by dropping the temporary volatile table.
|
|
888
|
-
# tdml.execute_sql(f'DROP TABLE {volatile_table_name}')
|
|
889
722
|
|
|
890
|
-
# Collect statistics only if something has been computed
|
|
891
723
|
if something_computed:
|
|
892
|
-
apply_collect_stats(
|
|
893
|
-
entity_id,
|
|
894
|
-
primary_index = primary_index,
|
|
895
|
-
partitioning = partitioning,
|
|
896
|
-
feature_infos = features_infos
|
|
897
|
-
)
|
|
724
|
+
apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
|
|
898
725
|
|
|
899
|
-
# Build a dataset view in the feature store.
|
|
900
726
|
if tdfs4ds.BUILD_DATASET_AT_UPLOAD:
|
|
901
|
-
|
|
727
|
+
logger_safe("info", "Building dataset for validation...")
|
|
902
728
|
try:
|
|
903
|
-
|
|
904
|
-
entity_id,
|
|
905
|
-
selected_features,
|
|
729
|
+
return build_dataset(
|
|
730
|
+
entity_id, selected_features,
|
|
906
731
|
view_name=None,
|
|
907
|
-
entity_null_substitute
|
|
732
|
+
entity_null_substitute=entity_null_substitute
|
|
908
733
|
)
|
|
909
734
|
except Exception as e:
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
print('entity :', entity_id)
|
|
913
|
-
print('selected features :', selected_features)
|
|
914
|
-
|
|
915
|
-
# Return the dataset view.
|
|
916
|
-
return dataset
|
|
735
|
+
logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
|
|
736
|
+
logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
|
|
917
737
|
else:
|
|
918
|
-
|
|
738
|
+
logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False)")
|
|
919
739
|
return
|
|
920
740
|
|
|
921
741
|
|
|
742
|
+
|
|
743
|
+
|
|
922
744
|
def build_dataset(entity_id, selected_features, view_name, schema_name=None, comment=None, return_query=False,
|
|
923
745
|
feature_store_time=False, join_type='INNER'):
|
|
924
746
|
"""
|