tdfs4ds 0.2.4.26__py3-none-any.whl → 0.2.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +769 -571
- tdfs4ds/feature_store/feature_data_processing.py +370 -300
- tdfs4ds/feature_store/feature_query_retrieval.py +105 -52
- tdfs4ds/feature_store/feature_store_management.py +226 -231
- tdfs4ds/genai/__init__.py +27 -0
- tdfs4ds/genai/documentation.py +1878 -0
- tdfs4ds/process_store/process_followup.py +113 -2
- tdfs4ds/process_store/process_query_administration.py +1 -1
- tdfs4ds/process_store/process_registration_management.py +67 -55
- tdfs4ds/process_store/process_store_catalog_management.py +79 -26
- tdfs4ds/utils/filter_management.py +548 -138
- tdfs4ds/utils/query_management.py +18 -40
- tdfs4ds/utils/time_management.py +565 -98
- {tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.5.1.dist-info}/METADATA +1 -1
- tdfs4ds-0.2.5.1.dist-info/RECORD +32 -0
- tdfs/__init__.py +0 -1
- tdfs/data/curves.csv +0 -5086
- tdfs/datasets.py +0 -27
- tdfs/feature_store.py +0 -723
- tdfs4ds/feature_engineering.py +0 -152
- tdfs4ds/feature_store.py +0 -1529
- tdfs4ds/process_store.py +0 -387
- tdfs4ds/utils.py +0 -579
- tdfs4ds-0.2.4.26.dist-info/RECORD +0 -38
- {tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.5.1.dist-info}/WHEEL +0 -0
- {tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.5.1.dist-info}/top_level.txt +0 -0
tdfs4ds/__init__.py
CHANGED
|
@@ -1,5 +1,8 @@
|
|
|
1
|
-
__version__ = '0.2.
|
|
1
|
+
__version__ = '0.2.5.1'
|
|
2
|
+
import difflib
|
|
2
3
|
import logging
|
|
4
|
+
import json
|
|
5
|
+
|
|
3
6
|
# Setup the logger
|
|
4
7
|
logging.basicConfig(
|
|
5
8
|
level=logging.INFO,
|
|
@@ -7,11 +10,21 @@ logging.basicConfig(
|
|
|
7
10
|
datefmt='%Y-%m-%d %H:%M:%S' # Set the date/time format
|
|
8
11
|
)
|
|
9
12
|
|
|
13
|
+
# Helper: central logging gate controlled by tdfs4ds.DISPLAY_LOGS
|
|
14
|
+
def logger_safe(level, message, *args, **kwargs):
|
|
15
|
+
"""
|
|
16
|
+
Wrapper around the global `logger` that only emits logs when
|
|
17
|
+
tdfs4ds.DISPLAY_LOGS is True. `level` is a string like "info", "error", etc.
|
|
18
|
+
"""
|
|
19
|
+
if getattr(tdfs4ds, "DISPLAY_LOGS", True):
|
|
20
|
+
getattr(logger, level)(message, *args, **kwargs)
|
|
21
|
+
|
|
10
22
|
logger = logging.getLogger(__name__)
|
|
11
23
|
|
|
12
24
|
from tdfs4ds.feature_store.feature_query_retrieval import get_available_entity_id_records, write_where_clause_filter
|
|
13
25
|
from tdfs4ds.process_store.process_followup import follow_up_report
|
|
14
26
|
from tdfs4ds.dataset.dataset_catalog import DatasetCatalog, Dataset
|
|
27
|
+
from . import genai
|
|
15
28
|
|
|
16
29
|
DATA_DOMAIN = None
|
|
17
30
|
SCHEMA = None
|
|
@@ -44,6 +57,18 @@ FEATURE_PARTITION_EACH = 1
|
|
|
44
57
|
|
|
45
58
|
VARCHAR_SIZE = 1024
|
|
46
59
|
|
|
60
|
+
INSTRUCT_MODEL_URL = None
|
|
61
|
+
INSTRUCT_MODEL_API_KEY = None
|
|
62
|
+
INSTRUCT_MODEL_MODEL = None
|
|
63
|
+
INSTRUCT_MODEL_PROVIDER = None
|
|
64
|
+
|
|
65
|
+
DOCUMENTATION_PROCESS_BUSINESS_LOGIC = 'FS_PROCESS_DOCUMENTATION_BUSINESS_LOGIC'
|
|
66
|
+
DOCUMENTATION_PROCESS_FEATURES = 'FS_PROCESS_DOCUMENTATION_FEATURES'
|
|
67
|
+
DOCUMENTATION_PROCESS_BUSINESS_LOGIC_VIEW = 'FS_V_PROCESS_DOCUMENTATION_BUSINESS_LOGIC'
|
|
68
|
+
DOCUMENTATION_PROCESS_FEATURES_VIEW = 'FS_V_PROCESS_DOCUMENTATION_FEATURES'
|
|
69
|
+
DOCUMENTATION_PROCESS_EXPLAIN = 'FS_PROCESS_DOCUMENTATION_EXPLAIN'
|
|
70
|
+
DOCUMENTATION_PROCESS_EXPLAIN_VIEW = 'FS_V_PROCESS_DOCUMENTATION_EXPLAIN'
|
|
71
|
+
|
|
47
72
|
import warnings
|
|
48
73
|
warnings.filterwarnings('ignore')
|
|
49
74
|
|
|
@@ -57,7 +82,7 @@ import tdfs4ds.datasets
|
|
|
57
82
|
import time
|
|
58
83
|
|
|
59
84
|
import inspect
|
|
60
|
-
import tqdm
|
|
85
|
+
from tqdm.auto import tqdm # auto picks the right frontend (notebook/terminal)
|
|
61
86
|
|
|
62
87
|
from tdfs4ds.feature_store.feature_data_processing import generate_on_clause
|
|
63
88
|
|
|
@@ -70,92 +95,85 @@ PROCESS_TYPE = 'RUN PROCESS'
|
|
|
70
95
|
try:
|
|
71
96
|
SCHEMA = tdml.context.context._get_current_databasename()
|
|
72
97
|
if SCHEMA is None:
|
|
73
|
-
|
|
74
|
-
|
|
98
|
+
logger.warning("No default database detected for feature store.")
|
|
99
|
+
logger.warning('Please set it explicitly: tdfs4ds.feature_store.schema = "<feature store database>"')
|
|
75
100
|
else:
|
|
76
|
-
|
|
77
|
-
|
|
101
|
+
logger.info("Default database detected for feature store: %s", SCHEMA)
|
|
102
|
+
logger.info('tdfs4ds.feature_store.schema = "%s"', SCHEMA)
|
|
103
|
+
|
|
78
104
|
if DATA_DOMAIN is None:
|
|
79
105
|
DATA_DOMAIN = SCHEMA
|
|
80
|
-
|
|
81
|
-
|
|
106
|
+
logger.info("DATA_DOMAIN not set. Defaulting to SCHEMA: %s", DATA_DOMAIN)
|
|
107
|
+
logger.info('You can override it using: tdfs4ds.DATA_DOMAIN = "<your data domain>"')
|
|
82
108
|
|
|
83
109
|
except Exception as e:
|
|
84
|
-
|
|
85
|
-
|
|
110
|
+
logger.error("Could not determine current database: %s", str(e).split('\n')[0])
|
|
111
|
+
logger.warning("Please specify the feature store database manually:")
|
|
112
|
+
logger.warning('tdfs4ds.feature_store.schema = "<feature store database>"')
|
|
86
113
|
|
|
87
114
|
|
|
88
115
|
def setup(database, if_exists='fail'):
|
|
89
116
|
"""
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
This function sets the database schema for feature and process catalogs. If specified, it also handles
|
|
93
|
-
the replacement of existing catalog tables. It reports the status of these operations, including any
|
|
94
|
-
encountered exceptions.
|
|
95
|
-
|
|
96
|
-
Parameters:
|
|
97
|
-
database (str): The name of the database schema to be used.
|
|
98
|
-
if_exists (str, optional): Determines the behavior if catalog tables already exist in the database.
|
|
99
|
-
'fail' (default) - Do nothing if the tables exist.
|
|
100
|
-
'replace' - Drop the tables if they exist before creating new ones.
|
|
101
|
-
|
|
102
|
-
Steps performed:
|
|
103
|
-
1. Sets the schema to the provided database name.
|
|
104
|
-
2. If 'if_exists' is 'replace', attempts to drop 'FS_FEATURE_CATALOG' and 'FS_PROCESS_CATALOG' tables.
|
|
105
|
-
3. Creates new feature and process catalog tables and sets their names in the tdfs4ds module.
|
|
106
|
-
4. Prints the names of the newly created tables along with the database name.
|
|
107
|
-
5. Captures and prints the first line of any exceptions that occur during these operations.
|
|
108
|
-
|
|
109
|
-
Returns:
|
|
110
|
-
None
|
|
117
|
+
Initialize the feature store environment by creating catalog tables and views.
|
|
111
118
|
"""
|
|
112
119
|
|
|
113
120
|
from tdfs4ds.feature_store.feature_store_management import feature_store_catalog_creation
|
|
114
121
|
from tdfs4ds.process_store.process_store_catalog_management import process_store_catalog_creation
|
|
115
122
|
|
|
116
123
|
tdfs4ds.SCHEMA = database
|
|
124
|
+
logger_safe("info", "Setting up feature store in database: %s", database)
|
|
125
|
+
|
|
117
126
|
if if_exists == 'replace':
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
print(str(e).split('\n')[0])
|
|
126
|
-
try:
|
|
127
|
-
tdml.db_drop_table(table_name = tdfs4ds.DATA_DISTRIBUTION_NAME, schema_name=database)
|
|
128
|
-
except Exception as e:
|
|
129
|
-
print(str(e).split('\n')[0])
|
|
127
|
+
logger_safe("info", "Replacing existing catalog tables if they exist.")
|
|
128
|
+
for table in [tdfs4ds.FEATURE_CATALOG_NAME, tdfs4ds.PROCESS_CATALOG_NAME, tdfs4ds.DATA_DISTRIBUTION_NAME]:
|
|
129
|
+
try:
|
|
130
|
+
tdml.db_drop_table(table_name=table, schema_name=database)
|
|
131
|
+
logger_safe("info", "Dropped table %s.%s", database, table)
|
|
132
|
+
except Exception as e:
|
|
133
|
+
logger_safe("warning", "Could not drop table %s.%s: %s", database, table, str(e).split('\n')[0])
|
|
130
134
|
|
|
131
135
|
DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME).drop_catalog()
|
|
136
|
+
|
|
132
137
|
try:
|
|
133
138
|
tdfs4ds.FEATURE_CATALOG_NAME = feature_store_catalog_creation()
|
|
134
|
-
|
|
139
|
+
logger_safe("info", "Feature catalog table created: %s in database %s", tdfs4ds.FEATURE_CATALOG_NAME, database)
|
|
135
140
|
except Exception as e:
|
|
136
|
-
|
|
141
|
+
logger_safe("error", "Feature catalog creation failed: %s", str(e).split('\n')[0])
|
|
137
142
|
|
|
138
143
|
try:
|
|
139
|
-
tdfs4ds.PROCESS_CATALOG_NAME,
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
144
|
+
(tdfs4ds.PROCESS_CATALOG_NAME,
|
|
145
|
+
tdfs4ds.DATA_DISTRIBUTION_NAME,
|
|
146
|
+
tdfs4ds.FILTER_MANAGER_NAME) = process_store_catalog_creation()
|
|
147
|
+
|
|
148
|
+
logger_safe("info", "Process catalog table created: %s", tdfs4ds.PROCESS_CATALOG_NAME)
|
|
149
|
+
logger_safe("info", "Data distribution table created: %s", tdfs4ds.DATA_DISTRIBUTION_NAME)
|
|
150
|
+
logger_safe("info", "Filter manager table created: %s", tdfs4ds.FILTER_MANAGER_NAME)
|
|
143
151
|
except Exception as e:
|
|
144
|
-
|
|
152
|
+
logger_safe("error", "Process catalog creation failed: %s", str(e).split('\n')[0])
|
|
145
153
|
|
|
146
154
|
try:
|
|
147
155
|
tdfs4ds.process_store.process_followup.follow_up_table_creation()
|
|
156
|
+
logger_safe("info", "Follow-up table created successfully.")
|
|
148
157
|
except Exception as e:
|
|
149
|
-
|
|
158
|
+
logger_safe("error", "Follow-up table creation failed: %s", str(e).split('\n')[0])
|
|
150
159
|
|
|
151
160
|
tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
|
|
152
161
|
tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
|
|
162
|
+
|
|
153
163
|
dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
|
|
154
164
|
if not dataset_catalog._exists():
|
|
155
165
|
dataset_catalog.create_catalog()
|
|
166
|
+
logger_safe("info", "Dataset catalog created: %s", tdfs4ds.DATASET_CATALOG_NAME)
|
|
156
167
|
|
|
168
|
+
logger_safe("info", "Setup complete.")
|
|
169
|
+
try:
|
|
170
|
+
tdfs4ds.genai.documentations_tables_creation()
|
|
171
|
+
logger_safe("info", "Documentation tables created successfully.")
|
|
172
|
+
except Exception as e:
|
|
173
|
+
logger_safe("error", "Documentation tables creation failed: %s", str(e).split('\n')[0])
|
|
157
174
|
return
|
|
158
175
|
|
|
176
|
+
|
|
159
177
|
def connect(
|
|
160
178
|
database = tdfs4ds.SCHEMA,
|
|
161
179
|
feature_catalog_name = tdfs4ds.FEATURE_CATALOG_NAME,
|
|
@@ -166,37 +184,51 @@ def connect(
|
|
|
166
184
|
feature_catalog_name_view = tdfs4ds.FEATURE_CATALOG_NAME_VIEW,
|
|
167
185
|
process_catalog_name_view = tdfs4ds.PROCESS_CATALOG_NAME_VIEW,
|
|
168
186
|
dataset_catalog_name = tdfs4ds.DATASET_CATALOG_NAME,
|
|
169
|
-
|
|
187
|
+
documentation_process_business_logic = tdfs4ds.DOCUMENTATION_PROCESS_BUSINESS_LOGIC,
|
|
188
|
+
documentation_process_features = tdfs4ds.DOCUMENTATION_PROCESS_FEATURES,
|
|
189
|
+
documentation_process_explain = tdfs4ds.DOCUMENTATION_PROCESS_EXPLAIN,
|
|
190
|
+
create_if_missing = False
|
|
170
191
|
):
|
|
171
|
-
if database is
|
|
172
|
-
tdfs4ds.SCHEMA = database
|
|
173
|
-
else:
|
|
192
|
+
if database is None:
|
|
174
193
|
raise ValueError("database parameter is None.")
|
|
194
|
+
tdfs4ds.SCHEMA = database
|
|
195
|
+
logger_safe("info", "Connecting to feature store in database: %s", database)
|
|
175
196
|
|
|
176
197
|
tables = [x.lower() for x in list(tdml.db_list_tables(schema_name=tdfs4ds.SCHEMA, object_type='table').TableName.values)]
|
|
177
|
-
|
|
198
|
+
|
|
178
199
|
feature_exists = feature_catalog_name.lower() in tables
|
|
179
200
|
process_exists = process_catalog_name.lower() in tables
|
|
180
201
|
distrib_exists = data_distribution_name.lower() in tables
|
|
181
202
|
filter_manager_exists = filter_manager_name.lower() in tables
|
|
182
203
|
followup_name_exists = followup_name.lower() in tables
|
|
204
|
+
documentation_process_business_logic_exist = documentation_process_business_logic.lower() in tables
|
|
205
|
+
documentation_process_features_exist = documentation_process_features.lower() in tables
|
|
206
|
+
documentation_process_explain_exist = documentation_process_explain.lower() in tables
|
|
183
207
|
|
|
184
|
-
|
|
208
|
+
|
|
209
|
+
if not (feature_exists and process_exists and distrib_exists and filter_manager_exists and documentation_process_business_logic_exist and documentation_process_features_exist):
|
|
185
210
|
if not create_if_missing:
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
211
|
+
logger_safe("warning", "Feature store components missing and create_if_missing=False")
|
|
212
|
+
return False
|
|
213
|
+
logger_safe("info", "Missing components detected; creating missing parts...")
|
|
214
|
+
if not feature_exists:
|
|
215
|
+
logger_safe("info", "Creating feature catalog: %s", feature_catalog_name)
|
|
216
|
+
tdfs4ds.feature_store.feature_store_management.feature_store_catalog_creation()
|
|
217
|
+
if not process_exists:
|
|
218
|
+
logger_safe("info", "Creating process catalog: %s", process_catalog_name)
|
|
219
|
+
tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_creation()
|
|
220
|
+
if not distrib_exists:
|
|
221
|
+
logger_safe("info", "Creating data distribution table: %s", data_distribution_name)
|
|
222
|
+
tdfs4ds.data_distribution.data_distribution_catalog_creation()
|
|
223
|
+
if not filter_manager_exists:
|
|
224
|
+
logger_safe("info", "Creating filter manager table: %s", filter_manager_name)
|
|
225
|
+
tdfs4ds.filter_manager.filter_manager_catalog_creation()
|
|
226
|
+
if not documentation_process_business_logic_exist or not documentation_process_features_exist or not documentation_process_explain_exist:
|
|
227
|
+
logger_safe("info", "Creating documentation tables.")
|
|
228
|
+
tdfs4ds.genai.documentation_tables_creation()
|
|
229
|
+
|
|
199
230
|
if not followup_name_exists:
|
|
231
|
+
logger_safe("info", "Creating follow-up table: %s", followup_name)
|
|
200
232
|
tdfs4ds.process_store.process_followup.follow_up_table_creation()
|
|
201
233
|
tdfs4ds.FOLLOW_UP_NAME = followup_name
|
|
202
234
|
|
|
@@ -210,20 +242,20 @@ def connect(
|
|
|
210
242
|
|
|
211
243
|
process_list = tdml.DataFrame(tdml.in_schema(database, process_catalog_name))
|
|
212
244
|
if 'ENTITY_NULL_SUBSTITUTE' not in process_list.columns:
|
|
213
|
-
|
|
214
|
-
print('upgrade to the latest DDL')
|
|
245
|
+
logger_safe("warning", "ENTITY_NULL_SUBSTITUTE column missing. Upgrading catalog.")
|
|
215
246
|
tdfs4ds.process_store.process_store_catalog_management.upgrade_process_catalog()
|
|
216
247
|
|
|
217
248
|
tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
|
|
218
249
|
tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
|
|
219
250
|
|
|
220
|
-
# Dataset
|
|
251
|
+
# Dataset Catalog
|
|
221
252
|
tdfs4ds.DATASET_CATALOG_NAME = dataset_catalog_name
|
|
222
|
-
dataset_catalog = DatasetCatalog(schema_name=database, name=
|
|
253
|
+
dataset_catalog = DatasetCatalog(schema_name=database, name=dataset_catalog_name)
|
|
223
254
|
if not dataset_catalog._exists():
|
|
224
255
|
dataset_catalog.create_catalog()
|
|
256
|
+
logger_safe("info", "Dataset catalog created: %s", dataset_catalog_name)
|
|
225
257
|
|
|
226
|
-
#
|
|
258
|
+
# Detect temporal distribution
|
|
227
259
|
def is_data_distribution_temporal():
|
|
228
260
|
return 'PERIOD' in tdfs4ds.utils.lineage.get_ddl(
|
|
229
261
|
view_name=tdfs4ds.DATA_DISTRIBUTION_NAME,
|
|
@@ -231,10 +263,110 @@ def connect(
|
|
|
231
263
|
object_type='table'
|
|
232
264
|
)
|
|
233
265
|
|
|
266
|
+
query_data_domain = f"""
|
|
267
|
+
SELECT DISTINCT DATA_DOMAIN
|
|
268
|
+
FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}
|
|
269
|
+
UNION
|
|
270
|
+
SELECT DISTINCT DATA_DOMAIN
|
|
271
|
+
FROM {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW}
|
|
272
|
+
"""
|
|
273
|
+
data_domains = tdml.DataFrame.from_query(query_data_domain).to_pandas()['DATA_DOMAIN'].tolist()
|
|
274
|
+
logger_safe("info", "Data domains in feature store: %s", data_domains)
|
|
275
|
+
|
|
234
276
|
tdfs4ds.DATA_DISTRIBUTION_TEMPORAL = is_data_distribution_temporal()
|
|
235
|
-
|
|
236
|
-
return True
|
|
277
|
+
logger_safe("info", "Connected to feature store successfully.")
|
|
278
|
+
return True
|
|
279
|
+
|
|
280
|
+
def get_data_domains(verbose=True):
|
|
281
|
+
"""
|
|
282
|
+
Retrieve and display all data domains available in the feature store.
|
|
283
|
+
This function queries the feature store to obtain a list of all distinct data domains
|
|
284
|
+
that have been defined within the system. It combines data domains from both the process
|
|
285
|
+
catalog and the feature catalog, ensuring a comprehensive overview. The current data
|
|
286
|
+
domain in use is highlighted for easy identification.
|
|
287
|
+
Parameters:
|
|
288
|
+
- verbose (bool): If True, prints the list of data domains with the current one marked.
|
|
289
|
+
Returns:
|
|
290
|
+
- str: The current data domain in use.
|
|
291
|
+
"""
|
|
292
|
+
|
|
293
|
+
query_data_domain = f"""
|
|
294
|
+
SELECT DISTINCT DATA_DOMAIN
|
|
295
|
+
FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}
|
|
296
|
+
UNION
|
|
297
|
+
SELECT DISTINCT DATA_DOMAIN
|
|
298
|
+
FROM {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW}
|
|
299
|
+
"""
|
|
300
|
+
data_domains = tdml.DataFrame.from_query(query_data_domain).to_pandas()['DATA_DOMAIN'].tolist()
|
|
301
|
+
|
|
302
|
+
if verbose:
|
|
303
|
+
print("Data Domains in Feature Store:")
|
|
304
|
+
for d in data_domains:
|
|
305
|
+
if d != tdfs4ds.DATA_DOMAIN:
|
|
306
|
+
print('\t'+d)
|
|
307
|
+
else:
|
|
308
|
+
print('*\t'+d)
|
|
309
|
+
if tdfs4ds.DATA_DOMAIN not in data_domains and tdfs4ds.DATA_DOMAIN is not None:
|
|
310
|
+
print("\nCurrent data domain (%s) not available yet in feature store. It may be a new one" % tdfs4ds.DATA_DOMAIN)
|
|
311
|
+
return
|
|
312
|
+
return data_domains
|
|
313
|
+
|
|
314
|
+
def select_data_domain(data_domain):
|
|
315
|
+
"""
|
|
316
|
+
Set the active data domain for feature store operations.
|
|
237
317
|
|
|
318
|
+
This function allows users to specify which data domain should be considered
|
|
319
|
+
as the current context for subsequent feature store operations. By setting
|
|
320
|
+
the data domain, users can ensure that all feature queries, registrations,
|
|
321
|
+
and other interactions with the feature store are scoped appropriately.
|
|
322
|
+
This is particularly useful in environments where multiple data domains
|
|
323
|
+
exist, allowing for clear separation and organization of features.
|
|
324
|
+
|
|
325
|
+
Parameters:
|
|
326
|
+
- data_domain (str): The name of the data domain to set as active.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
- str: The data domain that has been set as active.
|
|
330
|
+
"""
|
|
331
|
+
data_domains = get_data_domains(verbose=False)
|
|
332
|
+
if data_domain not in data_domains:
|
|
333
|
+
logger_safe("error", "Data domain '%s' not found in feature store.", data_domain)
|
|
334
|
+
raise ValueError(f"Data domain '{data_domain}' not found in feature store.")
|
|
335
|
+
#suggest a data domain closest to the requested one
|
|
336
|
+
closest_domain = difflib.get_close_matches(data_domain, data_domains, n=1)
|
|
337
|
+
if data_domain in data_domains:
|
|
338
|
+
tdfs4ds.DATA_DOMAIN = data_domain
|
|
339
|
+
elif closest_domain:
|
|
340
|
+
logger_safe("info", "Did you mean '%s'?", closest_domain[0])
|
|
341
|
+
return
|
|
342
|
+
tdfs4ds.DATA_DOMAIN = data_domain
|
|
343
|
+
logger_safe("info", "Data domain set to: %s", data_domain)
|
|
344
|
+
return
|
|
345
|
+
|
|
346
|
+
def create_data_domain(data_domain):
|
|
347
|
+
"""
|
|
348
|
+
Create a new data domain in the feature store.
|
|
349
|
+
|
|
350
|
+
This function facilitates the creation of a new data domain within the feature store.
|
|
351
|
+
A data domain serves as a logical grouping for features, allowing for better organization
|
|
352
|
+
and management. By creating a new data domain, users can segregate features based on
|
|
353
|
+
specific criteria, such as business units, projects, or data types. This helps in
|
|
354
|
+
maintaining clarity and structure within the feature store, especially in environments
|
|
355
|
+
with diverse datasets and use cases.
|
|
356
|
+
|
|
357
|
+
Parameters:
|
|
358
|
+
- data_domain (str): The name of the new data domain to be created.
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
- str: The name of the newly created data domain.
|
|
362
|
+
"""
|
|
363
|
+
existing_domains = get_data_domains(verbose=False)
|
|
364
|
+
if data_domain in existing_domains:
|
|
365
|
+
logger_safe("warning", "Data domain '%s' already exists in feature store.", data_domain)
|
|
366
|
+
return data_domain
|
|
367
|
+
tdfs4ds.DATA_DOMAIN = data_domain
|
|
368
|
+
logger_safe("info", "Data domain '%s' created in locally.", data_domain)
|
|
369
|
+
return
|
|
238
370
|
|
|
239
371
|
|
|
240
372
|
def feature_catalog():
|
|
@@ -287,50 +419,22 @@ def get_dataset_entity(dataset_id = None):
|
|
|
287
419
|
def get_dataset_features(dataset_id = None):
|
|
288
420
|
return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_features(dataset_id)
|
|
289
421
|
|
|
290
|
-
def run(process_id, return_dataset
|
|
422
|
+
def run(process_id, return_dataset=False, force_compute=False, force_varchar_length=None, dataset_view_name=None):
|
|
291
423
|
"""
|
|
292
424
|
Executes a specific process from the feature store identified by the process ID.
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
Parameters:
|
|
296
|
-
- process_id (str): The unique identifier of the process to run.
|
|
297
|
-
- return_dataset (bool, optional): A flag indicating whether to return the dataset created during the process.
|
|
298
|
-
Default is False.
|
|
299
|
-
- force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
|
|
300
|
-
Default is False.
|
|
301
|
-
- force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
|
|
302
|
-
VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
|
|
303
|
-
where k is the smallest integer so that the original lengths is smaller or equal
|
|
304
|
-
to k x force_varchar_length. Default is None.
|
|
305
|
-
|
|
306
|
-
Returns:
|
|
307
|
-
DataFrame or None: If return_dataset is True, returns the dataset created during the process. Otherwise, returns None.
|
|
308
|
-
|
|
309
|
-
This function performs the following steps:
|
|
310
|
-
1. Determines the process type and initializes necessary variables.
|
|
311
|
-
2. Constructs and executes a SQL query to retrieve process details by process ID.
|
|
312
|
-
3. Fetches the filter manager, process type, primary index, partitioning, and data domain from the query result.
|
|
313
|
-
4. Handles different process types, such as 'denormalized view' and 'tdstone2 view'.
|
|
314
|
-
5. For 'denormalized view' process type, extracts necessary details, fetches data, and uploads features to the feature store.
|
|
315
|
-
6. Optionally returns the dataset created during the process if return_dataset is True.
|
|
316
|
-
|
|
317
|
-
Note:
|
|
318
|
-
- The function relies on various sub-modules within the `tdfs4ds` library for different steps of the process, from
|
|
319
|
-
data retrieval to feature uploading.
|
|
320
|
-
- It is intended to be used internally within a system that manages a Teradata feature store, assuming access to
|
|
321
|
-
a Teradata database and the appropriate schema for feature storage.
|
|
425
|
+
Uses global `logger` for diagnostics (gated by tdfs4ds.DISPLAY_LOGS).
|
|
322
426
|
"""
|
|
323
427
|
|
|
324
428
|
if tdfs4ds.PROCESS_TYPE is None:
|
|
325
429
|
PROCESS_TYPE_ = 'RUN PROCESS'
|
|
326
|
-
tdfs4ds.RUN_ID
|
|
430
|
+
tdfs4ds.RUN_ID = str(uuid.uuid4())
|
|
327
431
|
else:
|
|
328
432
|
PROCESS_TYPE_ = tdfs4ds.PROCESS_TYPE
|
|
329
433
|
|
|
330
|
-
if tdfs4ds
|
|
331
|
-
|
|
434
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
435
|
+
logger_safe("debug", "def run | tdfs4ds.FEATURE_STORE_TIME=%s", tdfs4ds.FEATURE_STORE_TIME)
|
|
332
436
|
|
|
333
|
-
if tdfs4ds.FEATURE_STORE_TIME
|
|
437
|
+
if tdfs4ds.FEATURE_STORE_TIME is None:
|
|
334
438
|
validtime_statement = 'CURRENT VALIDTIME'
|
|
335
439
|
else:
|
|
336
440
|
validtime_statement = f"VALIDTIME AS OF TIMESTAMP '{tdfs4ds.FEATURE_STORE_TIME}'"
|
|
@@ -342,148 +446,112 @@ def run(process_id, return_dataset = False, force_compute = False, force_varchar
|
|
|
342
446
|
WHERE A.PROCESS_ID = '{process_id}'
|
|
343
447
|
"""
|
|
344
448
|
|
|
449
|
+
logger_safe(
|
|
450
|
+
"info",
|
|
451
|
+
"Starting run | run_id=%s | process_type=%s | process_id=%s | return_dataset=%s | force_compute=%s | force_varchar_length=%s",
|
|
452
|
+
tdfs4ds.RUN_ID, PROCESS_TYPE_, process_id, return_dataset, force_compute, force_varchar_length
|
|
453
|
+
)
|
|
454
|
+
|
|
345
455
|
# Executing the query and converting the result to Pandas DataFrame
|
|
346
456
|
df = tdml.DataFrame.from_query(query).to_pandas()
|
|
347
457
|
|
|
348
|
-
# Check if exactly one record is returned, else
|
|
458
|
+
# Check if exactly one record is returned, else log an error and return
|
|
349
459
|
if df.shape[0] != 1:
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
460
|
+
logger_safe(
|
|
461
|
+
"error",
|
|
462
|
+
"Process catalog lookup returned %s record(s); expected 1. Check table %s.%s. Query: %s",
|
|
463
|
+
df.shape[0], tdfs4ds.SCHEMA, tdfs4ds.PROCESS_CATALOG_NAME_VIEW, query.strip()
|
|
464
|
+
)
|
|
353
465
|
return
|
|
354
466
|
|
|
355
|
-
|
|
356
467
|
# Fetching the filter manager
|
|
357
468
|
filter_schema_name = df['FILTER_DATABASE_NAME'].values[0]
|
|
358
469
|
if filter_schema_name is None:
|
|
359
470
|
filtermanager = None
|
|
360
471
|
else:
|
|
361
472
|
filter_view_name = df['FILTER_VIEW_NAME'].values[0]
|
|
362
|
-
filter_table_name = df['FILTER_TABLE_NAME'].values[0]
|
|
473
|
+
filter_table_name = df['FILTER_TABLE_NAME'].values[0] # kept for parity; not used directly here
|
|
363
474
|
filtermanager = FilterManager(table_name=filter_view_name, schema_name=filter_schema_name)
|
|
364
475
|
|
|
365
|
-
# Fetching
|
|
366
|
-
process_type
|
|
367
|
-
|
|
368
|
-
# Fetching the primary index from the query result
|
|
369
|
-
primary_index = df['FOR_PRIMARY_INDEX'].values[0]
|
|
476
|
+
# Fetching process metadata
|
|
477
|
+
process_type = df['PROCESS_TYPE'].values[0]
|
|
478
|
+
primary_index = df['FOR_PRIMARY_INDEX'].values[0]
|
|
370
479
|
if primary_index is not None:
|
|
371
|
-
primary_index = primary_index.split(',')
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
480
|
+
primary_index = [x.strip() for x in primary_index.split(',') if x.strip()]
|
|
481
|
+
partitioning = df['FOR_DATA_PARTITIONING'].values[0]
|
|
482
|
+
DATA_DOMAIN = df['DATA_DOMAIN'].values[0]
|
|
483
|
+
|
|
484
|
+
logger_safe(
|
|
485
|
+
"info",
|
|
486
|
+
"Process metadata | process_id=%s | process_type=%s | primary_index=%s | partitioning=%s | data_domain=%s | validtime=%s",
|
|
487
|
+
process_id, process_type, primary_index, partitioning, DATA_DOMAIN, validtime_statement
|
|
488
|
+
)
|
|
378
489
|
|
|
379
490
|
# Handling 'denormalized view' process type
|
|
380
491
|
if process_type == 'denormalized view':
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
entity_id = df['ENTITY_ID'].values[0].split(',')
|
|
492
|
+
view_name = df['VIEW_NAME'].values[0]
|
|
493
|
+
entity_id = [x.strip() for x in df['ENTITY_ID'].values[0].split(',') if x.strip()]
|
|
384
494
|
entity_null_substitute = eval(df['ENTITY_NULL_SUBSTITUTE'].values[0])
|
|
385
|
-
feature_names
|
|
495
|
+
feature_names = [x.strip() for x in df['FEATURE_NAMES'].values[0].split(',') if x.strip()]
|
|
386
496
|
|
|
387
|
-
# Fetching data and uploading features to the feature store
|
|
388
497
|
df_data = tdml.DataFrame(tdml.in_schema(view_name.split('.')[0], view_name.split('.')[1]))
|
|
389
498
|
|
|
390
|
-
if tdfs4ds
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
499
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
500
|
+
logger_safe("debug", "run | entity_id=%s", entity_id)
|
|
501
|
+
logger_safe("debug", "run | entity_null_substitute=%s", entity_null_substitute)
|
|
502
|
+
logger_safe("debug", "run | feature_names=%s", feature_names)
|
|
503
|
+
logger_safe("debug", "run | process_id=%s", process_id)
|
|
504
|
+
logger_safe("debug", "run | primary_index=%s", primary_index)
|
|
505
|
+
logger_safe("debug", "run | partitioning=%s", partitioning)
|
|
506
|
+
|
|
397
507
|
dataset = _upload_features(
|
|
398
508
|
df_data,
|
|
399
509
|
entity_id,
|
|
400
510
|
feature_names,
|
|
401
|
-
feature_versions
|
|
402
|
-
primary_index
|
|
403
|
-
partitioning
|
|
404
|
-
filtermanager
|
|
405
|
-
entity_null_substitute
|
|
406
|
-
process_id
|
|
407
|
-
force_compute=
|
|
408
|
-
force_varchar_length
|
|
511
|
+
feature_versions=process_id,
|
|
512
|
+
primary_index=primary_index,
|
|
513
|
+
partitioning=partitioning,
|
|
514
|
+
filtermanager=filtermanager,
|
|
515
|
+
entity_null_substitute=entity_null_substitute,
|
|
516
|
+
process_id=process_id,
|
|
517
|
+
force_compute=force_compute,
|
|
518
|
+
force_varchar_length=force_varchar_length,
|
|
519
|
+
dataset_view_name = dataset_view_name
|
|
409
520
|
)
|
|
410
521
|
|
|
411
522
|
# Handling 'tdstone2 view' process type
|
|
412
523
|
elif process_type == 'tdstone2 view':
|
|
413
|
-
|
|
414
|
-
|
|
524
|
+
logger_safe("warning", "Process type 'tdstone2 view' not implemented yet for process_id=%s", process_id)
|
|
525
|
+
dataset = None
|
|
415
526
|
|
|
527
|
+
else:
|
|
528
|
+
logger_safe("error", "Unknown process type '%s' for process_id=%s", process_type, process_id)
|
|
529
|
+
dataset = None
|
|
416
530
|
|
|
417
531
|
if return_dataset:
|
|
532
|
+
logger_safe("info", "Run finished with dataset | run_id=%s | process_id=%s", tdfs4ds.RUN_ID, process_id)
|
|
418
533
|
return dataset
|
|
419
534
|
else:
|
|
535
|
+
logger_safe("info", "Run finished without dataset | run_id=%s | process_id=%s", tdfs4ds.RUN_ID, process_id)
|
|
420
536
|
return
|
|
421
537
|
|
|
422
|
-
def upload_features(df, entity_id, feature_names, metadata={}, primary_index = None, partitioning = '', filtermanager = None, entity_null_substitute = {}, force_compute = True, force_varchar_length = 1024):
|
|
423
|
-
"""
|
|
424
|
-
Uploads feature data from a DataFrame to the feature store for a specified entity. This involves registering the
|
|
425
|
-
process in the feature store, executing the necessary SQL to insert the data, and returning the resulting dataset
|
|
426
|
-
for further use or inspection.
|
|
427
538
|
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
- entity_null_substitute (dict, optional): A dictionary specifying substitute values for nulls in entity columns.
|
|
445
|
-
Default is an empty dictionary.
|
|
446
|
-
- force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
|
|
447
|
-
Default is True.
|
|
448
|
-
- force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
|
|
449
|
-
VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
|
|
450
|
-
where k is the smallest integer so that the original lengths is smaller or equal
|
|
451
|
-
to k x force_varchar_length. Default is 1024.
|
|
452
|
-
Returns:
|
|
453
|
-
DataFrame: A DataFrame representing the dataset resulting from the upload process, typically used for validation
|
|
454
|
-
or further processing.
|
|
455
|
-
|
|
456
|
-
The process involves several steps, including entity ID type conversion if necessary, feature name normalization,
|
|
457
|
-
process registration in the feature store, and the execution of SQL queries to insert the data. The function concludes
|
|
458
|
-
by returning a dataset derived from the uploaded data, offering immediate access to the newly stored information.
|
|
459
|
-
|
|
460
|
-
Example:
|
|
461
|
-
>>> df = tdml.DataFrame(...)
|
|
462
|
-
>>> entity_id = ['customer_id']
|
|
463
|
-
>>> feature_names = ['age', 'income']
|
|
464
|
-
>>> dataset = upload_features(df, entity_id, feature_names)
|
|
465
|
-
>>> # Another example with list-based entity_id, custom primary_index, and partitioning
|
|
466
|
-
>>> tddf = tdml.DataFrame(...) # Assuming tddf is predefined with appropriate columns
|
|
467
|
-
>>> entity_id = ['tx_type', 'txn_id']
|
|
468
|
-
>>> primary_index = ['txn_id']
|
|
469
|
-
>>> partitioning = '''
|
|
470
|
-
... PARTITION BY CASE_N (
|
|
471
|
-
... tx_type LIKE 'DEBIT',
|
|
472
|
-
... tx_type LIKE 'PAYMENT',
|
|
473
|
-
... tx_type LIKE 'CASH_OUT',
|
|
474
|
-
... tx_type LIKE 'CASH_IN',
|
|
475
|
-
... tx_type LIKE 'TRANSFER',
|
|
476
|
-
... NO CASE,
|
|
477
|
-
... UNKNOWN)'''
|
|
478
|
-
>>> features = [x for x in tddf.columns if x not in entity_id]
|
|
479
|
-
>>> dataset = upload_features(
|
|
480
|
-
... df = tddf,
|
|
481
|
-
... entity_id = entity_id,
|
|
482
|
-
... feature_names = features,
|
|
483
|
-
... metadata = {'project': 'test'},
|
|
484
|
-
... primary_index = primary_index,
|
|
485
|
-
... partitioning = partitioning
|
|
486
|
-
... )
|
|
539
|
+
def upload_features(
|
|
540
|
+
df,
|
|
541
|
+
entity_id,
|
|
542
|
+
feature_names,
|
|
543
|
+
metadata={},
|
|
544
|
+
primary_index=None,
|
|
545
|
+
partitioning='',
|
|
546
|
+
filtermanager=None,
|
|
547
|
+
entity_null_substitute={},
|
|
548
|
+
force_compute=True,
|
|
549
|
+
force_varchar_length=1024,
|
|
550
|
+
dataset_view_name = None
|
|
551
|
+
):
|
|
552
|
+
"""
|
|
553
|
+
Uploads feature data from a DataFrame to the feature store for a specified entity.
|
|
554
|
+
All diagnostics go through `logger_safe()` which respects `tdfs4ds.DISPLAY_LOGS`.
|
|
487
555
|
"""
|
|
488
556
|
|
|
489
557
|
from tdfs4ds.utils.info import get_column_types
|
|
@@ -491,45 +559,42 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
|
|
|
491
559
|
from tdfs4ds.process_store.process_registration_management import register_process_view
|
|
492
560
|
|
|
493
561
|
# Convert entity_id to a dictionary if it's not already one
|
|
494
|
-
if
|
|
562
|
+
if isinstance(entity_id, list):
|
|
495
563
|
entity_id.sort()
|
|
496
564
|
entity_id = get_column_types(df, entity_id)
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
elif
|
|
565
|
+
logger_safe("debug", "entity_id converted to dict: %s", entity_id)
|
|
566
|
+
|
|
567
|
+
elif isinstance(entity_id, str):
|
|
500
568
|
entity_id = [entity_id]
|
|
501
569
|
entity_id = get_column_types(df, entity_id)
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
if
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
feature_names = feature_names.split(',')
|
|
570
|
+
logger_safe("debug", "entity_id converted to dict: %s", entity_id)
|
|
571
|
+
|
|
572
|
+
# Normalize feature_names
|
|
573
|
+
if not isinstance(feature_names, list):
|
|
574
|
+
logger_safe("debug", "feature_names is not a list: %s", feature_names)
|
|
575
|
+
if isinstance(feature_names, str) and ',' in feature_names:
|
|
576
|
+
feature_names = [x.strip() for x in feature_names.split(',')]
|
|
510
577
|
else:
|
|
511
578
|
feature_names = [feature_names]
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
if primary_index is not None and
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
primary_index = primary_index.split(',')
|
|
579
|
+
logger_safe("debug", "feature_names converted to list: %s", feature_names)
|
|
580
|
+
logger_safe("debug", "Check the conversion is as expected.")
|
|
581
|
+
|
|
582
|
+
# Normalize primary_index
|
|
583
|
+
if primary_index is not None and not isinstance(primary_index, list):
|
|
584
|
+
logger_safe("debug", "primary_index is not a list: %s", primary_index)
|
|
585
|
+
if isinstance(primary_index, str) and ',' in primary_index:
|
|
586
|
+
primary_index = [x.strip() for x in primary_index.split(',')]
|
|
521
587
|
else:
|
|
522
588
|
primary_index = [primary_index]
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
print('check it is a expected.')
|
|
589
|
+
logger_safe("debug", "primary_index converted to list: %s", primary_index)
|
|
590
|
+
logger_safe("debug", "Check the conversion is as expected.")
|
|
526
591
|
|
|
592
|
+
# Partitioning
|
|
527
593
|
partitioning = tdfs4ds.utils.info.generate_partitioning_clause(partitioning=partitioning)
|
|
528
594
|
|
|
529
|
-
|
|
530
|
-
print("filtermanager", filtermanager)
|
|
595
|
+
logger_safe("debug", "filtermanager: %s", filtermanager)
|
|
531
596
|
|
|
532
|
-
# Register
|
|
597
|
+
# Register process -> get SQL(s) + process_id
|
|
533
598
|
query_insert, process_id, query_insert_dist, query_insert_filtermanager = register_process_view.__wrapped__(
|
|
534
599
|
view_name = df,
|
|
535
600
|
entity_id = entity_id,
|
|
@@ -542,104 +607,174 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
|
|
|
542
607
|
entity_null_substitute = entity_null_substitute
|
|
543
608
|
)
|
|
544
609
|
|
|
545
|
-
|
|
546
|
-
execute_query(query_insert)
|
|
547
|
-
execute_query(query_insert_dist)
|
|
548
|
-
if tdfs4ds.DEBUG_MODE:
|
|
549
|
-
print("query_insert_filtermanager",query_insert_filtermanager)
|
|
550
|
-
if query_insert_filtermanager is not None:
|
|
551
|
-
execute_query(query_insert_filtermanager)
|
|
610
|
+
logger_safe("info", "Registered process (process_id=%s) for upload_features", process_id)
|
|
552
611
|
|
|
553
|
-
#
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
612
|
+
# Execute queries
|
|
613
|
+
try:
|
|
614
|
+
execute_query(query_insert)
|
|
615
|
+
logger_safe("info", "Executed main insert query for process_id=%s", process_id)
|
|
616
|
+
except Exception as e:
|
|
617
|
+
logger_safe("exception", "Main insert query failed for process_id=%s", process_id)
|
|
618
|
+
raise
|
|
560
619
|
|
|
561
|
-
|
|
620
|
+
try:
|
|
621
|
+
execute_query(query_insert_dist)
|
|
622
|
+
logger_safe("info", "Executed distribution insert query for process_id=%s", process_id)
|
|
623
|
+
except Exception as e:
|
|
624
|
+
logger_safe("exception", "Distribution insert query failed for process_id=%s", process_id)
|
|
625
|
+
raise
|
|
562
626
|
|
|
563
|
-
|
|
627
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
628
|
+
# Avoid dumping entire SQL in normal logs; keep it debug-only.
|
|
629
|
+
logger_safe("debug", "query_insert_filtermanager: %s", query_insert_filtermanager)
|
|
564
630
|
|
|
631
|
+
if query_insert_filtermanager is not None:
|
|
632
|
+
try:
|
|
633
|
+
execute_query(query_insert_filtermanager)
|
|
634
|
+
logger_safe("info", "Executed filtermanager insert query for process_id=%s", process_id)
|
|
565
635
|
except Exception as e:
|
|
566
|
-
|
|
567
|
-
run_id = tdfs4ds.RUN_ID,
|
|
568
|
-
process_type = tdfs4ds.PROCESS_TYPE,
|
|
569
|
-
process_id = process_id,
|
|
570
|
-
status = 'FAILED,' + str(e).split('\n')[0]
|
|
571
|
-
)
|
|
636
|
+
logger_safe("exception", "Filtermanager insert query failed for process_id=%s", process_id)
|
|
572
637
|
raise
|
|
573
638
|
|
|
639
|
+
# Run the registered process (with/without dataset)
|
|
640
|
+
PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
|
|
641
|
+
tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES'
|
|
642
|
+
if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
|
|
643
|
+
tdfs4ds.PROCESS_TYPE = 'UPLOAD_FEATURES WITH DATASET VALIDATION'
|
|
644
|
+
tdfs4ds.RUN_ID = str(uuid.uuid4())
|
|
574
645
|
|
|
575
|
-
|
|
576
|
-
|
|
646
|
+
logger_safe(
|
|
647
|
+
"info",
|
|
648
|
+
"Starting run (run_id=%s, process_type=%s, process_id=%s, force_compute=%s, force_varchar_length=%s)",
|
|
649
|
+
tdfs4ds.RUN_ID, tdfs4ds.PROCESS_TYPE, process_id, force_compute, force_varchar_length
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
try:
|
|
653
|
+
if getattr(tdfs4ds, "BUILD_DATASET_AT_UPLOAD", False):
|
|
654
|
+
dataset = run(
|
|
655
|
+
process_id = process_id,
|
|
656
|
+
return_dataset = True,
|
|
657
|
+
force_compute = force_compute,
|
|
658
|
+
force_varchar_length = force_varchar_length,
|
|
659
|
+
dataset_view_name = dataset_view_name
|
|
660
|
+
)
|
|
661
|
+
logger_safe("info", "Run completed with dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
|
|
662
|
+
return dataset
|
|
663
|
+
else:
|
|
664
|
+
run(
|
|
665
|
+
process_id = process_id,
|
|
666
|
+
return_dataset = False,
|
|
667
|
+
force_compute = force_compute,
|
|
668
|
+
force_varchar_length = force_varchar_length,
|
|
669
|
+
dataset_view_name = dataset_view_name
|
|
670
|
+
)
|
|
671
|
+
logger_safe("info", "Run completed without dataset (run_id=%s, process_id=%s)", tdfs4ds.RUN_ID, process_id)
|
|
672
|
+
return
|
|
577
673
|
|
|
674
|
+
except Exception as e:
|
|
675
|
+
# Keep your existing follow-up close behavior, but ensure the error is logged.
|
|
578
676
|
try:
|
|
579
|
-
run(process_id=process_id, return_dataset=False, force_compute = force_compute, force_varchar_length = force_varchar_length)
|
|
580
|
-
except Exception as e:
|
|
581
677
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
582
|
-
run_id
|
|
583
|
-
process_type
|
|
584
|
-
process_id
|
|
585
|
-
status
|
|
678
|
+
run_id = tdfs4ds.RUN_ID,
|
|
679
|
+
process_type = tdfs4ds.PROCESS_TYPE,
|
|
680
|
+
process_id = process_id,
|
|
681
|
+
status = 'FAILED,' + str(e).split('\n')[0]
|
|
586
682
|
)
|
|
587
|
-
|
|
588
|
-
|
|
683
|
+
finally:
|
|
684
|
+
logger_safe("exception", "Run failed (run_id=%s, process_id=%s): %s",
|
|
685
|
+
tdfs4ds.RUN_ID, process_id, str(e).split('\n')[0]
|
|
686
|
+
)
|
|
687
|
+
raise
|
|
688
|
+
finally:
|
|
689
|
+
# Restore previous process type just in case the caller relies on it.
|
|
690
|
+
tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
|
|
589
691
|
|
|
590
|
-
tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
|
|
591
692
|
|
|
592
|
-
def _upload_features(df, entity_id, feature_names,
|
|
593
|
-
feature_versions=FEATURE_VERSION_DEFAULT, primary_index = None, partitioning = '', filtermanager=None, entity_null_substitute={}, process_id = None, force_compute = False,force_varchar_length = None):
|
|
594
|
-
"""
|
|
595
|
-
Uploads features from a DataFrame to the feature store, handling entity registration, feature type determination,
|
|
596
|
-
feature registration, preparation for ingestion, and storage in the designated feature tables.
|
|
597
693
|
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
enhance query performance based on the access patterns.
|
|
610
|
-
- filtermanager (object, optional): An object managing filter conditions for the feature data. Default is None.
|
|
611
|
-
- entity_null_substitute (dict, optional): A dictionary specifying substitute values for nulls in entity columns.
|
|
612
|
-
Default is an empty dictionary.
|
|
613
|
-
- process_id (str, optional): An identifier for the process, used for tracking and follow-up. Default is None.
|
|
614
|
-
- force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
|
|
615
|
-
Default is False.
|
|
616
|
-
- force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
|
|
617
|
-
VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
|
|
618
|
-
where k is the smallest integer so that the original lengths is smaller or equal
|
|
619
|
-
to k x force_varchar_length. Default is None.
|
|
694
|
+
def _upload_features(
|
|
695
|
+
df, entity_id, feature_names,
|
|
696
|
+
feature_versions = FEATURE_VERSION_DEFAULT,
|
|
697
|
+
primary_index = None, partitioning = '',
|
|
698
|
+
filtermanager = None, entity_null_substitute = {},
|
|
699
|
+
process_id = None, force_compute = False,
|
|
700
|
+
force_varchar_length = None,
|
|
701
|
+
dataset_view_name = None
|
|
702
|
+
):
|
|
703
|
+
"""
|
|
704
|
+
Uploads a set of features into the Feature Store for a given entity.
|
|
620
705
|
|
|
706
|
+
This function registers an entity and its associated features in the feature catalog
|
|
707
|
+
if they are not already defined, prepares the data for ingestion, and stores it in the
|
|
708
|
+
feature store. It also supports incremental feature computation and conditional execution
|
|
709
|
+
depending on prior runs.
|
|
621
710
|
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
711
|
+
Parameters
|
|
712
|
+
----------
|
|
713
|
+
df : pandas.DataFrame
|
|
714
|
+
Input dataframe containing entity keys and feature columns to upload.
|
|
715
|
+
entity_id : str, list, or dict
|
|
716
|
+
Identifier(s) for the entity. Can be:
|
|
717
|
+
- A string (single entity key)
|
|
718
|
+
- A list of key column names
|
|
719
|
+
- A dict mapping column names to data types
|
|
720
|
+
If not a dict, entity metadata is inferred automatically.
|
|
721
|
+
feature_names : list of str
|
|
722
|
+
List of feature column names to upload from `df`.
|
|
723
|
+
feature_versions : dict or int, optional
|
|
724
|
+
Feature version(s). If a single integer is provided, it is applied to all features.
|
|
725
|
+
If a dict is provided, it maps each feature name to its version.
|
|
726
|
+
Default is FEATURE_VERSION_DEFAULT.
|
|
727
|
+
primary_index : str or list, optional
|
|
728
|
+
Primary index to use when storing features in Teradata.
|
|
729
|
+
partitioning : str, optional
|
|
730
|
+
Partitioning clause for feature store tables. Default is ''.
|
|
731
|
+
filtermanager : FilterManager, optional
|
|
732
|
+
If provided, features are built iteratively per filter step.
|
|
733
|
+
entity_null_substitute : dict, optional
|
|
734
|
+
Replacement values for nulls in entity keys.
|
|
735
|
+
Example: {'customer_id': -1}
|
|
736
|
+
process_id : str, optional
|
|
737
|
+
Identifier for the process execution, used for follow-up logging.
|
|
738
|
+
force_compute : bool, optional
|
|
739
|
+
If True, forces recomputation even if the same process_id and timestamp were
|
|
740
|
+
already computed earlier. If False, the computation is skipped when existing
|
|
741
|
+
results are detected. Default is False.
|
|
742
|
+
force_varchar_length : int, optional
|
|
743
|
+
If provided, all VARCHAR feature columns are resized to this length
|
|
744
|
+
before ingestion.
|
|
745
|
+
|
|
746
|
+
Returns
|
|
747
|
+
-------
|
|
748
|
+
pandas.DataFrame or None
|
|
749
|
+
If BUILD_DATASET_AT_UPLOAD is enabled, returns a dataset built from the
|
|
750
|
+
ingested features for validation. Otherwise, returns None.
|
|
751
|
+
|
|
752
|
+
Notes
|
|
753
|
+
-----
|
|
754
|
+
- Uses global tdfs4ds context such as FEATURE_STORE_TIME, RUN_ID, and PROCESS_TYPE.
|
|
755
|
+
- Logs ingestion status in process follow-up tables.
|
|
756
|
+
- Skips ingestion when existing completed results are found unless
|
|
757
|
+
`force_compute=True`.
|
|
758
|
+
- Applies Teradata-optimized storage and statistics collection.
|
|
759
|
+
|
|
760
|
+
Raises
|
|
761
|
+
------
|
|
762
|
+
ValueError
|
|
763
|
+
If unsupported data types are found (CLOB/BLOB/JSON).
|
|
764
|
+
Exception
|
|
765
|
+
For ingestion failure or storage errors.
|
|
634
766
|
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
767
|
+
Example
|
|
768
|
+
-------
|
|
769
|
+
>>> _upload_features(
|
|
770
|
+
... df=dataframe,
|
|
771
|
+
... entity_id="customer_id",
|
|
772
|
+
... feature_names=["age", "credit_score"],
|
|
773
|
+
... process_id="customer_features_v1",
|
|
774
|
+
... force_compute=False
|
|
775
|
+
... )
|
|
641
776
|
"""
|
|
642
|
-
|
|
777
|
+
|
|
643
778
|
from tdfs4ds.feature_store.entity_management import register_entity
|
|
644
779
|
from tdfs4ds.feature_store.feature_store_management import Gettdtypes
|
|
645
780
|
from tdfs4ds.feature_store.feature_store_management import register_features
|
|
@@ -647,194 +782,199 @@ def _upload_features(df, entity_id, feature_names,
|
|
|
647
782
|
from tdfs4ds.feature_store.feature_data_processing import store_feature, apply_collect_stats
|
|
648
783
|
from tdfs4ds.utils.info import get_column_types, update_varchar_length
|
|
649
784
|
|
|
650
|
-
# Convert entity_id to a dictionary if
|
|
651
|
-
if
|
|
785
|
+
# Convert entity_id to a dictionary if not already
|
|
786
|
+
if isinstance(entity_id, list):
|
|
652
787
|
entity_id.sort()
|
|
653
788
|
entity_id = get_column_types(df, entity_id)
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
entity_id
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
#register_entity(entity_id, primary_index=primary_index, partitioning=partitioning)
|
|
663
|
-
|
|
664
|
-
# If feature_versions is a list, create a dictionary mapping each feature name to its corresponding version.
|
|
665
|
-
# If feature_versions is a string, create a dictionary mapping each feature name to this string.
|
|
666
|
-
if type(feature_versions) == list:
|
|
667
|
-
selected_features = {k: v for k, v in zip(feature_names, feature_versions)}
|
|
789
|
+
logger_safe("debug", "entity_id converted to dict: %s", entity_id)
|
|
790
|
+
elif isinstance(entity_id, str):
|
|
791
|
+
entity_id = get_column_types(df, [entity_id])
|
|
792
|
+
logger_safe("debug", "entity_id converted to dict: %s", entity_id)
|
|
793
|
+
|
|
794
|
+
# Map feature versions
|
|
795
|
+
if isinstance(feature_versions, list):
|
|
796
|
+
selected_features = dict(zip(feature_names, feature_versions))
|
|
668
797
|
else:
|
|
669
798
|
selected_features = {k: feature_versions for k in feature_names}
|
|
670
799
|
|
|
671
|
-
# Get
|
|
672
|
-
feature_names_types = Gettdtypes(
|
|
673
|
-
df,
|
|
674
|
-
features_columns=feature_names,
|
|
675
|
-
entity_id=entity_id
|
|
676
|
-
)
|
|
800
|
+
# Get Teradata types for features
|
|
801
|
+
feature_names_types = Gettdtypes(df, features_columns=feature_names, entity_id=entity_id)
|
|
677
802
|
|
|
678
803
|
if force_varchar_length is not None:
|
|
679
|
-
|
|
680
|
-
feature_names_types = update_varchar_length(
|
|
804
|
+
logger_safe("debug", "Updating VARCHAR lengths with force_varchar_length=%s", force_varchar_length)
|
|
805
|
+
feature_names_types = update_varchar_length(
|
|
806
|
+
feature_names_types,
|
|
807
|
+
new_varchar_length=force_varchar_length
|
|
808
|
+
)
|
|
681
809
|
|
|
682
810
|
def validate_feature_types(feature_names_types):
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
feature_names_types (dict): A dictionary where keys are feature names and values are their data types.
|
|
689
|
-
|
|
690
|
-
Raises:
|
|
691
|
-
ValueError: If any feature type contains 'clob', 'blob', or 'json'.
|
|
692
|
-
"""
|
|
693
|
-
invalid_types = {key: value['type'] for key, value in feature_names_types.items()
|
|
694
|
-
if any(term in value['type'].lower() for term in ['clob', 'blob', 'json'])}
|
|
695
|
-
|
|
696
|
-
if invalid_types:
|
|
811
|
+
invalid = {
|
|
812
|
+
k: v['type'] for k, v in feature_names_types.items()
|
|
813
|
+
if any(x in v['type'].lower() for x in ['clob', 'blob', 'json'])
|
|
814
|
+
}
|
|
815
|
+
if invalid:
|
|
697
816
|
raise ValueError(
|
|
698
|
-
f"
|
|
699
|
-
"
|
|
817
|
+
f"Unsupported data types found: {invalid}. "
|
|
818
|
+
"CLOB/BLOB/JSON are not supported."
|
|
700
819
|
)
|
|
701
|
-
|
|
702
|
-
validate_feature_types(feature_names_types)
|
|
703
|
-
|
|
820
|
+
|
|
821
|
+
validate_feature_types(feature_names_types)
|
|
822
|
+
|
|
823
|
+
logger_safe("info", "Registering entity %s in feature store", entity_id)
|
|
704
824
|
register_entity(entity_id, feature_names_types, primary_index=primary_index, partitioning=partitioning)
|
|
705
825
|
|
|
706
|
-
if tdfs4ds
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
print('_upload_features', 'df.columns', df.columns)
|
|
714
|
-
|
|
715
|
-
# Register the features in the feature catalog.
|
|
716
|
-
register_features(
|
|
717
|
-
entity_id,
|
|
718
|
-
feature_names_types,
|
|
719
|
-
primary_index,
|
|
720
|
-
partitioning
|
|
721
|
-
)
|
|
722
|
-
|
|
723
|
-
if tdfs4ds.DEBUG_MODE:
|
|
724
|
-
print("---------_upload_features")
|
|
725
|
-
print("filtermanager : ", filtermanager)
|
|
726
|
-
print("feature names : ", feature_names)
|
|
727
|
-
print("selected features : ", selected_features)
|
|
826
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
827
|
+
logger_safe(
|
|
828
|
+
"debug",
|
|
829
|
+
"_upload_features entity_id=%s null_substitute=%s features=%s primary_index=%s partitioning=%s",
|
|
830
|
+
entity_id, entity_null_substitute, feature_names, primary_index, partitioning
|
|
831
|
+
)
|
|
832
|
+
logger_safe("debug", "selected_features=%s df.columns=%s", selected_features, df.columns)
|
|
728
833
|
|
|
729
|
-
|
|
834
|
+
register_features(entity_id, feature_names_types, primary_index, partitioning)
|
|
835
|
+
logger_safe("info", "Features registered in catalog: %s", feature_names)
|
|
836
|
+
|
|
837
|
+
follow_up = None
|
|
838
|
+
if process_id and tdfs4ds.FEATURE_STORE_TIME:
|
|
730
839
|
follow_up = tdfs4ds.process_store.process_followup.follow_up_report()
|
|
731
|
-
follow_up = follow_up[
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
do_compute = False
|
|
840
|
+
follow_up = follow_up[
|
|
841
|
+
(follow_up.STATUS == 'COMPLETED') &
|
|
842
|
+
(follow_up.VALIDTIME_DATE.isna() == False) &
|
|
843
|
+
(follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME) &
|
|
844
|
+
(follow_up.PROCESS_ID == process_id)
|
|
845
|
+
]
|
|
738
846
|
|
|
739
|
-
|
|
847
|
+
if filtermanager is None:
|
|
848
|
+
dataset_created = False
|
|
849
|
+
do_compute = not (process_id and follow_up is not None and follow_up.shape[0] > 0)
|
|
850
|
+
if not do_compute and not force_compute:
|
|
851
|
+
logger_safe(
|
|
852
|
+
"info",
|
|
853
|
+
"Skipping computation for process_id=%s at time %s (already exists, force_compute=False)",
|
|
854
|
+
process_id, tdfs4ds.FEATURE_STORE_TIME
|
|
855
|
+
)
|
|
740
856
|
if do_compute or force_compute:
|
|
741
|
-
|
|
857
|
+
logger_safe("info", "Beginning feature ingestion for entity=%s", entity_id)
|
|
742
858
|
tdfs4ds.process_store.process_followup.followup_open(
|
|
743
|
-
run_id
|
|
744
|
-
process_type
|
|
745
|
-
process_id
|
|
859
|
+
run_id=tdfs4ds.RUN_ID,
|
|
860
|
+
process_type=tdfs4ds.PROCESS_TYPE,
|
|
861
|
+
process_id=process_id
|
|
746
862
|
)
|
|
747
|
-
|
|
748
863
|
try:
|
|
749
|
-
prepared_features,
|
|
750
|
-
df,
|
|
751
|
-
entity_id,
|
|
752
|
-
feature_names,
|
|
864
|
+
prepared_features, volatile_table, features_infos = prepare_feature_ingestion(
|
|
865
|
+
df, entity_id, feature_names,
|
|
753
866
|
feature_versions=selected_features,
|
|
754
867
|
primary_index=primary_index,
|
|
755
868
|
entity_null_substitute=entity_null_substitute,
|
|
756
869
|
partitioning=partitioning
|
|
757
870
|
)
|
|
758
|
-
# Store the prepared features in the feature store.
|
|
759
|
-
store_feature(
|
|
760
|
-
entity_id,
|
|
761
|
-
volatile_table_name,
|
|
762
|
-
entity_null_substitute=entity_null_substitute,
|
|
763
|
-
primary_index=primary_index,
|
|
764
|
-
partitioning=partitioning,
|
|
765
|
-
features_infos = features_infos
|
|
766
|
-
)
|
|
767
871
|
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
)
|
|
872
|
+
count_rows = store_feature(entity_id, volatile_table, entity_null_substitute,
|
|
873
|
+
primary_index, partitioning, features_infos)
|
|
874
|
+
|
|
875
|
+
|
|
876
|
+
|
|
877
|
+
apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
|
|
775
878
|
|
|
776
879
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
777
|
-
run_id
|
|
778
|
-
process_type
|
|
779
|
-
process_id
|
|
880
|
+
run_id=tdfs4ds.RUN_ID,
|
|
881
|
+
process_type=tdfs4ds.PROCESS_TYPE,
|
|
882
|
+
process_id=process_id
|
|
780
883
|
)
|
|
884
|
+
logger_safe("info", "Feature ingestion completed for entity=%s", entity_id)
|
|
885
|
+
# Build dataset for validation if enabled
|
|
886
|
+
if tdfs4ds.BUILD_DATASET_AT_UPLOAD or dataset_view_name is not None:
|
|
887
|
+
logger_safe("info", "Building dataset for validation...")
|
|
888
|
+
try:
|
|
889
|
+
dataset = build_dataset(
|
|
890
|
+
entity_id, selected_features,
|
|
891
|
+
view_name = dataset_view_name
|
|
892
|
+
)
|
|
893
|
+
dataset_created = True
|
|
894
|
+
except Exception as e:
|
|
895
|
+
logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
|
|
896
|
+
logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
|
|
897
|
+
else:
|
|
898
|
+
logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False) and no dataset view name provided.")
|
|
781
899
|
|
|
782
900
|
except Exception as e:
|
|
901
|
+
logger_safe("exception", "Feature ingestion failed for entity=%s", entity_id)
|
|
783
902
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
784
|
-
run_id
|
|
785
|
-
process_type
|
|
786
|
-
process_id
|
|
787
|
-
status
|
|
903
|
+
run_id=tdfs4ds.RUN_ID,
|
|
904
|
+
process_type=tdfs4ds.PROCESS_TYPE,
|
|
905
|
+
process_id=process_id,
|
|
906
|
+
status='FAILED,' + str(e).split('\n')[0]
|
|
788
907
|
)
|
|
789
908
|
raise
|
|
909
|
+
|
|
790
910
|
else:
|
|
791
|
-
# get the total number of filter condition in the filter manager
|
|
792
|
-
nb_filters = filtermanager.nb_filters
|
|
793
911
|
|
|
794
|
-
|
|
912
|
+
logger_safe("info", "FilterManager detected: %s filters to process", filtermanager.nb_filters)
|
|
795
913
|
something_computed = False
|
|
914
|
+
pbar = tqdm(
|
|
915
|
+
range(filtermanager.nb_filters),
|
|
916
|
+
total=filtermanager.nb_filters,
|
|
917
|
+
desc="Applying filters",
|
|
918
|
+
unit="filter",
|
|
919
|
+
leave=False
|
|
920
|
+
)
|
|
921
|
+
dataset_created = False
|
|
922
|
+
for i in pbar:
|
|
923
|
+
filter_id = i + 1
|
|
924
|
+
filtermanager.update(filter_id)
|
|
796
925
|
|
|
797
|
-
|
|
926
|
+
try:
|
|
927
|
+
pbar.set_description(f"Applying filter {filter_id}/{filtermanager.nb_filters}")
|
|
798
928
|
|
|
799
|
-
|
|
800
|
-
|
|
929
|
+
# Convert datetime columns to string
|
|
930
|
+
df_bar = filtermanager.display().to_pandas().astype(object) # avoid conversion issues
|
|
931
|
+
for col in df_bar.select_dtypes(include=["datetime", "datetimetz"]).columns:
|
|
932
|
+
df_bar[col] = df_bar[col].dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
801
933
|
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
tdfs4ds.FEATURE_STORE_TIME = filtermanager.get_date_in_the_past()
|
|
934
|
+
# Convert to JSON object (dict)
|
|
935
|
+
bar_info = df_bar.iloc[0].to_dict()
|
|
805
936
|
|
|
806
|
-
#
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
937
|
+
# ---- ADD THIS: handle python date objects ----
|
|
938
|
+
from datetime import date, datetime
|
|
939
|
+
for key, value in bar_info.items():
|
|
940
|
+
if isinstance(value, (date, datetime)): # convert date/datetime to string
|
|
941
|
+
bar_info[key] = value.strftime("%Y-%m-%d %H:%M:%S")
|
|
942
|
+
# ----------------------------------------------
|
|
810
943
|
|
|
811
|
-
|
|
812
|
-
|
|
944
|
+
bar_info = str(bar_info)
|
|
945
|
+
if len(bar_info) > 120:
|
|
946
|
+
bar_info = bar_info[:117] + "..."
|
|
947
|
+
pbar.set_postfix_str(bar_info)
|
|
813
948
|
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
follow_up_ = follow_up.assign(APPLIED_FILTER=follow_up.APPLIED_FILTER.cast(tdml.VARCHAR(20000))).join(
|
|
818
|
-
tdml.DataFrame.from_query(
|
|
819
|
-
f"""
|
|
820
|
-
SELECT
|
|
821
|
-
CAST(JSON_AGG({','.join(filtermanager.col_names)}) AS VARCHAR(20000)) AS APPLIED_FILTER
|
|
822
|
-
FROM {filtermanager.schema_name}.{filtermanager.view_name}
|
|
823
|
-
"""
|
|
824
|
-
),
|
|
825
|
-
on = 'APPLIED_FILTER',
|
|
826
|
-
how = 'inner',
|
|
827
|
-
lprefix = 'l',
|
|
828
|
-
rprefix = 'r'
|
|
829
|
-
)
|
|
830
|
-
# if already computed and completed, then do_compute is set to False
|
|
831
|
-
if follow_up_.shape[0] > 0:
|
|
832
|
-
do_compute = False
|
|
949
|
+
except Exception:
|
|
950
|
+
# postfix is optional; ignore errors from display() here
|
|
951
|
+
pass
|
|
833
952
|
|
|
834
|
-
|
|
835
|
-
|
|
953
|
+
logger_safe("debug", "Applying filter %s/%s:\n%s",
|
|
954
|
+
i + 1, filtermanager.nb_filters, filtermanager.display())
|
|
955
|
+
|
|
956
|
+
do_compute = True
|
|
957
|
+
if process_id and tdfs4ds.FEATURE_STORE_TIME:
|
|
958
|
+
# see if already computed
|
|
959
|
+
follow_up = tdfs4ds.process_store.process_followup.follow_up_report(process_id=process_id, filtermanager=filtermanager)
|
|
960
|
+
follow_up = follow_up[
|
|
961
|
+
(follow_up.STATUS == 'COMPLETED') &
|
|
962
|
+
(follow_up.VALIDTIME_DATE.isna() == False) &
|
|
963
|
+
(follow_up.VALIDTIME_DATE == tdfs4ds.FEATURE_STORE_TIME)
|
|
964
|
+
]
|
|
965
|
+
|
|
966
|
+
if follow_up.shape[0] > 0:
|
|
967
|
+
do_compute = False
|
|
836
968
|
|
|
969
|
+
if not do_compute and not force_compute:
|
|
970
|
+
logger_safe(
|
|
971
|
+
"info",
|
|
972
|
+
"Skipping computation for process_id=%s at time %s (already exists, force_compute=False)",
|
|
973
|
+
process_id, tdfs4ds.FEATURE_STORE_TIME
|
|
974
|
+
)
|
|
975
|
+
pbar.colour = "green"
|
|
837
976
|
if do_compute or force_compute:
|
|
977
|
+
pbar.colour = "blue"
|
|
838
978
|
tdfs4ds.process_store.process_followup.followup_open(
|
|
839
979
|
run_id = tdfs4ds.RUN_ID,
|
|
840
980
|
process_type = tdfs4ds.PROCESS_TYPE,
|
|
@@ -842,83 +982,78 @@ def _upload_features(df, entity_id, feature_names,
|
|
|
842
982
|
filtermanager = filtermanager
|
|
843
983
|
)
|
|
844
984
|
try:
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
df,
|
|
848
|
-
entity_id,
|
|
849
|
-
feature_names,
|
|
985
|
+
prepared_features, volatile_table, features_infos = prepare_feature_ingestion(
|
|
986
|
+
df, entity_id, feature_names,
|
|
850
987
|
feature_versions = selected_features,
|
|
851
988
|
primary_index = primary_index,
|
|
852
989
|
entity_null_substitute = entity_null_substitute,
|
|
853
990
|
partitioning = partitioning
|
|
854
991
|
)
|
|
855
992
|
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
volatile_table_name,
|
|
860
|
-
entity_null_substitute=entity_null_substitute,
|
|
861
|
-
primary_index = primary_index,
|
|
862
|
-
partitioning = partitioning,
|
|
863
|
-
features_infos=features_infos
|
|
864
|
-
|
|
865
|
-
)
|
|
866
|
-
|
|
867
|
-
# indicate that something has been processed:
|
|
993
|
+
count_rows = store_feature(entity_id, volatile_table, entity_null_substitute,
|
|
994
|
+
primary_index, partitioning, features_infos)
|
|
995
|
+
|
|
868
996
|
something_computed = True
|
|
869
997
|
|
|
870
998
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
871
|
-
run_id=tdfs4ds.RUN_ID,
|
|
872
|
-
process_type=tdfs4ds.PROCESS_TYPE,
|
|
873
|
-
process_id=process_id,
|
|
999
|
+
run_id = tdfs4ds.RUN_ID,
|
|
1000
|
+
process_type = tdfs4ds.PROCESS_TYPE,
|
|
1001
|
+
process_id = process_id,
|
|
874
1002
|
filtermanager = filtermanager
|
|
875
1003
|
)
|
|
876
1004
|
|
|
1005
|
+
# Build dataset for validation if enabled
|
|
1006
|
+
if (tdfs4ds.BUILD_DATASET_AT_UPLOAD or dataset_view_name is not None) and dataset_created==False:
|
|
1007
|
+
logger_safe("info", "Building dataset for validation...")
|
|
1008
|
+
try:
|
|
1009
|
+
dataset = build_dataset(
|
|
1010
|
+
entity_id, selected_features,
|
|
1011
|
+
view_name = dataset_view_name
|
|
1012
|
+
)
|
|
1013
|
+
dataset_created = True
|
|
1014
|
+
except Exception as e:
|
|
1015
|
+
logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
|
|
1016
|
+
logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
|
|
1017
|
+
else:
|
|
1018
|
+
logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False) and no dataset view name provided.")
|
|
1019
|
+
|
|
877
1020
|
except Exception as e:
|
|
878
|
-
|
|
1021
|
+
logger_safe("exception", "Error with filter iteration %s: %s", i + 1, str(e))
|
|
879
1022
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
880
|
-
run_id=tdfs4ds.RUN_ID,
|
|
881
|
-
process_type=tdfs4ds.PROCESS_TYPE,
|
|
882
|
-
process_id=process_id,
|
|
883
|
-
status='FAILED,' + str(e).split('\n')[0],
|
|
884
|
-
filtermanager=filtermanager
|
|
1023
|
+
run_id = tdfs4ds.RUN_ID,
|
|
1024
|
+
process_type = tdfs4ds.PROCESS_TYPE,
|
|
1025
|
+
process_id = process_id,
|
|
1026
|
+
status = 'FAILED,' + str(e).split('\n')[0],
|
|
1027
|
+
filtermanager = filtermanager
|
|
885
1028
|
)
|
|
886
1029
|
raise
|
|
887
|
-
# Clean up by dropping the temporary volatile table.
|
|
888
|
-
# tdml.execute_sql(f'DROP TABLE {volatile_table_name}')
|
|
889
1030
|
|
|
890
|
-
# Collect statistics only if something has been computed
|
|
891
1031
|
if something_computed:
|
|
892
|
-
apply_collect_stats(
|
|
893
|
-
entity_id,
|
|
894
|
-
primary_index = primary_index,
|
|
895
|
-
partitioning = partitioning,
|
|
896
|
-
feature_infos = features_infos
|
|
897
|
-
)
|
|
1032
|
+
apply_collect_stats(entity_id, primary_index, partitioning, features_infos)
|
|
898
1033
|
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
1034
|
+
if dataset_created == False and tdfs4ds.BUILD_DATASET_AT_UPLOAD and dataset_view_name == None:
|
|
1035
|
+
logger_safe("info", "Building dataset for validation...")
|
|
1036
|
+
try:
|
|
1037
|
+
dataset = build_dataset(
|
|
1038
|
+
entity_id, selected_features,
|
|
1039
|
+
view_name = dataset_view_name
|
|
1040
|
+
)
|
|
1041
|
+
return dataset
|
|
1042
|
+
except Exception as e:
|
|
1043
|
+
logger_safe("error", "Dataset build failed: %s", str(e).split('\n')[0])
|
|
1044
|
+
logger_safe("error", "entity=%s features=%s", entity_id, selected_features)
|
|
1045
|
+
else:
|
|
1046
|
+
if tdfs4ds.BUILD_DATASET_AT_UPLOAD == False:
|
|
1047
|
+
logger_safe("info", "Dataset build disabled (BUILD_DATASET_AT_UPLOAD=False) and no dataset view name provided.")
|
|
1048
|
+
else:
|
|
1049
|
+
return
|
|
1050
|
+
|
|
914
1051
|
|
|
915
|
-
# Return the dataset view.
|
|
916
|
-
return dataset
|
|
917
|
-
else:
|
|
918
|
-
if tdfs4ds.DISPLAY_LOGS: print('no dataset built for validation. Set tdfs4ds.BUILD_DATASET_AT_UPLOAD to True if you want it')
|
|
919
1052
|
return
|
|
920
1053
|
|
|
921
1054
|
|
|
1055
|
+
|
|
1056
|
+
|
|
922
1057
|
def build_dataset(entity_id, selected_features, view_name, schema_name=None, comment=None, return_query=False,
|
|
923
1058
|
feature_store_time=False, join_type='INNER'):
|
|
924
1059
|
"""
|
|
@@ -935,6 +1070,10 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
|
|
|
935
1070
|
selected_features : dict
|
|
936
1071
|
A dictionary where the keys are feature table names, and the values are lists of tuples
|
|
937
1072
|
(feature_id, feature_version, feature_name) specifying the features to retrieve.
|
|
1073
|
+
NOTE: feature_version may be either:
|
|
1074
|
+
- a single UUID string, or
|
|
1075
|
+
- a list of dicts like:
|
|
1076
|
+
{"process_id": <UUID>, "process_view_name": <str>}
|
|
938
1077
|
|
|
939
1078
|
view_name : str
|
|
940
1079
|
The name of the view to be created in the database.
|
|
@@ -1004,6 +1143,24 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
|
|
|
1004
1143
|
# Sort the entity ID list for consistent query generation
|
|
1005
1144
|
list_entity_id.sort()
|
|
1006
1145
|
|
|
1146
|
+
# Helpers
|
|
1147
|
+
import re
|
|
1148
|
+
def _sanitize_identifier(name: str) -> str:
|
|
1149
|
+
# Keep letters, numbers, and underscores; replace others with '_'
|
|
1150
|
+
return re.sub(r'[^0-9A-Za-z_]', '_', name)
|
|
1151
|
+
|
|
1152
|
+
used_alias_counts = {} # base_alias -> count
|
|
1153
|
+
|
|
1154
|
+
def _unique_alias(base: str) -> str:
|
|
1155
|
+
"""
|
|
1156
|
+
Ensure alias uniqueness: if base already used, append _2, _3, ...
|
|
1157
|
+
"""
|
|
1158
|
+
if base not in used_alias_counts:
|
|
1159
|
+
used_alias_counts[base] = 1
|
|
1160
|
+
return base
|
|
1161
|
+
used_alias_counts[base] += 1
|
|
1162
|
+
return f"{base}_{used_alias_counts[base]}"
|
|
1163
|
+
|
|
1007
1164
|
# Initialize sub-query construction
|
|
1008
1165
|
tdfs4ds.logger.info("Generating the sub-queries for feature retrieval.")
|
|
1009
1166
|
sub_queries = []
|
|
@@ -1014,21 +1171,52 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
|
|
|
1014
1171
|
# Construct sub-queries for each feature
|
|
1015
1172
|
for k, v in list_features.items():
|
|
1016
1173
|
for feature_id, feature_version, feature_name in v:
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1174
|
+
|
|
1175
|
+
# Multiple processes: list of dicts
|
|
1176
|
+
if isinstance(feature_version, list):
|
|
1177
|
+
for item in feature_version:
|
|
1178
|
+
process_id = item.get("process_id")
|
|
1179
|
+
process_view_name = item.get("process_view_name") or "PROCESS"
|
|
1180
|
+
base_alias = _sanitize_identifier(f"{feature_name}_{process_view_name}")
|
|
1181
|
+
alias = _unique_alias(base_alias)
|
|
1182
|
+
|
|
1183
|
+
txt_where = f"(FEATURE_ID = {feature_id} AND FEATURE_VERSION='{process_id}')"
|
|
1184
|
+
feature_str = ',B1.FEATURE_VALUE AS ' + alias
|
|
1185
|
+
|
|
1186
|
+
sub_queries.append(
|
|
1187
|
+
{
|
|
1188
|
+
'feature_name': alias,
|
|
1189
|
+
'query': f"""
|
|
1190
|
+
SEQUENCED VALIDTIME
|
|
1191
|
+
SELECT
|
|
1192
|
+
{txt_entity}
|
|
1193
|
+
{feature_str}
|
|
1194
|
+
FROM {k} B1
|
|
1195
|
+
WHERE {txt_where}
|
|
1196
|
+
"""
|
|
1197
|
+
}
|
|
1198
|
+
)
|
|
1199
|
+
|
|
1200
|
+
# Single UUID
|
|
1201
|
+
else:
|
|
1202
|
+
base_alias = _sanitize_identifier(feature_name)
|
|
1203
|
+
alias = _unique_alias(base_alias)
|
|
1204
|
+
|
|
1205
|
+
txt_where = f"(FEATURE_ID = {feature_id} AND FEATURE_VERSION='{feature_version}')"
|
|
1206
|
+
feature_str = ',B1.FEATURE_VALUE AS ' + alias
|
|
1207
|
+
sub_queries.append(
|
|
1208
|
+
{
|
|
1209
|
+
'feature_name': alias,
|
|
1210
|
+
'query': f"""
|
|
1211
|
+
SEQUENCED VALIDTIME
|
|
1212
|
+
SELECT
|
|
1213
|
+
{txt_entity}
|
|
1214
|
+
{feature_str}
|
|
1215
|
+
FROM {k} B1
|
|
1216
|
+
WHERE {txt_where}
|
|
1217
|
+
"""
|
|
1218
|
+
}
|
|
1219
|
+
)
|
|
1032
1220
|
|
|
1033
1221
|
# Handle case where no features are available
|
|
1034
1222
|
if len(sub_queries) == 0:
|
|
@@ -1102,6 +1290,7 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
|
|
|
1102
1290
|
return tdml.DataFrame.from_table(tdml.in_schema(schema_name, view_name))
|
|
1103
1291
|
|
|
1104
1292
|
|
|
1293
|
+
|
|
1105
1294
|
def build_dataset_opt(entity_id, selected_features, view_name = None, schema_name=tdfs4ds.SCHEMA,
|
|
1106
1295
|
comment='dataset', no_temporal=False, time_manager=None, query_only=False, entity_null_substitute={},
|
|
1107
1296
|
other=None, time_column=None, filtermanager = None, filter_conditions = None
|
|
@@ -1280,82 +1469,91 @@ def upload_tdstone2_scores(model):
|
|
|
1280
1469
|
return dataset
|
|
1281
1470
|
|
|
1282
1471
|
|
|
1283
|
-
def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None):
|
|
1472
|
+
def roll_out(process_list, time_manager, time_id_start = 1, time_id_end = None, force_compute = False, force_display_logs = False):
|
|
1284
1473
|
"""
|
|
1285
|
-
Executes a series of processes for each date in a given list, managing
|
|
1474
|
+
Executes a series of processes for each date in a given list, managing time, computation settings, and logging.
|
|
1286
1475
|
|
|
1287
1476
|
This function iterates over a range of time steps, updating a TimeManager object with each step, and then
|
|
1288
|
-
executes a list of processes for that time step. It also manages
|
|
1289
|
-
and
|
|
1477
|
+
executes a list of processes for that time step. It also manages synchronization of time for the feature store
|
|
1478
|
+
and optionally controls forced computation and log display behavior.
|
|
1290
1479
|
|
|
1291
1480
|
Parameters:
|
|
1292
1481
|
- process_list (list): A list of process IDs that need to be executed for each time step.
|
|
1293
|
-
- time_manager (TimeManager
|
|
1482
|
+
- time_manager (TimeManager): An object that manages time-related operations, like updating or retrieving time.
|
|
1294
1483
|
- time_id_start (int, optional): The starting time step ID. Default is 1.
|
|
1295
|
-
- time_id_end (int, optional): The ending time step ID. If None, it will run until the last time step in the
|
|
1484
|
+
- time_id_end (int, optional): The ending time step ID. If None, it will run until the last time step in the
|
|
1485
|
+
time manager.
|
|
1486
|
+
- force_compute (bool, optional): If True, forces each process to recompute even if previous results exist.
|
|
1487
|
+
Default is False.
|
|
1488
|
+
- force_display_logs (bool, optional): If True, forces log display during the rollout even if global log display
|
|
1489
|
+
is disabled. Default is False.
|
|
1296
1490
|
|
|
1297
1491
|
Side Effects:
|
|
1298
|
-
-
|
|
1492
|
+
- Temporarily modifies global variables DISPLAY_LOGS, PROCESS_TYPE, RUN_ID, and FEATURE_STORE_TIME.
|
|
1493
|
+
- Restores DISPLAY_LOGS setting after execution.
|
|
1299
1494
|
- Catches and prints exceptions along with the time step on which they occurred.
|
|
1300
1495
|
|
|
1301
|
-
|
|
1302
|
-
1. Disables display logs
|
|
1303
|
-
2.
|
|
1304
|
-
3.
|
|
1305
|
-
4.
|
|
1306
|
-
5.
|
|
1307
|
-
6.
|
|
1496
|
+
Steps performed:
|
|
1497
|
+
1. Disables display logs by default unless `force_display_logs` is True.
|
|
1498
|
+
2. Sets process type to 'ROLL_OUT' and initializes a unique run ID.
|
|
1499
|
+
3. Iterates over the specified range of time steps.
|
|
1500
|
+
4. Updates the time manager with the current time step.
|
|
1501
|
+
5. Synchronizes the feature store time with the current time step.
|
|
1502
|
+
6. Executes each process in the process list with optional forced computation.
|
|
1503
|
+
7. Restores original display log settings after completion.
|
|
1308
1504
|
|
|
1309
1505
|
Example:
|
|
1310
1506
|
>>> process_list = ['process_1', 'process_2']
|
|
1311
1507
|
>>> time_manager = TimeManager(...)
|
|
1312
|
-
>>> roll_out(process_list, time_manager, time_id_start=1, time_id_end=10)
|
|
1508
|
+
>>> roll_out(process_list, time_manager, time_id_start=1, time_id_end=10, force_compute=True, force_display_logs=True)
|
|
1313
1509
|
"""
|
|
1314
1510
|
|
|
1315
|
-
#global DISPLAY_LOGS
|
|
1316
|
-
#global FEATURE_STORE_TIME
|
|
1317
|
-
|
|
1318
1511
|
# Disable display logs
|
|
1319
1512
|
temp_DISPLAY_LOGS = tdfs4ds.DISPLAY_LOGS
|
|
1320
1513
|
tdfs4ds.DISPLAY_LOGS = False
|
|
1514
|
+
if force_display_logs:
|
|
1515
|
+
tdfs4ds.DISPLAY_LOGS = True
|
|
1321
1516
|
PROCESS_TYPE = tdfs4ds.PROCESS_TYPE
|
|
1322
1517
|
tdfs4ds.PROCESS_TYPE = 'ROLL_OUT'
|
|
1323
1518
|
tdfs4ds.RUN_ID = str(uuid.uuid4())
|
|
1324
1519
|
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
1520
|
try:
|
|
1521
|
+
# Define range of time steps
|
|
1328
1522
|
if time_id_end is None:
|
|
1329
|
-
|
|
1523
|
+
time_range = range(time_id_start, time_manager.nb_time_steps + 1)
|
|
1330
1524
|
else:
|
|
1331
|
-
|
|
1332
|
-
|
|
1525
|
+
time_range = range(time_id_start, min(time_manager.nb_time_steps + 1, time_id_end + 1))
|
|
1526
|
+
|
|
1527
|
+
# Progress bar
|
|
1528
|
+
pbar = tqdm(time_range, desc="Starting rollout", unit="step")
|
|
1529
|
+
|
|
1333
1530
|
for i in pbar:
|
|
1334
|
-
# Update
|
|
1335
|
-
time_manager.update(time_id
|
|
1531
|
+
# Update time manager
|
|
1532
|
+
time_manager.update(time_id=i)
|
|
1336
1533
|
date_ = str(time_manager.display()['BUSINESS_DATE'].values[0])
|
|
1337
|
-
|
|
1338
|
-
#
|
|
1534
|
+
|
|
1535
|
+
# Sync feature store time
|
|
1339
1536
|
tdfs4ds.FEATURE_STORE_TIME = time_manager.get_date_in_the_past()
|
|
1340
|
-
|
|
1537
|
+
|
|
1538
|
+
# Display current progress in tqdm
|
|
1539
|
+
pbar.set_postfix(time=date_, feature_time=tdfs4ds.FEATURE_STORE_TIME)
|
|
1540
|
+
|
|
1341
1541
|
if tdfs4ds.DEBUG_MODE:
|
|
1342
|
-
print(
|
|
1343
|
-
print(
|
|
1344
|
-
|
|
1345
|
-
# Execute
|
|
1542
|
+
print("roll_out | date_:", date_)
|
|
1543
|
+
print("roll_out | feature_store_time:", tdfs4ds.FEATURE_STORE_TIME)
|
|
1544
|
+
|
|
1545
|
+
# Execute all processes for this time step
|
|
1346
1546
|
for proc_id in process_list:
|
|
1347
|
-
pbar.set_description(f"Processing {date_}
|
|
1348
|
-
run(process_id=proc_id, force_compute=
|
|
1547
|
+
pbar.set_description(f"Processing {date_} | proc {proc_id}")
|
|
1548
|
+
run(process_id=proc_id, force_compute=force_compute)
|
|
1349
1549
|
|
|
1550
|
+
# Restore settings
|
|
1350
1551
|
tdfs4ds.DISPLAY_LOGS = temp_DISPLAY_LOGS
|
|
1552
|
+
|
|
1351
1553
|
except Exception as e:
|
|
1352
1554
|
tdfs4ds.DISPLAY_LOGS = temp_DISPLAY_LOGS
|
|
1353
|
-
# If an exception occurs, print the date and the first line of the exception message
|
|
1354
|
-
#print(date_)
|
|
1355
1555
|
print(str(e).split('\n')[0])
|
|
1356
1556
|
tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
|
|
1357
1557
|
raise
|
|
1358
1558
|
|
|
1359
|
-
tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
|
|
1360
|
-
|
|
1361
|
-
|
|
1559
|
+
tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
|