tdfs4ds 0.2.4.32__py3-none-any.whl → 0.2.4.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +387 -542
- tdfs4ds/feature_store/feature_data_processing.py +367 -299
- tdfs4ds/feature_store/feature_store_management.py +189 -167
- tdfs4ds/process_store/process_query_administration.py +1 -1
- tdfs4ds/process_store/process_registration_management.py +67 -55
- tdfs4ds/utils/filter_management.py +87 -53
- tdfs4ds/utils/time_management.py +67 -24
- {tdfs4ds-0.2.4.32.dist-info → tdfs4ds-0.2.4.34.dist-info}/METADATA +1 -1
- {tdfs4ds-0.2.4.32.dist-info → tdfs4ds-0.2.4.34.dist-info}/RECORD +11 -11
- {tdfs4ds-0.2.4.32.dist-info → tdfs4ds-0.2.4.34.dist-info}/WHEEL +0 -0
- {tdfs4ds-0.2.4.32.dist-info → tdfs4ds-0.2.4.34.dist-info}/top_level.txt +0 -0
|
@@ -7,6 +7,8 @@ from tdfs4ds.utils.info import seconds_to_dhms
|
|
|
7
7
|
import time
|
|
8
8
|
import re
|
|
9
9
|
import pandas as pd
|
|
10
|
+
from tdfs4ds import logger_safe, logger
|
|
11
|
+
|
|
10
12
|
|
|
11
13
|
def generate_on_clause(entity_id, entity_null_substitute, left_name, right_name):
|
|
12
14
|
res = []
|
|
@@ -73,7 +75,7 @@ def generate_collect_stats(entity_id, primary_index='', partitioning=''):
|
|
|
73
75
|
|
|
74
76
|
# Initialize the extended query with sampling and threshold settings for statistics collection
|
|
75
77
|
query_extension_header = 'COLLECT STATISTICS USING SAMPLE 25 PERCENT AND THRESHOLD 15 PERCENT'
|
|
76
|
-
query_extension
|
|
78
|
+
query_extension = []
|
|
77
79
|
|
|
78
80
|
# Add primary index columns to the extended query
|
|
79
81
|
if primary_index:
|
|
@@ -164,11 +166,10 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
|
|
|
164
166
|
# Record the start time
|
|
165
167
|
start_time = time.time()
|
|
166
168
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
if type(entity_id) == list:
|
|
169
|
+
# Normalize entity_id into a list of keys
|
|
170
|
+
if isinstance(entity_id, list):
|
|
170
171
|
list_entity_id = entity_id
|
|
171
|
-
elif
|
|
172
|
+
elif isinstance(entity_id, dict):
|
|
172
173
|
list_entity_id = list(entity_id.keys())
|
|
173
174
|
else:
|
|
174
175
|
list_entity_id = [entity_id]
|
|
@@ -176,260 +177,333 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
|
|
|
176
177
|
|
|
177
178
|
feature_id_names, conversion_name2id = get_feature_id_and_conversion(list_entity_id, feature_names)
|
|
178
179
|
|
|
179
|
-
features_infos = pd.DataFrame(feature_id_names, columns
|
|
180
|
+
features_infos = pd.DataFrame(feature_id_names, columns=['FEATURE_ID', 'FEATURE_NAME', 'FEATURE_TABLE', 'FEATURE_DATABASE'])
|
|
180
181
|
features_infos['FEATURE_VERSION'] = [feature_versions[k] for k in features_infos.FEATURE_NAME.values]
|
|
181
|
-
if tdfs4ds.DEBUG_MODE:
|
|
182
|
-
print('--- prepare_feature_ingestion ---')
|
|
183
|
-
print('conversion_name2id : ', conversion_name2id)
|
|
184
|
-
print('feature_names : ', feature_names)
|
|
185
182
|
|
|
186
|
-
|
|
187
|
-
|
|
183
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
184
|
+
logger_safe("debug", "--- prepare_feature_ingestion ---")
|
|
185
|
+
logger_safe("debug", "conversion_name2id=%s", conversion_name2id)
|
|
186
|
+
logger_safe("debug", "feature_names=%s", feature_names)
|
|
188
187
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
print('unpivot_columns : ', unpivot_columns)
|
|
192
|
-
# Create the output column list including entity IDs, feature names, and feature values
|
|
188
|
+
# UNPIVOT mapping
|
|
189
|
+
unpivot_columns = ", \n".join([f"({x}) as '{conversion_name2id[x]}'" for x in feature_names])
|
|
193
190
|
|
|
191
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
192
|
+
logger_safe("debug", "unpivot_columns=%s", unpivot_columns)
|
|
193
|
+
|
|
194
|
+
# Output columns for volatile table
|
|
194
195
|
output_columns = ', \n'.join(list_entity_id + ['CAST(FEATURE_ID AS BIGINT) AS FEATURE_ID', 'FEATURE_VALUE'])
|
|
195
196
|
|
|
197
|
+
# Primary index
|
|
196
198
|
if primary_index is None:
|
|
197
199
|
primary_index = ','.join(list_entity_id)
|
|
198
200
|
else:
|
|
199
|
-
if
|
|
200
|
-
primary_index = primary_index
|
|
201
|
-
else:
|
|
201
|
+
if not isinstance(primary_index, list):
|
|
202
202
|
primary_index = [primary_index]
|
|
203
203
|
primary_index = ','.join(primary_index)
|
|
204
204
|
|
|
205
|
-
#
|
|
205
|
+
# Feature versions (defaults)
|
|
206
206
|
versions = {f: tdfs4ds.FEATURE_VERSION_DEFAULT for f in feature_names}
|
|
207
207
|
if feature_versions is not None:
|
|
208
208
|
for k, v in feature_versions.items():
|
|
209
209
|
versions[k] = v
|
|
210
210
|
|
|
211
|
-
if tdfs4ds
|
|
212
|
-
|
|
213
|
-
print('versions : ', versions)
|
|
211
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
212
|
+
logger_safe("debug", "versions=%s", versions)
|
|
214
213
|
|
|
215
|
-
#
|
|
216
|
-
version_query = ["CASE"] + [f"WHEN FEATURE_ID = '{conversion_name2id[k]}' THEN '{v}' " for k, v in versions.items()] + [
|
|
217
|
-
"END AS FEATURE_VERSION"]
|
|
214
|
+
# CASE statement for versions
|
|
215
|
+
version_query = ["CASE"] + [f"WHEN FEATURE_ID = '{conversion_name2id[k]}' THEN '{v}' " for k, v in versions.items()] + ["END AS FEATURE_VERSION"]
|
|
218
216
|
version_query = '\n'.join(version_query)
|
|
219
217
|
|
|
220
|
-
if tdfs4ds
|
|
221
|
-
|
|
222
|
-
print('version_query : ', version_query)
|
|
218
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
219
|
+
logger_safe("debug", "version_query=%s", version_query)
|
|
223
220
|
|
|
224
|
-
#
|
|
221
|
+
# Volatile table name
|
|
225
222
|
volatile_table_name = df._table_name.split('.')[1].replace('"', '')
|
|
226
|
-
volatile_table_name = f
|
|
223
|
+
volatile_table_name = f"temp_{volatile_table_name}"
|
|
227
224
|
|
|
228
|
-
|
|
225
|
+
# Normalize entity_id again for var casting
|
|
226
|
+
if isinstance(entity_id, list):
|
|
229
227
|
list_entity_id = entity_id
|
|
230
|
-
elif
|
|
228
|
+
elif isinstance(entity_id, dict):
|
|
231
229
|
list_entity_id = list(entity_id.keys())
|
|
232
230
|
else:
|
|
233
231
|
list_entity_id = [entity_id]
|
|
234
232
|
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
res = {x.split()[0]:''.join(x.split()[1::]) for x in str(df[feature_names].tdtypes).split('\n')}
|
|
233
|
+
# Character set handling / pass-through
|
|
234
|
+
res = {x.split()[0]: ''.join(x.split()[1::]) for x in str(df[feature_names].tdtypes).split('\n')}
|
|
238
235
|
var_temp2 = []
|
|
239
|
-
for k,v in res.items():
|
|
236
|
+
for k, v in res.items():
|
|
240
237
|
if 'UNICODE' in v:
|
|
241
|
-
#var_temp2.append(f'TRANSLATE({k} USING UNICODE_TO_LATIN) AS {k}')
|
|
242
238
|
var_temp2.append(f'{k}')
|
|
243
239
|
elif 'LATIN' in v:
|
|
244
|
-
#var_temp2.append(f'{k}')
|
|
245
240
|
var_temp2.append(f'TRANSLATE({k} USING LATIN_TO_UNICODE) AS {k}')
|
|
246
241
|
else:
|
|
247
242
|
var_temp2.append(f'CAST({k} AS VARCHAR(2048) CHARACTER SET UNICODE) AS {k}')
|
|
248
243
|
var_temp2 = ', \n'.join(var_temp2)
|
|
244
|
+
|
|
245
|
+
# NOTE: the original code overrides var_temp2 with just the raw column names.
|
|
246
|
+
# Preserve that behavior to avoid functional change.
|
|
249
247
|
var_temp2 = ', \n'.join(list(res.keys()))
|
|
250
248
|
|
|
249
|
+
# Null substitution on entity keys
|
|
251
250
|
var_temp3 = []
|
|
252
251
|
for e in list_entity_id:
|
|
253
252
|
if e in entity_null_substitute.keys():
|
|
254
|
-
if
|
|
253
|
+
if isinstance(entity_null_substitute[e], str):
|
|
255
254
|
var_temp3.append(f"coalesce({e},'{entity_null_substitute[e]}') AS {e}")
|
|
256
255
|
else:
|
|
257
256
|
var_temp3.append(f"coalesce({e},{entity_null_substitute[e]}) AS {e}")
|
|
258
257
|
else:
|
|
259
258
|
var_temp3.append(e)
|
|
260
|
-
|
|
261
259
|
var_temp3 = ', \n'.join(var_temp3)
|
|
262
260
|
|
|
263
|
-
|
|
264
|
-
nested_query = f"""
|
|
265
|
-
CREATE MULTISET VOLATILE TABLE {volatile_table_name} AS
|
|
266
|
-
(
|
|
267
|
-
SELECT
|
|
268
|
-
{output_columns},
|
|
269
|
-
{version_query}
|
|
270
|
-
FROM
|
|
271
|
-
(SELECT
|
|
272
|
-
{var_temp3},
|
|
273
|
-
{var_temp2}
|
|
274
|
-
FROM {df._table_name}
|
|
275
|
-
) A
|
|
276
|
-
UNPIVOT INCLUDE NULLS ((FEATURE_VALUE ) FOR FEATURE_ID
|
|
277
|
-
IN ({unpivot_columns})) Tmp
|
|
278
|
-
) WITH DATA
|
|
279
|
-
PRIMARY INDEX ({primary_index})
|
|
280
|
-
PARTITION BY RANGE_N(FEATURE_ID BETWEEN 0 AND 2000 EACH 1 )
|
|
281
|
-
ON COMMIT PRESERVE ROWS
|
|
282
|
-
"""
|
|
283
|
-
|
|
284
|
-
nested_query = f"""
|
|
285
|
-
CREATE MULTISET VOLATILE TABLE {volatile_table_name} AS
|
|
286
|
-
(
|
|
287
|
-
SELECT
|
|
288
|
-
{var_temp3},
|
|
289
|
-
{var_temp2}
|
|
290
|
-
FROM {df._table_name}
|
|
291
|
-
) WITH DATA
|
|
292
|
-
PRIMARY INDEX ({primary_index})
|
|
293
|
-
ON COMMIT PRESERVE ROWS
|
|
294
|
-
"""
|
|
295
|
-
|
|
261
|
+
# Final nested query used (the function reassigns to plain SELECT; preserve as-is)
|
|
296
262
|
nested_query = f"""
|
|
297
263
|
SELECT
|
|
298
264
|
{var_temp3},
|
|
299
265
|
{var_temp2}
|
|
300
266
|
FROM {df._table_name}
|
|
301
|
-
|
|
302
267
|
"""
|
|
303
268
|
|
|
304
|
-
#
|
|
269
|
+
# Duplicate check query
|
|
305
270
|
output_columns_unicity = ', \n'.join(list_entity_id)
|
|
306
271
|
query_test_unicity = f"""
|
|
307
272
|
SELECT sum(CASE WHEN n>1 THEN 1 ELSE 0 END) AS nb_duplicates
|
|
308
273
|
FROM (
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
274
|
+
SELECT
|
|
275
|
+
{output_columns_unicity},
|
|
276
|
+
count(*) as n
|
|
277
|
+
FROM {_get_database_username()}.{volatile_table_name}
|
|
278
|
+
GROUP BY {output_columns_unicity}
|
|
314
279
|
) A
|
|
315
280
|
"""
|
|
316
281
|
|
|
317
|
-
if tdfs4ds
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
print('nested_query :', nested_query)
|
|
322
|
-
|
|
323
|
-
|
|
282
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
283
|
+
logger_safe("debug", "var_temp2=%s", var_temp2)
|
|
284
|
+
logger_safe("debug", "var_temp3=%s", var_temp3)
|
|
285
|
+
logger_safe("debug", "nested_query=%s", nested_query)
|
|
324
286
|
|
|
325
|
-
# Execute
|
|
287
|
+
# Execute: create volatile table and test unicity
|
|
326
288
|
try:
|
|
327
|
-
|
|
328
|
-
|
|
289
|
+
tdml.DataFrame.from_query(nested_query).to_sql(
|
|
290
|
+
table_name=volatile_table_name,
|
|
291
|
+
temporary=True,
|
|
292
|
+
primary_index=primary_index.split(','),
|
|
293
|
+
if_exists='replace'
|
|
294
|
+
)
|
|
329
295
|
nb_duplicates = tdml.execute_sql(query_test_unicity).fetchall()[0][0]
|
|
330
296
|
if nb_duplicates is not None and nb_duplicates > 0:
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
297
|
+
logger_safe("error", "The process generates %s duplicates", nb_duplicates)
|
|
298
|
+
# Show sample duplicates in debug for troubleshooting
|
|
299
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
300
|
+
sample_dups_query = f"""
|
|
301
|
+
SELECT TOP 3
|
|
302
|
+
{output_columns_unicity},
|
|
303
|
+
count(*) as n
|
|
304
|
+
FROM {_get_database_username()}.{volatile_table_name}
|
|
305
|
+
GROUP BY {output_columns_unicity}
|
|
306
|
+
HAVING n > 1
|
|
307
|
+
"""
|
|
308
|
+
logger_safe("debug", "Sample duplicates query:\n%s", sample_dups_query)
|
|
340
309
|
raise ValueError("Invalid process: the process generates duplicates.")
|
|
341
|
-
#
|
|
310
|
+
# else: no duplicates
|
|
311
|
+
# logger_safe("info", "No duplicate found.") # optional
|
|
342
312
|
except Exception as e:
|
|
343
|
-
|
|
344
|
-
print(str(e).split('\n')[0])
|
|
313
|
+
logger_safe("error", "prepare_feature_ingestion failed: %s", str(e).split('\n')[0])
|
|
345
314
|
raise
|
|
346
315
|
|
|
316
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
317
|
+
logger_safe(
|
|
318
|
+
"debug",
|
|
319
|
+
"Result volatile table dtypes:\n%s",
|
|
320
|
+
tdml.DataFrame(tdml.in_schema(_get_database_username(), volatile_table_name)).tdtypes
|
|
321
|
+
)
|
|
347
322
|
|
|
348
|
-
|
|
349
|
-
print('--- prepare_feature_ingestion ---')
|
|
350
|
-
print(tdml.DataFrame(tdml.in_schema(_get_database_username(), volatile_table_name)).tdtypes)
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
# Record the end time
|
|
323
|
+
# Timing
|
|
354
324
|
end_time = time.time()
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
# Calculate the elapsed time in seconds
|
|
358
325
|
elapsed_time = end_time - start_time
|
|
359
326
|
formatted_elapsed_time = seconds_to_dhms(elapsed_time)
|
|
360
|
-
|
|
361
|
-
|
|
327
|
+
logger_safe("info", "Feature preparation for ingestion: %s (%.3fs)", formatted_elapsed_time, elapsed_time)
|
|
328
|
+
|
|
329
|
+
# Return DataFrame and metadata
|
|
362
330
|
try:
|
|
363
331
|
df_out = tdml.DataFrame(tdml.in_schema(_get_database_username(), volatile_table_name))
|
|
364
332
|
return df_out, volatile_table_name, features_infos
|
|
365
333
|
except Exception as e:
|
|
366
|
-
|
|
367
|
-
|
|
334
|
+
logger_safe("error", "Failed to materialize volatile DataFrame: %s", str(e).split()[0])
|
|
335
|
+
logger_safe("error", "Feature dtypes snapshot: %s", df[feature_names].tdtypes)
|
|
368
336
|
if 'TD_Unpivot contract function' in str(e).split()[0]:
|
|
369
|
-
raise(
|
|
337
|
+
raise RuntimeError(
|
|
338
|
+
"Error: you may have strings with UNICODE encoding as features; please convert them to LATIN first."
|
|
339
|
+
)
|
|
340
|
+
raise
|
|
370
341
|
|
|
371
|
-
|
|
342
|
+
# Fallback (should not reach)
|
|
343
|
+
# return None, None, None
|
|
372
344
|
|
|
373
345
|
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
Applies a collect statistics operation on target tables grouped by feature table and database.
|
|
377
|
-
|
|
378
|
-
This function performs the following steps:
|
|
379
|
-
1. Sorts the `entity_id`.
|
|
380
|
-
2. Groups the feature information by feature table and database to count occurrences.
|
|
381
|
-
3. Generates collect statistics queries.
|
|
382
|
-
4. Executes the queries on the target tables while recording the execution time.
|
|
383
|
-
5. Logs the elapsed time if logging is enabled.
|
|
384
|
-
|
|
385
|
-
Args:
|
|
386
|
-
entity_id (list): A list of entity IDs to process.
|
|
387
|
-
primary_index (str): The primary index to use in the collect statistics query.
|
|
388
|
-
partitioning (str): Partitioning information for the query.
|
|
389
|
-
feature_infos (pd.DataFrame): A DataFrame containing feature information,
|
|
390
|
-
including columns 'FEATURE_TABLE', 'FEATURE_DATABASE', and 'FEATURE_ID'.
|
|
346
|
+
import time
|
|
347
|
+
from typing import Any, Dict, Iterable, Mapping, Optional, Tuple
|
|
391
348
|
|
|
392
|
-
|
|
393
|
-
|
|
349
|
+
import pandas as pd
|
|
350
|
+
|
|
351
|
+
def apply_collect_stats(
|
|
352
|
+
entity_id: Mapping[str, Any] | Iterable[str],
|
|
353
|
+
primary_index: Optional[str],
|
|
354
|
+
partitioning: Optional[str],
|
|
355
|
+
feature_infos: pd.DataFrame,
|
|
356
|
+
) -> Dict[str, Any]:
|
|
394
357
|
"""
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
358
|
+
Run COLLECT STATS on all target feature tables, with fallbacks and timing.
|
|
359
|
+
|
|
360
|
+
Steps:
|
|
361
|
+
1) Determine a stable ordering of entity IDs (for deterministic query gen).
|
|
362
|
+
2) Group `feature_infos` by FEATURE_DATABASE + FEATURE_TABLE to get unique targets.
|
|
363
|
+
3) Generate COLLECT STATS statements via `generate_collect_stats(...)` for fallback use.
|
|
364
|
+
4) For each target table:
|
|
365
|
+
- Try a simple `COLLECT STATS ON <db>.<table>`.
|
|
366
|
+
- On failure, retry with generated statements (and optional extension).
|
|
367
|
+
5) Log a compact summary (counts + total duration) and return it as a dict.
|
|
368
|
+
|
|
369
|
+
Parameters
|
|
370
|
+
----------
|
|
371
|
+
entity_id : Mapping[str, Any] | Iterable[str]
|
|
372
|
+
Entity identifiers used to parameterize collect-stat statements.
|
|
373
|
+
If a mapping (e.g., dict), its *keys* are used and sorted.
|
|
374
|
+
If an iterable (e.g., list/tuple), it’s sorted directly.
|
|
375
|
+
primary_index : Optional[str]
|
|
376
|
+
Primary index used by `generate_collect_stats` (may be None).
|
|
377
|
+
partitioning : Optional[str]
|
|
378
|
+
Partitioning clause used by `generate_collect_stats` (may be None).
|
|
379
|
+
feature_infos : pd.DataFrame
|
|
380
|
+
Must contain columns: 'FEATURE_TABLE', 'FEATURE_DATABASE', 'FEATURE_ID'.
|
|
381
|
+
|
|
382
|
+
Returns
|
|
383
|
+
-------
|
|
384
|
+
Dict[str, Any]
|
|
385
|
+
Summary with keys:
|
|
386
|
+
- total_tables: int
|
|
387
|
+
- ok: int
|
|
388
|
+
- retried: int
|
|
389
|
+
- failed: int
|
|
390
|
+
- duration_seconds: float
|
|
391
|
+
- duration_hms: str
|
|
392
|
+
- details: list[dict] # per-table status entries
|
|
393
|
+
"""
|
|
394
|
+
# --- Validate inputs -----------------------------------------------------
|
|
395
|
+
required_cols = {"FEATURE_TABLE", "FEATURE_DATABASE", "FEATURE_ID"}
|
|
396
|
+
missing = required_cols.difference(feature_infos.columns)
|
|
397
|
+
if missing:
|
|
398
|
+
raise ValueError(f"feature_infos is missing required columns: {sorted(missing)}")
|
|
399
|
+
|
|
400
|
+
# --- Normalize & sort entity IDs ----------------------------------------
|
|
401
|
+
if hasattr(entity_id, "keys"):
|
|
402
|
+
sorted_entity_ids = sorted(list(entity_id.keys()))
|
|
403
|
+
else:
|
|
404
|
+
sorted_entity_ids = sorted(list(entity_id))
|
|
405
|
+
|
|
406
|
+
# --- Group to unique targets --------------------------------------------
|
|
407
|
+
target_tables = (
|
|
408
|
+
feature_infos[["FEATURE_TABLE", "FEATURE_DATABASE", "FEATURE_ID"]]
|
|
409
|
+
.groupby(["FEATURE_TABLE", "FEATURE_DATABASE"])
|
|
410
|
+
.count()
|
|
411
|
+
.reset_index()
|
|
412
|
+
)
|
|
398
413
|
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
414
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
415
|
+
logger_safe(
|
|
416
|
+
"debug",
|
|
417
|
+
"collect_stats.targets | count=%s | tables=%s",
|
|
418
|
+
len(target_tables),
|
|
419
|
+
target_tables[["FEATURE_DATABASE", "FEATURE_TABLE"]].to_dict(orient="records"),
|
|
420
|
+
)
|
|
403
421
|
|
|
404
|
-
#
|
|
422
|
+
# --- Prepare statements --------------------------------------------------
|
|
405
423
|
query_collect_stats, query_collect_stats_extension = generate_collect_stats(
|
|
406
|
-
|
|
424
|
+
sorted_entity_ids,
|
|
407
425
|
primary_index=primary_index,
|
|
408
|
-
partitioning=partitioning
|
|
426
|
+
partitioning=partitioning,
|
|
409
427
|
)
|
|
410
428
|
|
|
411
|
-
#
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
429
|
+
# --- Execute -------------------------------------------------------------
|
|
430
|
+
started = time.perf_counter()
|
|
431
|
+
results: list[Dict[str, Any]] = []
|
|
432
|
+
|
|
433
|
+
ok = retried = failed = 0
|
|
434
|
+
|
|
435
|
+
for _, row in target_tables.iterrows():
|
|
436
|
+
db = row["FEATURE_DATABASE"]
|
|
437
|
+
tbl = row["FEATURE_TABLE"]
|
|
438
|
+
table_fqn = f"{db}.{tbl}"
|
|
439
|
+
|
|
440
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
441
|
+
logger_safe("debug", "collect_stats.run | table=%s", table_fqn)
|
|
442
|
+
|
|
443
|
+
t0 = time.perf_counter()
|
|
444
|
+
status = "ok"
|
|
445
|
+
error_short = None
|
|
446
|
+
retried_flag = False
|
|
447
|
+
|
|
448
|
+
try:
|
|
449
|
+
execute_query(f"COLLECT STATS ON {table_fqn}")
|
|
450
|
+
ok += 1
|
|
451
|
+
except Exception as e:
|
|
452
|
+
# First attempt failed; try generated statement(s)
|
|
453
|
+
error_short = str(e).split("\n")[0]
|
|
454
|
+
logger_safe("warning", "collect_stats.initial_fail | table=%s | err=%s", table_fqn, error_short)
|
|
455
|
+
|
|
456
|
+
try:
|
|
457
|
+
execute_query(query_collect_stats + f" ON {table_fqn}")
|
|
458
|
+
retried_flag = True
|
|
459
|
+
retried += 1
|
|
460
|
+
|
|
461
|
+
if query_collect_stats_extension is not None:
|
|
462
|
+
execute_query(query_collect_stats_extension + f" ON {table_fqn}")
|
|
463
|
+
except Exception as e2:
|
|
464
|
+
status = "failed"
|
|
465
|
+
error_short = str(e2).split("\n")[0]
|
|
466
|
+
failed += 1
|
|
467
|
+
logger_safe("error", "collect_stats.retry_fail | table=%s | err=%s", table_fqn, error_short)
|
|
468
|
+
|
|
469
|
+
dt = time.perf_counter() - t0
|
|
470
|
+
results.append(
|
|
471
|
+
{
|
|
472
|
+
"table": table_fqn,
|
|
473
|
+
"status": status,
|
|
474
|
+
"retried": retried_flag,
|
|
475
|
+
"elapsed_s": dt,
|
|
476
|
+
"error": error_short,
|
|
477
|
+
}
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
# --- Final summary -------------------------------------------------------
|
|
481
|
+
elapsed = time.perf_counter() - started
|
|
482
|
+
formatted = seconds_to_dhms(elapsed)
|
|
483
|
+
|
|
484
|
+
# Structured, parseable one-liner
|
|
485
|
+
logger_safe(
|
|
486
|
+
"info",
|
|
487
|
+
"collect_stats.summary | tables=%d | ok=%d | retried=%d | failed=%d | duration=%s (%.3fs)",
|
|
488
|
+
len(target_tables),
|
|
489
|
+
ok,
|
|
490
|
+
retried,
|
|
491
|
+
failed,
|
|
492
|
+
formatted,
|
|
493
|
+
elapsed,
|
|
494
|
+
)
|
|
418
495
|
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
496
|
+
return {
|
|
497
|
+
"total_tables": int(len(target_tables)),
|
|
498
|
+
"ok": int(ok),
|
|
499
|
+
"retried": int(retried),
|
|
500
|
+
"failed": int(failed),
|
|
501
|
+
"duration_seconds": float(elapsed),
|
|
502
|
+
"duration_hms": formatted,
|
|
503
|
+
"details": results,
|
|
504
|
+
}
|
|
422
505
|
|
|
423
|
-
# Record the end time after query execution.
|
|
424
|
-
end_time = time.time()
|
|
425
506
|
|
|
426
|
-
# Calculate the elapsed time in seconds and format it into a human-readable format.
|
|
427
|
-
elapsed_time = end_time - start_time
|
|
428
|
-
formatted_elapsed_time = seconds_to_dhms(elapsed_time)
|
|
429
|
-
|
|
430
|
-
# Log the execution time if logging is enabled.
|
|
431
|
-
if tdfs4ds.DISPLAY_LOGS:
|
|
432
|
-
print(f'Storage of the prepared features - collect stats only : {formatted_elapsed_time} ({elapsed_time}s)')
|
|
433
507
|
|
|
434
508
|
|
|
435
509
|
def _store_feature_update_insert(entity_id, volatile_table_name, entity_null_substitute={},primary_index=None,
|
|
@@ -627,9 +701,8 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
|
|
|
627
701
|
>>> store_feature(entity_id_dict, prepared_features)
|
|
628
702
|
"""
|
|
629
703
|
|
|
630
|
-
#
|
|
631
|
-
|
|
632
|
-
if tdfs4ds.FEATURE_STORE_TIME == None:
|
|
704
|
+
# VALIDTIME handling
|
|
705
|
+
if tdfs4ds.FEATURE_STORE_TIME is None:
|
|
633
706
|
validtime_statement = 'CURRENT VALIDTIME'
|
|
634
707
|
validtime_statement2 = validtime_statement
|
|
635
708
|
validtime_start = 'CAST(CURRENT_TIME AS TIMESTAMP(0) WITH TIME ZONE)'
|
|
@@ -638,180 +711,155 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
|
|
|
638
711
|
validtime_statement2 = f"VALIDTIME AS OF TIMESTAMP '{tdfs4ds.FEATURE_STORE_TIME}'"
|
|
639
712
|
validtime_start = f"CAST('{tdfs4ds.FEATURE_STORE_TIME}' AS TIMESTAMP(0) WITH TIME ZONE)"
|
|
640
713
|
|
|
641
|
-
if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED'
|
|
642
|
-
end_period_ = '9999-01-01 00:00:00'
|
|
643
|
-
else:
|
|
644
|
-
end_period_ = tdfs4ds.END_PERIOD
|
|
714
|
+
end_period_ = '9999-01-01 00:00:00' if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED' else tdfs4ds.END_PERIOD
|
|
645
715
|
|
|
646
|
-
if tdfs4ds
|
|
647
|
-
|
|
716
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
717
|
+
logger_safe("debug", "FEATURE_STORE_TIME=%s | END_PERIOD=%s", tdfs4ds.FEATURE_STORE_TIME, tdfs4ds.END_PERIOD)
|
|
718
|
+
logger_safe("debug", "entity_id=%s", entity_id)
|
|
648
719
|
|
|
720
|
+
# Entity id helpers
|
|
721
|
+
sorted_entity_id = sorted(list(entity_id.keys()))
|
|
722
|
+
ENTITY_ID = ','.join(sorted_entity_id)
|
|
649
723
|
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
{volatile_table_name}
|
|
661
|
-
""").fetchall(), columns = ['NB_ROWS'])
|
|
662
|
-
|
|
663
|
-
if tdfs4ds.DEBUG_MODE:
|
|
664
|
-
print('count_features :' , count_features)
|
|
665
|
-
print('features_infos :', features_infos)
|
|
724
|
+
# Count rows in volatile table
|
|
725
|
+
count_features = pd.DataFrame(
|
|
726
|
+
tdml.execute_sql(
|
|
727
|
+
f"""
|
|
728
|
+
SEL count(*) as NB_ROWS
|
|
729
|
+
FROM {_get_database_username()}.{volatile_table_name}
|
|
730
|
+
"""
|
|
731
|
+
).fetchall(),
|
|
732
|
+
columns=['NB_ROWS']
|
|
733
|
+
)
|
|
666
734
|
|
|
735
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
736
|
+
logger_safe("debug", "count_features=%s", count_features)
|
|
737
|
+
logger_safe("debug", "features_infos initial=%s", features_infos)
|
|
667
738
|
|
|
668
739
|
if count_features.shape[0] > 0:
|
|
669
740
|
features_infos['NB_ROWS'] = count_features['NB_ROWS'].values[0]
|
|
670
741
|
else:
|
|
671
742
|
features_infos['NB_ROWS'] = 0
|
|
672
743
|
|
|
673
|
-
if tdfs4ds
|
|
674
|
-
|
|
675
|
-
# Group the target tables by feature table and feature database and count the number of occurrences
|
|
676
|
-
target_tables = features_infos[['FEATURE_TABLE', 'FEATURE_DATABASE', 'NB_ROWS']].groupby(
|
|
677
|
-
['FEATURE_TABLE', 'FEATURE_DATABASE']).sum().reset_index()
|
|
678
|
-
|
|
679
|
-
if tdfs4ds.DEBUG_MODE:
|
|
680
|
-
print('target_tables :' , target_tables)
|
|
681
|
-
if tdfs4ds.DISPLAY_LOGS:
|
|
682
|
-
display_table(target_tables[['FEATURE_DATABASE', 'FEATURE_TABLE', 'NB_ROWS']])
|
|
744
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
745
|
+
logger_safe("debug", "features_infos updated=%s", features_infos)
|
|
683
746
|
|
|
747
|
+
# Compute target tables
|
|
748
|
+
target_tables = features_infos[['FEATURE_TABLE', 'FEATURE_DATABASE', 'NB_ROWS']].groupby(
|
|
749
|
+
['FEATURE_TABLE', 'FEATURE_DATABASE']
|
|
750
|
+
).sum().reset_index()
|
|
684
751
|
|
|
752
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
753
|
+
logger_safe("debug", "target_tables=%s", target_tables)
|
|
685
754
|
|
|
686
|
-
|
|
687
|
-
|
|
755
|
+
# Optional display (keep existing UX semantics)
|
|
756
|
+
if getattr(tdfs4ds, "DISPLAY_LOGS", False):
|
|
757
|
+
try:
|
|
758
|
+
display_table(target_tables[['FEATURE_DATABASE', 'FEATURE_TABLE', 'NB_ROWS']])
|
|
759
|
+
except Exception as e:
|
|
760
|
+
logger_safe("warning", "display_table failed: %s", str(e).split('\n')[0])
|
|
688
761
|
|
|
689
762
|
ENTITY_ID_ON = ' AND '.join([f'NEW_FEATURES.{k} = EXISTING_FEATURES.{k}' for k in sorted_entity_id])
|
|
690
|
-
|
|
691
763
|
ENTITY_ID_SELECT = ', \n'.join(['NEW_FEATURES.' + k for k in sorted_entity_id])
|
|
692
|
-
# Iterate over target tables and perform update and insert operations
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
#query_collect_stats, query_collect_stats_extension = generate_collect_stats(sorted_entity_id,primary_index=primary_index, partitioning=partitioning)
|
|
696
|
-
|
|
697
764
|
|
|
698
765
|
queries = []
|
|
699
|
-
for
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
766
|
+
for _, row in features_infos.iterrows():
|
|
767
|
+
features_infos_ = features_infos[
|
|
768
|
+
(features_infos.FEATURE_TABLE == row['FEATURE_TABLE']) &
|
|
769
|
+
(features_infos.FEATURE_DATABASE == row['FEATURE_DATABASE'])
|
|
770
|
+
]
|
|
771
|
+
feature_id_list = ','.join([str(x) for x in sorted(set(features_infos_.FEATURE_ID.values))])
|
|
772
|
+
feature_version_list = ','.join(["'" + x + "'" for x in sorted(set(features_infos_.FEATURE_VERSION.values))])
|
|
773
|
+
|
|
774
|
+
# Build nested query
|
|
707
775
|
nested_query = f"""
|
|
708
776
|
SEL
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
FROM {_get_database_username()}.{volatile_table_name}
|
|
777
|
+
{ENTITY_ID}
|
|
778
|
+
, {row['FEATURE_ID']} AS FEATURE_ID
|
|
779
|
+
, {row['FEATURE_NAME']} AS FEATURE_VALUE
|
|
780
|
+
, '{row['FEATURE_VERSION']}' AS FEATURE_VERSION
|
|
781
|
+
FROM {_get_database_username()}.{volatile_table_name}
|
|
714
782
|
"""
|
|
715
783
|
|
|
716
|
-
if tdfs4ds.FEATURE_STORE_TIME
|
|
784
|
+
if tdfs4ds.FEATURE_STORE_TIME is None:
|
|
717
785
|
query_merge = f"""
|
|
718
786
|
{validtime_statement}
|
|
719
|
-
MERGE INTO
|
|
720
|
-
|
|
787
|
+
MERGE INTO {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']} EXISTING_FEATURES
|
|
721
788
|
USING ( {nested_query} ) NEW_FEATURES
|
|
722
|
-
ON {ENTITY_ID_ON}
|
|
789
|
+
ON {ENTITY_ID_ON}
|
|
723
790
|
AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
|
|
724
791
|
AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
|
|
725
792
|
AND NEW_FEATURES.FEATURE_ID IN ({feature_id_list})
|
|
726
793
|
AND EXISTING_FEATURES.FEATURE_ID IN ({feature_id_list})
|
|
727
794
|
AND EXISTING_FEATURES.FEATURE_VERSION IN ({feature_version_list})
|
|
728
795
|
WHEN MATCHED THEN
|
|
729
|
-
UPDATE
|
|
730
|
-
SET
|
|
731
|
-
FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
|
|
796
|
+
UPDATE SET FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
|
|
732
797
|
WHEN NOT MATCHED THEN
|
|
733
798
|
INSERT
|
|
734
|
-
(
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
--'{end_period_}')
|
|
799
|
+
(
|
|
800
|
+
{ENTITY_ID_SELECT},
|
|
801
|
+
NEW_FEATURES.FEATURE_ID,
|
|
802
|
+
NEW_FEATURES.FEATURE_VALUE,
|
|
803
|
+
NEW_FEATURES.FEATURE_VERSION
|
|
804
|
+
)
|
|
741
805
|
"""
|
|
742
806
|
else:
|
|
743
807
|
query_merge = f"""
|
|
744
808
|
{validtime_statement}
|
|
745
|
-
MERGE INTO
|
|
809
|
+
MERGE INTO {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']} EXISTING_FEATURES
|
|
746
810
|
USING ( {nested_query} ) NEW_FEATURES
|
|
747
|
-
ON {ENTITY_ID_ON}
|
|
811
|
+
ON {ENTITY_ID_ON}
|
|
748
812
|
AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
|
|
749
|
-
AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
|
|
750
|
-
AND EXISTING_FEATURES.FEATURE_ID IN ({feature_id_list})
|
|
751
|
-
AND NEW_FEATURES.FEATURE_ID IN ({feature_id_list})
|
|
813
|
+
AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
|
|
814
|
+
AND EXISTING_FEATURES.FEATURE_ID IN ({feature_id_list})
|
|
815
|
+
AND NEW_FEATURES.FEATURE_ID IN ({feature_id_list})
|
|
752
816
|
AND EXISTING_FEATURES.FEATURE_VERSION IN ({feature_version_list})
|
|
753
817
|
WHEN MATCHED THEN
|
|
754
|
-
UPDATE
|
|
755
|
-
SET
|
|
756
|
-
FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
|
|
818
|
+
UPDATE SET FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
|
|
757
819
|
WHEN NOT MATCHED THEN
|
|
758
820
|
INSERT
|
|
759
|
-
(
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
821
|
+
(
|
|
822
|
+
{ENTITY_ID_SELECT},
|
|
823
|
+
NEW_FEATURES.FEATURE_ID,
|
|
824
|
+
NEW_FEATURES.FEATURE_VALUE,
|
|
825
|
+
NEW_FEATURES.FEATURE_VERSION,
|
|
826
|
+
{validtime_start},
|
|
827
|
+
'{end_period_}'
|
|
828
|
+
)
|
|
765
829
|
"""
|
|
766
830
|
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
831
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
832
|
+
entity_id_str = ', '.join(sorted_entity_id)
|
|
833
|
+
logger_safe(
|
|
834
|
+
"debug",
|
|
835
|
+
"Merging feature values for entity keys (%s) into %s.%s",
|
|
836
|
+
entity_id_str, row['FEATURE_DATABASE'], row['FEATURE_TABLE']
|
|
837
|
+
)
|
|
838
|
+
logger_safe("debug", "Query (truncated): %s", "\n".join(query_merge.splitlines()[:12]) + "\n...")
|
|
772
839
|
|
|
773
840
|
queries.append(query_merge)
|
|
774
841
|
|
|
775
|
-
query_merge = '; \n'.join(queries)
|
|
776
842
|
try:
|
|
777
|
-
# Record the end time
|
|
778
843
|
start_time = time.time()
|
|
779
844
|
|
|
780
845
|
for q in queries:
|
|
781
|
-
if tdfs4ds
|
|
782
|
-
|
|
783
|
-
# Execute the SQL query to create the volatile table.
|
|
846
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
847
|
+
logger_safe("debug", "Executing merge (head): %s", "\n".join(q.split('\n')[0:3]))
|
|
784
848
|
execute_query(q)
|
|
785
|
-
#execute_query(query_merge)
|
|
786
|
-
# Record the end time
|
|
787
|
-
end_time = time.time()
|
|
788
849
|
|
|
789
|
-
|
|
790
|
-
elapsed_time = end_time - start_time
|
|
850
|
+
elapsed_time = time.time() - start_time
|
|
791
851
|
formatted_elapsed_time = seconds_to_dhms(elapsed_time)
|
|
792
|
-
|
|
793
|
-
|
|
852
|
+
logger_safe(
|
|
853
|
+
"info",
|
|
854
|
+
"Storage of prepared features (merge-only) completed in %s (%.3fs)",
|
|
855
|
+
formatted_elapsed_time, elapsed_time
|
|
856
|
+
)
|
|
794
857
|
except Exception as e:
|
|
795
|
-
|
|
858
|
+
logger_safe("exception", "Feature storage (merge) failed: %s", str(e).split('\n')[0])
|
|
796
859
|
raise
|
|
797
860
|
|
|
798
|
-
# # Record the end time
|
|
799
|
-
# start_time = time.time()
|
|
800
|
-
# for i, row in features_infos.iterrows():
|
|
801
|
-
# execute_query(query_collect_stats + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
|
|
802
|
-
# #print(query_collect_stats + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
|
|
803
|
-
# if query_collect_stats_extension is not None:
|
|
804
|
-
# execute_query(query_collect_stats_extension + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
|
|
805
|
-
# #print(query_collect_stats_extension + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
|
|
806
|
-
# # Record the end time
|
|
807
|
-
# end_time = time.time()
|
|
808
|
-
#
|
|
809
|
-
# # Calculate the elapsed time in seconds
|
|
810
|
-
# elapsed_time = end_time - start_time
|
|
811
|
-
# formatted_elapsed_time = seconds_to_dhms(elapsed_time)
|
|
812
|
-
# if tdfs4ds.DISPLAY_LOGS:
|
|
813
|
-
# print(f'Storage of the prepared features - collect stats only : {formatted_elapsed_time} ({elapsed_time}s)')
|
|
814
861
|
return
|
|
862
|
+
|
|
815
863
|
def store_feature(entity_id, volatile_table_name, entity_null_substitute = {},primary_index=None,
|
|
816
864
|
partitioning='', features_infos = None, **kwargs):
|
|
817
865
|
"""
|
|
@@ -832,27 +880,47 @@ def store_feature(entity_id, volatile_table_name, entity_null_substitute = {},pr
|
|
|
832
880
|
>>> store_feature(entity_id_dict, prepared_features)
|
|
833
881
|
"""
|
|
834
882
|
|
|
835
|
-
# Record the start time
|
|
836
883
|
start_time = time.time()
|
|
837
884
|
|
|
885
|
+
# Choose storage strategy
|
|
838
886
|
if tdfs4ds.STORE_FEATURE == 'UPDATE_INSERT':
|
|
839
|
-
|
|
840
|
-
|
|
887
|
+
logger_safe("info", "Storing features using UPDATE/INSERT strategy.")
|
|
888
|
+
_store_feature_update_insert(
|
|
889
|
+
entity_id,
|
|
890
|
+
volatile_table_name,
|
|
891
|
+
entity_null_substitute=entity_null_substitute,
|
|
892
|
+
primary_index=primary_index,
|
|
893
|
+
partitioning=partitioning,
|
|
894
|
+
features_infos=features_infos,
|
|
895
|
+
**kwargs
|
|
896
|
+
)
|
|
841
897
|
elif tdfs4ds.STORE_FEATURE == 'MERGE':
|
|
842
|
-
|
|
843
|
-
|
|
898
|
+
logger_safe("info", "Storing features using MERGE strategy.")
|
|
899
|
+
_store_feature_merge(
|
|
900
|
+
entity_id,
|
|
901
|
+
volatile_table_name,
|
|
902
|
+
entity_null_substitute=entity_null_substitute,
|
|
903
|
+
primary_index=primary_index,
|
|
904
|
+
partitioning=partitioning,
|
|
905
|
+
features_infos=features_infos,
|
|
906
|
+
**kwargs
|
|
907
|
+
)
|
|
844
908
|
else:
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
#
|
|
852
|
-
elapsed_time =
|
|
909
|
+
logger_safe(
|
|
910
|
+
"warning",
|
|
911
|
+
"Unknown STORE_FEATURE strategy '%s'. No storage operation was performed.",
|
|
912
|
+
tdfs4ds.STORE_FEATURE
|
|
913
|
+
)
|
|
914
|
+
|
|
915
|
+
# Log duration
|
|
916
|
+
elapsed_time = time.time() - start_time
|
|
853
917
|
formatted_elapsed_time = seconds_to_dhms(elapsed_time)
|
|
854
|
-
|
|
855
|
-
|
|
918
|
+
logger_safe(
|
|
919
|
+
"info",
|
|
920
|
+
"Storage of prepared features completed in %s (%.3fs)",
|
|
921
|
+
formatted_elapsed_time,
|
|
922
|
+
elapsed_time
|
|
923
|
+
)
|
|
856
924
|
|
|
857
925
|
def prepare_feature_ingestion_tdstone2(df, entity_id):
|
|
858
926
|
"""
|