tdfs4ds 0.2.4.26__py3-none-any.whl → 0.2.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +769 -571
- tdfs4ds/feature_store/feature_data_processing.py +370 -300
- tdfs4ds/feature_store/feature_query_retrieval.py +105 -52
- tdfs4ds/feature_store/feature_store_management.py +226 -231
- tdfs4ds/genai/__init__.py +27 -0
- tdfs4ds/genai/documentation.py +1878 -0
- tdfs4ds/process_store/process_followup.py +113 -2
- tdfs4ds/process_store/process_query_administration.py +1 -1
- tdfs4ds/process_store/process_registration_management.py +67 -55
- tdfs4ds/process_store/process_store_catalog_management.py +79 -26
- tdfs4ds/utils/filter_management.py +548 -138
- tdfs4ds/utils/query_management.py +18 -40
- tdfs4ds/utils/time_management.py +565 -98
- {tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.5.1.dist-info}/METADATA +1 -1
- tdfs4ds-0.2.5.1.dist-info/RECORD +32 -0
- tdfs/__init__.py +0 -1
- tdfs/data/curves.csv +0 -5086
- tdfs/datasets.py +0 -27
- tdfs/feature_store.py +0 -723
- tdfs4ds/feature_engineering.py +0 -152
- tdfs4ds/feature_store.py +0 -1529
- tdfs4ds/process_store.py +0 -387
- tdfs4ds/utils.py +0 -579
- tdfs4ds-0.2.4.26.dist-info/RECORD +0 -38
- {tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.5.1.dist-info}/WHEEL +0 -0
- {tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.5.1.dist-info}/top_level.txt +0 -0
|
@@ -7,6 +7,8 @@ from tdfs4ds.utils.info import seconds_to_dhms
|
|
|
7
7
|
import time
|
|
8
8
|
import re
|
|
9
9
|
import pandas as pd
|
|
10
|
+
from tdfs4ds import logger_safe, logger
|
|
11
|
+
|
|
10
12
|
|
|
11
13
|
def generate_on_clause(entity_id, entity_null_substitute, left_name, right_name):
|
|
12
14
|
res = []
|
|
@@ -73,7 +75,7 @@ def generate_collect_stats(entity_id, primary_index='', partitioning=''):
|
|
|
73
75
|
|
|
74
76
|
# Initialize the extended query with sampling and threshold settings for statistics collection
|
|
75
77
|
query_extension_header = 'COLLECT STATISTICS USING SAMPLE 25 PERCENT AND THRESHOLD 15 PERCENT'
|
|
76
|
-
query_extension
|
|
78
|
+
query_extension = []
|
|
77
79
|
|
|
78
80
|
# Add primary index columns to the extended query
|
|
79
81
|
if primary_index:
|
|
@@ -164,11 +166,10 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
|
|
|
164
166
|
# Record the start time
|
|
165
167
|
start_time = time.time()
|
|
166
168
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
if type(entity_id) == list:
|
|
169
|
+
# Normalize entity_id into a list of keys
|
|
170
|
+
if isinstance(entity_id, list):
|
|
170
171
|
list_entity_id = entity_id
|
|
171
|
-
elif
|
|
172
|
+
elif isinstance(entity_id, dict):
|
|
172
173
|
list_entity_id = list(entity_id.keys())
|
|
173
174
|
else:
|
|
174
175
|
list_entity_id = [entity_id]
|
|
@@ -176,260 +177,333 @@ def prepare_feature_ingestion(df, entity_id, feature_names, feature_versions=Non
|
|
|
176
177
|
|
|
177
178
|
feature_id_names, conversion_name2id = get_feature_id_and_conversion(list_entity_id, feature_names)
|
|
178
179
|
|
|
179
|
-
features_infos = pd.DataFrame(feature_id_names, columns
|
|
180
|
+
features_infos = pd.DataFrame(feature_id_names, columns=['FEATURE_ID', 'FEATURE_NAME', 'FEATURE_TABLE', 'FEATURE_DATABASE'])
|
|
180
181
|
features_infos['FEATURE_VERSION'] = [feature_versions[k] for k in features_infos.FEATURE_NAME.values]
|
|
181
|
-
if tdfs4ds.DEBUG_MODE:
|
|
182
|
-
print('--- prepare_feature_ingestion ---')
|
|
183
|
-
print('conversion_name2id : ', conversion_name2id)
|
|
184
|
-
print('feature_names : ', feature_names)
|
|
185
182
|
|
|
186
|
-
|
|
187
|
-
|
|
183
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
184
|
+
logger_safe("debug", "--- prepare_feature_ingestion ---")
|
|
185
|
+
logger_safe("debug", "conversion_name2id=%s", conversion_name2id)
|
|
186
|
+
logger_safe("debug", "feature_names=%s", feature_names)
|
|
188
187
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
print('unpivot_columns : ', unpivot_columns)
|
|
192
|
-
# Create the output column list including entity IDs, feature names, and feature values
|
|
188
|
+
# UNPIVOT mapping
|
|
189
|
+
unpivot_columns = ", \n".join([f"({x}) as '{conversion_name2id[x]}'" for x in feature_names])
|
|
193
190
|
|
|
191
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
192
|
+
logger_safe("debug", "unpivot_columns=%s", unpivot_columns)
|
|
193
|
+
|
|
194
|
+
# Output columns for volatile table
|
|
194
195
|
output_columns = ', \n'.join(list_entity_id + ['CAST(FEATURE_ID AS BIGINT) AS FEATURE_ID', 'FEATURE_VALUE'])
|
|
195
196
|
|
|
197
|
+
# Primary index
|
|
196
198
|
if primary_index is None:
|
|
197
199
|
primary_index = ','.join(list_entity_id)
|
|
198
200
|
else:
|
|
199
|
-
if
|
|
200
|
-
primary_index = primary_index
|
|
201
|
-
else:
|
|
201
|
+
if not isinstance(primary_index, list):
|
|
202
202
|
primary_index = [primary_index]
|
|
203
203
|
primary_index = ','.join(primary_index)
|
|
204
204
|
|
|
205
|
-
#
|
|
205
|
+
# Feature versions (defaults)
|
|
206
206
|
versions = {f: tdfs4ds.FEATURE_VERSION_DEFAULT for f in feature_names}
|
|
207
207
|
if feature_versions is not None:
|
|
208
208
|
for k, v in feature_versions.items():
|
|
209
209
|
versions[k] = v
|
|
210
210
|
|
|
211
|
-
if tdfs4ds
|
|
212
|
-
|
|
213
|
-
print('versions : ', versions)
|
|
211
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
212
|
+
logger_safe("debug", "versions=%s", versions)
|
|
214
213
|
|
|
215
|
-
#
|
|
216
|
-
version_query = ["CASE"] + [f"WHEN FEATURE_ID = '{conversion_name2id[k]}' THEN '{v}' " for k, v in versions.items()] + [
|
|
217
|
-
"END AS FEATURE_VERSION"]
|
|
214
|
+
# CASE statement for versions
|
|
215
|
+
version_query = ["CASE"] + [f"WHEN FEATURE_ID = '{conversion_name2id[k]}' THEN '{v}' " for k, v in versions.items()] + ["END AS FEATURE_VERSION"]
|
|
218
216
|
version_query = '\n'.join(version_query)
|
|
219
217
|
|
|
220
|
-
if tdfs4ds
|
|
221
|
-
|
|
222
|
-
print('version_query : ', version_query)
|
|
218
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
219
|
+
logger_safe("debug", "version_query=%s", version_query)
|
|
223
220
|
|
|
224
|
-
#
|
|
221
|
+
# Volatile table name
|
|
225
222
|
volatile_table_name = df._table_name.split('.')[1].replace('"', '')
|
|
226
|
-
volatile_table_name = f
|
|
223
|
+
volatile_table_name = f"temp_{volatile_table_name}"
|
|
227
224
|
|
|
228
|
-
|
|
225
|
+
# Normalize entity_id again for var casting
|
|
226
|
+
if isinstance(entity_id, list):
|
|
229
227
|
list_entity_id = entity_id
|
|
230
|
-
elif
|
|
228
|
+
elif isinstance(entity_id, dict):
|
|
231
229
|
list_entity_id = list(entity_id.keys())
|
|
232
230
|
else:
|
|
233
231
|
list_entity_id = [entity_id]
|
|
234
232
|
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
res = {x.split()[0]:''.join(x.split()[1::]) for x in str(df[feature_names].tdtypes).split('\n')}
|
|
233
|
+
# Character set handling / pass-through
|
|
234
|
+
res = {x.split()[0]: ''.join(x.split()[1::]) for x in str(df[feature_names].tdtypes).split('\n')}
|
|
238
235
|
var_temp2 = []
|
|
239
|
-
for k,v in res.items():
|
|
236
|
+
for k, v in res.items():
|
|
240
237
|
if 'UNICODE' in v:
|
|
241
|
-
#var_temp2.append(f'TRANSLATE({k} USING UNICODE_TO_LATIN) AS {k}')
|
|
242
238
|
var_temp2.append(f'{k}')
|
|
243
239
|
elif 'LATIN' in v:
|
|
244
|
-
#var_temp2.append(f'{k}')
|
|
245
240
|
var_temp2.append(f'TRANSLATE({k} USING LATIN_TO_UNICODE) AS {k}')
|
|
246
241
|
else:
|
|
247
242
|
var_temp2.append(f'CAST({k} AS VARCHAR(2048) CHARACTER SET UNICODE) AS {k}')
|
|
248
243
|
var_temp2 = ', \n'.join(var_temp2)
|
|
244
|
+
|
|
245
|
+
# NOTE: the original code overrides var_temp2 with just the raw column names.
|
|
246
|
+
# Preserve that behavior to avoid functional change.
|
|
249
247
|
var_temp2 = ', \n'.join(list(res.keys()))
|
|
250
248
|
|
|
249
|
+
# Null substitution on entity keys
|
|
251
250
|
var_temp3 = []
|
|
252
251
|
for e in list_entity_id:
|
|
253
252
|
if e in entity_null_substitute.keys():
|
|
254
|
-
if
|
|
253
|
+
if isinstance(entity_null_substitute[e], str):
|
|
255
254
|
var_temp3.append(f"coalesce({e},'{entity_null_substitute[e]}') AS {e}")
|
|
256
255
|
else:
|
|
257
256
|
var_temp3.append(f"coalesce({e},{entity_null_substitute[e]}) AS {e}")
|
|
258
257
|
else:
|
|
259
258
|
var_temp3.append(e)
|
|
260
|
-
|
|
261
259
|
var_temp3 = ', \n'.join(var_temp3)
|
|
262
260
|
|
|
263
|
-
|
|
264
|
-
nested_query = f"""
|
|
265
|
-
CREATE MULTISET VOLATILE TABLE {volatile_table_name} AS
|
|
266
|
-
(
|
|
267
|
-
SELECT
|
|
268
|
-
{output_columns},
|
|
269
|
-
{version_query}
|
|
270
|
-
FROM
|
|
271
|
-
(SELECT
|
|
272
|
-
{var_temp3},
|
|
273
|
-
{var_temp2}
|
|
274
|
-
FROM {df._table_name}
|
|
275
|
-
) A
|
|
276
|
-
UNPIVOT INCLUDE NULLS ((FEATURE_VALUE ) FOR FEATURE_ID
|
|
277
|
-
IN ({unpivot_columns})) Tmp
|
|
278
|
-
) WITH DATA
|
|
279
|
-
PRIMARY INDEX ({primary_index})
|
|
280
|
-
PARTITION BY RANGE_N(FEATURE_ID BETWEEN 0 AND 2000 EACH 1 )
|
|
281
|
-
ON COMMIT PRESERVE ROWS
|
|
282
|
-
"""
|
|
283
|
-
|
|
261
|
+
# Final nested query used (the function reassigns to plain SELECT; preserve as-is)
|
|
284
262
|
nested_query = f"""
|
|
285
|
-
CREATE MULTISET VOLATILE TABLE {volatile_table_name} AS
|
|
286
|
-
(
|
|
287
263
|
SELECT
|
|
288
264
|
{var_temp3},
|
|
289
265
|
{var_temp2}
|
|
290
266
|
FROM {df._table_name}
|
|
291
|
-
) WITH DATA
|
|
292
|
-
PRIMARY INDEX ({primary_index})
|
|
293
|
-
ON COMMIT PRESERVE ROWS
|
|
294
267
|
"""
|
|
295
268
|
|
|
296
|
-
|
|
297
|
-
SELECT
|
|
298
|
-
{var_temp3},
|
|
299
|
-
{var_temp2}
|
|
300
|
-
FROM {df._table_name}
|
|
301
|
-
|
|
302
|
-
"""
|
|
303
|
-
|
|
304
|
-
# Test unicity of the process
|
|
269
|
+
# Duplicate check query
|
|
305
270
|
output_columns_unicity = ', \n'.join(list_entity_id)
|
|
306
271
|
query_test_unicity = f"""
|
|
307
272
|
SELECT sum(CASE WHEN n>1 THEN 1 ELSE 0 END) AS nb_duplicates
|
|
308
273
|
FROM (
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
274
|
+
SELECT
|
|
275
|
+
{output_columns_unicity},
|
|
276
|
+
count(*) as n
|
|
277
|
+
FROM {_get_database_username()}.{volatile_table_name}
|
|
278
|
+
GROUP BY {output_columns_unicity}
|
|
314
279
|
) A
|
|
315
280
|
"""
|
|
316
281
|
|
|
317
|
-
if tdfs4ds
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
print('nested_query :', nested_query)
|
|
322
|
-
|
|
282
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
283
|
+
logger_safe("debug", "var_temp2=%s", var_temp2)
|
|
284
|
+
logger_safe("debug", "var_temp3=%s", var_temp3)
|
|
285
|
+
logger_safe("debug", "nested_query=%s", nested_query)
|
|
323
286
|
|
|
324
|
-
|
|
325
|
-
# Execute the SQL query to create the volatile table.
|
|
287
|
+
# Execute: create volatile table and test unicity
|
|
326
288
|
try:
|
|
327
|
-
|
|
328
|
-
|
|
289
|
+
tdml.DataFrame.from_query(nested_query).to_sql(
|
|
290
|
+
table_name = volatile_table_name,
|
|
291
|
+
temporary = True,
|
|
292
|
+
primary_index = primary_index.split(','),
|
|
293
|
+
if_exists = 'replace'
|
|
294
|
+
)
|
|
329
295
|
nb_duplicates = tdml.execute_sql(query_test_unicity).fetchall()[0][0]
|
|
330
296
|
if nb_duplicates is not None and nb_duplicates > 0:
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
297
|
+
logger_safe("error", "The process generates %s duplicates", nb_duplicates)
|
|
298
|
+
# Show sample duplicates in debug for troubleshooting
|
|
299
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
300
|
+
sample_dups_query = f"""
|
|
301
|
+
SELECT TOP 3
|
|
302
|
+
{output_columns_unicity},
|
|
303
|
+
count(*) as n
|
|
304
|
+
FROM {_get_database_username()}.{volatile_table_name}
|
|
305
|
+
GROUP BY {output_columns_unicity}
|
|
306
|
+
HAVING n > 1
|
|
307
|
+
"""
|
|
308
|
+
logger_safe("debug", "Sample duplicates query:\n%s", sample_dups_query)
|
|
340
309
|
raise ValueError("Invalid process: the process generates duplicates.")
|
|
341
|
-
#
|
|
310
|
+
# else: no duplicates
|
|
311
|
+
# logger_safe("info", "No duplicate found.") # optional
|
|
342
312
|
except Exception as e:
|
|
343
|
-
|
|
344
|
-
print(str(e).split('\n')[0])
|
|
313
|
+
logger_safe("error", "prepare_feature_ingestion failed: %s", str(e).split('\n')[0])
|
|
345
314
|
raise
|
|
346
315
|
|
|
316
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
317
|
+
logger_safe(
|
|
318
|
+
"debug",
|
|
319
|
+
"Result volatile table dtypes:\n%s",
|
|
320
|
+
tdml.DataFrame(tdml.in_schema(_get_database_username(), volatile_table_name)).tdtypes
|
|
321
|
+
)
|
|
347
322
|
|
|
348
|
-
|
|
349
|
-
print('--- prepare_feature_ingestion ---')
|
|
350
|
-
print(tdml.DataFrame(tdml.in_schema(_get_database_username(), volatile_table_name)).tdtypes)
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
# Record the end time
|
|
323
|
+
# Timing
|
|
354
324
|
end_time = time.time()
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
# Calculate the elapsed time in seconds
|
|
358
325
|
elapsed_time = end_time - start_time
|
|
359
326
|
formatted_elapsed_time = seconds_to_dhms(elapsed_time)
|
|
360
|
-
|
|
361
|
-
|
|
327
|
+
logger_safe("info", "Feature preparation for ingestion: %s (%.3fs)", formatted_elapsed_time, elapsed_time)
|
|
328
|
+
|
|
329
|
+
# Return DataFrame and metadata
|
|
362
330
|
try:
|
|
363
331
|
df_out = tdml.DataFrame(tdml.in_schema(_get_database_username(), volatile_table_name))
|
|
364
332
|
return df_out, volatile_table_name, features_infos
|
|
365
333
|
except Exception as e:
|
|
366
|
-
|
|
367
|
-
|
|
334
|
+
logger_safe("error", "Failed to materialize volatile DataFrame: %s", str(e).split()[0])
|
|
335
|
+
logger_safe("error", "Feature dtypes snapshot: %s", df[feature_names].tdtypes)
|
|
368
336
|
if 'TD_Unpivot contract function' in str(e).split()[0]:
|
|
369
|
-
raise(
|
|
337
|
+
raise RuntimeError(
|
|
338
|
+
"Error: you may have strings with UNICODE encoding as features; please convert them to LATIN first."
|
|
339
|
+
)
|
|
340
|
+
raise
|
|
370
341
|
|
|
371
|
-
|
|
342
|
+
# Fallback (should not reach)
|
|
343
|
+
# return None, None, None
|
|
372
344
|
|
|
373
345
|
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
Applies a collect statistics operation on target tables grouped by feature table and database.
|
|
377
|
-
|
|
378
|
-
This function performs the following steps:
|
|
379
|
-
1. Sorts the `entity_id`.
|
|
380
|
-
2. Groups the feature information by feature table and database to count occurrences.
|
|
381
|
-
3. Generates collect statistics queries.
|
|
382
|
-
4. Executes the queries on the target tables while recording the execution time.
|
|
383
|
-
5. Logs the elapsed time if logging is enabled.
|
|
384
|
-
|
|
385
|
-
Args:
|
|
386
|
-
entity_id (list): A list of entity IDs to process.
|
|
387
|
-
primary_index (str): The primary index to use in the collect statistics query.
|
|
388
|
-
partitioning (str): Partitioning information for the query.
|
|
389
|
-
feature_infos (pd.DataFrame): A DataFrame containing feature information,
|
|
390
|
-
including columns 'FEATURE_TABLE', 'FEATURE_DATABASE', and 'FEATURE_ID'.
|
|
346
|
+
import time
|
|
347
|
+
from typing import Any, Dict, Iterable, Mapping, Optional, Tuple
|
|
391
348
|
|
|
392
|
-
|
|
393
|
-
|
|
349
|
+
import pandas as pd
|
|
350
|
+
|
|
351
|
+
def apply_collect_stats(
|
|
352
|
+
entity_id: Mapping[str, Any] | Iterable[str],
|
|
353
|
+
primary_index: Optional[str],
|
|
354
|
+
partitioning: Optional[str],
|
|
355
|
+
feature_infos: pd.DataFrame,
|
|
356
|
+
) -> Dict[str, Any]:
|
|
394
357
|
"""
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
358
|
+
Run COLLECT STATS on all target feature tables, with fallbacks and timing.
|
|
359
|
+
|
|
360
|
+
Steps:
|
|
361
|
+
1) Determine a stable ordering of entity IDs (for deterministic query gen).
|
|
362
|
+
2) Group `feature_infos` by FEATURE_DATABASE + FEATURE_TABLE to get unique targets.
|
|
363
|
+
3) Generate COLLECT STATS statements via `generate_collect_stats(...)` for fallback use.
|
|
364
|
+
4) For each target table:
|
|
365
|
+
- Try a simple `COLLECT STATS ON <db>.<table>`.
|
|
366
|
+
- On failure, retry with generated statements (and optional extension).
|
|
367
|
+
5) Log a compact summary (counts + total duration) and return it as a dict.
|
|
368
|
+
|
|
369
|
+
Parameters
|
|
370
|
+
----------
|
|
371
|
+
entity_id : Mapping[str, Any] | Iterable[str]
|
|
372
|
+
Entity identifiers used to parameterize collect-stat statements.
|
|
373
|
+
If a mapping (e.g., dict), its *keys* are used and sorted.
|
|
374
|
+
If an iterable (e.g., list/tuple), it’s sorted directly.
|
|
375
|
+
primary_index : Optional[str]
|
|
376
|
+
Primary index used by `generate_collect_stats` (may be None).
|
|
377
|
+
partitioning : Optional[str]
|
|
378
|
+
Partitioning clause used by `generate_collect_stats` (may be None).
|
|
379
|
+
feature_infos : pd.DataFrame
|
|
380
|
+
Must contain columns: 'FEATURE_TABLE', 'FEATURE_DATABASE', 'FEATURE_ID'.
|
|
381
|
+
|
|
382
|
+
Returns
|
|
383
|
+
-------
|
|
384
|
+
Dict[str, Any]
|
|
385
|
+
Summary with keys:
|
|
386
|
+
- total_tables: int
|
|
387
|
+
- ok: int
|
|
388
|
+
- retried: int
|
|
389
|
+
- failed: int
|
|
390
|
+
- duration_seconds: float
|
|
391
|
+
- duration_hms: str
|
|
392
|
+
- details: list[dict] # per-table status entries
|
|
393
|
+
"""
|
|
394
|
+
# --- Validate inputs -----------------------------------------------------
|
|
395
|
+
required_cols = {"FEATURE_TABLE", "FEATURE_DATABASE", "FEATURE_ID"}
|
|
396
|
+
missing = required_cols.difference(feature_infos.columns)
|
|
397
|
+
if missing:
|
|
398
|
+
raise ValueError(f"feature_infos is missing required columns: {sorted(missing)}")
|
|
399
|
+
|
|
400
|
+
# --- Normalize & sort entity IDs ----------------------------------------
|
|
401
|
+
if hasattr(entity_id, "keys"):
|
|
402
|
+
sorted_entity_ids = sorted(list(entity_id.keys()))
|
|
403
|
+
else:
|
|
404
|
+
sorted_entity_ids = sorted(list(entity_id))
|
|
405
|
+
|
|
406
|
+
# --- Group to unique targets --------------------------------------------
|
|
407
|
+
target_tables = (
|
|
408
|
+
feature_infos[["FEATURE_TABLE", "FEATURE_DATABASE", "FEATURE_ID"]]
|
|
409
|
+
.groupby(["FEATURE_TABLE", "FEATURE_DATABASE"])
|
|
410
|
+
.count()
|
|
411
|
+
.reset_index()
|
|
412
|
+
)
|
|
398
413
|
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
414
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
415
|
+
logger_safe(
|
|
416
|
+
"debug",
|
|
417
|
+
"collect_stats.targets | count=%s | tables=%s",
|
|
418
|
+
len(target_tables),
|
|
419
|
+
target_tables[["FEATURE_DATABASE", "FEATURE_TABLE"]].to_dict(orient="records"),
|
|
420
|
+
)
|
|
403
421
|
|
|
404
|
-
#
|
|
422
|
+
# --- Prepare statements --------------------------------------------------
|
|
405
423
|
query_collect_stats, query_collect_stats_extension = generate_collect_stats(
|
|
406
|
-
|
|
424
|
+
sorted_entity_ids,
|
|
407
425
|
primary_index=primary_index,
|
|
408
|
-
partitioning=partitioning
|
|
426
|
+
partitioning=partitioning,
|
|
409
427
|
)
|
|
410
428
|
|
|
411
|
-
#
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
429
|
+
# --- Execute -------------------------------------------------------------
|
|
430
|
+
started = time.perf_counter()
|
|
431
|
+
results: list[Dict[str, Any]] = []
|
|
432
|
+
|
|
433
|
+
ok = retried = failed = 0
|
|
434
|
+
|
|
435
|
+
for _, row in target_tables.iterrows():
|
|
436
|
+
db = row["FEATURE_DATABASE"]
|
|
437
|
+
tbl = row["FEATURE_TABLE"]
|
|
438
|
+
table_fqn = f"{db}.{tbl}"
|
|
439
|
+
|
|
440
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
441
|
+
logger_safe("debug", "collect_stats.run | table=%s", table_fqn)
|
|
442
|
+
|
|
443
|
+
t0 = time.perf_counter()
|
|
444
|
+
status = "ok"
|
|
445
|
+
error_short = None
|
|
446
|
+
retried_flag = False
|
|
447
|
+
|
|
448
|
+
try:
|
|
449
|
+
tdml.execute_sql(f"COLLECT STATS ON {table_fqn}")
|
|
450
|
+
ok += 1
|
|
451
|
+
except Exception as e:
|
|
452
|
+
# First attempt failed; try generated statement(s)
|
|
453
|
+
error_short = str(e).split("\n")[0]
|
|
454
|
+
logger_safe("warning", "collect_stats.initial_fail | table=%s | err=%s", table_fqn, error_short)
|
|
455
|
+
|
|
456
|
+
try:
|
|
457
|
+
execute_query(query_collect_stats + f" ON {table_fqn}")
|
|
458
|
+
retried_flag = True
|
|
459
|
+
retried += 1
|
|
460
|
+
|
|
461
|
+
if query_collect_stats_extension is not None:
|
|
462
|
+
execute_query(query_collect_stats_extension + f" ON {table_fqn}")
|
|
463
|
+
except Exception as e2:
|
|
464
|
+
status = "failed"
|
|
465
|
+
error_short = str(e2).split("\n")[0]
|
|
466
|
+
failed += 1
|
|
467
|
+
logger_safe("error", "collect_stats.retry_fail | table=%s | err=%s", table_fqn, error_short)
|
|
468
|
+
|
|
469
|
+
dt = time.perf_counter() - t0
|
|
470
|
+
results.append(
|
|
471
|
+
{
|
|
472
|
+
"table": table_fqn,
|
|
473
|
+
"status": status,
|
|
474
|
+
"retried": retried_flag,
|
|
475
|
+
"elapsed_s": dt,
|
|
476
|
+
"error": error_short,
|
|
477
|
+
}
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
# --- Final summary -------------------------------------------------------
|
|
481
|
+
elapsed = time.perf_counter() - started
|
|
482
|
+
formatted = seconds_to_dhms(elapsed)
|
|
483
|
+
|
|
484
|
+
# Structured, parseable one-liner
|
|
485
|
+
logger_safe(
|
|
486
|
+
"info",
|
|
487
|
+
"collect_stats.summary | tables=%d | ok=%d | retried=%d | failed=%d | duration=%s (%.3fs)",
|
|
488
|
+
len(target_tables),
|
|
489
|
+
ok,
|
|
490
|
+
retried,
|
|
491
|
+
failed,
|
|
492
|
+
formatted,
|
|
493
|
+
elapsed,
|
|
494
|
+
)
|
|
418
495
|
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
496
|
+
return {
|
|
497
|
+
"total_tables": int(len(target_tables)),
|
|
498
|
+
"ok": int(ok),
|
|
499
|
+
"retried": int(retried),
|
|
500
|
+
"failed": int(failed),
|
|
501
|
+
"duration_seconds": float(elapsed),
|
|
502
|
+
"duration_hms": formatted,
|
|
503
|
+
"details": results,
|
|
504
|
+
}
|
|
422
505
|
|
|
423
|
-
# Record the end time after query execution.
|
|
424
|
-
end_time = time.time()
|
|
425
506
|
|
|
426
|
-
# Calculate the elapsed time in seconds and format it into a human-readable format.
|
|
427
|
-
elapsed_time = end_time - start_time
|
|
428
|
-
formatted_elapsed_time = seconds_to_dhms(elapsed_time)
|
|
429
|
-
|
|
430
|
-
# Log the execution time if logging is enabled.
|
|
431
|
-
if tdfs4ds.DISPLAY_LOGS:
|
|
432
|
-
print(f'Storage of the prepared features - collect stats only : {formatted_elapsed_time} ({elapsed_time}s)')
|
|
433
507
|
|
|
434
508
|
|
|
435
509
|
def _store_feature_update_insert(entity_id, volatile_table_name, entity_null_substitute={},primary_index=None,
|
|
@@ -627,9 +701,8 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
|
|
|
627
701
|
>>> store_feature(entity_id_dict, prepared_features)
|
|
628
702
|
"""
|
|
629
703
|
|
|
630
|
-
#
|
|
631
|
-
|
|
632
|
-
if tdfs4ds.FEATURE_STORE_TIME == None:
|
|
704
|
+
# VALIDTIME handling
|
|
705
|
+
if tdfs4ds.FEATURE_STORE_TIME is None:
|
|
633
706
|
validtime_statement = 'CURRENT VALIDTIME'
|
|
634
707
|
validtime_statement2 = validtime_statement
|
|
635
708
|
validtime_start = 'CAST(CURRENT_TIME AS TIMESTAMP(0) WITH TIME ZONE)'
|
|
@@ -638,180 +711,157 @@ def _store_feature_merge(entity_id, volatile_table_name, entity_null_substitute=
|
|
|
638
711
|
validtime_statement2 = f"VALIDTIME AS OF TIMESTAMP '{tdfs4ds.FEATURE_STORE_TIME}'"
|
|
639
712
|
validtime_start = f"CAST('{tdfs4ds.FEATURE_STORE_TIME}' AS TIMESTAMP(0) WITH TIME ZONE)"
|
|
640
713
|
|
|
641
|
-
if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED'
|
|
642
|
-
end_period_ = '9999-01-01 00:00:00'
|
|
643
|
-
else:
|
|
644
|
-
end_period_ = tdfs4ds.END_PERIOD
|
|
714
|
+
end_period_ = '9999-01-01 00:00:00' if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED' else tdfs4ds.END_PERIOD
|
|
645
715
|
|
|
646
|
-
if tdfs4ds
|
|
647
|
-
|
|
716
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
717
|
+
logger_safe("debug", "FEATURE_STORE_TIME=%s | END_PERIOD=%s", tdfs4ds.FEATURE_STORE_TIME, tdfs4ds.END_PERIOD)
|
|
718
|
+
logger_safe("debug", "entity_id=%s", entity_id)
|
|
648
719
|
|
|
720
|
+
# Entity id helpers
|
|
721
|
+
sorted_entity_id = sorted(list(entity_id.keys()))
|
|
722
|
+
ENTITY_ID = ','.join(sorted_entity_id)
|
|
649
723
|
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
"""
|
|
662
|
-
|
|
663
|
-
if tdfs4ds.DEBUG_MODE:
|
|
664
|
-
print('count_features :' , count_features)
|
|
665
|
-
print('features_infos :', features_infos)
|
|
724
|
+
# Count rows in volatile table
|
|
725
|
+
count_features = pd.DataFrame(
|
|
726
|
+
tdml.execute_sql(
|
|
727
|
+
f"""
|
|
728
|
+
SEL count(*) as NB_ROWS
|
|
729
|
+
FROM {_get_database_username()}.{volatile_table_name}
|
|
730
|
+
"""
|
|
731
|
+
).fetchall(),
|
|
732
|
+
columns=['NB_ROWS']
|
|
733
|
+
)
|
|
734
|
+
# log the number of rows obtained after transformations
|
|
735
|
+
logger_safe("info", f"{count_features.NB_ROWS.values[0]} rows of features")
|
|
666
736
|
|
|
737
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
738
|
+
logger_safe("debug", "count_features=%s", count_features)
|
|
739
|
+
logger_safe("debug", "features_infos initial=%s", features_infos)
|
|
667
740
|
|
|
668
741
|
if count_features.shape[0] > 0:
|
|
669
742
|
features_infos['NB_ROWS'] = count_features['NB_ROWS'].values[0]
|
|
670
743
|
else:
|
|
671
744
|
features_infos['NB_ROWS'] = 0
|
|
672
745
|
|
|
673
|
-
if tdfs4ds
|
|
674
|
-
|
|
675
|
-
# Group the target tables by feature table and feature database and count the number of occurrences
|
|
676
|
-
target_tables = features_infos[['FEATURE_TABLE', 'FEATURE_DATABASE', 'NB_ROWS']].groupby(
|
|
677
|
-
['FEATURE_TABLE', 'FEATURE_DATABASE']).sum().reset_index()
|
|
678
|
-
|
|
679
|
-
if tdfs4ds.DEBUG_MODE:
|
|
680
|
-
print('target_tables :' , target_tables)
|
|
681
|
-
if tdfs4ds.DISPLAY_LOGS:
|
|
682
|
-
display_table(target_tables[['FEATURE_DATABASE', 'FEATURE_TABLE', 'NB_ROWS']])
|
|
746
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
747
|
+
logger_safe("debug", "features_infos updated=%s", features_infos)
|
|
683
748
|
|
|
749
|
+
# Compute target tables
|
|
750
|
+
target_tables = features_infos[['FEATURE_TABLE', 'FEATURE_DATABASE', 'NB_ROWS']].groupby(
|
|
751
|
+
['FEATURE_TABLE', 'FEATURE_DATABASE']
|
|
752
|
+
).sum().reset_index()
|
|
684
753
|
|
|
754
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
755
|
+
logger_safe("debug", "target_tables=%s", target_tables)
|
|
685
756
|
|
|
686
|
-
|
|
687
|
-
|
|
757
|
+
# Optional display (keep existing UX semantics)
|
|
758
|
+
if getattr(tdfs4ds, "DISPLAY_LOGS", False):
|
|
759
|
+
try:
|
|
760
|
+
display_table(target_tables[['FEATURE_DATABASE', 'FEATURE_TABLE', 'NB_ROWS']])
|
|
761
|
+
except Exception as e:
|
|
762
|
+
logger_safe("warning", "display_table failed: %s", str(e).split('\n')[0])
|
|
688
763
|
|
|
689
764
|
ENTITY_ID_ON = ' AND '.join([f'NEW_FEATURES.{k} = EXISTING_FEATURES.{k}' for k in sorted_entity_id])
|
|
690
|
-
|
|
691
765
|
ENTITY_ID_SELECT = ', \n'.join(['NEW_FEATURES.' + k for k in sorted_entity_id])
|
|
692
|
-
# Iterate over target tables and perform update and insert operations
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
#query_collect_stats, query_collect_stats_extension = generate_collect_stats(sorted_entity_id,primary_index=primary_index, partitioning=partitioning)
|
|
696
|
-
|
|
697
766
|
|
|
698
767
|
queries = []
|
|
699
|
-
for
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
768
|
+
for _, row in features_infos.iterrows():
|
|
769
|
+
features_infos_ = features_infos[
|
|
770
|
+
(features_infos.FEATURE_TABLE == row['FEATURE_TABLE']) &
|
|
771
|
+
(features_infos.FEATURE_DATABASE == row['FEATURE_DATABASE'])
|
|
772
|
+
]
|
|
773
|
+
feature_id_list = ','.join([str(x) for x in sorted(set(features_infos_.FEATURE_ID.values))])
|
|
774
|
+
feature_version_list = ','.join(["'" + x + "'" for x in sorted(set(features_infos_.FEATURE_VERSION.values))])
|
|
775
|
+
|
|
776
|
+
# Build nested query
|
|
707
777
|
nested_query = f"""
|
|
708
778
|
SEL
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
FROM {_get_database_username()}.{volatile_table_name}
|
|
779
|
+
{ENTITY_ID}
|
|
780
|
+
, {row['FEATURE_ID']} AS FEATURE_ID
|
|
781
|
+
, {row['FEATURE_NAME']} AS FEATURE_VALUE
|
|
782
|
+
, '{row['FEATURE_VERSION']}' AS FEATURE_VERSION
|
|
783
|
+
FROM {_get_database_username()}.{volatile_table_name}
|
|
714
784
|
"""
|
|
715
785
|
|
|
716
|
-
if tdfs4ds.FEATURE_STORE_TIME
|
|
786
|
+
if tdfs4ds.FEATURE_STORE_TIME is None:
|
|
717
787
|
query_merge = f"""
|
|
718
788
|
{validtime_statement}
|
|
719
|
-
MERGE INTO
|
|
720
|
-
|
|
789
|
+
MERGE INTO {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']} EXISTING_FEATURES
|
|
721
790
|
USING ( {nested_query} ) NEW_FEATURES
|
|
722
|
-
ON {ENTITY_ID_ON}
|
|
791
|
+
ON {ENTITY_ID_ON}
|
|
723
792
|
AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
|
|
724
793
|
AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
|
|
725
794
|
AND NEW_FEATURES.FEATURE_ID IN ({feature_id_list})
|
|
726
795
|
AND EXISTING_FEATURES.FEATURE_ID IN ({feature_id_list})
|
|
727
796
|
AND EXISTING_FEATURES.FEATURE_VERSION IN ({feature_version_list})
|
|
728
797
|
WHEN MATCHED THEN
|
|
729
|
-
UPDATE
|
|
730
|
-
SET
|
|
731
|
-
FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
|
|
798
|
+
UPDATE SET FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
|
|
732
799
|
WHEN NOT MATCHED THEN
|
|
733
800
|
INSERT
|
|
734
|
-
(
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
--'{end_period_}')
|
|
801
|
+
(
|
|
802
|
+
{ENTITY_ID_SELECT},
|
|
803
|
+
NEW_FEATURES.FEATURE_ID,
|
|
804
|
+
NEW_FEATURES.FEATURE_VALUE,
|
|
805
|
+
NEW_FEATURES.FEATURE_VERSION
|
|
806
|
+
)
|
|
741
807
|
"""
|
|
742
808
|
else:
|
|
743
809
|
query_merge = f"""
|
|
744
810
|
{validtime_statement}
|
|
745
|
-
MERGE INTO
|
|
811
|
+
MERGE INTO {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']} EXISTING_FEATURES
|
|
746
812
|
USING ( {nested_query} ) NEW_FEATURES
|
|
747
|
-
ON {ENTITY_ID_ON}
|
|
813
|
+
ON {ENTITY_ID_ON}
|
|
748
814
|
AND NEW_FEATURES.FEATURE_ID = EXISTING_FEATURES.FEATURE_ID
|
|
749
|
-
AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
|
|
750
|
-
AND EXISTING_FEATURES.FEATURE_ID IN ({feature_id_list})
|
|
751
|
-
AND NEW_FEATURES.FEATURE_ID IN ({feature_id_list})
|
|
815
|
+
AND NEW_FEATURES.FEATURE_VERSION = EXISTING_FEATURES.FEATURE_VERSION
|
|
816
|
+
AND EXISTING_FEATURES.FEATURE_ID IN ({feature_id_list})
|
|
817
|
+
AND NEW_FEATURES.FEATURE_ID IN ({feature_id_list})
|
|
752
818
|
AND EXISTING_FEATURES.FEATURE_VERSION IN ({feature_version_list})
|
|
753
819
|
WHEN MATCHED THEN
|
|
754
|
-
UPDATE
|
|
755
|
-
SET
|
|
756
|
-
FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
|
|
820
|
+
UPDATE SET FEATURE_VALUE = NEW_FEATURES.FEATURE_VALUE
|
|
757
821
|
WHEN NOT MATCHED THEN
|
|
758
822
|
INSERT
|
|
759
|
-
(
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
823
|
+
(
|
|
824
|
+
{ENTITY_ID_SELECT},
|
|
825
|
+
NEW_FEATURES.FEATURE_ID,
|
|
826
|
+
NEW_FEATURES.FEATURE_VALUE,
|
|
827
|
+
NEW_FEATURES.FEATURE_VERSION,
|
|
828
|
+
{validtime_start},
|
|
829
|
+
'{end_period_}'
|
|
830
|
+
)
|
|
765
831
|
"""
|
|
766
832
|
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
833
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
834
|
+
entity_id_str = ', '.join(sorted_entity_id)
|
|
835
|
+
logger_safe(
|
|
836
|
+
"debug",
|
|
837
|
+
"Merging feature values for entity keys (%s) into %s.%s",
|
|
838
|
+
entity_id_str, row['FEATURE_DATABASE'], row['FEATURE_TABLE']
|
|
839
|
+
)
|
|
840
|
+
logger_safe("debug", "Query (truncated): %s", "\n".join(query_merge.splitlines()[:12]) + "\n...")
|
|
772
841
|
|
|
773
842
|
queries.append(query_merge)
|
|
774
843
|
|
|
775
|
-
query_merge = '; \n'.join(queries)
|
|
776
844
|
try:
|
|
777
|
-
# Record the end time
|
|
778
845
|
start_time = time.time()
|
|
779
846
|
|
|
780
847
|
for q in queries:
|
|
781
|
-
if tdfs4ds
|
|
782
|
-
|
|
783
|
-
# Execute the SQL query to create the volatile table.
|
|
848
|
+
if getattr(tdfs4ds, "DEBUG_MODE", False):
|
|
849
|
+
logger_safe("debug", "Executing merge (head): %s", "\n".join(q.split('\n')[0:3]))
|
|
784
850
|
execute_query(q)
|
|
785
|
-
#execute_query(query_merge)
|
|
786
|
-
# Record the end time
|
|
787
|
-
end_time = time.time()
|
|
788
851
|
|
|
789
|
-
|
|
790
|
-
elapsed_time = end_time - start_time
|
|
852
|
+
elapsed_time = time.time() - start_time
|
|
791
853
|
formatted_elapsed_time = seconds_to_dhms(elapsed_time)
|
|
792
|
-
|
|
793
|
-
|
|
854
|
+
logger_safe(
|
|
855
|
+
"info",
|
|
856
|
+
"Storage of prepared features (merge-only) completed in %s (%.3fs)",
|
|
857
|
+
formatted_elapsed_time, elapsed_time
|
|
858
|
+
)
|
|
794
859
|
except Exception as e:
|
|
795
|
-
|
|
860
|
+
logger_safe("exception", "Feature storage (merge) failed: %s", str(e).split('\n')[0])
|
|
796
861
|
raise
|
|
797
862
|
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
# for i, row in features_infos.iterrows():
|
|
801
|
-
# execute_query(query_collect_stats + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
|
|
802
|
-
# #print(query_collect_stats + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
|
|
803
|
-
# if query_collect_stats_extension is not None:
|
|
804
|
-
# execute_query(query_collect_stats_extension + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
|
|
805
|
-
# #print(query_collect_stats_extension + f" ON {row['FEATURE_DATABASE']}.{row['FEATURE_TABLE']}")
|
|
806
|
-
# # Record the end time
|
|
807
|
-
# end_time = time.time()
|
|
808
|
-
#
|
|
809
|
-
# # Calculate the elapsed time in seconds
|
|
810
|
-
# elapsed_time = end_time - start_time
|
|
811
|
-
# formatted_elapsed_time = seconds_to_dhms(elapsed_time)
|
|
812
|
-
# if tdfs4ds.DISPLAY_LOGS:
|
|
813
|
-
# print(f'Storage of the prepared features - collect stats only : {formatted_elapsed_time} ({elapsed_time}s)')
|
|
814
|
-
return
|
|
863
|
+
return count_features.NB_ROWS.values[0]
|
|
864
|
+
|
|
815
865
|
def store_feature(entity_id, volatile_table_name, entity_null_substitute = {},primary_index=None,
|
|
816
866
|
partitioning='', features_infos = None, **kwargs):
|
|
817
867
|
"""
|
|
@@ -832,27 +882,47 @@ def store_feature(entity_id, volatile_table_name, entity_null_substitute = {},pr
|
|
|
832
882
|
>>> store_feature(entity_id_dict, prepared_features)
|
|
833
883
|
"""
|
|
834
884
|
|
|
835
|
-
# Record the start time
|
|
836
885
|
start_time = time.time()
|
|
837
886
|
|
|
887
|
+
# Choose storage strategy
|
|
838
888
|
if tdfs4ds.STORE_FEATURE == 'UPDATE_INSERT':
|
|
839
|
-
|
|
840
|
-
|
|
889
|
+
logger_safe("info", "Storing features using UPDATE/INSERT strategy.")
|
|
890
|
+
_store_feature_update_insert(
|
|
891
|
+
entity_id,
|
|
892
|
+
volatile_table_name,
|
|
893
|
+
entity_null_substitute=entity_null_substitute,
|
|
894
|
+
primary_index=primary_index,
|
|
895
|
+
partitioning=partitioning,
|
|
896
|
+
features_infos=features_infos,
|
|
897
|
+
**kwargs
|
|
898
|
+
)
|
|
841
899
|
elif tdfs4ds.STORE_FEATURE == 'MERGE':
|
|
842
|
-
|
|
843
|
-
|
|
900
|
+
logger_safe("info", "Storing features using MERGE strategy.")
|
|
901
|
+
_store_feature_merge(
|
|
902
|
+
entity_id,
|
|
903
|
+
volatile_table_name,
|
|
904
|
+
entity_null_substitute=entity_null_substitute,
|
|
905
|
+
primary_index=primary_index,
|
|
906
|
+
partitioning=partitioning,
|
|
907
|
+
features_infos=features_infos,
|
|
908
|
+
**kwargs
|
|
909
|
+
)
|
|
844
910
|
else:
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
#
|
|
852
|
-
elapsed_time =
|
|
911
|
+
logger_safe(
|
|
912
|
+
"warning",
|
|
913
|
+
"Unknown STORE_FEATURE strategy '%s'. No storage operation was performed.",
|
|
914
|
+
tdfs4ds.STORE_FEATURE
|
|
915
|
+
)
|
|
916
|
+
|
|
917
|
+
# Log duration
|
|
918
|
+
elapsed_time = time.time() - start_time
|
|
853
919
|
formatted_elapsed_time = seconds_to_dhms(elapsed_time)
|
|
854
|
-
|
|
855
|
-
|
|
920
|
+
logger_safe(
|
|
921
|
+
"info",
|
|
922
|
+
"Storage of prepared features completed in %s (%.3fs)",
|
|
923
|
+
formatted_elapsed_time,
|
|
924
|
+
elapsed_time
|
|
925
|
+
)
|
|
856
926
|
|
|
857
927
|
def prepare_feature_ingestion_tdstone2(df, entity_id):
|
|
858
928
|
"""
|