tdfs4ds 0.2.4.26__py3-none-any.whl → 0.2.4.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +70 -16
- tdfs4ds/feature_store/feature_query_retrieval.py +60 -49
- tdfs4ds/feature_store/feature_store_management.py +1 -1
- {tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.4.28.dist-info}/METADATA +1 -1
- {tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.4.28.dist-info}/RECORD +7 -7
- {tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.4.28.dist-info}/WHEEL +0 -0
- {tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.4.28.dist-info}/top_level.txt +0 -0
tdfs4ds/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
__version__ = '0.2.4.
|
|
1
|
+
__version__ = '0.2.4.28'
|
|
2
2
|
import logging
|
|
3
3
|
# Setup the logger
|
|
4
4
|
logging.basicConfig(
|
|
@@ -935,6 +935,10 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
|
|
|
935
935
|
selected_features : dict
|
|
936
936
|
A dictionary where the keys are feature table names, and the values are lists of tuples
|
|
937
937
|
(feature_id, feature_version, feature_name) specifying the features to retrieve.
|
|
938
|
+
NOTE: feature_version may be either:
|
|
939
|
+
- a single UUID string, or
|
|
940
|
+
- a list of dicts like:
|
|
941
|
+
{"process_id": <UUID>, "process_view_name": <str>}
|
|
938
942
|
|
|
939
943
|
view_name : str
|
|
940
944
|
The name of the view to be created in the database.
|
|
@@ -1004,6 +1008,24 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
|
|
|
1004
1008
|
# Sort the entity ID list for consistent query generation
|
|
1005
1009
|
list_entity_id.sort()
|
|
1006
1010
|
|
|
1011
|
+
# Helpers
|
|
1012
|
+
import re
|
|
1013
|
+
def _sanitize_identifier(name: str) -> str:
|
|
1014
|
+
# Keep letters, numbers, and underscores; replace others with '_'
|
|
1015
|
+
return re.sub(r'[^0-9A-Za-z_]', '_', name)
|
|
1016
|
+
|
|
1017
|
+
used_alias_counts = {} # base_alias -> count
|
|
1018
|
+
|
|
1019
|
+
def _unique_alias(base: str) -> str:
|
|
1020
|
+
"""
|
|
1021
|
+
Ensure alias uniqueness: if base already used, append _2, _3, ...
|
|
1022
|
+
"""
|
|
1023
|
+
if base not in used_alias_counts:
|
|
1024
|
+
used_alias_counts[base] = 1
|
|
1025
|
+
return base
|
|
1026
|
+
used_alias_counts[base] += 1
|
|
1027
|
+
return f"{base}_{used_alias_counts[base]}"
|
|
1028
|
+
|
|
1007
1029
|
# Initialize sub-query construction
|
|
1008
1030
|
tdfs4ds.logger.info("Generating the sub-queries for feature retrieval.")
|
|
1009
1031
|
sub_queries = []
|
|
@@ -1014,21 +1036,52 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
|
|
|
1014
1036
|
# Construct sub-queries for each feature
|
|
1015
1037
|
for k, v in list_features.items():
|
|
1016
1038
|
for feature_id, feature_version, feature_name in v:
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1039
|
+
|
|
1040
|
+
# Multiple processes: list of dicts
|
|
1041
|
+
if isinstance(feature_version, list):
|
|
1042
|
+
for item in feature_version:
|
|
1043
|
+
process_id = item.get("process_id")
|
|
1044
|
+
process_view_name = item.get("process_view_name") or "PROCESS"
|
|
1045
|
+
base_alias = _sanitize_identifier(f"{feature_name}_{process_view_name}")
|
|
1046
|
+
alias = _unique_alias(base_alias)
|
|
1047
|
+
|
|
1048
|
+
txt_where = f"(FEATURE_ID = {feature_id} AND FEATURE_VERSION='{process_id}')"
|
|
1049
|
+
feature_str = ',B1.FEATURE_VALUE AS ' + alias
|
|
1050
|
+
|
|
1051
|
+
sub_queries.append(
|
|
1052
|
+
{
|
|
1053
|
+
'feature_name': alias,
|
|
1054
|
+
'query': f"""
|
|
1055
|
+
SEQUENCED VALIDTIME
|
|
1056
|
+
SELECT
|
|
1057
|
+
{txt_entity}
|
|
1058
|
+
{feature_str}
|
|
1059
|
+
FROM {k} B1
|
|
1060
|
+
WHERE {txt_where}
|
|
1061
|
+
"""
|
|
1062
|
+
}
|
|
1063
|
+
)
|
|
1064
|
+
|
|
1065
|
+
# Single UUID
|
|
1066
|
+
else:
|
|
1067
|
+
base_alias = _sanitize_identifier(feature_name)
|
|
1068
|
+
alias = _unique_alias(base_alias)
|
|
1069
|
+
|
|
1070
|
+
txt_where = f"(FEATURE_ID = {feature_id} AND FEATURE_VERSION='{feature_version}')"
|
|
1071
|
+
feature_str = ',B1.FEATURE_VALUE AS ' + alias
|
|
1072
|
+
sub_queries.append(
|
|
1073
|
+
{
|
|
1074
|
+
'feature_name': alias,
|
|
1075
|
+
'query': f"""
|
|
1076
|
+
SEQUENCED VALIDTIME
|
|
1077
|
+
SELECT
|
|
1078
|
+
{txt_entity}
|
|
1079
|
+
{feature_str}
|
|
1080
|
+
FROM {k} B1
|
|
1081
|
+
WHERE {txt_where}
|
|
1082
|
+
"""
|
|
1083
|
+
}
|
|
1084
|
+
)
|
|
1032
1085
|
|
|
1033
1086
|
# Handle case where no features are available
|
|
1034
1087
|
if len(sub_queries) == 0:
|
|
@@ -1102,6 +1155,7 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
|
|
|
1102
1155
|
return tdml.DataFrame.from_table(tdml.in_schema(schema_name, view_name))
|
|
1103
1156
|
|
|
1104
1157
|
|
|
1158
|
+
|
|
1105
1159
|
def build_dataset_opt(entity_id, selected_features, view_name = None, schema_name=tdfs4ds.SCHEMA,
|
|
1106
1160
|
comment='dataset', no_temporal=False, time_manager=None, query_only=False, entity_null_substitute={},
|
|
1107
1161
|
other=None, time_column=None, filtermanager = None, filter_conditions = None
|
|
@@ -249,48 +249,49 @@ def get_list_features(entity_name, domain=None):
|
|
|
249
249
|
return tdml.DataFrame.from_query(query)
|
|
250
250
|
|
|
251
251
|
|
|
252
|
-
def get_feature_versions(entity_name, features, domain=None
|
|
252
|
+
def get_feature_versions(entity_name, features, domain=None):
|
|
253
253
|
"""
|
|
254
|
-
Retrieve
|
|
255
|
-
from a given data domain. This function allows fetching either all versions or
|
|
256
|
-
just the latest versions of the features.
|
|
254
|
+
Retrieve version UUID(s) for the given features of an entity within a domain.
|
|
257
255
|
|
|
258
256
|
Parameters:
|
|
259
|
-
entity_name (str
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
domain (str, optional): The data domain to filter the feature versions.
|
|
263
|
-
Defaults to None, where a predefined domain is used.
|
|
264
|
-
latest_version_only (bool, optional): Flag to fetch only the latest version
|
|
265
|
-
of each feature. Defaults to True.
|
|
266
|
-
version_lag (int, optional): The number of versions to lag behind the latest.
|
|
267
|
-
Only effective if latest_version_only is True. Defaults to 0.
|
|
257
|
+
- entity_name (str): The entity name to which the features belong.
|
|
258
|
+
- features (str | list[str]): Feature name or list of feature names.
|
|
259
|
+
- domain (str, optional): Data domain to filter on. If None, defaults to tdfs4ds.DATA_DOMAIN.
|
|
268
260
|
|
|
269
261
|
Returns:
|
|
270
|
-
dict
|
|
262
|
+
- dict[str, str | list[dict]]: Maps each requested feature name to either:
|
|
263
|
+
- a single version UUID string if exactly one row exists, or
|
|
264
|
+
- a list of dicts if multiple rows exist; each dict has:
|
|
265
|
+
{
|
|
266
|
+
"process_id": <FEATURE_VERSION UUID>,
|
|
267
|
+
"process_view_name": <PROCESS_VIEW_NAME string>
|
|
268
|
+
}
|
|
269
|
+
If a requested feature has no entries, it will be present with value None.
|
|
270
|
+
|
|
271
|
+
Notes:
|
|
272
|
+
- Uses {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME} as A and
|
|
273
|
+
{tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME} as B (must exist) joined on PROCESS_ID.
|
|
274
|
+
- Respects tdfs4ds.DEBUG_MODE to print the generated SQL.
|
|
271
275
|
"""
|
|
272
276
|
|
|
273
|
-
#
|
|
277
|
+
# Normalize inputs
|
|
278
|
+
if isinstance(features, str):
|
|
279
|
+
features = [features]
|
|
280
|
+
|
|
274
281
|
if domain is None:
|
|
275
282
|
domain = tdfs4ds.DATA_DOMAIN
|
|
276
283
|
|
|
284
|
+
# Basic escaping for single quotes in values used in SQL literals
|
|
285
|
+
def _esc(s: str) -> str:
|
|
286
|
+
return s.replace("'", "''")
|
|
277
287
|
|
|
278
|
-
|
|
279
|
-
# Convert the entity_name to a string if it is a list
|
|
280
|
-
if type(entity_name) == list:
|
|
281
|
-
entity_name.sort()
|
|
282
|
-
entity_name = ','.join(entity_name)
|
|
283
|
-
|
|
284
|
-
# Preparing the feature names for inclusion in the SQL query
|
|
285
|
-
if type(features) == list:
|
|
286
|
-
features = ["'" + f + "'" for f in features]
|
|
287
|
-
else:
|
|
288
|
-
features = "'" + features + "'"
|
|
288
|
+
features_lits = ",".join(f"'{_esc(f)}'" for f in features)
|
|
289
289
|
|
|
290
290
|
query = f"""
|
|
291
291
|
SELECT
|
|
292
292
|
A.FEATURE_NAME
|
|
293
293
|
, B.PROCESS_ID AS FEATURE_VERSION
|
|
294
|
+
, B.VIEW_NAME AS PROCESS_VIEW_NAME
|
|
294
295
|
FROM {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW} A
|
|
295
296
|
INNER JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW_FEATURE_SPLIT} B
|
|
296
297
|
ON A.DATA_DOMAIN = B.DATA_DOMAIN
|
|
@@ -298,35 +299,45 @@ def get_feature_versions(entity_name, features, domain=None, latest_version_only
|
|
|
298
299
|
AND A.FEATURE_NAME = B.FEATURE_NAME
|
|
299
300
|
WHERE A.DATA_DOMAIN = '{domain}'
|
|
300
301
|
AND A.ENTITY_NAME = '{entity_name}'
|
|
301
|
-
AND A.FEATURE_NAME IN ({
|
|
302
|
+
AND A.FEATURE_NAME IN ({features_lits})
|
|
302
303
|
"""
|
|
303
304
|
|
|
304
|
-
|
|
305
|
-
df = tdml.DataFrame.from_query(query).to_pandas()
|
|
306
|
-
|
|
307
|
-
# if df is empty
|
|
308
|
-
if df.shape[0] == 0:
|
|
309
|
-
print('the features you are requesting for this entity and data domain do not exist. Here is what you requested:')
|
|
310
|
-
print('feature store database :', tdfs4ds.SCHEMA)
|
|
311
|
-
print('feature catalog :', tdfs4ds.FEATURE_CATALOG_NAME_VIEW)
|
|
312
|
-
print('entity name :', entity_name)
|
|
313
|
-
print('data domain :', domain)
|
|
314
|
-
print('features :', ','.join(features))
|
|
315
|
-
print('')
|
|
305
|
+
if tdfs4ds.DEBUG_MODE:
|
|
316
306
|
print(query)
|
|
317
|
-
return
|
|
318
307
|
|
|
319
|
-
|
|
320
|
-
|
|
308
|
+
rows = tdml.execute_sql(query).fetchall()
|
|
309
|
+
|
|
310
|
+
# Initialize result for all requested features
|
|
311
|
+
result = {f: None for f in features}
|
|
312
|
+
|
|
313
|
+
# Collect (version, view) per feature, deduplicating while preserving order
|
|
314
|
+
tmp = {f: [] for f in features}
|
|
315
|
+
seen = {f: set() for f in features}
|
|
316
|
+
|
|
317
|
+
for feat, version, view_name in rows:
|
|
318
|
+
key = (version, view_name)
|
|
319
|
+
if key not in seen.setdefault(feat, set()):
|
|
320
|
+
seen[feat].add(key)
|
|
321
|
+
tmp.setdefault(feat, []).append(key)
|
|
322
|
+
|
|
323
|
+
# Shape:
|
|
324
|
+
# - if exactly one row: return UUID string
|
|
325
|
+
# - if multiple rows: list of {"process_id": <uuid>, "process_view_name": <str>}
|
|
326
|
+
for feat in result:
|
|
327
|
+
pairs = tmp.get(feat, [])
|
|
328
|
+
if len(pairs) == 0:
|
|
329
|
+
result[feat] = None
|
|
330
|
+
elif len(pairs) == 1:
|
|
331
|
+
result[feat] = pairs[0][0] # UUID only
|
|
332
|
+
else:
|
|
333
|
+
result[feat] = [
|
|
334
|
+
{"process_id": ver, "process_view_name": view}
|
|
335
|
+
for (ver, view) in pairs
|
|
336
|
+
]
|
|
337
|
+
|
|
338
|
+
return result
|
|
321
339
|
|
|
322
|
-
# results in dictionary:
|
|
323
|
-
results = {row['FEATURE_NAME']: row['FEATURE_VERSION'] for i, row in df.iterrows()}
|
|
324
|
-
if tdfs4ds.DEBUG_MODE == True:
|
|
325
|
-
print('---> RESULTS <---')
|
|
326
|
-
print(results)
|
|
327
340
|
|
|
328
|
-
# Returning the results as a dictionary with feature names as keys and their versions as values
|
|
329
|
-
return results
|
|
330
341
|
def get_entity_tables(entity_id, data_domain=None):
|
|
331
342
|
"""
|
|
332
343
|
Retrieves a list of table names associated with a given entity ID or IDs from a feature catalog within a specific data domain.
|
|
@@ -1011,7 +1011,7 @@ def delete_feature(feature_name, entity_id, data_domain=None):
|
|
|
1011
1011
|
print('table name : ', table_name)
|
|
1012
1012
|
|
|
1013
1013
|
query = f"""
|
|
1014
|
-
DELETE {table_name}
|
|
1014
|
+
NONSEQUENCED VALIDTIME DELETE {table_name}
|
|
1015
1015
|
WHERE FEATURE_ID = (
|
|
1016
1016
|
SEL FEATURE_ID FROM {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW}
|
|
1017
1017
|
WHERE FEATURE_NAME = '{feature_name}'
|
|
@@ -2,7 +2,7 @@ tdfs/__init__.py,sha256=7AcO7uB1opRCt7t2JOHworKimfAaDeO3boRW7u9Geo8,23
|
|
|
2
2
|
tdfs/datasets.py,sha256=-b2MPEKGki2V1M8iUcoDR9uc2krIK7u1CK-EhChvihs,985
|
|
3
3
|
tdfs/feature_store.py,sha256=Honu7eOAXxP4Ivz0mRlhuNkfTDzgZl5HB1WlQUwzcZ0,31354
|
|
4
4
|
tdfs/data/curves.csv,sha256=q0Tm-0yu7VMK4lHvHpgi1LMeRq0lO5gJy2Q17brKbEM,112488
|
|
5
|
-
tdfs4ds/__init__.py,sha256=
|
|
5
|
+
tdfs4ds/__init__.py,sha256=nxdiUTYsMHV8r-dKQt3sysy_Nxgwb82g9n0ZcrU-134,66290
|
|
6
6
|
tdfs4ds/datasets.py,sha256=LE4Gn0muwdyrIrCrbkE92cnafUML63z1lj5bFIIVzmc,3524
|
|
7
7
|
tdfs4ds/feature_engineering.py,sha256=oVnZ2V_XNGE12LKC_fNfkrWSQZLgtYRmaf8Dispi6S4,7081
|
|
8
8
|
tdfs4ds/feature_store.py,sha256=y-oItPZw6nBkBcGAceaATZbkLPTsvpk0OnpzTxYofDs,68576
|
|
@@ -18,8 +18,8 @@ tdfs4ds/dataset/dataset_catalog.py,sha256=qxS2thDW2MvsRouSFaX1M0sX2J7IzBAYD8Yf22
|
|
|
18
18
|
tdfs4ds/feature_store/__init__.py,sha256=a7NPCkpTx40UR5LRErwnskpABG2Vuib7F5wUjaUGCnI,209
|
|
19
19
|
tdfs4ds/feature_store/entity_management.py,sha256=9ltytv3yCTG84NZXBpb1Tlkf9pOxvrNb0MVidU4pwvE,10157
|
|
20
20
|
tdfs4ds/feature_store/feature_data_processing.py,sha256=rvpnFrV6Tmg8C6xcSQLT_lrFYqZsdSzFXmS-4suK9qg,42847
|
|
21
|
-
tdfs4ds/feature_store/feature_query_retrieval.py,sha256=
|
|
22
|
-
tdfs4ds/feature_store/feature_store_management.py,sha256=
|
|
21
|
+
tdfs4ds/feature_store/feature_query_retrieval.py,sha256=0ZLJWtV13tjaUdYCiQvPvYWxKs0f_3LZ2HgfQzHyaW4,33705
|
|
22
|
+
tdfs4ds/feature_store/feature_store_management.py,sha256=pWM9sjppBgRIg3l1ksoDJsM1fnaZlWtnuE3JuOP_2mY,54736
|
|
23
23
|
tdfs4ds/process_store/__init__.py,sha256=npHR_xju5ecGmWfYHDyteLwiU3x-cL4HD3sFK_th7xY,229
|
|
24
24
|
tdfs4ds/process_store/process_followup.py,sha256=PvLcU7meg3ljBlPfuez3qwTVqpHHhVJxYxGqjgiHE8E,7265
|
|
25
25
|
tdfs4ds/process_store/process_query_administration.py,sha256=DsIt97cBoJ7NcpQzbQt55eUFNgXGdOMm5Hh2aX5v0PY,7762
|
|
@@ -32,7 +32,7 @@ tdfs4ds/utils/lineage.py,sha256=gy5M42qy5fvdWmlohAY3WPYoqAyp5VakeEmeT1YjrJQ,3783
|
|
|
32
32
|
tdfs4ds/utils/query_management.py,sha256=nAcE8QY1GWAKgOtb-ubSfDVcnYbU7Ge8CruVRLoPtmY,6356
|
|
33
33
|
tdfs4ds/utils/time_management.py,sha256=1eqGs7rT3SGag0F30R3PzwiC7Aa7DKia2Ud0aSNKcPg,10593
|
|
34
34
|
tdfs4ds/utils/visualization.py,sha256=5S528KoKzzkrAdCxfy7ecyqKvAXBoibNvHwz_u5ISMs,23167
|
|
35
|
-
tdfs4ds-0.2.4.
|
|
36
|
-
tdfs4ds-0.2.4.
|
|
37
|
-
tdfs4ds-0.2.4.
|
|
38
|
-
tdfs4ds-0.2.4.
|
|
35
|
+
tdfs4ds-0.2.4.28.dist-info/METADATA,sha256=PwO281hIs7L2ZQsbwRgHnjU0Aq_98zAKoLP6SjI1t3s,14326
|
|
36
|
+
tdfs4ds-0.2.4.28.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
|
37
|
+
tdfs4ds-0.2.4.28.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
|
|
38
|
+
tdfs4ds-0.2.4.28.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|