tdfs4ds 0.2.4.26__py3-none-any.whl → 0.2.4.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tdfs4ds/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = '0.2.4.26'
1
+ __version__ = '0.2.4.27'
2
2
  import logging
3
3
  # Setup the logger
4
4
  logging.basicConfig(
@@ -935,6 +935,10 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
935
935
  selected_features : dict
936
936
  A dictionary where the keys are feature table names, and the values are lists of tuples
937
937
  (feature_id, feature_version, feature_name) specifying the features to retrieve.
938
+ NOTE: feature_version may be either:
939
+ - a single UUID string, or
940
+ - a list of dicts like:
941
+ {"process_id": <UUID>, "process_view_name": <str>}
938
942
 
939
943
  view_name : str
940
944
  The name of the view to be created in the database.
@@ -1004,6 +1008,24 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
1004
1008
  # Sort the entity ID list for consistent query generation
1005
1009
  list_entity_id.sort()
1006
1010
 
1011
+ # Helpers
1012
+ import re
1013
+ def _sanitize_identifier(name: str) -> str:
1014
+ # Keep letters, numbers, and underscores; replace others with '_'
1015
+ return re.sub(r'[^0-9A-Za-z_]', '_', name)
1016
+
1017
+ used_alias_counts = {} # base_alias -> count
1018
+
1019
+ def _unique_alias(base: str) -> str:
1020
+ """
1021
+ Ensure alias uniqueness: if base already used, append _2, _3, ...
1022
+ """
1023
+ if base not in used_alias_counts:
1024
+ used_alias_counts[base] = 1
1025
+ return base
1026
+ used_alias_counts[base] += 1
1027
+ return f"{base}_{used_alias_counts[base]}"
1028
+
1007
1029
  # Initialize sub-query construction
1008
1030
  tdfs4ds.logger.info("Generating the sub-queries for feature retrieval.")
1009
1031
  sub_queries = []
@@ -1014,21 +1036,52 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
1014
1036
  # Construct sub-queries for each feature
1015
1037
  for k, v in list_features.items():
1016
1038
  for feature_id, feature_version, feature_name in v:
1017
- txt_where = f"(FEATURE_ID = {feature_id} AND FEATURE_VERSION='{feature_version}')"
1018
- feature_str = ',B1.FEATURE_VALUE AS ' + feature_name
1019
- sub_queries.append(
1020
- {
1021
- 'feature_name': feature_name,
1022
- 'query': f"""
1023
- SEQUENCED VALIDTIME
1024
- SELECT
1025
- {txt_entity}
1026
- {feature_str}
1027
- FROM {k} B1
1028
- WHERE {txt_where}
1029
- """
1030
- }
1031
- )
1039
+
1040
+ # Multiple processes: list of dicts
1041
+ if isinstance(feature_version, list):
1042
+ for item in feature_version:
1043
+ process_id = item.get("process_id")
1044
+ process_view_name = item.get("process_view_name") or "PROCESS"
1045
+ base_alias = _sanitize_identifier(f"{feature_name}_{process_view_name}")
1046
+ alias = _unique_alias(base_alias)
1047
+
1048
+ txt_where = f"(FEATURE_ID = {feature_id} AND FEATURE_VERSION='{process_id}')"
1049
+ feature_str = ',B1.FEATURE_VALUE AS ' + alias
1050
+
1051
+ sub_queries.append(
1052
+ {
1053
+ 'feature_name': alias,
1054
+ 'query': f"""
1055
+ SEQUENCED VALIDTIME
1056
+ SELECT
1057
+ {txt_entity}
1058
+ {feature_str}
1059
+ FROM {k} B1
1060
+ WHERE {txt_where}
1061
+ """
1062
+ }
1063
+ )
1064
+
1065
+ # Single UUID
1066
+ else:
1067
+ base_alias = _sanitize_identifier(feature_name)
1068
+ alias = _unique_alias(base_alias)
1069
+
1070
+ txt_where = f"(FEATURE_ID = {feature_id} AND FEATURE_VERSION='{feature_version}')"
1071
+ feature_str = ',B1.FEATURE_VALUE AS ' + alias
1072
+ sub_queries.append(
1073
+ {
1074
+ 'feature_name': alias,
1075
+ 'query': f"""
1076
+ SEQUENCED VALIDTIME
1077
+ SELECT
1078
+ {txt_entity}
1079
+ {feature_str}
1080
+ FROM {k} B1
1081
+ WHERE {txt_where}
1082
+ """
1083
+ }
1084
+ )
1032
1085
 
1033
1086
  # Handle case where no features are available
1034
1087
  if len(sub_queries) == 0:
@@ -1102,6 +1155,7 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
1102
1155
  return tdml.DataFrame.from_table(tdml.in_schema(schema_name, view_name))
1103
1156
 
1104
1157
 
1158
+
1105
1159
  def build_dataset_opt(entity_id, selected_features, view_name = None, schema_name=tdfs4ds.SCHEMA,
1106
1160
  comment='dataset', no_temporal=False, time_manager=None, query_only=False, entity_null_substitute={},
1107
1161
  other=None, time_column=None, filtermanager = None, filter_conditions = None
@@ -249,48 +249,49 @@ def get_list_features(entity_name, domain=None):
249
249
  return tdml.DataFrame.from_query(query)
250
250
 
251
251
 
252
- def get_feature_versions(entity_name, features, domain=None, latest_version_only=True, version_lag=0):
252
+ def get_feature_versions(entity_name, features, domain=None):
253
253
  """
254
- Retrieve feature versions for specified features associated with certain entities
255
- from a given data domain. This function allows fetching either all versions or
256
- just the latest versions of the features.
254
+ Retrieve version UUID(s) for the given features of an entity within a domain.
257
255
 
258
256
  Parameters:
259
- entity_name (str or list): The name of the entity or a list of entity names
260
- for which feature versions are to be fetched.
261
- features (list): A list of features for which versions are required.
262
- domain (str, optional): The data domain to filter the feature versions.
263
- Defaults to None, where a predefined domain is used.
264
- latest_version_only (bool, optional): Flag to fetch only the latest version
265
- of each feature. Defaults to True.
266
- version_lag (int, optional): The number of versions to lag behind the latest.
267
- Only effective if latest_version_only is True. Defaults to 0.
257
+ - entity_name (str): The entity name to which the features belong.
258
+ - features (str | list[str]): Feature name or list of feature names.
259
+ - domain (str, optional): Data domain to filter on. If None, defaults to tdfs4ds.DATA_DOMAIN.
268
260
 
269
261
  Returns:
270
- dict: A dictionary with feature names as keys and their corresponding versions as values.
262
+ - dict[str, str | list[dict]]: Maps each requested feature name to either:
263
+ - a single version UUID string if exactly one row exists, or
264
+ - a list of dicts if multiple rows exist; each dict has:
265
+ {
266
+ "process_id": <FEATURE_VERSION UUID>,
267
+ "process_view_name": <PROCESS_VIEW_NAME string>
268
+ }
269
+ If a requested feature has no entries, it will be present with value None.
270
+
271
+ Notes:
272
+ - Uses {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME} as A and
273
+ {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME} as B (must exist) joined on PROCESS_ID.
274
+ - Respects tdfs4ds.DEBUG_MODE to print the generated SQL.
271
275
  """
272
276
 
273
- # Default to a predefined data domain if none is provided
277
+ # Normalize inputs
278
+ if isinstance(features, str):
279
+ features = [features]
280
+
274
281
  if domain is None:
275
282
  domain = tdfs4ds.DATA_DOMAIN
276
283
 
284
+ # Basic escaping for single quotes in values used in SQL literals
285
+ def _esc(s: str) -> str:
286
+ return s.replace("'", "''")
277
287
 
278
-
279
- # Convert the entity_name to a string if it is a list
280
- if type(entity_name) == list:
281
- entity_name.sort()
282
- entity_name = ','.join(entity_name)
283
-
284
- # Preparing the feature names for inclusion in the SQL query
285
- if type(features) == list:
286
- features = ["'" + f + "'" for f in features]
287
- else:
288
- features = "'" + features + "'"
288
+ features_lits = ",".join(f"'{_esc(f)}'" for f in features)
289
289
 
290
290
  query = f"""
291
291
  SELECT
292
292
  A.FEATURE_NAME
293
293
  , B.PROCESS_ID AS FEATURE_VERSION
294
+ , B.VIEW_NAME AS PROCESS_VIEW_NAME
294
295
  FROM {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW} A
295
296
  INNER JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW_FEATURE_SPLIT} B
296
297
  ON A.DATA_DOMAIN = B.DATA_DOMAIN
@@ -298,35 +299,45 @@ def get_feature_versions(entity_name, features, domain=None, latest_version_only
298
299
  AND A.FEATURE_NAME = B.FEATURE_NAME
299
300
  WHERE A.DATA_DOMAIN = '{domain}'
300
301
  AND A.ENTITY_NAME = '{entity_name}'
301
- AND A.FEATURE_NAME IN ({','.join(features)})
302
+ AND A.FEATURE_NAME IN ({features_lits})
302
303
  """
303
304
 
304
- # Executing the first query and converting the results to a pandas DataFrame
305
- df = tdml.DataFrame.from_query(query).to_pandas()
306
-
307
- # if df is empty
308
- if df.shape[0] == 0:
309
- print('the features you are requesting for this entity and data domain do not exist. Here is what you requested:')
310
- print('feature store database :', tdfs4ds.SCHEMA)
311
- print('feature catalog :', tdfs4ds.FEATURE_CATALOG_NAME_VIEW)
312
- print('entity name :', entity_name)
313
- print('data domain :', domain)
314
- print('features :', ','.join(features))
315
- print('')
305
+ if tdfs4ds.DEBUG_MODE:
316
306
  print(query)
317
- return
318
307
 
319
- if tdfs4ds.DEBUG_MODE == True:
320
- print(query)
308
+ rows = tdml.execute_sql(query).fetchall()
309
+
310
+ # Initialize result for all requested features
311
+ result = {f: None for f in features}
312
+
313
+ # Collect (version, view) per feature, deduplicating while preserving order
314
+ tmp = {f: [] for f in features}
315
+ seen = {f: set() for f in features}
316
+
317
+ for feat, version, view_name in rows:
318
+ key = (version, view_name)
319
+ if key not in seen.setdefault(feat, set()):
320
+ seen[feat].add(key)
321
+ tmp.setdefault(feat, []).append(key)
322
+
323
+ # Shape:
324
+ # - if exactly one row: return UUID string
325
+ # - if multiple rows: list of {"process_id": <uuid>, "process_view_name": <str>}
326
+ for feat in result:
327
+ pairs = tmp.get(feat, [])
328
+ if len(pairs) == 0:
329
+ result[feat] = None
330
+ elif len(pairs) == 1:
331
+ result[feat] = pairs[0][0] # UUID only
332
+ else:
333
+ result[feat] = [
334
+ {"process_id": ver, "process_view_name": view}
335
+ for (ver, view) in pairs
336
+ ]
337
+
338
+ return result
321
339
 
322
- # results in dictionary:
323
- results = {row['FEATURE_NAME']: row['FEATURE_VERSION'] for i, row in df.iterrows()}
324
- if tdfs4ds.DEBUG_MODE == True:
325
- print('---> RESULTS <---')
326
- print(results)
327
340
 
328
- # Returning the results as a dictionary with feature names as keys and their versions as values
329
- return results
330
341
  def get_entity_tables(entity_id, data_domain=None):
331
342
  """
332
343
  Retrieves a list of table names associated with a given entity ID or IDs from a feature catalog within a specific data domain.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tdfs4ds
3
- Version: 0.2.4.26
3
+ Version: 0.2.4.27
4
4
  Summary: A python package to simplify the usage of feature store using Teradata Vantage ...
5
5
  Author: Denis Molin
6
6
  Requires-Python: >=3.6
@@ -2,7 +2,7 @@ tdfs/__init__.py,sha256=7AcO7uB1opRCt7t2JOHworKimfAaDeO3boRW7u9Geo8,23
2
2
  tdfs/datasets.py,sha256=-b2MPEKGki2V1M8iUcoDR9uc2krIK7u1CK-EhChvihs,985
3
3
  tdfs/feature_store.py,sha256=Honu7eOAXxP4Ivz0mRlhuNkfTDzgZl5HB1WlQUwzcZ0,31354
4
4
  tdfs/data/curves.csv,sha256=q0Tm-0yu7VMK4lHvHpgi1LMeRq0lO5gJy2Q17brKbEM,112488
5
- tdfs4ds/__init__.py,sha256=_UnSzqinlnbLOM4wOTxJrT1a_qTn6mRiNHz4jE6bRaI,64168
5
+ tdfs4ds/__init__.py,sha256=sHzEWvxrBA_DBbOBJOsFuIxz0qX9MAY3zdS20gnCz_Q,66290
6
6
  tdfs4ds/datasets.py,sha256=LE4Gn0muwdyrIrCrbkE92cnafUML63z1lj5bFIIVzmc,3524
7
7
  tdfs4ds/feature_engineering.py,sha256=oVnZ2V_XNGE12LKC_fNfkrWSQZLgtYRmaf8Dispi6S4,7081
8
8
  tdfs4ds/feature_store.py,sha256=y-oItPZw6nBkBcGAceaATZbkLPTsvpk0OnpzTxYofDs,68576
@@ -18,7 +18,7 @@ tdfs4ds/dataset/dataset_catalog.py,sha256=qxS2thDW2MvsRouSFaX1M0sX2J7IzBAYD8Yf22
18
18
  tdfs4ds/feature_store/__init__.py,sha256=a7NPCkpTx40UR5LRErwnskpABG2Vuib7F5wUjaUGCnI,209
19
19
  tdfs4ds/feature_store/entity_management.py,sha256=9ltytv3yCTG84NZXBpb1Tlkf9pOxvrNb0MVidU4pwvE,10157
20
20
  tdfs4ds/feature_store/feature_data_processing.py,sha256=rvpnFrV6Tmg8C6xcSQLT_lrFYqZsdSzFXmS-4suK9qg,42847
21
- tdfs4ds/feature_store/feature_query_retrieval.py,sha256=zuHRZhL6-qyLpPS7mWgRy1WingSN5iibkbi53Q7jfAs,33834
21
+ tdfs4ds/feature_store/feature_query_retrieval.py,sha256=0ZLJWtV13tjaUdYCiQvPvYWxKs0f_3LZ2HgfQzHyaW4,33705
22
22
  tdfs4ds/feature_store/feature_store_management.py,sha256=ufIBTdrnHBvGdXggavJoTVoZjOHFtH5ZiYqJr5eIBhg,54713
23
23
  tdfs4ds/process_store/__init__.py,sha256=npHR_xju5ecGmWfYHDyteLwiU3x-cL4HD3sFK_th7xY,229
24
24
  tdfs4ds/process_store/process_followup.py,sha256=PvLcU7meg3ljBlPfuez3qwTVqpHHhVJxYxGqjgiHE8E,7265
@@ -32,7 +32,7 @@ tdfs4ds/utils/lineage.py,sha256=gy5M42qy5fvdWmlohAY3WPYoqAyp5VakeEmeT1YjrJQ,3783
32
32
  tdfs4ds/utils/query_management.py,sha256=nAcE8QY1GWAKgOtb-ubSfDVcnYbU7Ge8CruVRLoPtmY,6356
33
33
  tdfs4ds/utils/time_management.py,sha256=1eqGs7rT3SGag0F30R3PzwiC7Aa7DKia2Ud0aSNKcPg,10593
34
34
  tdfs4ds/utils/visualization.py,sha256=5S528KoKzzkrAdCxfy7ecyqKvAXBoibNvHwz_u5ISMs,23167
35
- tdfs4ds-0.2.4.26.dist-info/METADATA,sha256=15eq8Z08VdFjD-GXC2cLqGvfb8OQoDRi3oPlmTyiq00,14326
36
- tdfs4ds-0.2.4.26.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
37
- tdfs4ds-0.2.4.26.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
38
- tdfs4ds-0.2.4.26.dist-info/RECORD,,
35
+ tdfs4ds-0.2.4.27.dist-info/METADATA,sha256=0zXOf1EjCvIPgXK3EyOtMDlF4ZB5nArvMsKcqFqknTg,14326
36
+ tdfs4ds-0.2.4.27.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
37
+ tdfs4ds-0.2.4.27.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
38
+ tdfs4ds-0.2.4.27.dist-info/RECORD,,