tdfs4ds 0.2.4.26__py3-none-any.whl → 0.2.4.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -249,84 +249,137 @@ def get_list_features(entity_name, domain=None):
249
249
  return tdml.DataFrame.from_query(query)
250
250
 
251
251
 
252
- def get_feature_versions(entity_name, features, domain=None, latest_version_only=True, version_lag=0):
252
+ def get_feature_versions(entity_name, features, domain=None):
253
253
  """
254
- Retrieve feature versions for specified features associated with certain entities
255
- from a given data domain. This function allows fetching either all versions or
256
- just the latest versions of the features.
254
+ Retrieve version identifiers for one or more features belonging to a given entity.
255
+
256
+ The function queries the underlying metadata tables to find the *process*
257
+ (i.e., feature‑version) records that match the supplied entity and feature
258
+ names. It returns a mapping from each requested feature name to either:
259
+
260
+ * **None** – if no matching rows were found.
261
+ * A single UUID string – if exactly one matching row exists for the feature.
262
+ * A list of dictionaries – if more than one matching row is found; each
263
+ dictionary contains:
264
+ ``process_id`` – the UUID of the process that produced the
265
+ version,
266
+ ``process_view_name`` – the human‑readable view name associated with
267
+ that process.
268
+
269
+ Parameters
270
+ ----------
271
+ entity_name : str | list[str]
272
+ The name (or names) of the entity whose features we are querying.
273
+ If a single string is supplied it is treated as a singleton list.
274
+
275
+ features : str | list[str]
276
+ One or more feature names to look up. Accepts a single string or
277
+ an iterable of strings; if a single string is provided it is wrapped in
278
+ a list internally.
279
+
280
+ domain : str, optional
281
+ The data‑domain partition to filter on. If omitted the default
282
+ ``tdfs4ds.DATA_DOMAIN`` constant is used.
283
+
284
+ Returns
285
+ -------
286
+ dict[str, str | None | list[dict]]
287
+ A dictionary keyed by feature name. Each value is either:
288
+ * ``None`` – no records were found for that feature.
289
+ * ``str`` – a single UUID string when exactly one row matched.
290
+ * ``list[dict]`` – multiple matches; each dict has keys
291
+ ``process_id`` and ``process_view_name``.
292
+
293
+ Notes
294
+ -----
295
+ * The query joins the feature catalog view with the process catalog
296
+ (specifically the “feature split” view) on data domain, entity ID,
297
+ and feature name.
298
+ * SQL string literals are escaped by doubling single quotes; this is a
299
+ lightweight escape that suffices for the current use‑case.
300
+ * The function preserves insertion order of features in the returned
301
+ dictionary (Python 3.7+ guarantees dict order).
302
+ * When ``tdfs4ds.DEBUG_MODE`` is true, the generated SQL statement is
303
+ printed to stdout – useful for troubleshooting.
304
+
305
+ Example
306
+ -------
307
+ >>> get_feature_versions('user', ['age', 'income'])
308
+ {'age': 'c1d2e3f4-...', 'income': None}
257
309
 
258
- Parameters:
259
- entity_name (str or list): The name of the entity or a list of entity names
260
- for which feature versions are to be fetched.
261
- features (list): A list of features for which versions are required.
262
- domain (str, optional): The data domain to filter the feature versions.
263
- Defaults to None, where a predefined domain is used.
264
- latest_version_only (bool, optional): Flag to fetch only the latest version
265
- of each feature. Defaults to True.
266
- version_lag (int, optional): The number of versions to lag behind the latest.
267
- Only effective if latest_version_only is True. Defaults to 0.
268
-
269
- Returns:
270
- dict: A dictionary with feature names as keys and their corresponding versions as values.
271
310
  """
272
311
 
273
- # Default to a predefined data domain if none is provided
312
+
313
+ # Normalize inputs
314
+ if isinstance(features, str):
315
+ features = [features]
316
+
317
+ if isinstance(entity_name, str):
318
+ entity_name = [entity_name]
319
+
274
320
  if domain is None:
275
321
  domain = tdfs4ds.DATA_DOMAIN
276
322
 
323
+ # Basic escaping for single quotes in values used in SQL literals
324
+ def _esc(s: str) -> str:
325
+ return s.replace("'", "''")
277
326
 
327
+ features_lits = ",".join(f"'{_esc(f)}'" for f in features)
278
328
 
279
- # Convert the entity_name to a string if it is a list
280
- if type(entity_name) == list:
281
- entity_name.sort()
282
- entity_name = ','.join(entity_name)
283
-
284
- # Preparing the feature names for inclusion in the SQL query
285
- if type(features) == list:
286
- features = ["'" + f + "'" for f in features]
287
- else:
288
- features = "'" + features + "'"
329
+ entity_name.sort()
330
+ entity_name_str = ','.join(entity_name)
289
331
 
290
332
  query = f"""
291
333
  SELECT
292
334
  A.FEATURE_NAME
293
335
  , B.PROCESS_ID AS FEATURE_VERSION
336
+ , B.VIEW_NAME AS PROCESS_VIEW_NAME
294
337
  FROM {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW} A
295
338
  INNER JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW_FEATURE_SPLIT} B
296
339
  ON A.DATA_DOMAIN = B.DATA_DOMAIN
297
340
  AND A.ENTITY_NAME = B.ENTITY_ID
298
341
  AND A.FEATURE_NAME = B.FEATURE_NAME
299
342
  WHERE A.DATA_DOMAIN = '{domain}'
300
- AND A.ENTITY_NAME = '{entity_name}'
301
- AND A.FEATURE_NAME IN ({','.join(features)})
343
+ AND A.ENTITY_NAME = '{entity_name_str}'
344
+ AND A.FEATURE_NAME IN ({features_lits})
302
345
  """
303
346
 
304
- # Executing the first query and converting the results to a pandas DataFrame
305
- df = tdml.DataFrame.from_query(query).to_pandas()
306
-
307
- # if df is empty
308
- if df.shape[0] == 0:
309
- print('the features you are requesting for this entity and data domain do not exist. Here is what you requested:')
310
- print('feature store database :', tdfs4ds.SCHEMA)
311
- print('feature catalog :', tdfs4ds.FEATURE_CATALOG_NAME_VIEW)
312
- print('entity name :', entity_name)
313
- print('data domain :', domain)
314
- print('features :', ','.join(features))
315
- print('')
347
+ if tdfs4ds.DEBUG_MODE:
316
348
  print(query)
317
- return
318
349
 
319
- if tdfs4ds.DEBUG_MODE == True:
320
- print(query)
350
+ rows = tdml.execute_sql(query).fetchall()
351
+
352
+ # Initialize result for all requested features
353
+ result = {f: None for f in features}
354
+
355
+ # Collect (version, view) per feature, deduplicating while preserving order
356
+ tmp = {f: [] for f in features}
357
+ seen = {f: set() for f in features}
358
+
359
+ for feat, version, view_name in rows:
360
+ key = (version, view_name)
361
+ if key not in seen.setdefault(feat, set()):
362
+ seen[feat].add(key)
363
+ tmp.setdefault(feat, []).append(key)
364
+
365
+ # Shape:
366
+ # - if exactly one row: return UUID string
367
+ # - if multiple rows: list of {"process_id": <uuid>, "process_view_name": <str>}
368
+ for feat in result:
369
+ pairs = tmp.get(feat, [])
370
+ if len(pairs) == 0:
371
+ result[feat] = None
372
+ elif len(pairs) == 1:
373
+ result[feat] = pairs[0][0] # UUID only
374
+ else:
375
+ result[feat] = [
376
+ {"process_id": ver, "process_view_name": view}
377
+ for (ver, view) in pairs
378
+ ]
379
+
380
+ return result
321
381
 
322
- # results in dictionary:
323
- results = {row['FEATURE_NAME']: row['FEATURE_VERSION'] for i, row in df.iterrows()}
324
- if tdfs4ds.DEBUG_MODE == True:
325
- print('---> RESULTS <---')
326
- print(results)
327
382
 
328
- # Returning the results as a dictionary with feature names as keys and their versions as values
329
- return results
330
383
  def get_entity_tables(entity_id, data_domain=None):
331
384
  """
332
385
  Retrieves a list of table names associated with a given entity ID or IDs from a feature catalog within a specific data domain.