tdfs4ds 0.2.4.25__py3-none-any.whl → 0.2.4.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +586 -564
- tdfs4ds/feature_store/feature_data_processing.py +367 -299
- tdfs4ds/feature_store/feature_query_retrieval.py +105 -52
- tdfs4ds/feature_store/feature_store_management.py +268 -285
- tdfs4ds/process_store/process_followup.py +113 -2
- tdfs4ds/process_store/process_query_administration.py +1 -1
- tdfs4ds/process_store/process_registration_management.py +67 -55
- tdfs4ds/process_store/process_store_catalog_management.py +2 -2
- tdfs4ds/utils/filter_management.py +521 -138
- tdfs4ds/utils/query_management.py +18 -40
- tdfs4ds/utils/time_management.py +547 -97
- {tdfs4ds-0.2.4.25.dist-info → tdfs4ds-0.2.4.41.dist-info}/METADATA +1 -1
- {tdfs4ds-0.2.4.25.dist-info → tdfs4ds-0.2.4.41.dist-info}/RECORD +15 -15
- {tdfs4ds-0.2.4.25.dist-info → tdfs4ds-0.2.4.41.dist-info}/WHEEL +0 -0
- {tdfs4ds-0.2.4.25.dist-info → tdfs4ds-0.2.4.41.dist-info}/top_level.txt +0 -0
|
@@ -249,84 +249,137 @@ def get_list_features(entity_name, domain=None):
|
|
|
249
249
|
return tdml.DataFrame.from_query(query)
|
|
250
250
|
|
|
251
251
|
|
|
252
|
-
def get_feature_versions(entity_name, features, domain=None
|
|
252
|
+
def get_feature_versions(entity_name, features, domain=None):
|
|
253
253
|
"""
|
|
254
|
-
Retrieve
|
|
255
|
-
|
|
256
|
-
|
|
254
|
+
Retrieve version identifiers for one or more features belonging to a given entity.
|
|
255
|
+
|
|
256
|
+
The function queries the underlying metadata tables to find the *process*
|
|
257
|
+
(i.e., feature‑version) records that match the supplied entity and feature
|
|
258
|
+
names. It returns a mapping from each requested feature name to either:
|
|
259
|
+
|
|
260
|
+
* **None** – if no matching rows were found.
|
|
261
|
+
* A single UUID string – if exactly one matching row exists for the feature.
|
|
262
|
+
* A list of dictionaries – if more than one matching row is found; each
|
|
263
|
+
dictionary contains:
|
|
264
|
+
``process_id`` – the UUID of the process that produced the
|
|
265
|
+
version,
|
|
266
|
+
``process_view_name`` – the human‑readable view name associated with
|
|
267
|
+
that process.
|
|
268
|
+
|
|
269
|
+
Parameters
|
|
270
|
+
----------
|
|
271
|
+
entity_name : str | list[str]
|
|
272
|
+
The name (or names) of the entity whose features we are querying.
|
|
273
|
+
If a single string is supplied it is treated as a singleton list.
|
|
274
|
+
|
|
275
|
+
features : str | list[str]
|
|
276
|
+
One or more feature names to look up. Accepts a single string or
|
|
277
|
+
an iterable of strings; if a single string is provided it is wrapped in
|
|
278
|
+
a list internally.
|
|
279
|
+
|
|
280
|
+
domain : str, optional
|
|
281
|
+
The data‑domain partition to filter on. If omitted the default
|
|
282
|
+
``tdfs4ds.DATA_DOMAIN`` constant is used.
|
|
283
|
+
|
|
284
|
+
Returns
|
|
285
|
+
-------
|
|
286
|
+
dict[str, str | None | list[dict]]
|
|
287
|
+
A dictionary keyed by feature name. Each value is either:
|
|
288
|
+
* ``None`` – no records were found for that feature.
|
|
289
|
+
* ``str`` – a single UUID string when exactly one row matched.
|
|
290
|
+
* ``list[dict]`` – multiple matches; each dict has keys
|
|
291
|
+
``process_id`` and ``process_view_name``.
|
|
292
|
+
|
|
293
|
+
Notes
|
|
294
|
+
-----
|
|
295
|
+
* The query joins the feature catalog view with the process catalog
|
|
296
|
+
(specifically the “feature split” view) on data domain, entity ID,
|
|
297
|
+
and feature name.
|
|
298
|
+
* SQL string literals are escaped by doubling single quotes; this is a
|
|
299
|
+
lightweight escape that suffices for the current use‑case.
|
|
300
|
+
* The function preserves insertion order of features in the returned
|
|
301
|
+
dictionary (Python 3.7+ guarantees dict order).
|
|
302
|
+
* When ``tdfs4ds.DEBUG_MODE`` is true, the generated SQL statement is
|
|
303
|
+
printed to stdout – useful for troubleshooting.
|
|
304
|
+
|
|
305
|
+
Example
|
|
306
|
+
-------
|
|
307
|
+
>>> get_feature_versions('user', ['age', 'income'])
|
|
308
|
+
{'age': 'c1d2e3f4-...', 'income': None}
|
|
257
309
|
|
|
258
|
-
Parameters:
|
|
259
|
-
entity_name (str or list): The name of the entity or a list of entity names
|
|
260
|
-
for which feature versions are to be fetched.
|
|
261
|
-
features (list): A list of features for which versions are required.
|
|
262
|
-
domain (str, optional): The data domain to filter the feature versions.
|
|
263
|
-
Defaults to None, where a predefined domain is used.
|
|
264
|
-
latest_version_only (bool, optional): Flag to fetch only the latest version
|
|
265
|
-
of each feature. Defaults to True.
|
|
266
|
-
version_lag (int, optional): The number of versions to lag behind the latest.
|
|
267
|
-
Only effective if latest_version_only is True. Defaults to 0.
|
|
268
|
-
|
|
269
|
-
Returns:
|
|
270
|
-
dict: A dictionary with feature names as keys and their corresponding versions as values.
|
|
271
310
|
"""
|
|
272
311
|
|
|
273
|
-
|
|
312
|
+
|
|
313
|
+
# Normalize inputs
|
|
314
|
+
if isinstance(features, str):
|
|
315
|
+
features = [features]
|
|
316
|
+
|
|
317
|
+
if isinstance(entity_name, str):
|
|
318
|
+
entity_name = [entity_name]
|
|
319
|
+
|
|
274
320
|
if domain is None:
|
|
275
321
|
domain = tdfs4ds.DATA_DOMAIN
|
|
276
322
|
|
|
323
|
+
# Basic escaping for single quotes in values used in SQL literals
|
|
324
|
+
def _esc(s: str) -> str:
|
|
325
|
+
return s.replace("'", "''")
|
|
277
326
|
|
|
327
|
+
features_lits = ",".join(f"'{_esc(f)}'" for f in features)
|
|
278
328
|
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
entity_name.sort()
|
|
282
|
-
entity_name = ','.join(entity_name)
|
|
283
|
-
|
|
284
|
-
# Preparing the feature names for inclusion in the SQL query
|
|
285
|
-
if type(features) == list:
|
|
286
|
-
features = ["'" + f + "'" for f in features]
|
|
287
|
-
else:
|
|
288
|
-
features = "'" + features + "'"
|
|
329
|
+
entity_name.sort()
|
|
330
|
+
entity_name_str = ','.join(entity_name)
|
|
289
331
|
|
|
290
332
|
query = f"""
|
|
291
333
|
SELECT
|
|
292
334
|
A.FEATURE_NAME
|
|
293
335
|
, B.PROCESS_ID AS FEATURE_VERSION
|
|
336
|
+
, B.VIEW_NAME AS PROCESS_VIEW_NAME
|
|
294
337
|
FROM {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW} A
|
|
295
338
|
INNER JOIN {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW_FEATURE_SPLIT} B
|
|
296
339
|
ON A.DATA_DOMAIN = B.DATA_DOMAIN
|
|
297
340
|
AND A.ENTITY_NAME = B.ENTITY_ID
|
|
298
341
|
AND A.FEATURE_NAME = B.FEATURE_NAME
|
|
299
342
|
WHERE A.DATA_DOMAIN = '{domain}'
|
|
300
|
-
AND A.ENTITY_NAME = '{
|
|
301
|
-
AND A.FEATURE_NAME IN ({
|
|
343
|
+
AND A.ENTITY_NAME = '{entity_name_str}'
|
|
344
|
+
AND A.FEATURE_NAME IN ({features_lits})
|
|
302
345
|
"""
|
|
303
346
|
|
|
304
|
-
|
|
305
|
-
df = tdml.DataFrame.from_query(query).to_pandas()
|
|
306
|
-
|
|
307
|
-
# if df is empty
|
|
308
|
-
if df.shape[0] == 0:
|
|
309
|
-
print('the features you are requesting for this entity and data domain do not exist. Here is what you requested:')
|
|
310
|
-
print('feature store database :', tdfs4ds.SCHEMA)
|
|
311
|
-
print('feature catalog :', tdfs4ds.FEATURE_CATALOG_NAME_VIEW)
|
|
312
|
-
print('entity name :', entity_name)
|
|
313
|
-
print('data domain :', domain)
|
|
314
|
-
print('features :', ','.join(features))
|
|
315
|
-
print('')
|
|
347
|
+
if tdfs4ds.DEBUG_MODE:
|
|
316
348
|
print(query)
|
|
317
|
-
return
|
|
318
349
|
|
|
319
|
-
|
|
320
|
-
|
|
350
|
+
rows = tdml.execute_sql(query).fetchall()
|
|
351
|
+
|
|
352
|
+
# Initialize result for all requested features
|
|
353
|
+
result = {f: None for f in features}
|
|
354
|
+
|
|
355
|
+
# Collect (version, view) per feature, deduplicating while preserving order
|
|
356
|
+
tmp = {f: [] for f in features}
|
|
357
|
+
seen = {f: set() for f in features}
|
|
358
|
+
|
|
359
|
+
for feat, version, view_name in rows:
|
|
360
|
+
key = (version, view_name)
|
|
361
|
+
if key not in seen.setdefault(feat, set()):
|
|
362
|
+
seen[feat].add(key)
|
|
363
|
+
tmp.setdefault(feat, []).append(key)
|
|
364
|
+
|
|
365
|
+
# Shape:
|
|
366
|
+
# - if exactly one row: return UUID string
|
|
367
|
+
# - if multiple rows: list of {"process_id": <uuid>, "process_view_name": <str>}
|
|
368
|
+
for feat in result:
|
|
369
|
+
pairs = tmp.get(feat, [])
|
|
370
|
+
if len(pairs) == 0:
|
|
371
|
+
result[feat] = None
|
|
372
|
+
elif len(pairs) == 1:
|
|
373
|
+
result[feat] = pairs[0][0] # UUID only
|
|
374
|
+
else:
|
|
375
|
+
result[feat] = [
|
|
376
|
+
{"process_id": ver, "process_view_name": view}
|
|
377
|
+
for (ver, view) in pairs
|
|
378
|
+
]
|
|
379
|
+
|
|
380
|
+
return result
|
|
321
381
|
|
|
322
|
-
# results in dictionary:
|
|
323
|
-
results = {row['FEATURE_NAME']: row['FEATURE_VERSION'] for i, row in df.iterrows()}
|
|
324
|
-
if tdfs4ds.DEBUG_MODE == True:
|
|
325
|
-
print('---> RESULTS <---')
|
|
326
|
-
print(results)
|
|
327
382
|
|
|
328
|
-
# Returning the results as a dictionary with feature names as keys and their versions as values
|
|
329
|
-
return results
|
|
330
383
|
def get_entity_tables(entity_id, data_domain=None):
|
|
331
384
|
"""
|
|
332
385
|
Retrieves a list of table names associated with a given entity ID or IDs from a feature catalog within a specific data domain.
|