tdfs4ds 0.2.4.3__py3-none-any.whl → 0.2.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +26 -9
- tdfs4ds/feature_store/feature_store_management.py +9 -2
- tdfs4ds/utils/info.py +39 -1
- {tdfs4ds-0.2.4.3.dist-info → tdfs4ds-0.2.4.4.dist-info}/METADATA +1 -1
- {tdfs4ds-0.2.4.3.dist-info → tdfs4ds-0.2.4.4.dist-info}/RECORD +7 -7
- {tdfs4ds-0.2.4.3.dist-info → tdfs4ds-0.2.4.4.dist-info}/WHEEL +0 -0
- {tdfs4ds-0.2.4.3.dist-info → tdfs4ds-0.2.4.4.dist-info}/top_level.txt +0 -0
tdfs4ds/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
__version__ = '0.2.4.
|
|
1
|
+
__version__ = '0.2.4.4'
|
|
2
2
|
import logging
|
|
3
3
|
# Setup the logger
|
|
4
4
|
logging.basicConfig(
|
|
@@ -310,7 +310,7 @@ def get_dataset_entity(dataset_id = None):
|
|
|
310
310
|
def get_dataset_features(dataset_id = None):
|
|
311
311
|
return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_features(dataset_id)
|
|
312
312
|
|
|
313
|
-
def run(process_id, return_dataset = False, force_compute = False):
|
|
313
|
+
def run(process_id, return_dataset = False, force_compute = False, force_varchar_length = None):
|
|
314
314
|
"""
|
|
315
315
|
Executes a specific process from the feature store identified by the process ID.
|
|
316
316
|
The function handles different process types and performs appropriate actions.
|
|
@@ -321,6 +321,10 @@ def run(process_id, return_dataset = False, force_compute = False):
|
|
|
321
321
|
Default is False.
|
|
322
322
|
- force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
|
|
323
323
|
Default is False.
|
|
324
|
+
- force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
|
|
325
|
+
VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
|
|
326
|
+
where k is the smallest integer so that the original lengths is smaller or equal
|
|
327
|
+
to k x force_varchar_length. Default is None.
|
|
324
328
|
|
|
325
329
|
Returns:
|
|
326
330
|
DataFrame or None: If return_dataset is True, returns the dataset created during the process. Otherwise, returns None.
|
|
@@ -423,7 +427,8 @@ def run(process_id, return_dataset = False, force_compute = False):
|
|
|
423
427
|
filtermanager = filtermanager,
|
|
424
428
|
entity_null_substitute = entity_null_substitute,
|
|
425
429
|
process_id = process_id,
|
|
426
|
-
force_compute= force_compute
|
|
430
|
+
force_compute= force_compute,
|
|
431
|
+
force_varchar_length = force_varchar_length
|
|
427
432
|
)
|
|
428
433
|
|
|
429
434
|
# Handling 'tdstone2 view' process type
|
|
@@ -437,7 +442,7 @@ def run(process_id, return_dataset = False, force_compute = False):
|
|
|
437
442
|
else:
|
|
438
443
|
return
|
|
439
444
|
|
|
440
|
-
def upload_features(df, entity_id, feature_names, metadata={}, primary_index = None, partitioning = '', filtermanager = None, entity_null_substitute = {}, force_compute = True):
|
|
445
|
+
def upload_features(df, entity_id, feature_names, metadata={}, primary_index = None, partitioning = '', filtermanager = None, entity_null_substitute = {}, force_compute = True, force_varchar_length = 1024):
|
|
441
446
|
"""
|
|
442
447
|
Uploads feature data from a DataFrame to the feature store for a specified entity. This involves registering the
|
|
443
448
|
process in the feature store, executing the necessary SQL to insert the data, and returning the resulting dataset
|
|
@@ -463,7 +468,10 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
|
|
|
463
468
|
Default is an empty dictionary.
|
|
464
469
|
- force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
|
|
465
470
|
Default is True.
|
|
466
|
-
|
|
471
|
+
- force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
|
|
472
|
+
VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
|
|
473
|
+
where k is the smallest integer so that the original lengths is smaller or equal
|
|
474
|
+
to k x force_varchar_length. Default is 1024.
|
|
467
475
|
Returns:
|
|
468
476
|
DataFrame: A DataFrame representing the dataset resulting from the upload process, typically used for validation
|
|
469
477
|
or further processing.
|
|
@@ -575,7 +583,7 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
|
|
|
575
583
|
|
|
576
584
|
try:
|
|
577
585
|
|
|
578
|
-
dataset = run(process_id=process_id, return_dataset=True, force_compute = force_compute)
|
|
586
|
+
dataset = run(process_id=process_id, return_dataset=True, force_compute = force_compute, force_varchar_length = force_varchar_length)
|
|
579
587
|
|
|
580
588
|
except Exception as e:
|
|
581
589
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
@@ -591,7 +599,7 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
|
|
|
591
599
|
else:
|
|
592
600
|
|
|
593
601
|
try:
|
|
594
|
-
run(process_id=process_id, return_dataset=False)
|
|
602
|
+
run(process_id=process_id, return_dataset=False, force_compute = force_compute, force_varchar_length = force_varchar_length)
|
|
595
603
|
except Exception as e:
|
|
596
604
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
597
605
|
run_id = tdfs4ds.RUN_ID,
|
|
@@ -605,7 +613,7 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
|
|
|
605
613
|
tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
|
|
606
614
|
|
|
607
615
|
def _upload_features(df, entity_id, feature_names,
|
|
608
|
-
feature_versions=FEATURE_VERSION_DEFAULT, primary_index = None, partitioning = '', filtermanager=None, entity_null_substitute={}, process_id = None, force_compute = False):
|
|
616
|
+
feature_versions=FEATURE_VERSION_DEFAULT, primary_index = None, partitioning = '', filtermanager=None, entity_null_substitute={}, process_id = None, force_compute = False,force_varchar_length = None):
|
|
609
617
|
"""
|
|
610
618
|
Uploads features from a DataFrame to the feature store, handling entity registration, feature type determination,
|
|
611
619
|
feature registration, preparation for ingestion, and storage in the designated feature tables.
|
|
@@ -628,6 +636,11 @@ def _upload_features(df, entity_id, feature_names,
|
|
|
628
636
|
- process_id (str, optional): An identifier for the process, used for tracking and follow-up. Default is None.
|
|
629
637
|
- force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
|
|
630
638
|
Default is False.
|
|
639
|
+
- force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
|
|
640
|
+
VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
|
|
641
|
+
where k is the smallest integer so that the original lengths is smaller or equal
|
|
642
|
+
to k x force_varchar_length. Default is None.
|
|
643
|
+
|
|
631
644
|
|
|
632
645
|
Returns:
|
|
633
646
|
DataFrame: A DataFrame representing the dataset view created in the feature store, detailing the features and their
|
|
@@ -655,7 +668,7 @@ def _upload_features(df, entity_id, feature_names,
|
|
|
655
668
|
from tdfs4ds.feature_store.feature_store_management import register_features
|
|
656
669
|
from tdfs4ds.feature_store.feature_data_processing import prepare_feature_ingestion
|
|
657
670
|
from tdfs4ds.feature_store.feature_data_processing import store_feature, apply_collect_stats
|
|
658
|
-
from tdfs4ds.utils.info import get_column_types
|
|
671
|
+
from tdfs4ds.utils.info import get_column_types, update_varchar_length
|
|
659
672
|
|
|
660
673
|
# Convert entity_id to a dictionary if it's not already one
|
|
661
674
|
if type(entity_id) == list:
|
|
@@ -685,6 +698,10 @@ def _upload_features(df, entity_id, feature_names,
|
|
|
685
698
|
entity_id=entity_id
|
|
686
699
|
)
|
|
687
700
|
|
|
701
|
+
if force_varchar_length is not None:
|
|
702
|
+
print(feature_names_types)
|
|
703
|
+
feature_names_types = update_varchar_length(feature_names_types,new_varchar_length = force_varchar_length)
|
|
704
|
+
|
|
688
705
|
def validate_feature_types(feature_names_types):
|
|
689
706
|
"""
|
|
690
707
|
Validates feature data types and raises an error if any value contains
|
|
@@ -73,6 +73,7 @@ def feature_store_catalog_creation(if_exists='replace', comment='this table is a
|
|
|
73
73
|
|
|
74
74
|
FEATURE_ID BIGINT,
|
|
75
75
|
FEATURE_NAME VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
76
|
+
FEATURE_TYPE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
76
77
|
FEATURE_TABLE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
77
78
|
FEATURE_DATABASE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
78
79
|
FEATURE_VIEW VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
@@ -410,12 +411,12 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
|
|
|
410
411
|
# Create a DataFrame from the feature_names_types dictionary
|
|
411
412
|
if len(feature_names_types.keys()) > 1:
|
|
412
413
|
df = pd.DataFrame(feature_names_types).transpose().reset_index()
|
|
413
|
-
df.columns = ['FEATURE_NAME', '
|
|
414
|
+
df.columns = ['FEATURE_NAME', 'FEATURE_TYPE', 'FEATURE_ID']
|
|
414
415
|
else:
|
|
415
416
|
df = pd.DataFrame(columns=['FEATURE_NAME', 'TYPE', 'FEATURE_ID'])
|
|
416
417
|
k = list(feature_names_types.keys())[0]
|
|
417
418
|
df['FEATURE_NAME'] = [k]
|
|
418
|
-
df['
|
|
419
|
+
df['FEATURE_TYPE'] = [feature_names_types[k]['type']]
|
|
419
420
|
df['FEATURE_ID'] = [feature_names_types[k]['id']]
|
|
420
421
|
|
|
421
422
|
|
|
@@ -458,6 +459,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
|
|
|
458
459
|
SELECT
|
|
459
460
|
CASE WHEN B.FEATURE_ID IS NULL THEN A.FEATURE_ID ELSE B.FEATURE_ID END AS FEATURE_ID
|
|
460
461
|
, A.FEATURE_NAME
|
|
462
|
+
, A.FEATURE_TYPE
|
|
461
463
|
, A.FEATURE_TABLE
|
|
462
464
|
, A.FEATURE_DATABASE
|
|
463
465
|
, A.FEATURE_VIEW
|
|
@@ -476,6 +478,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
|
|
|
476
478
|
UPDATE
|
|
477
479
|
SET
|
|
478
480
|
FEATURE_TABLE = UPDATED_FEATURES.FEATURE_TABLE,
|
|
481
|
+
FEATURE_TYPE = UPDATED_FEATURES.FEATURE_TYPE,
|
|
479
482
|
FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE,
|
|
480
483
|
FEATURE_VIEW = UPDATED_FEATURES.FEATURE_VIEW
|
|
481
484
|
--,ENTITY_NAME = UPDATED_FEATURES.ENTITY_NAME -- modified
|
|
@@ -483,6 +486,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
|
|
|
483
486
|
INSERT
|
|
484
487
|
( UPDATED_FEATURES.FEATURE_ID
|
|
485
488
|
, UPDATED_FEATURES.FEATURE_NAME
|
|
489
|
+
, UPDATED_FEATURES.FEATURE_TYPE
|
|
486
490
|
, UPDATED_FEATURES.FEATURE_TABLE
|
|
487
491
|
, UPDATED_FEATURES.FEATURE_DATABASE
|
|
488
492
|
, UPDATED_FEATURES.FEATURE_VIEW
|
|
@@ -498,6 +502,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
|
|
|
498
502
|
SELECT
|
|
499
503
|
CASE WHEN B.FEATURE_ID IS NULL THEN A.FEATURE_ID ELSE B.FEATURE_ID END AS FEATURE_ID
|
|
500
504
|
, A.FEATURE_NAME
|
|
505
|
+
, A.FEATURE_TYPE
|
|
501
506
|
, A.FEATURE_TABLE
|
|
502
507
|
, A.FEATURE_DATABASE
|
|
503
508
|
, A.FEATURE_VIEW
|
|
@@ -516,6 +521,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
|
|
|
516
521
|
UPDATE
|
|
517
522
|
SET
|
|
518
523
|
FEATURE_TABLE = UPDATED_FEATURES.FEATURE_TABLE,
|
|
524
|
+
FEATURE_TYPE = UPDATED_FEATURES.FEATURE_TYPE,
|
|
519
525
|
FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE,
|
|
520
526
|
FEATURE_VIEW = UPDATED_FEATURES.FEATURE_VIEW
|
|
521
527
|
--,ENTITY_NAME = UPDATED_FEATURES.ENTITY_NAME -- modified
|
|
@@ -523,6 +529,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
|
|
|
523
529
|
INSERT
|
|
524
530
|
( UPDATED_FEATURES.FEATURE_ID
|
|
525
531
|
, UPDATED_FEATURES.FEATURE_NAME
|
|
532
|
+
, UPDATED_FEATURES.FEATURE_TYPE
|
|
526
533
|
, UPDATED_FEATURES.FEATURE_TABLE
|
|
527
534
|
, UPDATED_FEATURES.FEATURE_DATABASE
|
|
528
535
|
, UPDATED_FEATURES.FEATURE_VIEW
|
tdfs4ds/utils/info.py
CHANGED
|
@@ -2,6 +2,8 @@ import re
|
|
|
2
2
|
|
|
3
3
|
import tdfs4ds
|
|
4
4
|
import teradataml as tdml
|
|
5
|
+
from tdfs4ds import logger
|
|
6
|
+
import numpy as np
|
|
5
7
|
|
|
6
8
|
def get_column_types(df, columns):
|
|
7
9
|
"""
|
|
@@ -264,4 +266,40 @@ def get_feature_types_sql_format(tddf, columns = None):
|
|
|
264
266
|
res = tdml.DataFrame.from_query(query).to_pandas()
|
|
265
267
|
|
|
266
268
|
# Return column names with their corresponding SQL data types in a dictionary
|
|
267
|
-
return {c: res[c].values[0].strip() for c in columns}
|
|
269
|
+
return {c: res[c].values[0].strip() for c in columns}
|
|
270
|
+
|
|
271
|
+
def update_varchar_length(feature_types: dict, new_varchar_length: int) -> dict:
|
|
272
|
+
"""
|
|
273
|
+
Updates the length of all VARCHAR fields in the feature_types dictionary based on an increment.
|
|
274
|
+
The new length is calculated as ceil(previous_length / new_varchar_length) * new_varchar_length,
|
|
275
|
+
ensuring that when new_varchar_length is equal to the current length, no change occurs.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
feature_types (dict): A dictionary where keys are feature names and values are dictionaries with 'type' and 'id'.
|
|
279
|
+
new_varchar_length (int): The increment value for adjusting VARCHAR lengths.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
dict: A dictionary with updated VARCHAR lengths.
|
|
283
|
+
|
|
284
|
+
Issues a warning if the new length is smaller than the original length.
|
|
285
|
+
"""
|
|
286
|
+
updated_feature_types = {}
|
|
287
|
+
varchar_pattern = re.compile(r'VARCHAR\((\d+)\)', re.IGNORECASE)
|
|
288
|
+
|
|
289
|
+
for key, value in feature_types.items():
|
|
290
|
+
type_value = value['type']
|
|
291
|
+
match = varchar_pattern.search(type_value)
|
|
292
|
+
if match:
|
|
293
|
+
original_length = int(match.group(1))
|
|
294
|
+
modified_length = int(np.ceil(original_length / new_varchar_length) * new_varchar_length)
|
|
295
|
+
|
|
296
|
+
if modified_length < original_length:
|
|
297
|
+
logger.warning(f"Reducing VARCHAR length for {key} from {original_length} to {modified_length}")
|
|
298
|
+
|
|
299
|
+
# Replace only the VARCHAR length
|
|
300
|
+
updated_value = varchar_pattern.sub(f'VARCHAR({modified_length})', type_value)
|
|
301
|
+
updated_feature_types[key] = {'type': updated_value, 'id': value['id']}
|
|
302
|
+
else:
|
|
303
|
+
updated_feature_types[key] = value
|
|
304
|
+
|
|
305
|
+
return updated_feature_types
|
|
@@ -2,7 +2,7 @@ tdfs/__init__.py,sha256=7AcO7uB1opRCt7t2JOHworKimfAaDeO3boRW7u9Geo8,23
|
|
|
2
2
|
tdfs/datasets.py,sha256=-b2MPEKGki2V1M8iUcoDR9uc2krIK7u1CK-EhChvihs,985
|
|
3
3
|
tdfs/feature_store.py,sha256=Honu7eOAXxP4Ivz0mRlhuNkfTDzgZl5HB1WlQUwzcZ0,31354
|
|
4
4
|
tdfs/data/curves.csv,sha256=q0Tm-0yu7VMK4lHvHpgi1LMeRq0lO5gJy2Q17brKbEM,112488
|
|
5
|
-
tdfs4ds/__init__.py,sha256=
|
|
5
|
+
tdfs4ds/__init__.py,sha256=BSE-ct7eaUwHsch7GGloYMSDWTb9nPLSeR-LSDZNY18,65844
|
|
6
6
|
tdfs4ds/datasets.py,sha256=LE4Gn0muwdyrIrCrbkE92cnafUML63z1lj5bFIIVzmc,3524
|
|
7
7
|
tdfs4ds/feature_engineering.py,sha256=oVnZ2V_XNGE12LKC_fNfkrWSQZLgtYRmaf8Dispi6S4,7081
|
|
8
8
|
tdfs4ds/feature_store.py,sha256=y-oItPZw6nBkBcGAceaATZbkLPTsvpk0OnpzTxYofDs,68576
|
|
@@ -13,7 +13,7 @@ tdfs4ds/feature_store/__init__.py,sha256=a7NPCkpTx40UR5LRErwnskpABG2Vuib7F5wUjaU
|
|
|
13
13
|
tdfs4ds/feature_store/entity_management.py,sha256=9ltytv3yCTG84NZXBpb1Tlkf9pOxvrNb0MVidU4pwvE,10157
|
|
14
14
|
tdfs4ds/feature_store/feature_data_processing.py,sha256=SuJeCTJF51l9-VS9WRS0oBUnxaVqba4hqjOpsCtdVs8,42352
|
|
15
15
|
tdfs4ds/feature_store/feature_query_retrieval.py,sha256=zuHRZhL6-qyLpPS7mWgRy1WingSN5iibkbi53Q7jfAs,33834
|
|
16
|
-
tdfs4ds/feature_store/feature_store_management.py,sha256=
|
|
16
|
+
tdfs4ds/feature_store/feature_store_management.py,sha256=WcgawACgC_lI880wj_FO2wV_FIp0W5WZ3x7r2-0WKdI,56121
|
|
17
17
|
tdfs4ds/process_store/__init__.py,sha256=npHR_xju5ecGmWfYHDyteLwiU3x-cL4HD3sFK_th7xY,229
|
|
18
18
|
tdfs4ds/process_store/process_followup.py,sha256=PvLcU7meg3ljBlPfuez3qwTVqpHHhVJxYxGqjgiHE8E,7265
|
|
19
19
|
tdfs4ds/process_store/process_query_administration.py,sha256=DsIt97cBoJ7NcpQzbQt55eUFNgXGdOMm5Hh2aX5v0PY,7762
|
|
@@ -21,12 +21,12 @@ tdfs4ds/process_store/process_registration_management.py,sha256=F8VlBoL-de98KnkM
|
|
|
21
21
|
tdfs4ds/process_store/process_store_catalog_management.py,sha256=H135RRTYn-pyWIqPVbHpuIyyvsaNrek6b1iPk8avJMI,16088
|
|
22
22
|
tdfs4ds/utils/__init__.py,sha256=-yTMfDLZbQnIRQ64s_bczzT21tDW2A8FZeq9PX5SgFU,168
|
|
23
23
|
tdfs4ds/utils/filter_management.py,sha256=7D47N_hnTSUVOkaV2XuKrlUFMxzWjDsCBvRYsH4lXdU,11073
|
|
24
|
-
tdfs4ds/utils/info.py,sha256=
|
|
24
|
+
tdfs4ds/utils/info.py,sha256=SQR_ec4M9-5Z4erjb9_N0n8JPY1wpelgxkw3B12D1Q4,12322
|
|
25
25
|
tdfs4ds/utils/lineage.py,sha256=LI-5pG7D8lO3-YFa9qA6CrEackiYugV23_Vz9IpF5xw,28670
|
|
26
26
|
tdfs4ds/utils/query_management.py,sha256=nAcE8QY1GWAKgOtb-ubSfDVcnYbU7Ge8CruVRLoPtmY,6356
|
|
27
27
|
tdfs4ds/utils/time_management.py,sha256=_jbwdyZH4Yr3VzbUrq6X93FpXDCDEdH0iv56vX7j8mA,8446
|
|
28
28
|
tdfs4ds/utils/visualization.py,sha256=5S528KoKzzkrAdCxfy7ecyqKvAXBoibNvHwz_u5ISMs,23167
|
|
29
|
-
tdfs4ds-0.2.4.
|
|
30
|
-
tdfs4ds-0.2.4.
|
|
31
|
-
tdfs4ds-0.2.4.
|
|
32
|
-
tdfs4ds-0.2.4.
|
|
29
|
+
tdfs4ds-0.2.4.4.dist-info/METADATA,sha256=OtxHXtfMZvmAZNaX0sontwt5luQ_0__lJhxoE3XHuho,11944
|
|
30
|
+
tdfs4ds-0.2.4.4.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
|
31
|
+
tdfs4ds-0.2.4.4.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
|
|
32
|
+
tdfs4ds-0.2.4.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|