tdfs4ds 0.2.4.3__py3-none-any.whl → 0.2.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tdfs4ds/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = '0.2.4.3'
1
+ __version__ = '0.2.4.4'
2
2
  import logging
3
3
  # Setup the logger
4
4
  logging.basicConfig(
@@ -310,7 +310,7 @@ def get_dataset_entity(dataset_id = None):
310
310
  def get_dataset_features(dataset_id = None):
311
311
  return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_features(dataset_id)
312
312
 
313
- def run(process_id, return_dataset = False, force_compute = False):
313
+ def run(process_id, return_dataset = False, force_compute = False, force_varchar_length = None):
314
314
  """
315
315
  Executes a specific process from the feature store identified by the process ID.
316
316
  The function handles different process types and performs appropriate actions.
@@ -321,6 +321,10 @@ def run(process_id, return_dataset = False, force_compute = False):
321
321
  Default is False.
322
322
  - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
323
323
  Default is False.
324
+ - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
325
+ VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
326
+ where k is the smallest integer so that the original lengths is smaller or equal
327
+ to k x force_varchar_length. Default is None.
324
328
 
325
329
  Returns:
326
330
  DataFrame or None: If return_dataset is True, returns the dataset created during the process. Otherwise, returns None.
@@ -423,7 +427,8 @@ def run(process_id, return_dataset = False, force_compute = False):
423
427
  filtermanager = filtermanager,
424
428
  entity_null_substitute = entity_null_substitute,
425
429
  process_id = process_id,
426
- force_compute= force_compute
430
+ force_compute= force_compute,
431
+ force_varchar_length = force_varchar_length
427
432
  )
428
433
 
429
434
  # Handling 'tdstone2 view' process type
@@ -437,7 +442,7 @@ def run(process_id, return_dataset = False, force_compute = False):
437
442
  else:
438
443
  return
439
444
 
440
- def upload_features(df, entity_id, feature_names, metadata={}, primary_index = None, partitioning = '', filtermanager = None, entity_null_substitute = {}, force_compute = True):
445
+ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = None, partitioning = '', filtermanager = None, entity_null_substitute = {}, force_compute = True, force_varchar_length = 1024):
441
446
  """
442
447
  Uploads feature data from a DataFrame to the feature store for a specified entity. This involves registering the
443
448
  process in the feature store, executing the necessary SQL to insert the data, and returning the resulting dataset
@@ -463,7 +468,10 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
463
468
  Default is an empty dictionary.
464
469
  - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
465
470
  Default is True.
466
-
471
+ - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
472
+ VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
473
+ where k is the smallest integer so that the original lengths is smaller or equal
474
+ to k x force_varchar_length. Default is 1024.
467
475
  Returns:
468
476
  DataFrame: A DataFrame representing the dataset resulting from the upload process, typically used for validation
469
477
  or further processing.
@@ -575,7 +583,7 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
575
583
 
576
584
  try:
577
585
 
578
- dataset = run(process_id=process_id, return_dataset=True, force_compute = force_compute)
586
+ dataset = run(process_id=process_id, return_dataset=True, force_compute = force_compute, force_varchar_length = force_varchar_length)
579
587
 
580
588
  except Exception as e:
581
589
  tdfs4ds.process_store.process_followup.followup_close(
@@ -591,7 +599,7 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
591
599
  else:
592
600
 
593
601
  try:
594
- run(process_id=process_id, return_dataset=False)
602
+ run(process_id=process_id, return_dataset=False, force_compute = force_compute, force_varchar_length = force_varchar_length)
595
603
  except Exception as e:
596
604
  tdfs4ds.process_store.process_followup.followup_close(
597
605
  run_id = tdfs4ds.RUN_ID,
@@ -605,7 +613,7 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
605
613
  tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
606
614
 
607
615
  def _upload_features(df, entity_id, feature_names,
608
- feature_versions=FEATURE_VERSION_DEFAULT, primary_index = None, partitioning = '', filtermanager=None, entity_null_substitute={}, process_id = None, force_compute = False):
616
+ feature_versions=FEATURE_VERSION_DEFAULT, primary_index = None, partitioning = '', filtermanager=None, entity_null_substitute={}, process_id = None, force_compute = False,force_varchar_length = None):
609
617
  """
610
618
  Uploads features from a DataFrame to the feature store, handling entity registration, feature type determination,
611
619
  feature registration, preparation for ingestion, and storage in the designated feature tables.
@@ -628,6 +636,11 @@ def _upload_features(df, entity_id, feature_names,
628
636
  - process_id (str, optional): An identifier for the process, used for tracking and follow-up. Default is None.
629
637
  - force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
630
638
  Default is False.
639
+ - force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
640
+ VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
641
+ where k is the smallest integer so that the original lengths is smaller or equal
642
+ to k x force_varchar_length. Default is None.
643
+
631
644
 
632
645
  Returns:
633
646
  DataFrame: A DataFrame representing the dataset view created in the feature store, detailing the features and their
@@ -655,7 +668,7 @@ def _upload_features(df, entity_id, feature_names,
655
668
  from tdfs4ds.feature_store.feature_store_management import register_features
656
669
  from tdfs4ds.feature_store.feature_data_processing import prepare_feature_ingestion
657
670
  from tdfs4ds.feature_store.feature_data_processing import store_feature, apply_collect_stats
658
- from tdfs4ds.utils.info import get_column_types
671
+ from tdfs4ds.utils.info import get_column_types, update_varchar_length
659
672
 
660
673
  # Convert entity_id to a dictionary if it's not already one
661
674
  if type(entity_id) == list:
@@ -685,6 +698,10 @@ def _upload_features(df, entity_id, feature_names,
685
698
  entity_id=entity_id
686
699
  )
687
700
 
701
+ if force_varchar_length is not None:
702
+ print(feature_names_types)
703
+ feature_names_types = update_varchar_length(feature_names_types,new_varchar_length = force_varchar_length)
704
+
688
705
  def validate_feature_types(feature_names_types):
689
706
  """
690
707
  Validates feature data types and raises an error if any value contains
@@ -73,6 +73,7 @@ def feature_store_catalog_creation(if_exists='replace', comment='this table is a
73
73
 
74
74
  FEATURE_ID BIGINT,
75
75
  FEATURE_NAME VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
76
+ FEATURE_TYPE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
76
77
  FEATURE_TABLE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
77
78
  FEATURE_DATABASE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
78
79
  FEATURE_VIEW VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
@@ -410,12 +411,12 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
410
411
  # Create a DataFrame from the feature_names_types dictionary
411
412
  if len(feature_names_types.keys()) > 1:
412
413
  df = pd.DataFrame(feature_names_types).transpose().reset_index()
413
- df.columns = ['FEATURE_NAME', 'TYPE', 'FEATURE_ID']
414
+ df.columns = ['FEATURE_NAME', 'FEATURE_TYPE', 'FEATURE_ID']
414
415
  else:
415
416
  df = pd.DataFrame(columns=['FEATURE_NAME', 'TYPE', 'FEATURE_ID'])
416
417
  k = list(feature_names_types.keys())[0]
417
418
  df['FEATURE_NAME'] = [k]
418
- df['TYPE'] = [feature_names_types[k]['type']]
419
+ df['FEATURE_TYPE'] = [feature_names_types[k]['type']]
419
420
  df['FEATURE_ID'] = [feature_names_types[k]['id']]
420
421
 
421
422
 
@@ -458,6 +459,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
458
459
  SELECT
459
460
  CASE WHEN B.FEATURE_ID IS NULL THEN A.FEATURE_ID ELSE B.FEATURE_ID END AS FEATURE_ID
460
461
  , A.FEATURE_NAME
462
+ , A.FEATURE_TYPE
461
463
  , A.FEATURE_TABLE
462
464
  , A.FEATURE_DATABASE
463
465
  , A.FEATURE_VIEW
@@ -476,6 +478,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
476
478
  UPDATE
477
479
  SET
478
480
  FEATURE_TABLE = UPDATED_FEATURES.FEATURE_TABLE,
481
+ FEATURE_TYPE = UPDATED_FEATURES.FEATURE_TYPE,
479
482
  FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE,
480
483
  FEATURE_VIEW = UPDATED_FEATURES.FEATURE_VIEW
481
484
  --,ENTITY_NAME = UPDATED_FEATURES.ENTITY_NAME -- modified
@@ -483,6 +486,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
483
486
  INSERT
484
487
  ( UPDATED_FEATURES.FEATURE_ID
485
488
  , UPDATED_FEATURES.FEATURE_NAME
489
+ , UPDATED_FEATURES.FEATURE_TYPE
486
490
  , UPDATED_FEATURES.FEATURE_TABLE
487
491
  , UPDATED_FEATURES.FEATURE_DATABASE
488
492
  , UPDATED_FEATURES.FEATURE_VIEW
@@ -498,6 +502,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
498
502
  SELECT
499
503
  CASE WHEN B.FEATURE_ID IS NULL THEN A.FEATURE_ID ELSE B.FEATURE_ID END AS FEATURE_ID
500
504
  , A.FEATURE_NAME
505
+ , A.FEATURE_TYPE
501
506
  , A.FEATURE_TABLE
502
507
  , A.FEATURE_DATABASE
503
508
  , A.FEATURE_VIEW
@@ -516,6 +521,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
516
521
  UPDATE
517
522
  SET
518
523
  FEATURE_TABLE = UPDATED_FEATURES.FEATURE_TABLE,
524
+ FEATURE_TYPE = UPDATED_FEATURES.FEATURE_TYPE,
519
525
  FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE,
520
526
  FEATURE_VIEW = UPDATED_FEATURES.FEATURE_VIEW
521
527
  --,ENTITY_NAME = UPDATED_FEATURES.ENTITY_NAME -- modified
@@ -523,6 +529,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
523
529
  INSERT
524
530
  ( UPDATED_FEATURES.FEATURE_ID
525
531
  , UPDATED_FEATURES.FEATURE_NAME
532
+ , UPDATED_FEATURES.FEATURE_TYPE
526
533
  , UPDATED_FEATURES.FEATURE_TABLE
527
534
  , UPDATED_FEATURES.FEATURE_DATABASE
528
535
  , UPDATED_FEATURES.FEATURE_VIEW
tdfs4ds/utils/info.py CHANGED
@@ -2,6 +2,8 @@ import re
2
2
 
3
3
  import tdfs4ds
4
4
  import teradataml as tdml
5
+ from tdfs4ds import logger
6
+ import numpy as np
5
7
 
6
8
  def get_column_types(df, columns):
7
9
  """
@@ -264,4 +266,40 @@ def get_feature_types_sql_format(tddf, columns = None):
264
266
  res = tdml.DataFrame.from_query(query).to_pandas()
265
267
 
266
268
  # Return column names with their corresponding SQL data types in a dictionary
267
- return {c: res[c].values[0].strip() for c in columns}
269
+ return {c: res[c].values[0].strip() for c in columns}
270
+
271
+ def update_varchar_length(feature_types: dict, new_varchar_length: int) -> dict:
272
+ """
273
+ Updates the length of all VARCHAR fields in the feature_types dictionary based on an increment.
274
+ The new length is calculated as ceil(previous_length / new_varchar_length) * new_varchar_length,
275
+ ensuring that when new_varchar_length is equal to the current length, no change occurs.
276
+
277
+ Args:
278
+ feature_types (dict): A dictionary where keys are feature names and values are dictionaries with 'type' and 'id'.
279
+ new_varchar_length (int): The increment value for adjusting VARCHAR lengths.
280
+
281
+ Returns:
282
+ dict: A dictionary with updated VARCHAR lengths.
283
+
284
+ Issues a warning if the new length is smaller than the original length.
285
+ """
286
+ updated_feature_types = {}
287
+ varchar_pattern = re.compile(r'VARCHAR\((\d+)\)', re.IGNORECASE)
288
+
289
+ for key, value in feature_types.items():
290
+ type_value = value['type']
291
+ match = varchar_pattern.search(type_value)
292
+ if match:
293
+ original_length = int(match.group(1))
294
+ modified_length = int(np.ceil(original_length / new_varchar_length) * new_varchar_length)
295
+
296
+ if modified_length < original_length:
297
+ logger.warning(f"Reducing VARCHAR length for {key} from {original_length} to {modified_length}")
298
+
299
+ # Replace only the VARCHAR length
300
+ updated_value = varchar_pattern.sub(f'VARCHAR({modified_length})', type_value)
301
+ updated_feature_types[key] = {'type': updated_value, 'id': value['id']}
302
+ else:
303
+ updated_feature_types[key] = value
304
+
305
+ return updated_feature_types
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: tdfs4ds
3
- Version: 0.2.4.3
3
+ Version: 0.2.4.4
4
4
  Summary: A python package to simplify the usage of feature store using Teradata Vantage ...
5
5
  Author: Denis Molin
6
6
  Requires-Python: >=3.6
@@ -2,7 +2,7 @@ tdfs/__init__.py,sha256=7AcO7uB1opRCt7t2JOHworKimfAaDeO3boRW7u9Geo8,23
2
2
  tdfs/datasets.py,sha256=-b2MPEKGki2V1M8iUcoDR9uc2krIK7u1CK-EhChvihs,985
3
3
  tdfs/feature_store.py,sha256=Honu7eOAXxP4Ivz0mRlhuNkfTDzgZl5HB1WlQUwzcZ0,31354
4
4
  tdfs/data/curves.csv,sha256=q0Tm-0yu7VMK4lHvHpgi1LMeRq0lO5gJy2Q17brKbEM,112488
5
- tdfs4ds/__init__.py,sha256=OOakI_WdX1fjXTheqqLMUQY99apaGFXdEYg_SQpWQng,63986
5
+ tdfs4ds/__init__.py,sha256=BSE-ct7eaUwHsch7GGloYMSDWTb9nPLSeR-LSDZNY18,65844
6
6
  tdfs4ds/datasets.py,sha256=LE4Gn0muwdyrIrCrbkE92cnafUML63z1lj5bFIIVzmc,3524
7
7
  tdfs4ds/feature_engineering.py,sha256=oVnZ2V_XNGE12LKC_fNfkrWSQZLgtYRmaf8Dispi6S4,7081
8
8
  tdfs4ds/feature_store.py,sha256=y-oItPZw6nBkBcGAceaATZbkLPTsvpk0OnpzTxYofDs,68576
@@ -13,7 +13,7 @@ tdfs4ds/feature_store/__init__.py,sha256=a7NPCkpTx40UR5LRErwnskpABG2Vuib7F5wUjaU
13
13
  tdfs4ds/feature_store/entity_management.py,sha256=9ltytv3yCTG84NZXBpb1Tlkf9pOxvrNb0MVidU4pwvE,10157
14
14
  tdfs4ds/feature_store/feature_data_processing.py,sha256=SuJeCTJF51l9-VS9WRS0oBUnxaVqba4hqjOpsCtdVs8,42352
15
15
  tdfs4ds/feature_store/feature_query_retrieval.py,sha256=zuHRZhL6-qyLpPS7mWgRy1WingSN5iibkbi53Q7jfAs,33834
16
- tdfs4ds/feature_store/feature_store_management.py,sha256=RIa3ZjKBULTovEmy3KEa0M2Rn5D6LMizDVnx4Q25S6o,55724
16
+ tdfs4ds/feature_store/feature_store_management.py,sha256=WcgawACgC_lI880wj_FO2wV_FIp0W5WZ3x7r2-0WKdI,56121
17
17
  tdfs4ds/process_store/__init__.py,sha256=npHR_xju5ecGmWfYHDyteLwiU3x-cL4HD3sFK_th7xY,229
18
18
  tdfs4ds/process_store/process_followup.py,sha256=PvLcU7meg3ljBlPfuez3qwTVqpHHhVJxYxGqjgiHE8E,7265
19
19
  tdfs4ds/process_store/process_query_administration.py,sha256=DsIt97cBoJ7NcpQzbQt55eUFNgXGdOMm5Hh2aX5v0PY,7762
@@ -21,12 +21,12 @@ tdfs4ds/process_store/process_registration_management.py,sha256=F8VlBoL-de98KnkM
21
21
  tdfs4ds/process_store/process_store_catalog_management.py,sha256=H135RRTYn-pyWIqPVbHpuIyyvsaNrek6b1iPk8avJMI,16088
22
22
  tdfs4ds/utils/__init__.py,sha256=-yTMfDLZbQnIRQ64s_bczzT21tDW2A8FZeq9PX5SgFU,168
23
23
  tdfs4ds/utils/filter_management.py,sha256=7D47N_hnTSUVOkaV2XuKrlUFMxzWjDsCBvRYsH4lXdU,11073
24
- tdfs4ds/utils/info.py,sha256=lc9-rQDfM4NWnZGkSUkY_G0qYx7qnoErNKKcYMuLIRs,10554
24
+ tdfs4ds/utils/info.py,sha256=SQR_ec4M9-5Z4erjb9_N0n8JPY1wpelgxkw3B12D1Q4,12322
25
25
  tdfs4ds/utils/lineage.py,sha256=LI-5pG7D8lO3-YFa9qA6CrEackiYugV23_Vz9IpF5xw,28670
26
26
  tdfs4ds/utils/query_management.py,sha256=nAcE8QY1GWAKgOtb-ubSfDVcnYbU7Ge8CruVRLoPtmY,6356
27
27
  tdfs4ds/utils/time_management.py,sha256=_jbwdyZH4Yr3VzbUrq6X93FpXDCDEdH0iv56vX7j8mA,8446
28
28
  tdfs4ds/utils/visualization.py,sha256=5S528KoKzzkrAdCxfy7ecyqKvAXBoibNvHwz_u5ISMs,23167
29
- tdfs4ds-0.2.4.3.dist-info/METADATA,sha256=dUqe-90oXLdYx2U6F-WmeQDHhAFN_vvZrFfVuYGmTn8,11944
30
- tdfs4ds-0.2.4.3.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
31
- tdfs4ds-0.2.4.3.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
32
- tdfs4ds-0.2.4.3.dist-info/RECORD,,
29
+ tdfs4ds-0.2.4.4.dist-info/METADATA,sha256=OtxHXtfMZvmAZNaX0sontwt5luQ_0__lJhxoE3XHuho,11944
30
+ tdfs4ds-0.2.4.4.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
31
+ tdfs4ds-0.2.4.4.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
32
+ tdfs4ds-0.2.4.4.dist-info/RECORD,,