tdfs4ds 0.2.4.2__py3-none-any.whl → 0.2.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +69 -17
- tdfs4ds/feature_store/feature_store_management.py +9 -2
- tdfs4ds/utils/info.py +48 -3
- {tdfs4ds-0.2.4.2.dist-info → tdfs4ds-0.2.4.4.dist-info}/METADATA +1 -1
- {tdfs4ds-0.2.4.2.dist-info → tdfs4ds-0.2.4.4.dist-info}/RECORD +7 -7
- {tdfs4ds-0.2.4.2.dist-info → tdfs4ds-0.2.4.4.dist-info}/WHEEL +0 -0
- {tdfs4ds-0.2.4.2.dist-info → tdfs4ds-0.2.4.4.dist-info}/top_level.txt +0 -0
tdfs4ds/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
__version__ = '0.2.4.
|
|
1
|
+
__version__ = '0.2.4.4'
|
|
2
2
|
import logging
|
|
3
3
|
# Setup the logger
|
|
4
4
|
logging.basicConfig(
|
|
@@ -11,6 +11,7 @@ logger = logging.getLogger(__name__)
|
|
|
11
11
|
|
|
12
12
|
from tdfs4ds.feature_store.feature_query_retrieval import get_available_entity_id_records, write_where_clause_filter
|
|
13
13
|
from tdfs4ds.process_store.process_followup import follow_up_report
|
|
14
|
+
from tdfs4ds.dataset.dataset_catalog import DatasetCatalog, Dataset
|
|
14
15
|
|
|
15
16
|
DATA_DOMAIN = None
|
|
16
17
|
SCHEMA = None
|
|
@@ -19,6 +20,7 @@ FEATURE_CATALOG_NAME_VIEW = 'FS_V_FEATURE_CATALOG'
|
|
|
19
20
|
PROCESS_CATALOG_NAME = 'FS_PROCESS_CATALOG'
|
|
20
21
|
PROCESS_CATALOG_NAME_VIEW = 'FS_V_PROCESS_CATALOG'
|
|
21
22
|
PROCESS_CATALOG_NAME_VIEW_FEATURE_SPLIT = 'FS_V_PROCESS_CATALOG_FEATURE_SPLIT'
|
|
23
|
+
DATASET_CATALOG_NAME = 'FS_DATASET'
|
|
22
24
|
|
|
23
25
|
DATA_DISTRIBUTION_NAME = 'FS_DATA_DISTRIBUTION'
|
|
24
26
|
FOLLOW_UP_NAME = 'FS_FOLLOW_UP'
|
|
@@ -125,6 +127,8 @@ def setup(database, if_exists='fail'):
|
|
|
125
127
|
tdml.db_drop_table(table_name = tdfs4ds.DATA_DISTRIBUTION_NAME, schema_name=database)
|
|
126
128
|
except Exception as e:
|
|
127
129
|
print(str(e).split('\n')[0])
|
|
130
|
+
|
|
131
|
+
DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME).drop_catalog()
|
|
128
132
|
try:
|
|
129
133
|
tdfs4ds.FEATURE_CATALOG_NAME = feature_store_catalog_creation()
|
|
130
134
|
print('feature catalog table: ', tdfs4ds.FEATURE_CATALOG_NAME, ' in database ', database)
|
|
@@ -146,18 +150,22 @@ def setup(database, if_exists='fail'):
|
|
|
146
150
|
|
|
147
151
|
tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
|
|
148
152
|
tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
|
|
153
|
+
dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
|
|
154
|
+
if not dataset_catalog._exists():
|
|
155
|
+
dataset_catalog.create_catalog()
|
|
149
156
|
|
|
150
157
|
return
|
|
151
158
|
|
|
152
159
|
def connect(
|
|
153
|
-
database
|
|
154
|
-
feature_catalog_name
|
|
155
|
-
process_catalog_name
|
|
156
|
-
data_distribution_name
|
|
157
|
-
filter_manager_name
|
|
158
|
-
followup_name
|
|
160
|
+
database = tdfs4ds.SCHEMA,
|
|
161
|
+
feature_catalog_name = tdfs4ds.FEATURE_CATALOG_NAME,
|
|
162
|
+
process_catalog_name = tdfs4ds.PROCESS_CATALOG_NAME,
|
|
163
|
+
data_distribution_name = tdfs4ds.DATA_DISTRIBUTION_NAME,
|
|
164
|
+
filter_manager_name = tdfs4ds.FILTER_MANAGER_NAME,
|
|
165
|
+
followup_name = tdfs4ds.FOLLOW_UP_NAME,
|
|
159
166
|
feature_catalog_name_view = tdfs4ds.FEATURE_CATALOG_NAME_VIEW,
|
|
160
|
-
process_catalog_name_view = tdfs4ds.PROCESS_CATALOG_NAME_VIEW
|
|
167
|
+
process_catalog_name_view = tdfs4ds.PROCESS_CATALOG_NAME_VIEW,
|
|
168
|
+
dataset_catalog_name = tdfs4ds.DATASET_CATALOG_NAME
|
|
161
169
|
):
|
|
162
170
|
"""
|
|
163
171
|
Configures the database environment by setting schema names and checking the existence of specified catalog tables.
|
|
@@ -197,7 +205,8 @@ def connect(
|
|
|
197
205
|
distrib_exists = data_distribution_name.lower() in tables
|
|
198
206
|
filter_manager_exists = filter_manager_name.lower() in tables
|
|
199
207
|
followup_name_exists = followup_name.lower() in tables
|
|
200
|
-
|
|
208
|
+
|
|
209
|
+
|
|
201
210
|
if followup_name_exists:
|
|
202
211
|
tdfs4ds.FOLLOW_UP_NAME = followup_name
|
|
203
212
|
else:
|
|
@@ -211,6 +220,7 @@ def connect(
|
|
|
211
220
|
tdfs4ds.FILTER_MANAGER_NAME = filter_manager_name
|
|
212
221
|
tdfs4ds.PROCESS_CATALOG_NAME_VIEW = process_catalog_name_view
|
|
213
222
|
tdfs4ds.FEATURE_CATALOG_NAME_VIEW = feature_catalog_name_view
|
|
223
|
+
|
|
214
224
|
|
|
215
225
|
process_list = tdml.DataFrame(tdml.in_schema(database, process_catalog_name))
|
|
216
226
|
if 'ENTITY_NULL_SUBSTITUTE' not in process_list.columns:
|
|
@@ -237,6 +247,11 @@ def connect(
|
|
|
237
247
|
def is_data_distribution_temporal():
|
|
238
248
|
return 'PERIOD' in tdfs4ds.utils.lineage.get_ddl(view_name=tdfs4ds.DATA_DISTRIBUTION_NAME,
|
|
239
249
|
schema_name=tdfs4ds.SCHEMA, object_type='table')
|
|
250
|
+
|
|
251
|
+
tdfs4ds.DATASET_CATALOG_NAME = dataset_catalog_name
|
|
252
|
+
dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
|
|
253
|
+
if not dataset_catalog._exists():
|
|
254
|
+
dataset_catalog.create_catalog()
|
|
240
255
|
|
|
241
256
|
if is_data_distribution_temporal():
|
|
242
257
|
tdfs4ds.DATA_DISTRIBUTION_TEMPORAL = True
|
|
@@ -279,9 +294,23 @@ def process_catalog():
|
|
|
279
294
|
"""
|
|
280
295
|
return tdfs4ds.process_store.process_query_administration.list_processes()
|
|
281
296
|
|
|
297
|
+
def dataset_catalog():
|
|
298
|
+
"""
|
|
299
|
+
Retrieve a list of all datasets registered in the dataset store.
|
|
300
|
+
|
|
301
|
+
This function performs a query against the dataset store to gather a list of all
|
|
302
|
+
datasets that have been registered and are administrable.
|
|
282
303
|
|
|
304
|
+
"""
|
|
305
|
+
return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).catalog
|
|
306
|
+
|
|
307
|
+
def get_dataset_entity(dataset_id = None):
|
|
308
|
+
return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_entity(dataset_id)
|
|
283
309
|
|
|
284
|
-
def
|
|
310
|
+
def get_dataset_features(dataset_id = None):
|
|
311
|
+
return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_features(dataset_id)
|
|
312
|
+
|
|
313
|
+
def run(process_id, return_dataset = False, force_compute = False, force_varchar_length = None):
|
|
285
314
|
"""
|
|
286
315
|
Executes a specific process from the feature store identified by the process ID.
|
|
287
316
|
The function handles different process types and performs appropriate actions.
|
|
@@ -292,6 +321,10 @@ def run(process_id, return_dataset = False, force_compute = False):
|
|
|
292
321
|
Default is False.
|
|
293
322
|
- force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
|
|
294
323
|
Default is False.
|
|
324
|
+
- force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
|
|
325
|
+
VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
|
|
326
|
+
where k is the smallest integer so that the original lengths is smaller or equal
|
|
327
|
+
to k x force_varchar_length. Default is None.
|
|
295
328
|
|
|
296
329
|
Returns:
|
|
297
330
|
DataFrame or None: If return_dataset is True, returns the dataset created during the process. Otherwise, returns None.
|
|
@@ -394,7 +427,8 @@ def run(process_id, return_dataset = False, force_compute = False):
|
|
|
394
427
|
filtermanager = filtermanager,
|
|
395
428
|
entity_null_substitute = entity_null_substitute,
|
|
396
429
|
process_id = process_id,
|
|
397
|
-
force_compute= force_compute
|
|
430
|
+
force_compute= force_compute,
|
|
431
|
+
force_varchar_length = force_varchar_length
|
|
398
432
|
)
|
|
399
433
|
|
|
400
434
|
# Handling 'tdstone2 view' process type
|
|
@@ -408,7 +442,7 @@ def run(process_id, return_dataset = False, force_compute = False):
|
|
|
408
442
|
else:
|
|
409
443
|
return
|
|
410
444
|
|
|
411
|
-
def upload_features(df, entity_id, feature_names, metadata={}, primary_index = None, partitioning = '', filtermanager = None, entity_null_substitute = {}, force_compute = True):
|
|
445
|
+
def upload_features(df, entity_id, feature_names, metadata={}, primary_index = None, partitioning = '', filtermanager = None, entity_null_substitute = {}, force_compute = True, force_varchar_length = 1024):
|
|
412
446
|
"""
|
|
413
447
|
Uploads feature data from a DataFrame to the feature store for a specified entity. This involves registering the
|
|
414
448
|
process in the feature store, executing the necessary SQL to insert the data, and returning the resulting dataset
|
|
@@ -434,7 +468,10 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
|
|
|
434
468
|
Default is an empty dictionary.
|
|
435
469
|
- force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
|
|
436
470
|
Default is True.
|
|
437
|
-
|
|
471
|
+
- force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
|
|
472
|
+
VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
|
|
473
|
+
where k is the smallest integer so that the original lengths is smaller or equal
|
|
474
|
+
to k x force_varchar_length. Default is 1024.
|
|
438
475
|
Returns:
|
|
439
476
|
DataFrame: A DataFrame representing the dataset resulting from the upload process, typically used for validation
|
|
440
477
|
or further processing.
|
|
@@ -546,7 +583,7 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
|
|
|
546
583
|
|
|
547
584
|
try:
|
|
548
585
|
|
|
549
|
-
dataset = run(process_id=process_id, return_dataset=True, force_compute = force_compute)
|
|
586
|
+
dataset = run(process_id=process_id, return_dataset=True, force_compute = force_compute, force_varchar_length = force_varchar_length)
|
|
550
587
|
|
|
551
588
|
except Exception as e:
|
|
552
589
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
@@ -562,7 +599,7 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
|
|
|
562
599
|
else:
|
|
563
600
|
|
|
564
601
|
try:
|
|
565
|
-
run(process_id=process_id, return_dataset=False)
|
|
602
|
+
run(process_id=process_id, return_dataset=False, force_compute = force_compute, force_varchar_length = force_varchar_length)
|
|
566
603
|
except Exception as e:
|
|
567
604
|
tdfs4ds.process_store.process_followup.followup_close(
|
|
568
605
|
run_id = tdfs4ds.RUN_ID,
|
|
@@ -576,7 +613,7 @@ def upload_features(df, entity_id, feature_names, metadata={}, primary_index = N
|
|
|
576
613
|
tdfs4ds.PROCESS_TYPE = PROCESS_TYPE
|
|
577
614
|
|
|
578
615
|
def _upload_features(df, entity_id, feature_names,
|
|
579
|
-
feature_versions=FEATURE_VERSION_DEFAULT, primary_index = None, partitioning = '', filtermanager=None, entity_null_substitute={}, process_id = None, force_compute = False):
|
|
616
|
+
feature_versions=FEATURE_VERSION_DEFAULT, primary_index = None, partitioning = '', filtermanager=None, entity_null_substitute={}, process_id = None, force_compute = False,force_varchar_length = None):
|
|
580
617
|
"""
|
|
581
618
|
Uploads features from a DataFrame to the feature store, handling entity registration, feature type determination,
|
|
582
619
|
feature registration, preparation for ingestion, and storage in the designated feature tables.
|
|
@@ -599,6 +636,11 @@ def _upload_features(df, entity_id, feature_names,
|
|
|
599
636
|
- process_id (str, optional): An identifier for the process, used for tracking and follow-up. Default is None.
|
|
600
637
|
- force_compute (bool, optional): A flag indicating whether to force computation even if data already exists.
|
|
601
638
|
Default is False.
|
|
639
|
+
- force_varchar_length (int, optional): in order to avoid the multiplication of feature tables when dealing with the
|
|
640
|
+
VARCHAR type, it cast the VARCHAR features into VARCHAR(k x force_varchar_length)
|
|
641
|
+
where k is the smallest integer so that the original lengths is smaller or equal
|
|
642
|
+
to k x force_varchar_length. Default is None.
|
|
643
|
+
|
|
602
644
|
|
|
603
645
|
Returns:
|
|
604
646
|
DataFrame: A DataFrame representing the dataset view created in the feature store, detailing the features and their
|
|
@@ -626,7 +668,7 @@ def _upload_features(df, entity_id, feature_names,
|
|
|
626
668
|
from tdfs4ds.feature_store.feature_store_management import register_features
|
|
627
669
|
from tdfs4ds.feature_store.feature_data_processing import prepare_feature_ingestion
|
|
628
670
|
from tdfs4ds.feature_store.feature_data_processing import store_feature, apply_collect_stats
|
|
629
|
-
from tdfs4ds.utils.info import get_column_types
|
|
671
|
+
from tdfs4ds.utils.info import get_column_types, update_varchar_length
|
|
630
672
|
|
|
631
673
|
# Convert entity_id to a dictionary if it's not already one
|
|
632
674
|
if type(entity_id) == list:
|
|
@@ -656,6 +698,10 @@ def _upload_features(df, entity_id, feature_names,
|
|
|
656
698
|
entity_id=entity_id
|
|
657
699
|
)
|
|
658
700
|
|
|
701
|
+
if force_varchar_length is not None:
|
|
702
|
+
print(feature_names_types)
|
|
703
|
+
feature_names_types = update_varchar_length(feature_names_types,new_varchar_length = force_varchar_length)
|
|
704
|
+
|
|
659
705
|
def validate_feature_types(feature_names_types):
|
|
660
706
|
"""
|
|
661
707
|
Validates feature data types and raises an error if any value contains
|
|
@@ -1064,6 +1110,12 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
|
|
|
1064
1110
|
tdfs4ds.logger.info(f"Adding a comment to the view {view_name} in the {schema_name} database.")
|
|
1065
1111
|
tdml.execute_sql(f"COMMENT ON VIEW {schema_name}.{view_name} IS '{comment}'")
|
|
1066
1112
|
|
|
1113
|
+
# build the dataset object
|
|
1114
|
+
tdfs4ds.logger.info(f"Creation of the dataset object.")
|
|
1115
|
+
dataset = Dataset(view_name=view_name, schema_name=schema_name)
|
|
1116
|
+
tdfs4ds.logger.info(f"Registering of the dataset in the dataset catalog.")
|
|
1117
|
+
DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).add_dataset(dataset=dataset)
|
|
1118
|
+
|
|
1067
1119
|
# Return the query or the DataFrame based on the `return_query` flag
|
|
1068
1120
|
if return_query:
|
|
1069
1121
|
tdfs4ds.logger.info("Returning the generated dataset query.")
|
|
@@ -73,6 +73,7 @@ def feature_store_catalog_creation(if_exists='replace', comment='this table is a
|
|
|
73
73
|
|
|
74
74
|
FEATURE_ID BIGINT,
|
|
75
75
|
FEATURE_NAME VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
76
|
+
FEATURE_TYPE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
76
77
|
FEATURE_TABLE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
77
78
|
FEATURE_DATABASE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
78
79
|
FEATURE_VIEW VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
|
|
@@ -410,12 +411,12 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
|
|
|
410
411
|
# Create a DataFrame from the feature_names_types dictionary
|
|
411
412
|
if len(feature_names_types.keys()) > 1:
|
|
412
413
|
df = pd.DataFrame(feature_names_types).transpose().reset_index()
|
|
413
|
-
df.columns = ['FEATURE_NAME', '
|
|
414
|
+
df.columns = ['FEATURE_NAME', 'FEATURE_TYPE', 'FEATURE_ID']
|
|
414
415
|
else:
|
|
415
416
|
df = pd.DataFrame(columns=['FEATURE_NAME', 'TYPE', 'FEATURE_ID'])
|
|
416
417
|
k = list(feature_names_types.keys())[0]
|
|
417
418
|
df['FEATURE_NAME'] = [k]
|
|
418
|
-
df['
|
|
419
|
+
df['FEATURE_TYPE'] = [feature_names_types[k]['type']]
|
|
419
420
|
df['FEATURE_ID'] = [feature_names_types[k]['id']]
|
|
420
421
|
|
|
421
422
|
|
|
@@ -458,6 +459,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
|
|
|
458
459
|
SELECT
|
|
459
460
|
CASE WHEN B.FEATURE_ID IS NULL THEN A.FEATURE_ID ELSE B.FEATURE_ID END AS FEATURE_ID
|
|
460
461
|
, A.FEATURE_NAME
|
|
462
|
+
, A.FEATURE_TYPE
|
|
461
463
|
, A.FEATURE_TABLE
|
|
462
464
|
, A.FEATURE_DATABASE
|
|
463
465
|
, A.FEATURE_VIEW
|
|
@@ -476,6 +478,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
|
|
|
476
478
|
UPDATE
|
|
477
479
|
SET
|
|
478
480
|
FEATURE_TABLE = UPDATED_FEATURES.FEATURE_TABLE,
|
|
481
|
+
FEATURE_TYPE = UPDATED_FEATURES.FEATURE_TYPE,
|
|
479
482
|
FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE,
|
|
480
483
|
FEATURE_VIEW = UPDATED_FEATURES.FEATURE_VIEW
|
|
481
484
|
--,ENTITY_NAME = UPDATED_FEATURES.ENTITY_NAME -- modified
|
|
@@ -483,6 +486,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
|
|
|
483
486
|
INSERT
|
|
484
487
|
( UPDATED_FEATURES.FEATURE_ID
|
|
485
488
|
, UPDATED_FEATURES.FEATURE_NAME
|
|
489
|
+
, UPDATED_FEATURES.FEATURE_TYPE
|
|
486
490
|
, UPDATED_FEATURES.FEATURE_TABLE
|
|
487
491
|
, UPDATED_FEATURES.FEATURE_DATABASE
|
|
488
492
|
, UPDATED_FEATURES.FEATURE_VIEW
|
|
@@ -498,6 +502,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
|
|
|
498
502
|
SELECT
|
|
499
503
|
CASE WHEN B.FEATURE_ID IS NULL THEN A.FEATURE_ID ELSE B.FEATURE_ID END AS FEATURE_ID
|
|
500
504
|
, A.FEATURE_NAME
|
|
505
|
+
, A.FEATURE_TYPE
|
|
501
506
|
, A.FEATURE_TABLE
|
|
502
507
|
, A.FEATURE_DATABASE
|
|
503
508
|
, A.FEATURE_VIEW
|
|
@@ -516,6 +521,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
|
|
|
516
521
|
UPDATE
|
|
517
522
|
SET
|
|
518
523
|
FEATURE_TABLE = UPDATED_FEATURES.FEATURE_TABLE,
|
|
524
|
+
FEATURE_TYPE = UPDATED_FEATURES.FEATURE_TYPE,
|
|
519
525
|
FEATURE_DATABASE = UPDATED_FEATURES.FEATURE_DATABASE,
|
|
520
526
|
FEATURE_VIEW = UPDATED_FEATURES.FEATURE_VIEW
|
|
521
527
|
--,ENTITY_NAME = UPDATED_FEATURES.ENTITY_NAME -- modified
|
|
@@ -523,6 +529,7 @@ def _register_features_merge(entity_id, feature_names_types, primary_index=None,
|
|
|
523
529
|
INSERT
|
|
524
530
|
( UPDATED_FEATURES.FEATURE_ID
|
|
525
531
|
, UPDATED_FEATURES.FEATURE_NAME
|
|
532
|
+
, UPDATED_FEATURES.FEATURE_TYPE
|
|
526
533
|
, UPDATED_FEATURES.FEATURE_TABLE
|
|
527
534
|
, UPDATED_FEATURES.FEATURE_DATABASE
|
|
528
535
|
, UPDATED_FEATURES.FEATURE_VIEW
|
tdfs4ds/utils/info.py
CHANGED
|
@@ -2,6 +2,8 @@ import re
|
|
|
2
2
|
|
|
3
3
|
import tdfs4ds
|
|
4
4
|
import teradataml as tdml
|
|
5
|
+
from tdfs4ds import logger
|
|
6
|
+
import numpy as np
|
|
5
7
|
|
|
6
8
|
def get_column_types(df, columns):
|
|
7
9
|
"""
|
|
@@ -47,7 +49,7 @@ def get_column_types(df, columns):
|
|
|
47
49
|
|
|
48
50
|
|
|
49
51
|
|
|
50
|
-
def get_column_types_simple(df, columns):
|
|
52
|
+
def get_column_types_simple(df, columns = None):
|
|
51
53
|
"""
|
|
52
54
|
Retrieve simplified column types for specified columns from a DataFrame.
|
|
53
55
|
|
|
@@ -71,6 +73,9 @@ def get_column_types_simple(df, columns):
|
|
|
71
73
|
"""
|
|
72
74
|
|
|
73
75
|
# Ensure that the columns parameter is in list format
|
|
76
|
+
if columns is None:
|
|
77
|
+
columns = df.columns
|
|
78
|
+
|
|
74
79
|
if type(columns) != list:
|
|
75
80
|
columns = [columns]
|
|
76
81
|
|
|
@@ -193,7 +198,7 @@ def generate_partitioning_clause(partitioning):
|
|
|
193
198
|
{partitioning}
|
|
194
199
|
)"""
|
|
195
200
|
|
|
196
|
-
def get_feature_types_sql_format(tddf, columns):
|
|
201
|
+
def get_feature_types_sql_format(tddf, columns = None):
|
|
197
202
|
"""
|
|
198
203
|
Retrieve the SQL data types of specified columns from a Teradata dataframe.
|
|
199
204
|
|
|
@@ -228,6 +233,10 @@ def get_feature_types_sql_format(tddf, columns):
|
|
|
228
233
|
'programming': 'VARCHAR(30)',
|
|
229
234
|
'admitted': 'INTEGER'}
|
|
230
235
|
"""
|
|
236
|
+
|
|
237
|
+
if columns is None:
|
|
238
|
+
columns = tddf.columns
|
|
239
|
+
|
|
231
240
|
# Validate inputs
|
|
232
241
|
if not isinstance(tddf, tdml.DataFrame):
|
|
233
242
|
raise TypeError("tddf must be an instance of tdml.DataFrame")
|
|
@@ -257,4 +266,40 @@ def get_feature_types_sql_format(tddf, columns):
|
|
|
257
266
|
res = tdml.DataFrame.from_query(query).to_pandas()
|
|
258
267
|
|
|
259
268
|
# Return column names with their corresponding SQL data types in a dictionary
|
|
260
|
-
return {c: res[c].values[0].strip() for c in columns}
|
|
269
|
+
return {c: res[c].values[0].strip() for c in columns}
|
|
270
|
+
|
|
271
|
+
def update_varchar_length(feature_types: dict, new_varchar_length: int) -> dict:
|
|
272
|
+
"""
|
|
273
|
+
Updates the length of all VARCHAR fields in the feature_types dictionary based on an increment.
|
|
274
|
+
The new length is calculated as ceil(previous_length / new_varchar_length) * new_varchar_length,
|
|
275
|
+
ensuring that when new_varchar_length is equal to the current length, no change occurs.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
feature_types (dict): A dictionary where keys are feature names and values are dictionaries with 'type' and 'id'.
|
|
279
|
+
new_varchar_length (int): The increment value for adjusting VARCHAR lengths.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
dict: A dictionary with updated VARCHAR lengths.
|
|
283
|
+
|
|
284
|
+
Issues a warning if the new length is smaller than the original length.
|
|
285
|
+
"""
|
|
286
|
+
updated_feature_types = {}
|
|
287
|
+
varchar_pattern = re.compile(r'VARCHAR\((\d+)\)', re.IGNORECASE)
|
|
288
|
+
|
|
289
|
+
for key, value in feature_types.items():
|
|
290
|
+
type_value = value['type']
|
|
291
|
+
match = varchar_pattern.search(type_value)
|
|
292
|
+
if match:
|
|
293
|
+
original_length = int(match.group(1))
|
|
294
|
+
modified_length = int(np.ceil(original_length / new_varchar_length) * new_varchar_length)
|
|
295
|
+
|
|
296
|
+
if modified_length < original_length:
|
|
297
|
+
logger.warning(f"Reducing VARCHAR length for {key} from {original_length} to {modified_length}")
|
|
298
|
+
|
|
299
|
+
# Replace only the VARCHAR length
|
|
300
|
+
updated_value = varchar_pattern.sub(f'VARCHAR({modified_length})', type_value)
|
|
301
|
+
updated_feature_types[key] = {'type': updated_value, 'id': value['id']}
|
|
302
|
+
else:
|
|
303
|
+
updated_feature_types[key] = value
|
|
304
|
+
|
|
305
|
+
return updated_feature_types
|
|
@@ -2,7 +2,7 @@ tdfs/__init__.py,sha256=7AcO7uB1opRCt7t2JOHworKimfAaDeO3boRW7u9Geo8,23
|
|
|
2
2
|
tdfs/datasets.py,sha256=-b2MPEKGki2V1M8iUcoDR9uc2krIK7u1CK-EhChvihs,985
|
|
3
3
|
tdfs/feature_store.py,sha256=Honu7eOAXxP4Ivz0mRlhuNkfTDzgZl5HB1WlQUwzcZ0,31354
|
|
4
4
|
tdfs/data/curves.csv,sha256=q0Tm-0yu7VMK4lHvHpgi1LMeRq0lO5gJy2Q17brKbEM,112488
|
|
5
|
-
tdfs4ds/__init__.py,sha256=
|
|
5
|
+
tdfs4ds/__init__.py,sha256=BSE-ct7eaUwHsch7GGloYMSDWTb9nPLSeR-LSDZNY18,65844
|
|
6
6
|
tdfs4ds/datasets.py,sha256=LE4Gn0muwdyrIrCrbkE92cnafUML63z1lj5bFIIVzmc,3524
|
|
7
7
|
tdfs4ds/feature_engineering.py,sha256=oVnZ2V_XNGE12LKC_fNfkrWSQZLgtYRmaf8Dispi6S4,7081
|
|
8
8
|
tdfs4ds/feature_store.py,sha256=y-oItPZw6nBkBcGAceaATZbkLPTsvpk0OnpzTxYofDs,68576
|
|
@@ -13,7 +13,7 @@ tdfs4ds/feature_store/__init__.py,sha256=a7NPCkpTx40UR5LRErwnskpABG2Vuib7F5wUjaU
|
|
|
13
13
|
tdfs4ds/feature_store/entity_management.py,sha256=9ltytv3yCTG84NZXBpb1Tlkf9pOxvrNb0MVidU4pwvE,10157
|
|
14
14
|
tdfs4ds/feature_store/feature_data_processing.py,sha256=SuJeCTJF51l9-VS9WRS0oBUnxaVqba4hqjOpsCtdVs8,42352
|
|
15
15
|
tdfs4ds/feature_store/feature_query_retrieval.py,sha256=zuHRZhL6-qyLpPS7mWgRy1WingSN5iibkbi53Q7jfAs,33834
|
|
16
|
-
tdfs4ds/feature_store/feature_store_management.py,sha256=
|
|
16
|
+
tdfs4ds/feature_store/feature_store_management.py,sha256=WcgawACgC_lI880wj_FO2wV_FIp0W5WZ3x7r2-0WKdI,56121
|
|
17
17
|
tdfs4ds/process_store/__init__.py,sha256=npHR_xju5ecGmWfYHDyteLwiU3x-cL4HD3sFK_th7xY,229
|
|
18
18
|
tdfs4ds/process_store/process_followup.py,sha256=PvLcU7meg3ljBlPfuez3qwTVqpHHhVJxYxGqjgiHE8E,7265
|
|
19
19
|
tdfs4ds/process_store/process_query_administration.py,sha256=DsIt97cBoJ7NcpQzbQt55eUFNgXGdOMm5Hh2aX5v0PY,7762
|
|
@@ -21,12 +21,12 @@ tdfs4ds/process_store/process_registration_management.py,sha256=F8VlBoL-de98KnkM
|
|
|
21
21
|
tdfs4ds/process_store/process_store_catalog_management.py,sha256=H135RRTYn-pyWIqPVbHpuIyyvsaNrek6b1iPk8avJMI,16088
|
|
22
22
|
tdfs4ds/utils/__init__.py,sha256=-yTMfDLZbQnIRQ64s_bczzT21tDW2A8FZeq9PX5SgFU,168
|
|
23
23
|
tdfs4ds/utils/filter_management.py,sha256=7D47N_hnTSUVOkaV2XuKrlUFMxzWjDsCBvRYsH4lXdU,11073
|
|
24
|
-
tdfs4ds/utils/info.py,sha256=
|
|
24
|
+
tdfs4ds/utils/info.py,sha256=SQR_ec4M9-5Z4erjb9_N0n8JPY1wpelgxkw3B12D1Q4,12322
|
|
25
25
|
tdfs4ds/utils/lineage.py,sha256=LI-5pG7D8lO3-YFa9qA6CrEackiYugV23_Vz9IpF5xw,28670
|
|
26
26
|
tdfs4ds/utils/query_management.py,sha256=nAcE8QY1GWAKgOtb-ubSfDVcnYbU7Ge8CruVRLoPtmY,6356
|
|
27
27
|
tdfs4ds/utils/time_management.py,sha256=_jbwdyZH4Yr3VzbUrq6X93FpXDCDEdH0iv56vX7j8mA,8446
|
|
28
28
|
tdfs4ds/utils/visualization.py,sha256=5S528KoKzzkrAdCxfy7ecyqKvAXBoibNvHwz_u5ISMs,23167
|
|
29
|
-
tdfs4ds-0.2.4.
|
|
30
|
-
tdfs4ds-0.2.4.
|
|
31
|
-
tdfs4ds-0.2.4.
|
|
32
|
-
tdfs4ds-0.2.4.
|
|
29
|
+
tdfs4ds-0.2.4.4.dist-info/METADATA,sha256=OtxHXtfMZvmAZNaX0sontwt5luQ_0__lJhxoE3XHuho,11944
|
|
30
|
+
tdfs4ds-0.2.4.4.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
|
31
|
+
tdfs4ds-0.2.4.4.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
|
|
32
|
+
tdfs4ds-0.2.4.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|