tdfs4ds 0.2.4.24__py3-none-any.whl → 0.2.4.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +1 -1
- tdfs4ds/dataset/dataset.py +58 -1
- tdfs4ds/feature_store/feature_store_management.py +42 -54
- {tdfs4ds-0.2.4.24.dist-info → tdfs4ds-0.2.4.26.dist-info}/METADATA +1 -1
- {tdfs4ds-0.2.4.24.dist-info → tdfs4ds-0.2.4.26.dist-info}/RECORD +7 -7
- {tdfs4ds-0.2.4.24.dist-info → tdfs4ds-0.2.4.26.dist-info}/WHEEL +0 -0
- {tdfs4ds-0.2.4.24.dist-info → tdfs4ds-0.2.4.26.dist-info}/top_level.txt +0 -0
tdfs4ds/__init__.py
CHANGED
tdfs4ds/dataset/dataset.py
CHANGED
|
@@ -43,7 +43,7 @@ class Dataset:
|
|
|
43
43
|
return getattr(self.df, item)
|
|
44
44
|
raise AttributeError(f"'{type(self).__name__}' object has no attribute '{item}'")
|
|
45
45
|
|
|
46
|
-
def
|
|
46
|
+
def _retrieve_entities_and_features_old(self):
|
|
47
47
|
|
|
48
48
|
if self._get_dataset_type() == 'snapshot':
|
|
49
49
|
|
|
@@ -75,6 +75,63 @@ class Dataset:
|
|
|
75
75
|
logger.error(f"not implemented yet for dataset type: {self._get_dataset_type()}")
|
|
76
76
|
raise
|
|
77
77
|
|
|
78
|
+
def _retrieve_entities_and_features(self):
|
|
79
|
+
if self._get_dataset_type() != 'snapshot':
|
|
80
|
+
logger.error(f"not implemented yet for dataset type: {self._get_dataset_type()}")
|
|
81
|
+
raise
|
|
82
|
+
|
|
83
|
+
import re
|
|
84
|
+
|
|
85
|
+
ddl = self._get_ddl()
|
|
86
|
+
|
|
87
|
+
# Column types from the materialized dataframe
|
|
88
|
+
columns_types = get_feature_types_sql_format(self.df)
|
|
89
|
+
|
|
90
|
+
# Regex to capture each feature subquery:
|
|
91
|
+
# - grabs feature name alias, FEATURE_ID, FEATURE_VERSION
|
|
92
|
+
# - grabs database and view/table (quoted or unquoted)
|
|
93
|
+
pattern = re.compile(
|
|
94
|
+
r"""
|
|
95
|
+
SEQUENCED\s+VALIDTIME\s+SELECT
|
|
96
|
+
.*? # anything before the feature value
|
|
97
|
+
B1\.FEATURE_VALUE\s+AS\s+(?P<fname>[A-Za-z_][\w]*) # AS <feature_name>
|
|
98
|
+
\s+FROM\s+
|
|
99
|
+
(?:
|
|
100
|
+
"(?P<dbq>[^"]+)"\."(?P<viewq>[^"]+)" # "DB"."VIEW"
|
|
101
|
+
|
|
|
102
|
+
(?P<db>[A-Za-z_]\w*)\.(?P<view>[A-Za-z_]\w*) # DB.VIEW
|
|
103
|
+
)
|
|
104
|
+
\s+B1\s+WHERE\s*\(
|
|
105
|
+
\s*FEATURE_ID\s*=\s*(?P<fid>\d+)\s+
|
|
106
|
+
AND\s+FEATURE_VERSION\s*=\s*'(?P<fver>[^']+)'
|
|
107
|
+
\s*\)
|
|
108
|
+
""",
|
|
109
|
+
re.IGNORECASE | re.DOTALL | re.VERBOSE
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
features = {}
|
|
113
|
+
for m in pattern.finditer(ddl):
|
|
114
|
+
fname = m.group('fname')
|
|
115
|
+
fid = int(m.group('fid'))
|
|
116
|
+
fver = m.group('fver')
|
|
117
|
+
db = (m.group('dbq') or m.group('db') or '').upper()
|
|
118
|
+
view = (m.group('viewq') or m.group('view') or '').upper()
|
|
119
|
+
ftype = columns_types[fname].upper()
|
|
120
|
+
features[fname.upper()] = {
|
|
121
|
+
'id': fid,
|
|
122
|
+
'version': fver,
|
|
123
|
+
'type': ftype,
|
|
124
|
+
'database': db,
|
|
125
|
+
'view': view
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
# Anything in the dataframe that isn't a feature column is an entity column.
|
|
129
|
+
feature_names_upper = set(features.keys())
|
|
130
|
+
entity_names = [c for c in self.df.columns if c.upper() not in feature_names_upper]
|
|
131
|
+
entity = {n: columns_types[n] for n in entity_names}
|
|
132
|
+
|
|
133
|
+
return entity, features
|
|
134
|
+
|
|
78
135
|
def _get_dataset_type(self):
|
|
79
136
|
return 'snapshot'
|
|
80
137
|
|
|
@@ -869,39 +869,6 @@ def Gettdtypes(tddf, features_columns, entity_id):
|
|
|
869
869
|
# Increment the feature ID for the next iteration.
|
|
870
870
|
feature_id += 1
|
|
871
871
|
|
|
872
|
-
# # Iterate over the data types of the columns in the DataFrame.
|
|
873
|
-
# for k, v in types.items():
|
|
874
|
-
# # If the column name does not exist in the feature catalog table and is in the list of feature column names...
|
|
875
|
-
# if k.upper() not in [n.upper() for n in existing_features] and k.upper() in [n.upper() for n in features_columns]:
|
|
876
|
-
# # If the data type of the column is integer...
|
|
877
|
-
# if 'int' in str(v.lower()):
|
|
878
|
-
# # Add an entry to the result dictionary for the column name with its data type and new feature ID.
|
|
879
|
-
# res[k] = {'type': 'BIGINT', 'id': feature_id}
|
|
880
|
-
# # If the data type of the column is float...
|
|
881
|
-
# elif 'float' in str(v.lower()):
|
|
882
|
-
# # Add an entry to the result dictionary for the column name with its data type and new feature ID.
|
|
883
|
-
# res[k] = {'type': 'FLOAT', 'id': feature_id}
|
|
884
|
-
# # If the data type of the column is varchar with unicode encoding ...
|
|
885
|
-
# elif 'unicode' in str(v.lower()):
|
|
886
|
-
# res[k] = {'type': 'VARCHAR_UNICODE', 'id': feature_id}
|
|
887
|
-
# # Print a message that the data type is not yet managed.
|
|
888
|
-
# #if tdfs4ds.DISPLAY_LOGS: print(f'{k} has a type that is not yet managed')
|
|
889
|
-
# # If the data type of the column is varchar with unicode encoding ...
|
|
890
|
-
# elif 'latin' in str(v.lower()):
|
|
891
|
-
# res[k] = {'type': 'VARCHAR_LATIN', 'id': feature_id}
|
|
892
|
-
# # Print a message that the data type is not yet managed.
|
|
893
|
-
# #if tdfs4ds.DISPLAY_LOGS: print(f'{k} has a type that is not yet managed')
|
|
894
|
-
# elif 'decimal' in str(v.lower()):
|
|
895
|
-
# res[k] = {'type': 'DECIMAL', 'id': feature_id}
|
|
896
|
-
# # Print a message that the data type is not yet managed.
|
|
897
|
-
# # if tdfs4ds.DISPLAY_LOGS: print(f'{k} has a type that is not yet managed')
|
|
898
|
-
# else:
|
|
899
|
-
# res[k] = {'type': 'VARCHAR_LATIN', 'id': feature_id}
|
|
900
|
-
# # Print a message that the data type is not yet managed.
|
|
901
|
-
# # if tdfs4ds.DISPLAY_LOGS: print(f'{k} has a type that is not yet managed')
|
|
902
|
-
# # Increment the feature ID for the next iteration.
|
|
903
|
-
# feature_id += 1
|
|
904
|
-
|
|
905
872
|
# Return the result dictionary.
|
|
906
873
|
return res
|
|
907
874
|
|
|
@@ -979,36 +946,50 @@ def tdstone2_Gettdtypes(existing_model, entity_id, display_logs=False):
|
|
|
979
946
|
# Return the dictionary containing feature names, types, and IDs.
|
|
980
947
|
return res
|
|
981
948
|
|
|
982
|
-
def delete_feature(feature_name, data_domain=None):
|
|
949
|
+
def delete_feature(feature_name, entity_id, data_domain=None):
|
|
983
950
|
"""
|
|
984
|
-
Delete the values of a specific feature from the feature table
|
|
951
|
+
Delete the values of a specific feature for given entities from the feature table
|
|
952
|
+
within a specified data domain.
|
|
985
953
|
|
|
986
954
|
This function constructs and executes two SQL queries against a Teradata database
|
|
987
|
-
to remove a feature specified by its name. The first query
|
|
988
|
-
where the feature resides, based on the feature name
|
|
989
|
-
deletes the feature from the
|
|
955
|
+
to remove a feature specified by its name and entity identifiers. The first query
|
|
956
|
+
retrieves the table name where the feature resides, based on the feature name,
|
|
957
|
+
entity, and data domain. The second query deletes the feature values from the
|
|
958
|
+
identified table.
|
|
990
959
|
|
|
991
960
|
Parameters:
|
|
992
961
|
- feature_name (str): The name of the feature to be removed.
|
|
993
|
-
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
962
|
+
- entity_id (str or list of str): Entity identifier(s). If a string is provided,
|
|
963
|
+
it will be converted to a single-element list. The list is always sorted
|
|
964
|
+
alphabetically before use.
|
|
965
|
+
- data_domain (str, optional): The data domain where the feature is located.
|
|
966
|
+
If not specified, the function uses the default data domain defined in
|
|
967
|
+
`tdfs4ds.DATA_DOMAIN`.
|
|
968
|
+
|
|
969
|
+
Behavior:
|
|
970
|
+
- The function checks if the `DEBUG_MODE` flag in the `tdfs4ds` module is set to True.
|
|
971
|
+
If so, it prints the generated SQL queries and the resolved table name for debugging.
|
|
972
|
+
- If the feature table cannot be resolved, the function returns without executing
|
|
973
|
+
a delete query.
|
|
998
974
|
|
|
999
|
-
|
|
975
|
+
Returns:
|
|
976
|
+
- None
|
|
1000
977
|
|
|
1001
978
|
Note:
|
|
1002
979
|
- The function assumes the presence of a module `tdfs4ds` with predefined constants
|
|
1003
|
-
such as `DATA_DOMAIN`, `SCHEMA`, `
|
|
980
|
+
such as `DATA_DOMAIN`, `SCHEMA`, `FEATURE_CATALOG_NAME_VIEW`, and a flag `DEBUG_MODE`.
|
|
1004
981
|
- It also assumes a `tdml` module or object with an `execute_sql` method capable of
|
|
1005
982
|
executing SQL queries against a Teradata database and fetching the results.
|
|
1006
983
|
|
|
1007
984
|
Raises:
|
|
1008
|
-
-
|
|
1009
|
-
|
|
985
|
+
- Exceptions related to SQL execution or connection issues may be raised but are not
|
|
986
|
+
explicitly handled, except for printing the error message.
|
|
1010
987
|
"""
|
|
1011
988
|
|
|
989
|
+
if isinstance(entity_id, str):
|
|
990
|
+
entity_id = [entity_id]
|
|
991
|
+
entity_id = sorted(entity_id)
|
|
992
|
+
|
|
1012
993
|
if data_domain is None:
|
|
1013
994
|
data_domain = tdfs4ds.DATA_DOMAIN
|
|
1014
995
|
|
|
@@ -1016,17 +997,19 @@ def delete_feature(feature_name, data_domain=None):
|
|
|
1016
997
|
SEL FEATURE_DATABASE||'.'||FEATURE_TABLE AS TABLE_NAME
|
|
1017
998
|
FROM {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME_VIEW}
|
|
1018
999
|
WHERE FEATURE_NAME = '{feature_name}'
|
|
1019
|
-
AND DATA_DOMAIN = '{data_domain}'
|
|
1000
|
+
AND DATA_DOMAIN = '{data_domain}'
|
|
1001
|
+
AND ENTITY_NAME = '{','.join([e.upper() for e in entity_id])}'"""
|
|
1020
1002
|
if tdfs4ds.DEBUG_MODE:
|
|
1021
1003
|
print(query0)
|
|
1022
1004
|
|
|
1023
1005
|
table_name = tdml.execute_sql(query0).fetchall()
|
|
1024
|
-
if len(table_name)>0:
|
|
1006
|
+
if len(table_name) > 0:
|
|
1025
1007
|
table_name = table_name[0][0]
|
|
1026
1008
|
else:
|
|
1027
1009
|
return
|
|
1028
1010
|
if tdfs4ds.DEBUG_MODE:
|
|
1029
1011
|
print('table name : ', table_name)
|
|
1012
|
+
|
|
1030
1013
|
query = f"""
|
|
1031
1014
|
DELETE {table_name}
|
|
1032
1015
|
WHERE FEATURE_ID = (
|
|
@@ -1044,6 +1027,7 @@ def delete_feature(feature_name, data_domain=None):
|
|
|
1044
1027
|
|
|
1045
1028
|
return
|
|
1046
1029
|
|
|
1030
|
+
|
|
1047
1031
|
def remove_feature(feature_name, entity_id, data_domain=None):
|
|
1048
1032
|
"""
|
|
1049
1033
|
Attempts to remove a specific feature from the feature catalog and any associated data,
|
|
@@ -1060,7 +1044,9 @@ def remove_feature(feature_name, entity_id, data_domain=None):
|
|
|
1060
1044
|
|
|
1061
1045
|
Parameters:
|
|
1062
1046
|
- feature_name (str): The name of the feature to be removed.
|
|
1063
|
-
- entity_id (list of str):
|
|
1047
|
+
- entity_id (str or list of str): Entity identifier(s). If a string is provided,
|
|
1048
|
+
it will be converted to a single-element list. The list is always sorted
|
|
1049
|
+
alphabetically before use.
|
|
1064
1050
|
- data_domain (str, optional): The data domain where the feature is located. If not provided,
|
|
1065
1051
|
the function uses the default data domain from the `tdfs4ds.DATA_DOMAIN` setting.
|
|
1066
1052
|
|
|
@@ -1084,16 +1070,19 @@ def remove_feature(feature_name, entity_id, data_domain=None):
|
|
|
1084
1070
|
- SQL execution or connection exceptions might occur but are not explicitly handled by this function.
|
|
1085
1071
|
"""
|
|
1086
1072
|
|
|
1073
|
+
if isinstance(entity_id, str):
|
|
1074
|
+
entity_id = [entity_id]
|
|
1075
|
+
entity_id = sorted(entity_id)
|
|
1076
|
+
|
|
1087
1077
|
if data_domain is None:
|
|
1088
1078
|
data_domain = tdfs4ds.DATA_DOMAIN
|
|
1089
1079
|
|
|
1090
1080
|
try:
|
|
1091
|
-
delete_feature(feature_name, data_domain)
|
|
1081
|
+
delete_feature(feature_name, entity_id, data_domain)
|
|
1092
1082
|
except Exception as e:
|
|
1093
1083
|
print(str(e).split('\n')[0])
|
|
1094
1084
|
return
|
|
1095
1085
|
|
|
1096
|
-
entity_id.sort()
|
|
1097
1086
|
query = f"""
|
|
1098
1087
|
NONSEQUENCED VALIDTIME DELETE {tdfs4ds.SCHEMA}.{tdfs4ds.FEATURE_CATALOG_NAME}
|
|
1099
1088
|
WHERE FEATURE_NAME = '{feature_name}'
|
|
@@ -1102,7 +1091,6 @@ def remove_feature(feature_name, entity_id, data_domain=None):
|
|
|
1102
1091
|
"""
|
|
1103
1092
|
if tdfs4ds.DEBUG_MODE:
|
|
1104
1093
|
print(query)
|
|
1094
|
+
|
|
1105
1095
|
tdml.execute_sql(query)
|
|
1106
1096
|
return
|
|
1107
|
-
|
|
1108
|
-
|
|
@@ -2,7 +2,7 @@ tdfs/__init__.py,sha256=7AcO7uB1opRCt7t2JOHworKimfAaDeO3boRW7u9Geo8,23
|
|
|
2
2
|
tdfs/datasets.py,sha256=-b2MPEKGki2V1M8iUcoDR9uc2krIK7u1CK-EhChvihs,985
|
|
3
3
|
tdfs/feature_store.py,sha256=Honu7eOAXxP4Ivz0mRlhuNkfTDzgZl5HB1WlQUwzcZ0,31354
|
|
4
4
|
tdfs/data/curves.csv,sha256=q0Tm-0yu7VMK4lHvHpgi1LMeRq0lO5gJy2Q17brKbEM,112488
|
|
5
|
-
tdfs4ds/__init__.py,sha256=
|
|
5
|
+
tdfs4ds/__init__.py,sha256=_UnSzqinlnbLOM4wOTxJrT1a_qTn6mRiNHz4jE6bRaI,64168
|
|
6
6
|
tdfs4ds/datasets.py,sha256=LE4Gn0muwdyrIrCrbkE92cnafUML63z1lj5bFIIVzmc,3524
|
|
7
7
|
tdfs4ds/feature_engineering.py,sha256=oVnZ2V_XNGE12LKC_fNfkrWSQZLgtYRmaf8Dispi6S4,7081
|
|
8
8
|
tdfs4ds/feature_store.py,sha256=y-oItPZw6nBkBcGAceaATZbkLPTsvpk0OnpzTxYofDs,68576
|
|
@@ -13,13 +13,13 @@ tdfs4ds/data/logo/tdfs4ds_logo.png,sha256=OCKQnH0gQbRyupwZeiIgo-9c6mdRtjE2E2Zunr
|
|
|
13
13
|
tdfs4ds/data/logo/teradata_sym_rgb_pos.png,sha256=Zq-QzLb04PIQ4iN8C6ssaLuNVVI1Q_TqBkFx_f7aNOI,8052
|
|
14
14
|
tdfs4ds/data/logo/teradata_sym_rgb_wht_rev.png,sha256=ETznIUnS38vlHek_CzjmcjnpthfCATCp2Ww0Dx8Th3Q,7803
|
|
15
15
|
tdfs4ds/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
-
tdfs4ds/dataset/dataset.py,sha256=
|
|
16
|
+
tdfs4ds/dataset/dataset.py,sha256=J_fgfsVdR9zSOXrUOqyotqsUD-GlQMGyuld6ueov45w,7603
|
|
17
17
|
tdfs4ds/dataset/dataset_catalog.py,sha256=qxS2thDW2MvsRouSFaX1M0sX2J7IzBAYD8Yf22Tsd5k,16638
|
|
18
18
|
tdfs4ds/feature_store/__init__.py,sha256=a7NPCkpTx40UR5LRErwnskpABG2Vuib7F5wUjaUGCnI,209
|
|
19
19
|
tdfs4ds/feature_store/entity_management.py,sha256=9ltytv3yCTG84NZXBpb1Tlkf9pOxvrNb0MVidU4pwvE,10157
|
|
20
20
|
tdfs4ds/feature_store/feature_data_processing.py,sha256=rvpnFrV6Tmg8C6xcSQLT_lrFYqZsdSzFXmS-4suK9qg,42847
|
|
21
21
|
tdfs4ds/feature_store/feature_query_retrieval.py,sha256=zuHRZhL6-qyLpPS7mWgRy1WingSN5iibkbi53Q7jfAs,33834
|
|
22
|
-
tdfs4ds/feature_store/feature_store_management.py,sha256=
|
|
22
|
+
tdfs4ds/feature_store/feature_store_management.py,sha256=ufIBTdrnHBvGdXggavJoTVoZjOHFtH5ZiYqJr5eIBhg,54713
|
|
23
23
|
tdfs4ds/process_store/__init__.py,sha256=npHR_xju5ecGmWfYHDyteLwiU3x-cL4HD3sFK_th7xY,229
|
|
24
24
|
tdfs4ds/process_store/process_followup.py,sha256=PvLcU7meg3ljBlPfuez3qwTVqpHHhVJxYxGqjgiHE8E,7265
|
|
25
25
|
tdfs4ds/process_store/process_query_administration.py,sha256=DsIt97cBoJ7NcpQzbQt55eUFNgXGdOMm5Hh2aX5v0PY,7762
|
|
@@ -32,7 +32,7 @@ tdfs4ds/utils/lineage.py,sha256=gy5M42qy5fvdWmlohAY3WPYoqAyp5VakeEmeT1YjrJQ,3783
|
|
|
32
32
|
tdfs4ds/utils/query_management.py,sha256=nAcE8QY1GWAKgOtb-ubSfDVcnYbU7Ge8CruVRLoPtmY,6356
|
|
33
33
|
tdfs4ds/utils/time_management.py,sha256=1eqGs7rT3SGag0F30R3PzwiC7Aa7DKia2Ud0aSNKcPg,10593
|
|
34
34
|
tdfs4ds/utils/visualization.py,sha256=5S528KoKzzkrAdCxfy7ecyqKvAXBoibNvHwz_u5ISMs,23167
|
|
35
|
-
tdfs4ds-0.2.4.
|
|
36
|
-
tdfs4ds-0.2.4.
|
|
37
|
-
tdfs4ds-0.2.4.
|
|
38
|
-
tdfs4ds-0.2.4.
|
|
35
|
+
tdfs4ds-0.2.4.26.dist-info/METADATA,sha256=15eq8Z08VdFjD-GXC2cLqGvfb8OQoDRi3oPlmTyiq00,14326
|
|
36
|
+
tdfs4ds-0.2.4.26.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
|
37
|
+
tdfs4ds-0.2.4.26.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
|
|
38
|
+
tdfs4ds-0.2.4.26.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|