tdfs4ds 0.2.4.2__py3-none-any.whl → 0.2.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +44 -9
- tdfs4ds/utils/info.py +9 -2
- {tdfs4ds-0.2.4.2.dist-info → tdfs4ds-0.2.4.3.dist-info}/METADATA +1 -1
- {tdfs4ds-0.2.4.2.dist-info → tdfs4ds-0.2.4.3.dist-info}/RECORD +6 -6
- {tdfs4ds-0.2.4.2.dist-info → tdfs4ds-0.2.4.3.dist-info}/WHEEL +0 -0
- {tdfs4ds-0.2.4.2.dist-info → tdfs4ds-0.2.4.3.dist-info}/top_level.txt +0 -0
tdfs4ds/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
__version__ = '0.2.4.
|
|
1
|
+
__version__ = '0.2.4.3'
|
|
2
2
|
import logging
|
|
3
3
|
# Setup the logger
|
|
4
4
|
logging.basicConfig(
|
|
@@ -11,6 +11,7 @@ logger = logging.getLogger(__name__)
|
|
|
11
11
|
|
|
12
12
|
from tdfs4ds.feature_store.feature_query_retrieval import get_available_entity_id_records, write_where_clause_filter
|
|
13
13
|
from tdfs4ds.process_store.process_followup import follow_up_report
|
|
14
|
+
from tdfs4ds.dataset.dataset_catalog import DatasetCatalog, Dataset
|
|
14
15
|
|
|
15
16
|
DATA_DOMAIN = None
|
|
16
17
|
SCHEMA = None
|
|
@@ -19,6 +20,7 @@ FEATURE_CATALOG_NAME_VIEW = 'FS_V_FEATURE_CATALOG'
|
|
|
19
20
|
PROCESS_CATALOG_NAME = 'FS_PROCESS_CATALOG'
|
|
20
21
|
PROCESS_CATALOG_NAME_VIEW = 'FS_V_PROCESS_CATALOG'
|
|
21
22
|
PROCESS_CATALOG_NAME_VIEW_FEATURE_SPLIT = 'FS_V_PROCESS_CATALOG_FEATURE_SPLIT'
|
|
23
|
+
DATASET_CATALOG_NAME = 'FS_DATASET'
|
|
22
24
|
|
|
23
25
|
DATA_DISTRIBUTION_NAME = 'FS_DATA_DISTRIBUTION'
|
|
24
26
|
FOLLOW_UP_NAME = 'FS_FOLLOW_UP'
|
|
@@ -125,6 +127,8 @@ def setup(database, if_exists='fail'):
|
|
|
125
127
|
tdml.db_drop_table(table_name = tdfs4ds.DATA_DISTRIBUTION_NAME, schema_name=database)
|
|
126
128
|
except Exception as e:
|
|
127
129
|
print(str(e).split('\n')[0])
|
|
130
|
+
|
|
131
|
+
DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME).drop_catalog()
|
|
128
132
|
try:
|
|
129
133
|
tdfs4ds.FEATURE_CATALOG_NAME = feature_store_catalog_creation()
|
|
130
134
|
print('feature catalog table: ', tdfs4ds.FEATURE_CATALOG_NAME, ' in database ', database)
|
|
@@ -146,18 +150,22 @@ def setup(database, if_exists='fail'):
|
|
|
146
150
|
|
|
147
151
|
tdfs4ds.feature_store.feature_store_management.feature_store_catalog_view_creation()
|
|
148
152
|
tdfs4ds.process_store.process_store_catalog_management.process_store_catalog_view_creation()
|
|
153
|
+
dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
|
|
154
|
+
if not dataset_catalog._exists():
|
|
155
|
+
dataset_catalog.create_catalog()
|
|
149
156
|
|
|
150
157
|
return
|
|
151
158
|
|
|
152
159
|
def connect(
|
|
153
|
-
database
|
|
154
|
-
feature_catalog_name
|
|
155
|
-
process_catalog_name
|
|
156
|
-
data_distribution_name
|
|
157
|
-
filter_manager_name
|
|
158
|
-
followup_name
|
|
160
|
+
database = tdfs4ds.SCHEMA,
|
|
161
|
+
feature_catalog_name = tdfs4ds.FEATURE_CATALOG_NAME,
|
|
162
|
+
process_catalog_name = tdfs4ds.PROCESS_CATALOG_NAME,
|
|
163
|
+
data_distribution_name = tdfs4ds.DATA_DISTRIBUTION_NAME,
|
|
164
|
+
filter_manager_name = tdfs4ds.FILTER_MANAGER_NAME,
|
|
165
|
+
followup_name = tdfs4ds.FOLLOW_UP_NAME,
|
|
159
166
|
feature_catalog_name_view = tdfs4ds.FEATURE_CATALOG_NAME_VIEW,
|
|
160
|
-
process_catalog_name_view = tdfs4ds.PROCESS_CATALOG_NAME_VIEW
|
|
167
|
+
process_catalog_name_view = tdfs4ds.PROCESS_CATALOG_NAME_VIEW,
|
|
168
|
+
dataset_catalog_name = tdfs4ds.DATASET_CATALOG_NAME
|
|
161
169
|
):
|
|
162
170
|
"""
|
|
163
171
|
Configures the database environment by setting schema names and checking the existence of specified catalog tables.
|
|
@@ -197,7 +205,8 @@ def connect(
|
|
|
197
205
|
distrib_exists = data_distribution_name.lower() in tables
|
|
198
206
|
filter_manager_exists = filter_manager_name.lower() in tables
|
|
199
207
|
followup_name_exists = followup_name.lower() in tables
|
|
200
|
-
|
|
208
|
+
|
|
209
|
+
|
|
201
210
|
if followup_name_exists:
|
|
202
211
|
tdfs4ds.FOLLOW_UP_NAME = followup_name
|
|
203
212
|
else:
|
|
@@ -211,6 +220,7 @@ def connect(
|
|
|
211
220
|
tdfs4ds.FILTER_MANAGER_NAME = filter_manager_name
|
|
212
221
|
tdfs4ds.PROCESS_CATALOG_NAME_VIEW = process_catalog_name_view
|
|
213
222
|
tdfs4ds.FEATURE_CATALOG_NAME_VIEW = feature_catalog_name_view
|
|
223
|
+
|
|
214
224
|
|
|
215
225
|
process_list = tdml.DataFrame(tdml.in_schema(database, process_catalog_name))
|
|
216
226
|
if 'ENTITY_NULL_SUBSTITUTE' not in process_list.columns:
|
|
@@ -237,6 +247,11 @@ def connect(
|
|
|
237
247
|
def is_data_distribution_temporal():
|
|
238
248
|
return 'PERIOD' in tdfs4ds.utils.lineage.get_ddl(view_name=tdfs4ds.DATA_DISTRIBUTION_NAME,
|
|
239
249
|
schema_name=tdfs4ds.SCHEMA, object_type='table')
|
|
250
|
+
|
|
251
|
+
tdfs4ds.DATASET_CATALOG_NAME = dataset_catalog_name
|
|
252
|
+
dataset_catalog = DatasetCatalog(schema_name=database, name=tdfs4ds.DATASET_CATALOG_NAME)
|
|
253
|
+
if not dataset_catalog._exists():
|
|
254
|
+
dataset_catalog.create_catalog()
|
|
240
255
|
|
|
241
256
|
if is_data_distribution_temporal():
|
|
242
257
|
tdfs4ds.DATA_DISTRIBUTION_TEMPORAL = True
|
|
@@ -279,7 +294,21 @@ def process_catalog():
|
|
|
279
294
|
"""
|
|
280
295
|
return tdfs4ds.process_store.process_query_administration.list_processes()
|
|
281
296
|
|
|
297
|
+
def dataset_catalog():
|
|
298
|
+
"""
|
|
299
|
+
Retrieve a list of all datasets registered in the dataset store.
|
|
300
|
+
|
|
301
|
+
This function performs a query against the dataset store to gather a list of all
|
|
302
|
+
datasets that have been registered and are administrable.
|
|
303
|
+
|
|
304
|
+
"""
|
|
305
|
+
return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).catalog
|
|
306
|
+
|
|
307
|
+
def get_dataset_entity(dataset_id = None):
|
|
308
|
+
return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_entity(dataset_id)
|
|
282
309
|
|
|
310
|
+
def get_dataset_features(dataset_id = None):
|
|
311
|
+
return DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).get_dataset_features(dataset_id)
|
|
283
312
|
|
|
284
313
|
def run(process_id, return_dataset = False, force_compute = False):
|
|
285
314
|
"""
|
|
@@ -1064,6 +1093,12 @@ def build_dataset(entity_id, selected_features, view_name, schema_name=None, com
|
|
|
1064
1093
|
tdfs4ds.logger.info(f"Adding a comment to the view {view_name} in the {schema_name} database.")
|
|
1065
1094
|
tdml.execute_sql(f"COMMENT ON VIEW {schema_name}.{view_name} IS '{comment}'")
|
|
1066
1095
|
|
|
1096
|
+
# build the dataset object
|
|
1097
|
+
tdfs4ds.logger.info(f"Creation of the dataset object.")
|
|
1098
|
+
dataset = Dataset(view_name=view_name, schema_name=schema_name)
|
|
1099
|
+
tdfs4ds.logger.info(f"Registering of the dataset in the dataset catalog.")
|
|
1100
|
+
DatasetCatalog(schema_name=tdfs4ds.SCHEMA, name=tdfs4ds.DATASET_CATALOG_NAME).add_dataset(dataset=dataset)
|
|
1101
|
+
|
|
1067
1102
|
# Return the query or the DataFrame based on the `return_query` flag
|
|
1068
1103
|
if return_query:
|
|
1069
1104
|
tdfs4ds.logger.info("Returning the generated dataset query.")
|
tdfs4ds/utils/info.py
CHANGED
|
@@ -47,7 +47,7 @@ def get_column_types(df, columns):
|
|
|
47
47
|
|
|
48
48
|
|
|
49
49
|
|
|
50
|
-
def get_column_types_simple(df, columns):
|
|
50
|
+
def get_column_types_simple(df, columns = None):
|
|
51
51
|
"""
|
|
52
52
|
Retrieve simplified column types for specified columns from a DataFrame.
|
|
53
53
|
|
|
@@ -71,6 +71,9 @@ def get_column_types_simple(df, columns):
|
|
|
71
71
|
"""
|
|
72
72
|
|
|
73
73
|
# Ensure that the columns parameter is in list format
|
|
74
|
+
if columns is None:
|
|
75
|
+
columns = df.columns
|
|
76
|
+
|
|
74
77
|
if type(columns) != list:
|
|
75
78
|
columns = [columns]
|
|
76
79
|
|
|
@@ -193,7 +196,7 @@ def generate_partitioning_clause(partitioning):
|
|
|
193
196
|
{partitioning}
|
|
194
197
|
)"""
|
|
195
198
|
|
|
196
|
-
def get_feature_types_sql_format(tddf, columns):
|
|
199
|
+
def get_feature_types_sql_format(tddf, columns = None):
|
|
197
200
|
"""
|
|
198
201
|
Retrieve the SQL data types of specified columns from a Teradata dataframe.
|
|
199
202
|
|
|
@@ -228,6 +231,10 @@ def get_feature_types_sql_format(tddf, columns):
|
|
|
228
231
|
'programming': 'VARCHAR(30)',
|
|
229
232
|
'admitted': 'INTEGER'}
|
|
230
233
|
"""
|
|
234
|
+
|
|
235
|
+
if columns is None:
|
|
236
|
+
columns = tddf.columns
|
|
237
|
+
|
|
231
238
|
# Validate inputs
|
|
232
239
|
if not isinstance(tddf, tdml.DataFrame):
|
|
233
240
|
raise TypeError("tddf must be an instance of tdml.DataFrame")
|
|
@@ -2,7 +2,7 @@ tdfs/__init__.py,sha256=7AcO7uB1opRCt7t2JOHworKimfAaDeO3boRW7u9Geo8,23
|
|
|
2
2
|
tdfs/datasets.py,sha256=-b2MPEKGki2V1M8iUcoDR9uc2krIK7u1CK-EhChvihs,985
|
|
3
3
|
tdfs/feature_store.py,sha256=Honu7eOAXxP4Ivz0mRlhuNkfTDzgZl5HB1WlQUwzcZ0,31354
|
|
4
4
|
tdfs/data/curves.csv,sha256=q0Tm-0yu7VMK4lHvHpgi1LMeRq0lO5gJy2Q17brKbEM,112488
|
|
5
|
-
tdfs4ds/__init__.py,sha256=
|
|
5
|
+
tdfs4ds/__init__.py,sha256=OOakI_WdX1fjXTheqqLMUQY99apaGFXdEYg_SQpWQng,63986
|
|
6
6
|
tdfs4ds/datasets.py,sha256=LE4Gn0muwdyrIrCrbkE92cnafUML63z1lj5bFIIVzmc,3524
|
|
7
7
|
tdfs4ds/feature_engineering.py,sha256=oVnZ2V_XNGE12LKC_fNfkrWSQZLgtYRmaf8Dispi6S4,7081
|
|
8
8
|
tdfs4ds/feature_store.py,sha256=y-oItPZw6nBkBcGAceaATZbkLPTsvpk0OnpzTxYofDs,68576
|
|
@@ -21,12 +21,12 @@ tdfs4ds/process_store/process_registration_management.py,sha256=F8VlBoL-de98KnkM
|
|
|
21
21
|
tdfs4ds/process_store/process_store_catalog_management.py,sha256=H135RRTYn-pyWIqPVbHpuIyyvsaNrek6b1iPk8avJMI,16088
|
|
22
22
|
tdfs4ds/utils/__init__.py,sha256=-yTMfDLZbQnIRQ64s_bczzT21tDW2A8FZeq9PX5SgFU,168
|
|
23
23
|
tdfs4ds/utils/filter_management.py,sha256=7D47N_hnTSUVOkaV2XuKrlUFMxzWjDsCBvRYsH4lXdU,11073
|
|
24
|
-
tdfs4ds/utils/info.py,sha256=
|
|
24
|
+
tdfs4ds/utils/info.py,sha256=lc9-rQDfM4NWnZGkSUkY_G0qYx7qnoErNKKcYMuLIRs,10554
|
|
25
25
|
tdfs4ds/utils/lineage.py,sha256=LI-5pG7D8lO3-YFa9qA6CrEackiYugV23_Vz9IpF5xw,28670
|
|
26
26
|
tdfs4ds/utils/query_management.py,sha256=nAcE8QY1GWAKgOtb-ubSfDVcnYbU7Ge8CruVRLoPtmY,6356
|
|
27
27
|
tdfs4ds/utils/time_management.py,sha256=_jbwdyZH4Yr3VzbUrq6X93FpXDCDEdH0iv56vX7j8mA,8446
|
|
28
28
|
tdfs4ds/utils/visualization.py,sha256=5S528KoKzzkrAdCxfy7ecyqKvAXBoibNvHwz_u5ISMs,23167
|
|
29
|
-
tdfs4ds-0.2.4.
|
|
30
|
-
tdfs4ds-0.2.4.
|
|
31
|
-
tdfs4ds-0.2.4.
|
|
32
|
-
tdfs4ds-0.2.4.
|
|
29
|
+
tdfs4ds-0.2.4.3.dist-info/METADATA,sha256=dUqe-90oXLdYx2U6F-WmeQDHhAFN_vvZrFfVuYGmTn8,11944
|
|
30
|
+
tdfs4ds-0.2.4.3.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
|
31
|
+
tdfs4ds-0.2.4.3.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
|
|
32
|
+
tdfs4ds-0.2.4.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|