tdfs4ds 0.2.4.26__py3-none-any.whl → 0.2.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +769 -571
- tdfs4ds/feature_store/feature_data_processing.py +370 -300
- tdfs4ds/feature_store/feature_query_retrieval.py +105 -52
- tdfs4ds/feature_store/feature_store_management.py +226 -231
- tdfs4ds/genai/__init__.py +27 -0
- tdfs4ds/genai/documentation.py +1878 -0
- tdfs4ds/process_store/process_followup.py +113 -2
- tdfs4ds/process_store/process_query_administration.py +1 -1
- tdfs4ds/process_store/process_registration_management.py +67 -55
- tdfs4ds/process_store/process_store_catalog_management.py +79 -26
- tdfs4ds/utils/filter_management.py +548 -138
- tdfs4ds/utils/query_management.py +18 -40
- tdfs4ds/utils/time_management.py +565 -98
- {tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.5.1.dist-info}/METADATA +1 -1
- tdfs4ds-0.2.5.1.dist-info/RECORD +32 -0
- tdfs/__init__.py +0 -1
- tdfs/data/curves.csv +0 -5086
- tdfs/datasets.py +0 -27
- tdfs/feature_store.py +0 -723
- tdfs4ds/feature_engineering.py +0 -152
- tdfs4ds/feature_store.py +0 -1529
- tdfs4ds/process_store.py +0 -387
- tdfs4ds/utils.py +0 -579
- tdfs4ds-0.2.4.26.dist-info/RECORD +0 -38
- {tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.5.1.dist-info}/WHEEL +0 -0
- {tdfs4ds-0.2.4.26.dist-info → tdfs4ds-0.2.5.1.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import tdfs4ds
|
|
2
2
|
from tdfs4ds.utils.query_management import execute_query_wrapper
|
|
3
3
|
import teradataml as tdml
|
|
4
|
+
from tdfs4ds import logger_safe, logger
|
|
4
5
|
|
|
5
6
|
@execute_query_wrapper
|
|
6
7
|
def follow_up_table_creation():
|
|
@@ -194,5 +195,115 @@ def followup_close(run_id, process_type, process_id, status='COMPLETED', filterm
|
|
|
194
195
|
raise
|
|
195
196
|
return query
|
|
196
197
|
|
|
197
|
-
|
|
198
|
-
|
|
198
|
+
from typing import Optional
|
|
199
|
+
|
|
200
|
+
def follow_up_report(filtermanager: Optional[object] = None, process_id: Optional[str] = None):
|
|
201
|
+
"""
|
|
202
|
+
Return a follow-up report as a `tdml.DataFrame`, optionally filtered by
|
|
203
|
+
`process_id` and/or a `filtermanager`'s applied filter.
|
|
204
|
+
|
|
205
|
+
Behavior by arguments:
|
|
206
|
+
- process_id is None and filtermanager is None:
|
|
207
|
+
Return all rows from SCHEMA.FOLLOW_UP_NAME, sorted by START_DATETIME desc.
|
|
208
|
+
- process_id is not None and filtermanager is None:
|
|
209
|
+
Return rows for the given PROCESS_ID.
|
|
210
|
+
- process_id is not None and filtermanager is not None:
|
|
211
|
+
Return rows for the given PROCESS_ID whose APPLIED_FILTER matches the
|
|
212
|
+
JSON_AGG of `filtermanager`'s columns coming from its schema/view.
|
|
213
|
+
- process_id is None and filtermanager is not None:
|
|
214
|
+
Return rows whose APPLIED_FILTER matches the JSON_AGG of `filtermanager`
|
|
215
|
+
(no PROCESS_ID constraint).
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
filtermanager: An object exposing `col_names`, `schema_name`, and `view_name`.
|
|
219
|
+
Its columns are aggregated via `JSON_AGG(col1, col2, ...)` to compare
|
|
220
|
+
against A.APPLIED_FILTER.
|
|
221
|
+
process_id: Optional process identifier used to filter by PROCESS_ID.
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
tdml.DataFrame: The resulting dataframe sorted by START_DATETIME (descending).
|
|
225
|
+
|
|
226
|
+
Raises:
|
|
227
|
+
ValueError: If `filtermanager` is provided but is missing required attributes
|
|
228
|
+
or has an empty `col_names` list.
|
|
229
|
+
RuntimeError: If the query fails.
|
|
230
|
+
"""
|
|
231
|
+
logger_safe("debug", "follow_up_report called with process_id=%s, filtermanager=%s",
|
|
232
|
+
process_id, type(filtermanager).__name__ if filtermanager else None)
|
|
233
|
+
|
|
234
|
+
table_fqn = f"{tdfs4ds.SCHEMA}.{tdfs4ds.FOLLOW_UP_NAME}"
|
|
235
|
+
|
|
236
|
+
# Case 1: No filters at all -> return full table
|
|
237
|
+
if process_id is None and filtermanager is None:
|
|
238
|
+
logger_safe("info", "Returning all follow-up rows (no filters).")
|
|
239
|
+
try:
|
|
240
|
+
return tdml.DataFrame(tdml.in_schema(tdfs4ds.SCHEMA, tdfs4ds.FOLLOW_UP_NAME)) \
|
|
241
|
+
.sort('START_DATETIME', ascending=False)
|
|
242
|
+
except Exception as e:
|
|
243
|
+
logger_safe("error", "Failed to fetch all follow-up rows: %s", e)
|
|
244
|
+
raise RuntimeError("Database query failed while fetching follow-up report.") from e
|
|
245
|
+
|
|
246
|
+
# Helper to build the FILTER_MANAGER scalar subquery when filtermanager is provided
|
|
247
|
+
def _build_filter_manager_subquery(fm: object) -> str:
|
|
248
|
+
required_attrs = ("col_names", "schema_name", "view_name")
|
|
249
|
+
if not all(hasattr(fm, a) for a in required_attrs):
|
|
250
|
+
raise ValueError("filtermanager must have col_names, schema_name, and view_name.")
|
|
251
|
+
if not getattr(fm, "col_names", None):
|
|
252
|
+
raise ValueError("filtermanager.col_names must be a non-empty list.")
|
|
253
|
+
|
|
254
|
+
json_cols = ",".join(fm.col_names)
|
|
255
|
+
subq = f"""
|
|
256
|
+
(
|
|
257
|
+
SELECT JSON_AGG({json_cols}) AS APPLIED_FILTER
|
|
258
|
+
FROM {fm.schema_name}.{fm.view_name}
|
|
259
|
+
) FILTER_MANAGER
|
|
260
|
+
"""
|
|
261
|
+
logger_safe("debug", "Constructed FILTER_MANAGER subquery with columns: %s", json_cols)
|
|
262
|
+
return subq
|
|
263
|
+
|
|
264
|
+
# Defensive escaping for process_id if used in a literal (prefer bind params if available)
|
|
265
|
+
def _escape_literal(val: str) -> str:
|
|
266
|
+
return val.replace("'", "''")
|
|
267
|
+
|
|
268
|
+
try:
|
|
269
|
+
# Case 2: process_id only
|
|
270
|
+
if process_id is not None and filtermanager is None:
|
|
271
|
+
pid = _escape_literal(process_id)
|
|
272
|
+
query = f"""
|
|
273
|
+
SELECT *
|
|
274
|
+
FROM {table_fqn}
|
|
275
|
+
WHERE PROCESS_ID = '{pid}'
|
|
276
|
+
"""
|
|
277
|
+
logger_safe("info", "Fetching follow-up rows filtered by PROCESS_ID only.")
|
|
278
|
+
return tdml.DataFrame.from_query(query).sort('START_DATETIME', ascending=False)
|
|
279
|
+
|
|
280
|
+
# Case 3: filtermanager only
|
|
281
|
+
if process_id is None and filtermanager is not None:
|
|
282
|
+
subq = _build_filter_manager_subquery(filtermanager)
|
|
283
|
+
query = f"""
|
|
284
|
+
SELECT A.*
|
|
285
|
+
FROM {table_fqn} A,
|
|
286
|
+
{subq}
|
|
287
|
+
WHERE CAST(A.APPLIED_FILTER AS VARCHAR(20000)) =
|
|
288
|
+
CAST(FILTER_MANAGER.APPLIED_FILTER AS VARCHAR(20000))
|
|
289
|
+
"""
|
|
290
|
+
logger_safe("info", "Fetching follow-up rows filtered by FILTER_MANAGER only.")
|
|
291
|
+
return tdml.DataFrame.from_query(query).sort('START_DATETIME', ascending=False)
|
|
292
|
+
|
|
293
|
+
# Case 4: both process_id and filtermanager
|
|
294
|
+
pid = _escape_literal(process_id) # type: ignore[arg-type]
|
|
295
|
+
subq = _build_filter_manager_subquery(filtermanager) # type: ignore[arg-type]
|
|
296
|
+
query = f"""
|
|
297
|
+
SELECT A.*
|
|
298
|
+
FROM {table_fqn} A,
|
|
299
|
+
{subq}
|
|
300
|
+
WHERE A.PROCESS_ID = '{pid}'
|
|
301
|
+
AND CAST(A.APPLIED_FILTER AS VARCHAR(20000)) =
|
|
302
|
+
CAST(FILTER_MANAGER.APPLIED_FILTER AS VARCHAR(20000))
|
|
303
|
+
"""
|
|
304
|
+
logger_safe("info", "Fetching follow-up rows filtered by PROCESS_ID and FILTER_MANAGER.")
|
|
305
|
+
return tdml.DataFrame.from_query(query).sort('START_DATETIME', ascending=False)
|
|
306
|
+
|
|
307
|
+
except Exception as e:
|
|
308
|
+
logger_safe("error", "Failed to fetch follow-up report: %s", e)
|
|
309
|
+
raise RuntimeError("Database query failed while fetching follow-up report.") from e
|
|
@@ -28,7 +28,7 @@ def list_processes():
|
|
|
28
28
|
return tdml.DataFrame(tdml.in_schema(tdfs4ds.SCHEMA, tdfs4ds.PROCESS_CATALOG_NAME_VIEW))
|
|
29
29
|
except Exception as e:
|
|
30
30
|
print(str(e))
|
|
31
|
-
print(
|
|
31
|
+
print(tdml.DataFrame(tdml.in_schema(tdfs4ds.SCHEMA, tdfs4ds.PROCESS_CATALOG_NAME_VIEW)).show_query())
|
|
32
32
|
|
|
33
33
|
def list_processes_feature_split():
|
|
34
34
|
"""
|
|
@@ -3,6 +3,7 @@ import tdfs4ds
|
|
|
3
3
|
from tdfs4ds.utils.query_management import execute_query_wrapper
|
|
4
4
|
import uuid
|
|
5
5
|
import json
|
|
6
|
+
from tdfs4ds import logger,logger_safe
|
|
6
7
|
|
|
7
8
|
@execute_query_wrapper
|
|
8
9
|
def register_process_view(view_name, entity_id, feature_names, metadata={}, entity_null_substitute = {}, **kwargs):
|
|
@@ -74,80 +75,91 @@ def _register_process_view_merge(view_name, entity_id, feature_names, metadata={
|
|
|
74
75
|
- Requires 'tdml' module for DataFrame operations and 'uuid' for generating unique identifiers.
|
|
75
76
|
"""
|
|
76
77
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
if type(view_name) == tdml.dataframe.dataframe.DataFrame:
|
|
78
|
+
# Handle teradataml DataFrame input
|
|
79
|
+
if isinstance(view_name, tdml.dataframe.dataframe.DataFrame):
|
|
80
80
|
try:
|
|
81
81
|
view_name = view_name._table_name
|
|
82
|
-
except:
|
|
83
|
-
|
|
84
|
-
|
|
82
|
+
except Exception:
|
|
83
|
+
logger_safe(
|
|
84
|
+
"error",
|
|
85
|
+
"Invalid DataFrame for view registration. Use: tdml.DataFrame(<table/view>). Crystallize if needed."
|
|
86
|
+
)
|
|
85
87
|
raise
|
|
86
88
|
|
|
89
|
+
# Prevent using temporary teradataml views
|
|
87
90
|
if view_name.split('.')[1].startswith('ml__'):
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
91
|
+
logger_safe(
|
|
92
|
+
"error",
|
|
93
|
+
"Invalid view name '%s': starts with 'ml__'. Please crystallize your view first.",
|
|
94
|
+
view_name
|
|
95
|
+
)
|
|
96
|
+
raise ValueError("Invalid process view name: temporary teradataml views are not allowed.")
|
|
97
|
+
|
|
98
|
+
# Get optional arguments
|
|
92
99
|
filtermanager = kwargs.get('filtermanager', None)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
# Get data distribution related inputs:
|
|
97
|
-
primary_index = kwargs.get('primary_index', [e for e in entity_id.keys()])
|
|
100
|
+
query_upsert_filtermanager = None
|
|
101
|
+
primary_index = kwargs.get('primary_index', list(entity_id.keys()))
|
|
98
102
|
partitioning = kwargs.get('partitioning', '').replace("'", '"')
|
|
99
103
|
|
|
100
104
|
if primary_index is None:
|
|
101
|
-
primary_index =
|
|
105
|
+
primary_index = list(entity_id.keys())
|
|
102
106
|
|
|
107
|
+
feature_names = ','.join(feature_names)
|
|
103
108
|
|
|
109
|
+
# Validtime period
|
|
110
|
+
end_period_ = '9999-01-01 00:00:00' if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED' else tdfs4ds.END_PERIOD
|
|
111
|
+
validtime_statement = (
|
|
112
|
+
'CURRENT VALIDTIME'
|
|
113
|
+
if tdfs4ds.FEATURE_STORE_TIME is None
|
|
114
|
+
else f"VALIDTIME PERIOD '({tdfs4ds.FEATURE_STORE_TIME},{end_period_})'"
|
|
115
|
+
)
|
|
104
116
|
|
|
105
|
-
|
|
106
|
-
feature_names = ','.join(feature_names)
|
|
117
|
+
logger_safe("info", "Registering process view: %s", view_name)
|
|
107
118
|
|
|
108
|
-
#
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
119
|
+
# Check if view already exists in catalog
|
|
120
|
+
query_process_id = f"""
|
|
121
|
+
SEL PROCESS_ID FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}
|
|
122
|
+
WHERE view_name = '{view_name}'
|
|
123
|
+
"""
|
|
124
|
+
process_id_result = tdml.execute_sql(query_process_id).fetchall()
|
|
113
125
|
|
|
114
|
-
if
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
validtime_statement = f"VALIDTIME PERIOD '({tdfs4ds.FEATURE_STORE_TIME},{end_period_})'"
|
|
126
|
+
if process_id_result:
|
|
127
|
+
process_id = process_id_result[0][0]
|
|
128
|
+
logger_safe("info", "Updating existing process_id=%s", process_id)
|
|
118
129
|
|
|
130
|
+
query_feature_version = f"""
|
|
131
|
+
SEL PROCESS_VERSION FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}
|
|
132
|
+
WHERE view_name = '{view_name}'
|
|
133
|
+
"""
|
|
134
|
+
feature_version = tdml.execute_sql(query_feature_version).fetchall()[0][0]
|
|
119
135
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
if len(query_primary_index_res)>0:
|
|
129
|
-
FOR_PRIMARY_INDEX, FOR_DATA_PARTITIONING = tdml.execute_sql(query_primary_index).fetchall()[0]
|
|
136
|
+
query_primary_index = f"""
|
|
137
|
+
SEL FOR_PRIMARY_INDEX, FOR_DATA_PARTITIONING
|
|
138
|
+
FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME}
|
|
139
|
+
WHERE process_id = '{process_id}'
|
|
140
|
+
"""
|
|
141
|
+
dist_res = tdml.execute_sql(query_primary_index).fetchall()
|
|
142
|
+
if dist_res:
|
|
143
|
+
FOR_PRIMARY_INDEX, FOR_DATA_PARTITIONING = dist_res[0]
|
|
130
144
|
else:
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
""
|
|
145
|
+
logger_safe(
|
|
146
|
+
"error",
|
|
147
|
+
"Missing data distribution info for existing process %s. Check distribution table.",
|
|
148
|
+
process_id
|
|
149
|
+
)
|
|
150
|
+
raise ValueError("Missing distribution info.")
|
|
137
151
|
else:
|
|
138
|
-
# Generating a unique process identifier
|
|
139
152
|
process_id = str(uuid.uuid4())
|
|
140
153
|
feature_version = 1
|
|
141
154
|
FOR_PRIMARY_INDEX = ",".join(primary_index)
|
|
142
155
|
FOR_DATA_PARTITIONING = partitioning
|
|
156
|
+
logger_safe("info", "Generated new process_id=%s", process_id)
|
|
143
157
|
|
|
144
|
-
#
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
158
|
+
# Build entity_id string
|
|
159
|
+
ENTITY_ID__ = ','.join(sorted(entity_id.keys()))
|
|
160
|
+
logger_safe("debug", "Entity IDs: %s", ENTITY_ID__)
|
|
161
|
+
logger_safe("debug", "Feature names: %s", feature_names)
|
|
148
162
|
|
|
149
|
-
print('feature_version :',feature_version)
|
|
150
|
-
print('int(feature_version) :', int(feature_version))
|
|
151
163
|
if tdfs4ds.FEATURE_STORE_TIME == None:
|
|
152
164
|
|
|
153
165
|
|
|
@@ -402,16 +414,16 @@ def _register_process_view_merge(view_name, entity_id, feature_names, metadata={
|
|
|
402
414
|
"""
|
|
403
415
|
|
|
404
416
|
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
print(f"to update your dataset : dataset = run(process_id='{process_id}',return_dataset=True)")
|
|
417
|
+
logger_safe("info", "Process registered: process_id=%s", process_id)
|
|
418
|
+
logger_safe("info", "To rerun: run(process_id='%s')", process_id)
|
|
419
|
+
logger_safe("info", "To build dataset: dataset = run(process_id='%s', return_dataset=True)", process_id)
|
|
409
420
|
|
|
410
|
-
#
|
|
421
|
+
# Return queries
|
|
411
422
|
if kwargs.get('with_process_id'):
|
|
412
423
|
return query_upsert, process_id, query_upsert_dist, query_upsert_filtermanager
|
|
413
424
|
else:
|
|
414
425
|
return query_upsert, query_upsert_dist, query_upsert_filtermanager
|
|
426
|
+
|
|
415
427
|
@execute_query_wrapper
|
|
416
428
|
def _register_process_view_update_insert(view_name, entity_id, feature_names, metadata={}, entity_null_substitute={}, **kwargs):
|
|
417
429
|
"""
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import teradataml as tdml
|
|
2
2
|
import tdfs4ds
|
|
3
3
|
from tdfs4ds.utils.query_management import execute_query,execute_query_wrapper
|
|
4
|
+
from tdfs4ds import logger_safe
|
|
5
|
+
|
|
4
6
|
|
|
5
7
|
def upgrade_process_catalog():
|
|
6
8
|
|
|
@@ -43,13 +45,13 @@ def upgrade_process_catalog():
|
|
|
43
45
|
# Step 4: Rename the new table to the old table's name
|
|
44
46
|
query_4 = f"""RENAME TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME}_NEW TO {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME};"""
|
|
45
47
|
|
|
46
|
-
|
|
48
|
+
logger_safe('info', f'creation of the {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME}_NEW table')
|
|
47
49
|
tdml.execute_sql(query_1)
|
|
48
|
-
|
|
50
|
+
logger_safe('info', f'insert existing processes from {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME} to {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME}_NEW')
|
|
49
51
|
tdml.execute_sql(query_2)
|
|
50
|
-
|
|
52
|
+
logger_safe('info', f'rename {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME} to {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME}_OLD')
|
|
51
53
|
tdml.execute_sql(query_3)
|
|
52
|
-
|
|
54
|
+
logger_safe('info', f'rename {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME}_NEW to {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME}')
|
|
53
55
|
tdml.execute_sql(query_4)
|
|
54
56
|
|
|
55
57
|
@execute_query_wrapper
|
|
@@ -208,7 +210,7 @@ def process_store_catalog_creation(if_exists='replace', comment='this table is a
|
|
|
208
210
|
(
|
|
209
211
|
PROCESS_ID VARCHAR(36) NOT NULL,
|
|
210
212
|
FOR_PRIMARY_INDEX VARCHAR(2048),
|
|
211
|
-
FOR_DATA_PARTITIONING VARCHAR(
|
|
213
|
+
FOR_DATA_PARTITIONING VARCHAR(32000),
|
|
212
214
|
ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
|
|
213
215
|
ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
|
|
214
216
|
PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME
|
|
@@ -227,7 +229,7 @@ def process_store_catalog_creation(if_exists='replace', comment='this table is a
|
|
|
227
229
|
(
|
|
228
230
|
PROCESS_ID VARCHAR(36) NOT NULL,
|
|
229
231
|
FOR_PRIMARY_INDEX VARCHAR(2048),
|
|
230
|
-
FOR_DATA_PARTITIONING VARCHAR(
|
|
232
|
+
FOR_DATA_PARTITIONING VARCHAR(32000)
|
|
231
233
|
)
|
|
232
234
|
PRIMARY INDEX (PROCESS_ID);
|
|
233
235
|
"""
|
|
@@ -263,79 +265,130 @@ def process_store_catalog_creation(if_exists='replace', comment='this table is a
|
|
|
263
265
|
execute_query(query)
|
|
264
266
|
if tdml.display.print_sqlmr_query:
|
|
265
267
|
print(query)
|
|
266
|
-
|
|
268
|
+
logger_safe('info', f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME} has been created')
|
|
267
269
|
execute_query(query3)
|
|
268
270
|
except Exception as e:
|
|
269
271
|
# If the table already exists and if_exists is set to 'replace', drop the table and recreate it
|
|
270
|
-
|
|
272
|
+
logger_safe('error', str(e).split('\n')[0])
|
|
271
273
|
if str(e).split('\n')[0].endswith('already exists.') and (if_exists == 'replace'):
|
|
272
274
|
execute_query(f'DROP TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME}')
|
|
273
|
-
|
|
275
|
+
logger_safe('info', f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME} has been dropped')
|
|
274
276
|
try:
|
|
275
277
|
# Attempt to recreate the table after dropping it
|
|
276
278
|
execute_query(query)
|
|
277
|
-
|
|
279
|
+
logger_safe('info', f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME} has been re-created')
|
|
278
280
|
if tdml.display.print_sqlmr_query:
|
|
279
281
|
print(query)
|
|
280
282
|
execute_query(query3)
|
|
281
283
|
except Exception as e:
|
|
282
|
-
|
|
284
|
+
logger_safe('error', str(e).split('\n')[0])
|
|
283
285
|
|
|
284
286
|
try:
|
|
285
287
|
# Attempt to execute the create table query
|
|
286
288
|
execute_query(query4)
|
|
287
289
|
if tdml.display.print_sqlmr_query:
|
|
288
290
|
print(query4)
|
|
289
|
-
|
|
291
|
+
logger_safe('info', f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME} has been created')
|
|
290
292
|
execute_query(query5)
|
|
291
293
|
except Exception as e:
|
|
292
294
|
# If the table already exists and if_exists is set to 'replace', drop the table and recreate it
|
|
293
|
-
|
|
295
|
+
logger_safe('error', str(e).split('\n')[0])
|
|
294
296
|
if str(e).split('\n')[0].endswith('already exists.') and (if_exists == 'replace'):
|
|
295
297
|
execute_query(f'DROP TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME}')
|
|
296
|
-
|
|
298
|
+
logger_safe('info', f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME} has been dropped')
|
|
297
299
|
try:
|
|
298
300
|
# Attempt to recreate the table after dropping it
|
|
299
301
|
execute_query(query4)
|
|
300
|
-
|
|
301
|
-
f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME} has been re-created')
|
|
302
|
+
logger_safe('info', f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME} has been re-created')
|
|
302
303
|
if tdml.display.print_sqlmr_query:
|
|
303
304
|
print(query4)
|
|
304
305
|
execute_query(query5)
|
|
305
306
|
except Exception as e:
|
|
306
|
-
|
|
307
|
+
logger_safe('error', str(e).split('\n')[0])
|
|
307
308
|
|
|
308
309
|
try:
|
|
309
310
|
# Attempt to execute the create table query
|
|
310
311
|
execute_query(query6)
|
|
311
312
|
if tdml.display.print_sqlmr_query:
|
|
312
313
|
print(query6)
|
|
313
|
-
|
|
314
|
+
logger_safe('info', f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.FILTER_MANAGER_NAME} has been created')
|
|
314
315
|
execute_query(query7)
|
|
315
316
|
except Exception as e:
|
|
316
317
|
# If the table already exists and if_exists is set to 'replace', drop the table and recreate it
|
|
317
|
-
|
|
318
|
+
logger_safe('error', str(e).split('\n')[0])
|
|
318
319
|
if str(e).split('\n')[0].endswith('already exists.') and (if_exists == 'replace'):
|
|
319
320
|
execute_query(f'DROP TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.FILTER_MANAGER_NAME}')
|
|
320
|
-
|
|
321
|
+
logger_safe('info', f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.FILTER_MANAGER_NAME} has been dropped')
|
|
321
322
|
try:
|
|
322
323
|
# Attempt to recreate the table after dropping it
|
|
323
324
|
execute_query(query6)
|
|
324
|
-
|
|
325
|
-
f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.FILTER_MANAGER_NAME} has been re-created')
|
|
325
|
+
logger_safe('info', f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.FILTER_MANAGER_NAME} has been re-created')
|
|
326
326
|
if tdml.display.print_sqlmr_query:
|
|
327
327
|
print(query6)
|
|
328
328
|
execute_query(query7)
|
|
329
329
|
except Exception as e:
|
|
330
|
-
|
|
330
|
+
logger_safe('error', str(e).split('\n')[0])
|
|
331
331
|
|
|
332
332
|
try:
|
|
333
333
|
# Attempt to create the secondary index
|
|
334
334
|
execute_query(query2)
|
|
335
335
|
if tdml.display.print_sqlmr_query:
|
|
336
336
|
print(query)
|
|
337
|
-
|
|
337
|
+
logger_safe('info', f'SECONDARY INDEX ON TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME} has been created')
|
|
338
|
+
except Exception as e:
|
|
339
|
+
logger_safe('error', str(e).split('\n')[0])
|
|
340
|
+
|
|
341
|
+
return tdfs4ds.PROCESS_CATALOG_NAME, tdfs4ds.DATA_DISTRIBUTION_NAME, tdfs4ds.FILTER_MANAGER_NAME
|
|
342
|
+
|
|
343
|
+
def get_process_info(process_id: str) -> dict:
|
|
344
|
+
"""
|
|
345
|
+
Retrieve process information including SQL and columns from the process catalog.
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
process_id (str): The unique identifier of the process.
|
|
349
|
+
Returns:
|
|
350
|
+
dict: A dictionary containing process SQL and columns.
|
|
351
|
+
"""
|
|
352
|
+
# Retrieve process SQL and columns
|
|
353
|
+
process_catalog = tdfs4ds.process_catalog()
|
|
354
|
+
try:
|
|
355
|
+
process_info = process_catalog[process_catalog['PROCESS_ID'] == process_id].to_pandas().to_dict(orient='records')[0]
|
|
356
|
+
process_info['ENTITY_COLUMNS'] = process_info['ENTITY_ID'].split(',')
|
|
357
|
+
process_info['FEATURE_COLUMNS'] = process_info['FEATURE_NAMES'].split(',')
|
|
358
|
+
except Exception as e:
|
|
359
|
+
logger_safe('error', f"Error retrieving process info: {e}")
|
|
360
|
+
return None
|
|
361
|
+
|
|
362
|
+
# get the SQL query:
|
|
363
|
+
if process_info:
|
|
364
|
+
process_sql = tdfs4ds.utils.lineage.get_ddl(
|
|
365
|
+
view_name = process_info['VIEW_NAME'].split('.')[1],
|
|
366
|
+
schema_name = process_info['VIEW_NAME'].split('.')[0],
|
|
367
|
+
object_type='view'
|
|
368
|
+
)
|
|
369
|
+
process_info['PROCESS_SQL'] = process_sql
|
|
370
|
+
|
|
371
|
+
# retrieve feature documentation
|
|
372
|
+
from tdfs4ds.genai.documentation import retrieve_documentation, retrieve_explain_documentation
|
|
373
|
+
|
|
374
|
+
try:
|
|
375
|
+
documentation = retrieve_documentation(process_id)
|
|
376
|
+
process_info['DOCUMENTED_SQL'] = documentation['DOCUMENTED_SQL']
|
|
377
|
+
process_info['DOCUMENTED_ENTITY_COLUMNS'] = documentation['DOCUMENTED_ENTITY_COLUMNS']
|
|
378
|
+
process_info['DOCUMENTED_FEATURE_COLUMNS'] = documentation['DOCUMENTED_FEATURE_COLUMNS']
|
|
379
|
+
process_info['ENTITY_DESCRIPTION'] = documentation['ENTITY_DESCRIPTION']
|
|
380
|
+
logger_safe('info', f"Successfully retrieved documentation for process_id {process_id}")
|
|
381
|
+
except Exception as e:
|
|
382
|
+
logger_safe('error', f"Error retrieving documentation: {e}")
|
|
383
|
+
|
|
384
|
+
try:
|
|
385
|
+
explain_documentation = retrieve_explain_documentation(process_id)
|
|
386
|
+
process_info['EXPLAIN_ANALYSIS'] = explain_documentation['EXPLAIN_ANALYSIS']
|
|
387
|
+
process_info['OPTIMIZATION_SCORE'] = explain_documentation['OPTIMIZATION_SCORE']
|
|
388
|
+
process_info['EXPLAIN_WARNINGS'] = explain_documentation['EXPLAIN_WARNINGS']
|
|
389
|
+
process_info['EXPLAIN_RECOMMENDATIONS'] = explain_documentation['EXPLAIN_RECOMMENDATIONS']
|
|
390
|
+
logger_safe('info', f"Successfully retrieved explanation for process_id {process_id}")
|
|
338
391
|
except Exception as e:
|
|
339
|
-
|
|
392
|
+
logger_safe('error', f"Error retrieving explanation: {e}")
|
|
340
393
|
|
|
341
|
-
return
|
|
394
|
+
return process_info
|