tdfs4ds 0.2.4.26__py3-none-any.whl → 0.2.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  import tdfs4ds
2
2
  from tdfs4ds.utils.query_management import execute_query_wrapper
3
3
  import teradataml as tdml
4
+ from tdfs4ds import logger_safe, logger
4
5
 
5
6
  @execute_query_wrapper
6
7
  def follow_up_table_creation():
@@ -194,5 +195,115 @@ def followup_close(run_id, process_type, process_id, status='COMPLETED', filterm
194
195
  raise
195
196
  return query
196
197
 
197
- def follow_up_report():
198
- return tdml.DataFrame(tdml.in_schema(tdfs4ds.SCHEMA, tdfs4ds.FOLLOW_UP_NAME.replace('FS_', 'FS_V_'))).sort('START_DATETIME',ascending=False)
198
+ from typing import Optional
199
+
200
+ def follow_up_report(filtermanager: Optional[object] = None, process_id: Optional[str] = None):
201
+ """
202
+ Return a follow-up report as a `tdml.DataFrame`, optionally filtered by
203
+ `process_id` and/or a `filtermanager`'s applied filter.
204
+
205
+ Behavior by arguments:
206
+ - process_id is None and filtermanager is None:
207
+ Return all rows from SCHEMA.FOLLOW_UP_NAME, sorted by START_DATETIME desc.
208
+ - process_id is not None and filtermanager is None:
209
+ Return rows for the given PROCESS_ID.
210
+ - process_id is not None and filtermanager is not None:
211
+ Return rows for the given PROCESS_ID whose APPLIED_FILTER matches the
212
+ JSON_AGG of `filtermanager`'s columns coming from its schema/view.
213
+ - process_id is None and filtermanager is not None:
214
+ Return rows whose APPLIED_FILTER matches the JSON_AGG of `filtermanager`
215
+ (no PROCESS_ID constraint).
216
+
217
+ Args:
218
+ filtermanager: An object exposing `col_names`, `schema_name`, and `view_name`.
219
+ Its columns are aggregated via `JSON_AGG(col1, col2, ...)` to compare
220
+ against A.APPLIED_FILTER.
221
+ process_id: Optional process identifier used to filter by PROCESS_ID.
222
+
223
+ Returns:
224
+ tdml.DataFrame: The resulting dataframe sorted by START_DATETIME (descending).
225
+
226
+ Raises:
227
+ ValueError: If `filtermanager` is provided but is missing required attributes
228
+ or has an empty `col_names` list.
229
+ RuntimeError: If the query fails.
230
+ """
231
+ logger_safe("debug", "follow_up_report called with process_id=%s, filtermanager=%s",
232
+ process_id, type(filtermanager).__name__ if filtermanager else None)
233
+
234
+ table_fqn = f"{tdfs4ds.SCHEMA}.{tdfs4ds.FOLLOW_UP_NAME}"
235
+
236
+ # Case 1: No filters at all -> return full table
237
+ if process_id is None and filtermanager is None:
238
+ logger_safe("info", "Returning all follow-up rows (no filters).")
239
+ try:
240
+ return tdml.DataFrame(tdml.in_schema(tdfs4ds.SCHEMA, tdfs4ds.FOLLOW_UP_NAME)) \
241
+ .sort('START_DATETIME', ascending=False)
242
+ except Exception as e:
243
+ logger_safe("error", "Failed to fetch all follow-up rows: %s", e)
244
+ raise RuntimeError("Database query failed while fetching follow-up report.") from e
245
+
246
+ # Helper to build the FILTER_MANAGER scalar subquery when filtermanager is provided
247
+ def _build_filter_manager_subquery(fm: object) -> str:
248
+ required_attrs = ("col_names", "schema_name", "view_name")
249
+ if not all(hasattr(fm, a) for a in required_attrs):
250
+ raise ValueError("filtermanager must have col_names, schema_name, and view_name.")
251
+ if not getattr(fm, "col_names", None):
252
+ raise ValueError("filtermanager.col_names must be a non-empty list.")
253
+
254
+ json_cols = ",".join(fm.col_names)
255
+ subq = f"""
256
+ (
257
+ SELECT JSON_AGG({json_cols}) AS APPLIED_FILTER
258
+ FROM {fm.schema_name}.{fm.view_name}
259
+ ) FILTER_MANAGER
260
+ """
261
+ logger_safe("debug", "Constructed FILTER_MANAGER subquery with columns: %s", json_cols)
262
+ return subq
263
+
264
+ # Defensive escaping for process_id if used in a literal (prefer bind params if available)
265
+ def _escape_literal(val: str) -> str:
266
+ return val.replace("'", "''")
267
+
268
+ try:
269
+ # Case 2: process_id only
270
+ if process_id is not None and filtermanager is None:
271
+ pid = _escape_literal(process_id)
272
+ query = f"""
273
+ SELECT *
274
+ FROM {table_fqn}
275
+ WHERE PROCESS_ID = '{pid}'
276
+ """
277
+ logger_safe("info", "Fetching follow-up rows filtered by PROCESS_ID only.")
278
+ return tdml.DataFrame.from_query(query).sort('START_DATETIME', ascending=False)
279
+
280
+ # Case 3: filtermanager only
281
+ if process_id is None and filtermanager is not None:
282
+ subq = _build_filter_manager_subquery(filtermanager)
283
+ query = f"""
284
+ SELECT A.*
285
+ FROM {table_fqn} A,
286
+ {subq}
287
+ WHERE CAST(A.APPLIED_FILTER AS VARCHAR(20000)) =
288
+ CAST(FILTER_MANAGER.APPLIED_FILTER AS VARCHAR(20000))
289
+ """
290
+ logger_safe("info", "Fetching follow-up rows filtered by FILTER_MANAGER only.")
291
+ return tdml.DataFrame.from_query(query).sort('START_DATETIME', ascending=False)
292
+
293
+ # Case 4: both process_id and filtermanager
294
+ pid = _escape_literal(process_id) # type: ignore[arg-type]
295
+ subq = _build_filter_manager_subquery(filtermanager) # type: ignore[arg-type]
296
+ query = f"""
297
+ SELECT A.*
298
+ FROM {table_fqn} A,
299
+ {subq}
300
+ WHERE A.PROCESS_ID = '{pid}'
301
+ AND CAST(A.APPLIED_FILTER AS VARCHAR(20000)) =
302
+ CAST(FILTER_MANAGER.APPLIED_FILTER AS VARCHAR(20000))
303
+ """
304
+ logger_safe("info", "Fetching follow-up rows filtered by PROCESS_ID and FILTER_MANAGER.")
305
+ return tdml.DataFrame.from_query(query).sort('START_DATETIME', ascending=False)
306
+
307
+ except Exception as e:
308
+ logger_safe("error", "Failed to fetch follow-up report: %s", e)
309
+ raise RuntimeError("Database query failed while fetching follow-up report.") from e
@@ -28,7 +28,7 @@ def list_processes():
28
28
  return tdml.DataFrame(tdml.in_schema(tdfs4ds.SCHEMA, tdfs4ds.PROCESS_CATALOG_NAME_VIEW))
29
29
  except Exception as e:
30
30
  print(str(e))
31
- print(query)
31
+ print(tdml.DataFrame(tdml.in_schema(tdfs4ds.SCHEMA, tdfs4ds.PROCESS_CATALOG_NAME_VIEW)).show_query())
32
32
 
33
33
  def list_processes_feature_split():
34
34
  """
@@ -3,6 +3,7 @@ import tdfs4ds
3
3
  from tdfs4ds.utils.query_management import execute_query_wrapper
4
4
  import uuid
5
5
  import json
6
+ from tdfs4ds import logger,logger_safe
6
7
 
7
8
  @execute_query_wrapper
8
9
  def register_process_view(view_name, entity_id, feature_names, metadata={}, entity_null_substitute = {}, **kwargs):
@@ -74,80 +75,91 @@ def _register_process_view_merge(view_name, entity_id, feature_names, metadata={
74
75
  - Requires 'tdml' module for DataFrame operations and 'uuid' for generating unique identifiers.
75
76
  """
76
77
 
77
-
78
- # Handling the case where the view name is provided as a DataFrame
79
- if type(view_name) == tdml.dataframe.dataframe.DataFrame:
78
+ # Handle teradataml DataFrame input
79
+ if isinstance(view_name, tdml.dataframe.dataframe.DataFrame):
80
80
  try:
81
81
  view_name = view_name._table_name
82
- except:
83
- print(
84
- 'create your teradata dataframe using tdml.DataFrame(<view name>). Crystallize your view if needed')
82
+ except Exception:
83
+ logger_safe(
84
+ "error",
85
+ "Invalid DataFrame for view registration. Use: tdml.DataFrame(<table/view>). Crystallize if needed."
86
+ )
85
87
  raise
86
88
 
89
+ # Prevent using temporary teradataml views
87
90
  if view_name.split('.')[1].startswith('ml__'):
88
- tdfs4ds.logger.error('Your dataframe is a temporary teradataml dataframe. Please crystallize your view first.')
89
- raise ValueError("Invalid process view name: it starts with 'ml__'. Please consider view crystallization")
90
-
91
- # Get filter manager:
91
+ logger_safe(
92
+ "error",
93
+ "Invalid view name '%s': starts with 'ml__'. Please crystallize your view first.",
94
+ view_name
95
+ )
96
+ raise ValueError("Invalid process view name: temporary teradataml views are not allowed.")
97
+
98
+ # Get optional arguments
92
99
  filtermanager = kwargs.get('filtermanager', None)
93
- if filtermanager is None:
94
- query_upsert_filtermanager = None
95
-
96
- # Get data distribution related inputs:
97
- primary_index = kwargs.get('primary_index', [e for e in entity_id.keys()])
100
+ query_upsert_filtermanager = None
101
+ primary_index = kwargs.get('primary_index', list(entity_id.keys()))
98
102
  partitioning = kwargs.get('partitioning', '').replace("'", '"')
99
103
 
100
104
  if primary_index is None:
101
- primary_index = [e for e in entity_id.keys()]
105
+ primary_index = list(entity_id.keys())
102
106
 
107
+ feature_names = ','.join(feature_names)
103
108
 
109
+ # Validtime period
110
+ end_period_ = '9999-01-01 00:00:00' if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED' else tdfs4ds.END_PERIOD
111
+ validtime_statement = (
112
+ 'CURRENT VALIDTIME'
113
+ if tdfs4ds.FEATURE_STORE_TIME is None
114
+ else f"VALIDTIME PERIOD '({tdfs4ds.FEATURE_STORE_TIME},{end_period_})'"
115
+ )
104
116
 
105
- # Joining the feature names into a comma-separated string
106
- feature_names = ','.join(feature_names)
117
+ logger_safe("info", "Registering process view: %s", view_name)
107
118
 
108
- # Setting the end period for the view
109
- if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED':
110
- end_period_ = '9999-01-01 00:00:00'
111
- else:
112
- end_period_ = tdfs4ds.END_PERIOD
119
+ # Check if view already exists in catalog
120
+ query_process_id = f"""
121
+ SEL PROCESS_ID FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}
122
+ WHERE view_name = '{view_name}'
123
+ """
124
+ process_id_result = tdml.execute_sql(query_process_id).fetchall()
113
125
 
114
- if tdfs4ds.FEATURE_STORE_TIME == None:
115
- validtime_statement = 'CURRENT VALIDTIME'
116
- else:
117
- validtime_statement = f"VALIDTIME PERIOD '({tdfs4ds.FEATURE_STORE_TIME},{end_period_})'"
126
+ if process_id_result:
127
+ process_id = process_id_result[0][0]
128
+ logger_safe("info", "Updating existing process_id=%s", process_id)
118
129
 
130
+ query_feature_version = f"""
131
+ SEL PROCESS_VERSION FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}
132
+ WHERE view_name = '{view_name}'
133
+ """
134
+ feature_version = tdml.execute_sql(query_feature_version).fetchall()[0][0]
119
135
 
120
- query_process_id = f"SEL PROCESS_ID FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW} WHERE view_name = '{view_name}'"
121
- process_id = tdml.execute_sql(query_process_id).fetchall()
122
- if len(process_id)>0:
123
- process_id = process_id[0][0]
124
- query_feature_version = f"SEL PROCESS_VERSION FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW} WHERE view_name = '{view_name}'"
125
- feature_version = tdml.execute_sql(query_feature_version).fetchall()[0][0]
126
- query_primary_index = f"SEL FOR_PRIMARY_INDEX, FOR_DATA_PARTITIONING FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME} WHERE process_id = '{process_id}'"
127
- query_primary_index_res = tdml.execute_sql(query_primary_index).fetchall()
128
- if len(query_primary_index_res)>0:
129
- FOR_PRIMARY_INDEX, FOR_DATA_PARTITIONING = tdml.execute_sql(query_primary_index).fetchall()[0]
136
+ query_primary_index = f"""
137
+ SEL FOR_PRIMARY_INDEX, FOR_DATA_PARTITIONING
138
+ FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME}
139
+ WHERE process_id = '{process_id}'
140
+ """
141
+ dist_res = tdml.execute_sql(query_primary_index).fetchall()
142
+ if dist_res:
143
+ FOR_PRIMARY_INDEX, FOR_DATA_PARTITIONING = dist_res[0]
130
144
  else:
131
- raise ValueError(f"""
132
- There is not information on primary index and partitioning for process: {process_id}.
133
- The working date is: {validtime_statement}
134
- The content of the distribution table is:
135
- {print(tdml.DataFrame.from_query(f"SEL * FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME} WHERE process_id = '{process_id}'"))}
136
- """)
145
+ logger_safe(
146
+ "error",
147
+ "Missing data distribution info for existing process %s. Check distribution table.",
148
+ process_id
149
+ )
150
+ raise ValueError("Missing distribution info.")
137
151
  else:
138
- # Generating a unique process identifier
139
152
  process_id = str(uuid.uuid4())
140
153
  feature_version = 1
141
154
  FOR_PRIMARY_INDEX = ",".join(primary_index)
142
155
  FOR_DATA_PARTITIONING = partitioning
156
+ logger_safe("info", "Generated new process_id=%s", process_id)
143
157
 
144
- # Create a comma-separated string of entity IDs
145
- entity_id_list = list(entity_id.keys())
146
- entity_id_list.sort()
147
- ENTITY_ID__ = ','.join([k for k in entity_id_list])
158
+ # Build entity_id string
159
+ ENTITY_ID__ = ','.join(sorted(entity_id.keys()))
160
+ logger_safe("debug", "Entity IDs: %s", ENTITY_ID__)
161
+ logger_safe("debug", "Feature names: %s", feature_names)
148
162
 
149
- print('feature_version :',feature_version)
150
- print('int(feature_version) :', int(feature_version))
151
163
  if tdfs4ds.FEATURE_STORE_TIME == None:
152
164
 
153
165
 
@@ -402,16 +414,16 @@ def _register_process_view_merge(view_name, entity_id, feature_names, metadata={
402
414
  """
403
415
 
404
416
 
405
- # Logging the process registration
406
- print(f'register process with id : {process_id}')
407
- print(f"to run the process again just type : run(process_id='{process_id}')")
408
- print(f"to update your dataset : dataset = run(process_id='{process_id}',return_dataset=True)")
417
+ logger_safe("info", "Process registered: process_id=%s", process_id)
418
+ logger_safe("info", "To rerun: run(process_id='%s')", process_id)
419
+ logger_safe("info", "To build dataset: dataset = run(process_id='%s', return_dataset=True)", process_id)
409
420
 
410
- #print('query_insert_dist', query_upsert_dist)
421
+ # Return queries
411
422
  if kwargs.get('with_process_id'):
412
423
  return query_upsert, process_id, query_upsert_dist, query_upsert_filtermanager
413
424
  else:
414
425
  return query_upsert, query_upsert_dist, query_upsert_filtermanager
426
+
415
427
  @execute_query_wrapper
416
428
  def _register_process_view_update_insert(view_name, entity_id, feature_names, metadata={}, entity_null_substitute={}, **kwargs):
417
429
  """
@@ -1,6 +1,8 @@
1
1
  import teradataml as tdml
2
2
  import tdfs4ds
3
3
  from tdfs4ds.utils.query_management import execute_query,execute_query_wrapper
4
+ from tdfs4ds import logger_safe
5
+
4
6
 
5
7
  def upgrade_process_catalog():
6
8
 
@@ -43,13 +45,13 @@ def upgrade_process_catalog():
43
45
  # Step 4: Rename the new table to the old table's name
44
46
  query_4 = f"""RENAME TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME}_NEW TO {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME};"""
45
47
 
46
- print('creation of the ', f"{tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME}_NEW","table" )
48
+ logger_safe('info', f'creation of the {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME}_NEW table')
47
49
  tdml.execute_sql(query_1)
48
- print('insert existing processes from',f"{tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME}", 'to',f"{tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME}_NEW")
50
+ logger_safe('info', f'insert existing processes from {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME} to {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME}_NEW')
49
51
  tdml.execute_sql(query_2)
50
- print('rename ',f"{tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME}",'to',f"{tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME}_OLD")
52
+ logger_safe('info', f'rename {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME} to {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME}_OLD')
51
53
  tdml.execute_sql(query_3)
52
- print('rename ,',f"{tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME}_NEW",'to',f"{tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME}")
54
+ logger_safe('info', f'rename {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME}_NEW to {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME}')
53
55
  tdml.execute_sql(query_4)
54
56
 
55
57
  @execute_query_wrapper
@@ -208,7 +210,7 @@ def process_store_catalog_creation(if_exists='replace', comment='this table is a
208
210
  (
209
211
  PROCESS_ID VARCHAR(36) NOT NULL,
210
212
  FOR_PRIMARY_INDEX VARCHAR(2048),
211
- FOR_DATA_PARTITIONING VARCHAR(2048),
213
+ FOR_DATA_PARTITIONING VARCHAR(32000),
212
214
  ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
213
215
  ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
214
216
  PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME
@@ -227,7 +229,7 @@ def process_store_catalog_creation(if_exists='replace', comment='this table is a
227
229
  (
228
230
  PROCESS_ID VARCHAR(36) NOT NULL,
229
231
  FOR_PRIMARY_INDEX VARCHAR(2048),
230
- FOR_DATA_PARTITIONING VARCHAR(2048)
232
+ FOR_DATA_PARTITIONING VARCHAR(32000)
231
233
  )
232
234
  PRIMARY INDEX (PROCESS_ID);
233
235
  """
@@ -263,79 +265,130 @@ def process_store_catalog_creation(if_exists='replace', comment='this table is a
263
265
  execute_query(query)
264
266
  if tdml.display.print_sqlmr_query:
265
267
  print(query)
266
- if tdfs4ds.DISPLAY_LOGS: print(f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME} has been created')
268
+ logger_safe('info', f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME} has been created')
267
269
  execute_query(query3)
268
270
  except Exception as e:
269
271
  # If the table already exists and if_exists is set to 'replace', drop the table and recreate it
270
- if tdfs4ds.DISPLAY_LOGS: print(str(e).split('\n')[0])
272
+ logger_safe('error', str(e).split('\n')[0])
271
273
  if str(e).split('\n')[0].endswith('already exists.') and (if_exists == 'replace'):
272
274
  execute_query(f'DROP TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME}')
273
- print(f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME} has been dropped')
275
+ logger_safe('info', f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME} has been dropped')
274
276
  try:
275
277
  # Attempt to recreate the table after dropping it
276
278
  execute_query(query)
277
- if tdfs4ds.DISPLAY_LOGS: print(f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME} has been re-created')
279
+ logger_safe('info', f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME} has been re-created')
278
280
  if tdml.display.print_sqlmr_query:
279
281
  print(query)
280
282
  execute_query(query3)
281
283
  except Exception as e:
282
- print(str(e).split('\n')[0])
284
+ logger_safe('error', str(e).split('\n')[0])
283
285
 
284
286
  try:
285
287
  # Attempt to execute the create table query
286
288
  execute_query(query4)
287
289
  if tdml.display.print_sqlmr_query:
288
290
  print(query4)
289
- if tdfs4ds.DISPLAY_LOGS: print(f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME} has been created')
291
+ logger_safe('info', f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME} has been created')
290
292
  execute_query(query5)
291
293
  except Exception as e:
292
294
  # If the table already exists and if_exists is set to 'replace', drop the table and recreate it
293
- if tdfs4ds.DISPLAY_LOGS: print(str(e).split('\n')[0])
295
+ logger_safe('error', str(e).split('\n')[0])
294
296
  if str(e).split('\n')[0].endswith('already exists.') and (if_exists == 'replace'):
295
297
  execute_query(f'DROP TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME}')
296
- print(f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME} has been dropped')
298
+ logger_safe('info', f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME} has been dropped')
297
299
  try:
298
300
  # Attempt to recreate the table after dropping it
299
301
  execute_query(query4)
300
- if tdfs4ds.DISPLAY_LOGS: print(
301
- f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME} has been re-created')
302
+ logger_safe('info', f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME} has been re-created')
302
303
  if tdml.display.print_sqlmr_query:
303
304
  print(query4)
304
305
  execute_query(query5)
305
306
  except Exception as e:
306
- print(str(e).split('\n')[0])
307
+ logger_safe('error', str(e).split('\n')[0])
307
308
 
308
309
  try:
309
310
  # Attempt to execute the create table query
310
311
  execute_query(query6)
311
312
  if tdml.display.print_sqlmr_query:
312
313
  print(query6)
313
- if tdfs4ds.DISPLAY_LOGS: print(f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.FILTER_MANAGER_NAME} has been created')
314
+ logger_safe('info', f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.FILTER_MANAGER_NAME} has been created')
314
315
  execute_query(query7)
315
316
  except Exception as e:
316
317
  # If the table already exists and if_exists is set to 'replace', drop the table and recreate it
317
- if tdfs4ds.DISPLAY_LOGS: print(str(e).split('\n')[0])
318
+ logger_safe('error', str(e).split('\n')[0])
318
319
  if str(e).split('\n')[0].endswith('already exists.') and (if_exists == 'replace'):
319
320
  execute_query(f'DROP TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.FILTER_MANAGER_NAME}')
320
- print(f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.FILTER_MANAGER_NAME} has been dropped')
321
+ logger_safe('info', f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.FILTER_MANAGER_NAME} has been dropped')
321
322
  try:
322
323
  # Attempt to recreate the table after dropping it
323
324
  execute_query(query6)
324
- if tdfs4ds.DISPLAY_LOGS: print(
325
- f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.FILTER_MANAGER_NAME} has been re-created')
325
+ logger_safe('info', f'TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.FILTER_MANAGER_NAME} has been re-created')
326
326
  if tdml.display.print_sqlmr_query:
327
327
  print(query6)
328
328
  execute_query(query7)
329
329
  except Exception as e:
330
- print(str(e).split('\n')[0])
330
+ logger_safe('error', str(e).split('\n')[0])
331
331
 
332
332
  try:
333
333
  # Attempt to create the secondary index
334
334
  execute_query(query2)
335
335
  if tdml.display.print_sqlmr_query:
336
336
  print(query)
337
- if tdfs4ds.DISPLAY_LOGS: print(f'SECONDARY INDEX ON TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME} has been created')
337
+ logger_safe('info', f'SECONDARY INDEX ON TABLE {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME} has been created')
338
+ except Exception as e:
339
+ logger_safe('error', str(e).split('\n')[0])
340
+
341
+ return tdfs4ds.PROCESS_CATALOG_NAME, tdfs4ds.DATA_DISTRIBUTION_NAME, tdfs4ds.FILTER_MANAGER_NAME
342
+
343
+ def get_process_info(process_id: str) -> dict:
344
+ """
345
+ Retrieve process information including SQL and columns from the process catalog.
346
+
347
+ Args:
348
+ process_id (str): The unique identifier of the process.
349
+ Returns:
350
+ dict: A dictionary containing process SQL and columns.
351
+ """
352
+ # Retrieve process SQL and columns
353
+ process_catalog = tdfs4ds.process_catalog()
354
+ try:
355
+ process_info = process_catalog[process_catalog['PROCESS_ID'] == process_id].to_pandas().to_dict(orient='records')[0]
356
+ process_info['ENTITY_COLUMNS'] = process_info['ENTITY_ID'].split(',')
357
+ process_info['FEATURE_COLUMNS'] = process_info['FEATURE_NAMES'].split(',')
358
+ except Exception as e:
359
+ logger_safe('error', f"Error retrieving process info: {e}")
360
+ return None
361
+
362
+ # get the SQL query:
363
+ if process_info:
364
+ process_sql = tdfs4ds.utils.lineage.get_ddl(
365
+ view_name = process_info['VIEW_NAME'].split('.')[1],
366
+ schema_name = process_info['VIEW_NAME'].split('.')[0],
367
+ object_type='view'
368
+ )
369
+ process_info['PROCESS_SQL'] = process_sql
370
+
371
+ # retrieve feature documentation
372
+ from tdfs4ds.genai.documentation import retrieve_documentation, retrieve_explain_documentation
373
+
374
+ try:
375
+ documentation = retrieve_documentation(process_id)
376
+ process_info['DOCUMENTED_SQL'] = documentation['DOCUMENTED_SQL']
377
+ process_info['DOCUMENTED_ENTITY_COLUMNS'] = documentation['DOCUMENTED_ENTITY_COLUMNS']
378
+ process_info['DOCUMENTED_FEATURE_COLUMNS'] = documentation['DOCUMENTED_FEATURE_COLUMNS']
379
+ process_info['ENTITY_DESCRIPTION'] = documentation['ENTITY_DESCRIPTION']
380
+ logger_safe('info', f"Successfully retrieved documentation for process_id {process_id}")
381
+ except Exception as e:
382
+ logger_safe('error', f"Error retrieving documentation: {e}")
383
+
384
+ try:
385
+ explain_documentation = retrieve_explain_documentation(process_id)
386
+ process_info['EXPLAIN_ANALYSIS'] = explain_documentation['EXPLAIN_ANALYSIS']
387
+ process_info['OPTIMIZATION_SCORE'] = explain_documentation['OPTIMIZATION_SCORE']
388
+ process_info['EXPLAIN_WARNINGS'] = explain_documentation['EXPLAIN_WARNINGS']
389
+ process_info['EXPLAIN_RECOMMENDATIONS'] = explain_documentation['EXPLAIN_RECOMMENDATIONS']
390
+ logger_safe('info', f"Successfully retrieved explanation for process_id {process_id}")
338
391
  except Exception as e:
339
- print(str(e).split('\n')[0])
392
+ logger_safe('error', f"Error retrieving explanation: {e}")
340
393
 
341
- return tdfs4ds.PROCESS_CATALOG_NAME, tdfs4ds.DATA_DISTRIBUTION_NAME, tdfs4ds.FILTER_MANAGER_NAME
394
+ return process_info