tdfs4ds 0.2.4.26__py3-none-any.whl → 0.2.4.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  import tdfs4ds
2
2
  from tdfs4ds.utils.query_management import execute_query_wrapper
3
3
  import teradataml as tdml
4
+ from tdfs4ds import logger_safe, logger
4
5
 
5
6
  @execute_query_wrapper
6
7
  def follow_up_table_creation():
@@ -194,5 +195,115 @@ def followup_close(run_id, process_type, process_id, status='COMPLETED', filterm
194
195
  raise
195
196
  return query
196
197
 
197
- def follow_up_report():
198
- return tdml.DataFrame(tdml.in_schema(tdfs4ds.SCHEMA, tdfs4ds.FOLLOW_UP_NAME.replace('FS_', 'FS_V_'))).sort('START_DATETIME',ascending=False)
198
+ from typing import Optional
199
+
200
+ def follow_up_report(filtermanager: Optional[object] = None, process_id: Optional[str] = None):
201
+ """
202
+ Return a follow-up report as a `tdml.DataFrame`, optionally filtered by
203
+ `process_id` and/or a `filtermanager`'s applied filter.
204
+
205
+ Behavior by arguments:
206
+ - process_id is None and filtermanager is None:
207
+ Return all rows from SCHEMA.FOLLOW_UP_NAME, sorted by START_DATETIME desc.
208
+ - process_id is not None and filtermanager is None:
209
+ Return rows for the given PROCESS_ID.
210
+ - process_id is not None and filtermanager is not None:
211
+ Return rows for the given PROCESS_ID whose APPLIED_FILTER matches the
212
+ JSON_AGG of `filtermanager`'s columns coming from its schema/view.
213
+ - process_id is None and filtermanager is not None:
214
+ Return rows whose APPLIED_FILTER matches the JSON_AGG of `filtermanager`
215
+ (no PROCESS_ID constraint).
216
+
217
+ Args:
218
+ filtermanager: An object exposing `col_names`, `schema_name`, and `view_name`.
219
+ Its columns are aggregated via `JSON_AGG(col1, col2, ...)` to compare
220
+ against A.APPLIED_FILTER.
221
+ process_id: Optional process identifier used to filter by PROCESS_ID.
222
+
223
+ Returns:
224
+ tdml.DataFrame: The resulting dataframe sorted by START_DATETIME (descending).
225
+
226
+ Raises:
227
+ ValueError: If `filtermanager` is provided but is missing required attributes
228
+ or has an empty `col_names` list.
229
+ RuntimeError: If the query fails.
230
+ """
231
+ logger_safe("debug", "follow_up_report called with process_id=%s, filtermanager=%s",
232
+ process_id, type(filtermanager).__name__ if filtermanager else None)
233
+
234
+ table_fqn = f"{tdfs4ds.SCHEMA}.{tdfs4ds.FOLLOW_UP_NAME}"
235
+
236
+ # Case 1: No filters at all -> return full table
237
+ if process_id is None and filtermanager is None:
238
+ logger_safe("info", "Returning all follow-up rows (no filters).")
239
+ try:
240
+ return tdml.DataFrame(tdml.in_schema(tdfs4ds.SCHEMA, tdfs4ds.FOLLOW_UP_NAME)) \
241
+ .sort('START_DATETIME', ascending=False)
242
+ except Exception as e:
243
+ logger_safe("error", "Failed to fetch all follow-up rows: %s", e)
244
+ raise RuntimeError("Database query failed while fetching follow-up report.") from e
245
+
246
+ # Helper to build the FILTER_MANAGER scalar subquery when filtermanager is provided
247
+ def _build_filter_manager_subquery(fm: object) -> str:
248
+ required_attrs = ("col_names", "schema_name", "view_name")
249
+ if not all(hasattr(fm, a) for a in required_attrs):
250
+ raise ValueError("filtermanager must have col_names, schema_name, and view_name.")
251
+ if not getattr(fm, "col_names", None):
252
+ raise ValueError("filtermanager.col_names must be a non-empty list.")
253
+
254
+ json_cols = ",".join(fm.col_names)
255
+ subq = f"""
256
+ (
257
+ SELECT JSON_AGG({json_cols}) AS APPLIED_FILTER
258
+ FROM {fm.schema_name}.{fm.view_name}
259
+ ) FILTER_MANAGER
260
+ """
261
+ logger_safe("debug", "Constructed FILTER_MANAGER subquery with columns: %s", json_cols)
262
+ return subq
263
+
264
+ # Defensive escaping for process_id if used in a literal (prefer bind params if available)
265
+ def _escape_literal(val: str) -> str:
266
+ return val.replace("'", "''")
267
+
268
+ try:
269
+ # Case 2: process_id only
270
+ if process_id is not None and filtermanager is None:
271
+ pid = _escape_literal(process_id)
272
+ query = f"""
273
+ SELECT *
274
+ FROM {table_fqn}
275
+ WHERE PROCESS_ID = '{pid}'
276
+ """
277
+ logger_safe("info", "Fetching follow-up rows filtered by PROCESS_ID only.")
278
+ return tdml.DataFrame.from_query(query).sort('START_DATETIME', ascending=False)
279
+
280
+ # Case 3: filtermanager only
281
+ if process_id is None and filtermanager is not None:
282
+ subq = _build_filter_manager_subquery(filtermanager)
283
+ query = f"""
284
+ SELECT A.*
285
+ FROM {table_fqn} A,
286
+ {subq}
287
+ WHERE CAST(A.APPLIED_FILTER AS VARCHAR(20000)) =
288
+ CAST(FILTER_MANAGER.APPLIED_FILTER AS VARCHAR(20000))
289
+ """
290
+ logger_safe("info", "Fetching follow-up rows filtered by FILTER_MANAGER only.")
291
+ return tdml.DataFrame.from_query(query).sort('START_DATETIME', ascending=False)
292
+
293
+ # Case 4: both process_id and filtermanager
294
+ pid = _escape_literal(process_id) # type: ignore[arg-type]
295
+ subq = _build_filter_manager_subquery(filtermanager) # type: ignore[arg-type]
296
+ query = f"""
297
+ SELECT A.*
298
+ FROM {table_fqn} A,
299
+ {subq}
300
+ WHERE A.PROCESS_ID = '{pid}'
301
+ AND CAST(A.APPLIED_FILTER AS VARCHAR(20000)) =
302
+ CAST(FILTER_MANAGER.APPLIED_FILTER AS VARCHAR(20000))
303
+ """
304
+ logger_safe("info", "Fetching follow-up rows filtered by PROCESS_ID and FILTER_MANAGER.")
305
+ return tdml.DataFrame.from_query(query).sort('START_DATETIME', ascending=False)
306
+
307
+ except Exception as e:
308
+ logger_safe("error", "Failed to fetch follow-up report: %s", e)
309
+ raise RuntimeError("Database query failed while fetching follow-up report.") from e
@@ -28,7 +28,7 @@ def list_processes():
28
28
  return tdml.DataFrame(tdml.in_schema(tdfs4ds.SCHEMA, tdfs4ds.PROCESS_CATALOG_NAME_VIEW))
29
29
  except Exception as e:
30
30
  print(str(e))
31
- print(query)
31
+ print(tdml.DataFrame(tdml.in_schema(tdfs4ds.SCHEMA, tdfs4ds.PROCESS_CATALOG_NAME_VIEW)).show_query())
32
32
 
33
33
  def list_processes_feature_split():
34
34
  """
@@ -3,6 +3,7 @@ import tdfs4ds
3
3
  from tdfs4ds.utils.query_management import execute_query_wrapper
4
4
  import uuid
5
5
  import json
6
+ from tdfs4ds import logger,logger_safe
6
7
 
7
8
  @execute_query_wrapper
8
9
  def register_process_view(view_name, entity_id, feature_names, metadata={}, entity_null_substitute = {}, **kwargs):
@@ -74,80 +75,91 @@ def _register_process_view_merge(view_name, entity_id, feature_names, metadata={
74
75
  - Requires 'tdml' module for DataFrame operations and 'uuid' for generating unique identifiers.
75
76
  """
76
77
 
77
-
78
- # Handling the case where the view name is provided as a DataFrame
79
- if type(view_name) == tdml.dataframe.dataframe.DataFrame:
78
+ # Handle teradataml DataFrame input
79
+ if isinstance(view_name, tdml.dataframe.dataframe.DataFrame):
80
80
  try:
81
81
  view_name = view_name._table_name
82
- except:
83
- print(
84
- 'create your teradata dataframe using tdml.DataFrame(<view name>). Crystallize your view if needed')
82
+ except Exception:
83
+ logger_safe(
84
+ "error",
85
+ "Invalid DataFrame for view registration. Use: tdml.DataFrame(<table/view>). Crystallize if needed."
86
+ )
85
87
  raise
86
88
 
89
+ # Prevent using temporary teradataml views
87
90
  if view_name.split('.')[1].startswith('ml__'):
88
- tdfs4ds.logger.error('Your dataframe is a temporary teradataml dataframe. Please crystallize your view first.')
89
- raise ValueError("Invalid process view name: it starts with 'ml__'. Please consider view crystallization")
90
-
91
- # Get filter manager:
91
+ logger_safe(
92
+ "error",
93
+ "Invalid view name '%s': starts with 'ml__'. Please crystallize your view first.",
94
+ view_name
95
+ )
96
+ raise ValueError("Invalid process view name: temporary teradataml views are not allowed.")
97
+
98
+ # Get optional arguments
92
99
  filtermanager = kwargs.get('filtermanager', None)
93
- if filtermanager is None:
94
- query_upsert_filtermanager = None
95
-
96
- # Get data distribution related inputs:
97
- primary_index = kwargs.get('primary_index', [e for e in entity_id.keys()])
100
+ query_upsert_filtermanager = None
101
+ primary_index = kwargs.get('primary_index', list(entity_id.keys()))
98
102
  partitioning = kwargs.get('partitioning', '').replace("'", '"')
99
103
 
100
104
  if primary_index is None:
101
- primary_index = [e for e in entity_id.keys()]
105
+ primary_index = list(entity_id.keys())
102
106
 
107
+ feature_names = ','.join(feature_names)
103
108
 
109
+ # Validtime period
110
+ end_period_ = '9999-01-01 00:00:00' if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED' else tdfs4ds.END_PERIOD
111
+ validtime_statement = (
112
+ 'CURRENT VALIDTIME'
113
+ if tdfs4ds.FEATURE_STORE_TIME is None
114
+ else f"VALIDTIME PERIOD '({tdfs4ds.FEATURE_STORE_TIME},{end_period_})'"
115
+ )
104
116
 
105
- # Joining the feature names into a comma-separated string
106
- feature_names = ','.join(feature_names)
117
+ logger_safe("info", "Registering process view: %s", view_name)
107
118
 
108
- # Setting the end period for the view
109
- if tdfs4ds.END_PERIOD == 'UNTIL_CHANGED':
110
- end_period_ = '9999-01-01 00:00:00'
111
- else:
112
- end_period_ = tdfs4ds.END_PERIOD
119
+ # Check if view already exists in catalog
120
+ query_process_id = f"""
121
+ SEL PROCESS_ID FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}
122
+ WHERE view_name = '{view_name}'
123
+ """
124
+ process_id_result = tdml.execute_sql(query_process_id).fetchall()
113
125
 
114
- if tdfs4ds.FEATURE_STORE_TIME == None:
115
- validtime_statement = 'CURRENT VALIDTIME'
116
- else:
117
- validtime_statement = f"VALIDTIME PERIOD '({tdfs4ds.FEATURE_STORE_TIME},{end_period_})'"
126
+ if process_id_result:
127
+ process_id = process_id_result[0][0]
128
+ logger_safe("info", "Updating existing process_id=%s", process_id)
118
129
 
130
+ query_feature_version = f"""
131
+ SEL PROCESS_VERSION FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW}
132
+ WHERE view_name = '{view_name}'
133
+ """
134
+ feature_version = tdml.execute_sql(query_feature_version).fetchall()[0][0]
119
135
 
120
- query_process_id = f"SEL PROCESS_ID FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW} WHERE view_name = '{view_name}'"
121
- process_id = tdml.execute_sql(query_process_id).fetchall()
122
- if len(process_id)>0:
123
- process_id = process_id[0][0]
124
- query_feature_version = f"SEL PROCESS_VERSION FROM {tdfs4ds.SCHEMA}.{tdfs4ds.PROCESS_CATALOG_NAME_VIEW} WHERE view_name = '{view_name}'"
125
- feature_version = tdml.execute_sql(query_feature_version).fetchall()[0][0]
126
- query_primary_index = f"SEL FOR_PRIMARY_INDEX, FOR_DATA_PARTITIONING FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME} WHERE process_id = '{process_id}'"
127
- query_primary_index_res = tdml.execute_sql(query_primary_index).fetchall()
128
- if len(query_primary_index_res)>0:
129
- FOR_PRIMARY_INDEX, FOR_DATA_PARTITIONING = tdml.execute_sql(query_primary_index).fetchall()[0]
136
+ query_primary_index = f"""
137
+ SEL FOR_PRIMARY_INDEX, FOR_DATA_PARTITIONING
138
+ FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME}
139
+ WHERE process_id = '{process_id}'
140
+ """
141
+ dist_res = tdml.execute_sql(query_primary_index).fetchall()
142
+ if dist_res:
143
+ FOR_PRIMARY_INDEX, FOR_DATA_PARTITIONING = dist_res[0]
130
144
  else:
131
- raise ValueError(f"""
132
- There is not information on primary index and partitioning for process: {process_id}.
133
- The working date is: {validtime_statement}
134
- The content of the distribution table is:
135
- {print(tdml.DataFrame.from_query(f"SEL * FROM {tdfs4ds.SCHEMA}.{tdfs4ds.DATA_DISTRIBUTION_NAME} WHERE process_id = '{process_id}'"))}
136
- """)
145
+ logger_safe(
146
+ "error",
147
+ "Missing data distribution info for existing process %s. Check distribution table.",
148
+ process_id
149
+ )
150
+ raise ValueError("Missing distribution info.")
137
151
  else:
138
- # Generating a unique process identifier
139
152
  process_id = str(uuid.uuid4())
140
153
  feature_version = 1
141
154
  FOR_PRIMARY_INDEX = ",".join(primary_index)
142
155
  FOR_DATA_PARTITIONING = partitioning
156
+ logger_safe("info", "Generated new process_id=%s", process_id)
143
157
 
144
- # Create a comma-separated string of entity IDs
145
- entity_id_list = list(entity_id.keys())
146
- entity_id_list.sort()
147
- ENTITY_ID__ = ','.join([k for k in entity_id_list])
158
+ # Build entity_id string
159
+ ENTITY_ID__ = ','.join(sorted(entity_id.keys()))
160
+ logger_safe("debug", "Entity IDs: %s", ENTITY_ID__)
161
+ logger_safe("debug", "Feature names: %s", feature_names)
148
162
 
149
- print('feature_version :',feature_version)
150
- print('int(feature_version) :', int(feature_version))
151
163
  if tdfs4ds.FEATURE_STORE_TIME == None:
152
164
 
153
165
 
@@ -402,16 +414,16 @@ def _register_process_view_merge(view_name, entity_id, feature_names, metadata={
402
414
  """
403
415
 
404
416
 
405
- # Logging the process registration
406
- print(f'register process with id : {process_id}')
407
- print(f"to run the process again just type : run(process_id='{process_id}')")
408
- print(f"to update your dataset : dataset = run(process_id='{process_id}',return_dataset=True)")
417
+ logger_safe("info", "Process registered: process_id=%s", process_id)
418
+ logger_safe("info", "To rerun: run(process_id='%s')", process_id)
419
+ logger_safe("info", "To build dataset: dataset = run(process_id='%s', return_dataset=True)", process_id)
409
420
 
410
- #print('query_insert_dist', query_upsert_dist)
421
+ # Return queries
411
422
  if kwargs.get('with_process_id'):
412
423
  return query_upsert, process_id, query_upsert_dist, query_upsert_filtermanager
413
424
  else:
414
425
  return query_upsert, query_upsert_dist, query_upsert_filtermanager
426
+
415
427
  @execute_query_wrapper
416
428
  def _register_process_view_update_insert(view_name, entity_id, feature_names, metadata={}, entity_null_substitute={}, **kwargs):
417
429
  """
@@ -208,7 +208,7 @@ def process_store_catalog_creation(if_exists='replace', comment='this table is a
208
208
  (
209
209
  PROCESS_ID VARCHAR(36) NOT NULL,
210
210
  FOR_PRIMARY_INDEX VARCHAR(2048),
211
- FOR_DATA_PARTITIONING VARCHAR(2048),
211
+ FOR_DATA_PARTITIONING VARCHAR(32000),
212
212
  ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
213
213
  ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
214
214
  PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME
@@ -227,7 +227,7 @@ def process_store_catalog_creation(if_exists='replace', comment='this table is a
227
227
  (
228
228
  PROCESS_ID VARCHAR(36) NOT NULL,
229
229
  FOR_PRIMARY_INDEX VARCHAR(2048),
230
- FOR_DATA_PARTITIONING VARCHAR(2048)
230
+ FOR_DATA_PARTITIONING VARCHAR(32000)
231
231
  )
232
232
  PRIMARY INDEX (PROCESS_ID);
233
233
  """