tdfs4ds 0.2.4.41__py3-none-any.whl → 0.2.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tdfs4ds/process_store.py DELETED
@@ -1,387 +0,0 @@
1
- import teradataml as tdml
2
- from tdfs4ds import feature_store
3
- from tdfs4ds.utils import execute_query_wrapper,execute_query
4
-
5
-
6
- import uuid
7
- import json
8
-
9
- process_catalog_name = 'FS_PROCESS_CATALOG'
10
-
11
- def process_store_catalog_creation(if_exists='replace', comment='this table is a process catalog'):
12
- """
13
- This function creates a feature store catalog table in Teradata database.
14
- The catalog table stores information about features such as their names, associated tables, databases, validity periods, etc.
15
-
16
- Parameters:
17
- - schema: The schema name in which the catalog table will be created.
18
- - if_exists (optional): Specifies the behavior if the catalog table already exists. The default is 'replace', which means the existing table will be replaced.
19
- - table_name (optional): The name of the catalog table. The default is 'FS_FEATURE_CATALOG'.
20
-
21
- Returns:
22
- The name of the created or replaced catalog table.
23
-
24
- """
25
-
26
- # SQL query to create the catalog table
27
- query = f"""
28
- CREATE MULTISET TABLE {feature_store.schema}.{process_catalog_name},
29
- FALLBACK,
30
- NO BEFORE JOURNAL,
31
- NO AFTER JOURNAL,
32
- CHECKSUM = DEFAULT,
33
- DEFAULT MERGEBLOCKRATIO,
34
- MAP = TD_MAP1
35
- (
36
-
37
- PROCESS_ID VARCHAR(36) NOT NULL,
38
- PROCESS_TYPE VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
39
- VIEW_NAME VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
40
- ENTITY_ID JSON(32000),
41
- FEATURE_NAMES VARCHAR(1024) CHARACTER SET LATIN NOT CASESPECIFIC,
42
- FEATURE_VERSION VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC,
43
- DATA_DOMAIN VARCHAR(255) CHARACTER SET LATIN NOT CASESPECIFIC NOT NULL,
44
- METADATA JSON(32000),
45
- ValidStart TIMESTAMP(0) WITH TIME ZONE NOT NULL,
46
- ValidEnd TIMESTAMP(0) WITH TIME ZONE NOT NULL,
47
- PERIOD FOR ValidPeriod (ValidStart, ValidEnd) AS VALIDTIME
48
- )
49
- PRIMARY INDEX (PROCESS_ID);
50
- """
51
-
52
- # SQL query to create a secondary index on the feature name
53
- query2 = f"CREATE INDEX (PROCESS_TYPE) ON {feature_store.schema}.{process_catalog_name};"
54
-
55
- # SQL query to comment the table
56
- query3 = f"COMMENT ON TABLE {feature_store.schema}.{process_catalog_name} IS '{comment}'"
57
-
58
- try:
59
- # Attempt to execute the create table query
60
- execute_query(query)
61
- if tdml.display.print_sqlmr_query:
62
- print(query)
63
- if feature_store.display_logs: print(f'TABLE {feature_store.schema}.{process_catalog_name} has been created')
64
- execute_query(query3)
65
- except Exception as e:
66
- # If the table already exists and if_exists is set to 'replace', drop the table and recreate it
67
- if feature_store.display_logs: print(str(e).split('\n')[0])
68
- if str(e).split('\n')[0].endswith('already exists.') and (if_exists == 'replace'):
69
- execute_query(f'DROP TABLE {feature_store.schema}.{process_catalog_name}')
70
- print(f'TABLE {feature_store.schema}.{process_catalog_name} has been dropped')
71
- try:
72
- # Attempt to recreate the table after dropping it
73
- execute_query(query)
74
- if feature_store.display_logs: print(f'TABLE {feature_store.schema}.{process_catalog_name} has been re-created')
75
- if tdml.display.print_sqlmr_query:
76
- print(query)
77
- execute_query(query3)
78
- except Exception as e:
79
- print(str(e).split('\n')[0])
80
-
81
- try:
82
- # Attempt to create the secondary index
83
- execute_query(query2)
84
- if tdml.display.print_sqlmr_query:
85
- print(query)
86
- if feature_store.display_logs: print(f'SECONDARY INDEX ON TABLE {feature_store.schema}.{process_catalog_name} has been created')
87
- except Exception as e:
88
- print(str(e).split('\n')[0])
89
-
90
- return process_catalog_name
91
-
92
- @execute_query_wrapper
93
- def register_process_view(view_name, entity_id, feature_names, metadata={}, **kwargs):
94
- """
95
- Registers a process view with the specified details in the feature store. The function
96
- handles both the creation of new views and the updating of existing views.
97
-
98
- Parameters:
99
- view_name (str or DataFrame): The name of the view or a DataFrame object representing the view.
100
- entity_id (str): The identifier of the entity associated with the view.
101
- feature_names (list): A list of feature names included in the view.
102
- metadata (dict, optional): Additional metadata related to the view. Defaults to an empty dictionary.
103
-
104
- Returns:
105
- str: A query string to insert or update the view details in the feature store.
106
- """
107
-
108
- # Handling the case where the view name is provided as a DataFrame
109
- if type(view_name) == tdml.dataframe.dataframe.DataFrame:
110
- try:
111
- view_name = view_name._table_name
112
- except:
113
- print('create your teradata dataframe using tdml.DataFrame(<view name>). Crystallize your view if needed')
114
- return []
115
-
116
- # Generating a unique process identifier
117
- process_id = str(uuid.uuid4())
118
-
119
- # Joining the feature names into a comma-separated string
120
- feature_names = ','.join(feature_names)
121
-
122
- # Setting the end period for the view
123
- if feature_store.end_period == 'UNTIL_CHANGED':
124
- end_period_ = '9999-01-01 00:00:00'
125
- else:
126
- end_period_ = feature_store.end_period
127
-
128
- if feature_store.date_in_the_past == None:
129
- validtime_statement = 'CURRENT VALIDTIME'
130
- else:
131
- validtime_statement = f"VALIDTIME PERIOD '({feature_store.date_in_the_past},{end_period_})'"
132
-
133
- # Handling cases based on whether the date is in the past or not
134
- if feature_store.date_in_the_past == None:
135
-
136
- # Checking if the view already exists in the feature store
137
- query_ = f"CURRENT VALIDTIME SEL * FROM {feature_store.schema}.{process_catalog_name} WHERE view_name = '{view_name}'"
138
- df = tdml.DataFrame.from_query(query_)
139
-
140
- # Constructing the query for new views
141
- if df.shape[0] == 0:
142
- query_insert = f"""
143
- CURRENT VALIDTIME INSERT INTO {feature_store.schema}.{process_catalog_name} (PROCESS_ID, PROCESS_TYPE, VIEW_NAME, ENTITY_ID, FEATURE_NAMES, FEATURE_VERSION, METADATA, DATA_DOMAIN)
144
- VALUES ('{process_id}',
145
- 'denormalized view',
146
- '{view_name}',
147
- '{json.dumps(entity_id).replace("'", '"')}',
148
- '{feature_names}',
149
- '1',
150
- '{json.dumps(metadata).replace("'", '"')}',
151
- '{feature_store.data_domain}'
152
- )
153
- """
154
- # Constructing the query for updating existing views
155
- else:
156
- query_insert = f"""
157
- CURRENT VALIDTIME UPDATE {feature_store.schema}.{process_catalog_name}
158
- SET
159
- PROCESS_TYPE = 'denormalized view'
160
- , ENTITY_ID = '{json.dumps(entity_id).replace("'", '"')}'
161
- , FEATURE_NAMES = '{feature_names}'
162
- , FEATURE_VERSION = CAST((CAST(FEATURE_VERSION AS INTEGER) +1) AS VARCHAR(4))
163
- , METADATA = '{json.dumps(metadata).replace("'", '"')}'
164
- , DATA_DOMAIN = '{feature_store.data_domain}'
165
- WHERE VIEW_NAME = '{view_name}'
166
- """
167
- process_id = tdml.DataFrame.from_query(f"CURRENT VALIDTIME SEL PROCESS_ID FROM {feature_store.schema}.{process_catalog_name} WHERE VIEW_NAME = '{view_name}'").to_pandas().PROCESS_ID.values[0]
168
-
169
- else:
170
- # Handling the case when the date is in the past
171
- df = tdml.DataFrame.from_query(f"VALIDTIME AS OF TIMESTAMP '{feature_store.date_in_the_past}' SEL * FROM {feature_store.schema}.{process_catalog_name} WHERE view_name = '{view_name}'")
172
-
173
-
174
-
175
- # Constructing the query for new views with a past date
176
- if df.shape[0] == 0:
177
- query_insert = f"""
178
- INSERT INTO {feature_store.schema}.{process_catalog_name} (PROCESS_ID, PROCESS_TYPE, VIEW_NAME, ENTITY_ID, FEATURE_NAMES, FEATURE_VERSION, METADATA, DATA_DOMAIN,ValidStart, ValidEnd)
179
- VALUES ('{process_id}',
180
- 'denormalized view',
181
- '{view_name}',
182
- '{json.dumps(entity_id).replace("'", '"')}'
183
- ,'{feature_names}',
184
- '1',
185
- '{json.dumps(metadata).replace("'", '"')}',
186
- '{feature_store.data_domain}',
187
- TIMESTAMP '{feature_store.date_in_the_past}',
188
- TIMESTAMP '{end_period_}'
189
- )
190
- """
191
- # Constructing the query for updating existing views with a past date
192
- else:
193
- query_insert = f"""{validtime_statement}
194
- UPDATE {feature_store.schema}.{process_catalog_name}
195
- SET
196
- PROCESS_TYPE = 'denormalized view'
197
- , ENTITY_ID = '{json.dumps(entity_id).replace("'", '"')}'
198
- , FEATURE_NAMES = '{feature_names}'
199
- , FEATURE_VERSION = CAST((CAST(FEATURE_VERSION AS INTEGER) +1) AS VARCHAR(4))
200
- , METADATA = '{json.dumps(metadata).replace("'", '"')}'
201
- , DATA_DOMAIN = '{feature_store.data_domain}'
202
- WHERE VIEW_NAME = '{view_name}'
203
- """
204
- process_id = tdml.DataFrame.from_query(
205
- f"VALIDTIME AS OF TIMESTAMP '{feature_store.date_in_the_past}' SEL PROCESS_ID FROM {feature_store.schema}.{process_catalog_name} WHERE VIEW_NAME = '{view_name}'").to_pandas().PROCESS_ID.values[
206
- 0]
207
- # Logging the process registration
208
- print(f'register process with id : {process_id}')
209
- print(f'to run the process again just type : run(process_id={process_id})')
210
- print(f'to update your dataset : dataset = run(process_id={process_id},return_dataset=True)')
211
-
212
- if kwargs.get('with_process_id'):
213
- return query_insert, process_id
214
- else:
215
- return query_insert
216
-
217
- @execute_query_wrapper
218
- def register_process_tdstone(model, metadata={}):
219
- """
220
- Registers a 'tdstone2 view' process in the feature store with specified model details and metadata.
221
- It handles both the scenarios where the feature store date is current or in the past.
222
-
223
- Parameters:
224
- model (Model Object): The model object containing necessary details for the registration.
225
- metadata (dict, optional): Additional metadata related to the process. Defaults to an empty dictionary.
226
-
227
- Returns:
228
- str: A query string to insert the process details into the feature store.
229
- """
230
-
231
- # Generating a unique process identifier
232
- process_id = str(uuid.uuid4())
233
-
234
- # Handling the current date scenario
235
- if feature_store.date_in_the_past is None:
236
- # Constructing the query for insertion with current valid time
237
- query_insert = f"""
238
- CURRENT VALIDTIME INSERT INTO {feature_store.schema}.{process_catalog_name} (PROCESS_ID, PROCESS_TYPE, ENTITY_ID, FEATURE_VERSION, METADATA, DATA_DOMAIN)
239
- VALUES ('{process_id}',
240
- 'tdstone2 view',
241
- '{model.mapper_scoring.id_row}',
242
- '{model.id}',
243
- '{json.dumps(metadata).replace("'", '"')}',
244
- '{feature_store.data_domain}'
245
- )
246
- """
247
- else:
248
- # Determining the end period based on feature store configuration
249
- end_period_ = '9999-01-01 00:00:00' if feature_store.end_period == 'UNTIL_CHANGED' else feature_store.end_period
250
-
251
- # Constructing the query for insertion with a specified past date
252
- query_insert = f"""
253
- INSERT INTO {feature_store.schema}.{process_catalog_name} (PROCESS_ID, PROCESS_TYPE, ENTITY_ID, FEATURE_VERSION, METADATA, DATA_DOMAIN, ValidStart, ValidEnd)
254
- VALUES ('{process_id}',
255
- 'tdstone2 view',
256
- '{model.mapper_scoring.id_row}',
257
- '{model.id}',
258
- '{json.dumps(metadata).replace("'", '"')}',
259
- '{feature_store.data_domain}',
260
- TIMESTAMP '{feature_store.date_in_the_past}',
261
- TIMESTAMP '{end_period_}')
262
- """
263
-
264
- # Logging the process registration
265
- print(f'register process with id : {process_id}')
266
-
267
- return query_insert
268
-
269
-
270
- def list_processes():
271
- """
272
- Retrieves and returns a list of all processes from the feature store.
273
- The function fetches details like process ID, type, view name, entity ID,
274
- feature names, feature version, and metadata.
275
-
276
- Returns:
277
- DataFrame: A DataFrame containing the details of all processes in the feature store.
278
- """
279
-
280
- # Constructing the SQL query to fetch process details
281
- query = f"""
282
- CURRENT VALIDTIME
283
- SELECT
284
- PROCESS_ID ,
285
- PROCESS_TYPE ,
286
- VIEW_NAME ,
287
- ENTITY_ID ,
288
- FEATURE_NAMES ,
289
- FEATURE_VERSION AS PROCESS_VERSION,
290
- DATA_DOMAIN,
291
- METADATA
292
- FROM {feature_store.schema}.{process_catalog_name}
293
- """
294
-
295
- # Optionally printing the query if configured to do so
296
- if tdml.display.print_sqlmr_query:
297
- print(query)
298
-
299
- # Executing the query and returning the result as a DataFrame
300
- try:
301
- return tdml.DataFrame.from_query(query)
302
- except Exception as e:
303
- print(str(e))
304
- print(query)
305
-
306
-
307
- def run(process_id, return_dataset = False):
308
- """
309
- Executes a specific process from the feature store identified by the process ID.
310
- The function handles different process types and performs appropriate actions.
311
-
312
- Args:
313
- process_id (str): The unique identifier of the process to run.
314
- as_date_of (str, optional): Date parameter for the process execution. Defaults to None.
315
-
316
- Returns:
317
- None: The function returns None, but performs operations based on process type.
318
- """
319
-
320
- if feature_store.date_in_the_past == None:
321
- validtime_statement = 'CURRENT VALIDTIME'
322
- else:
323
- validtime_statement = f"VALIDTIME AS OF TIMESTAMP '{feature_store.date_in_the_past}'"
324
-
325
- # Construct SQL query to retrieve process details by process ID
326
- query = f"""
327
- {validtime_statement}
328
- SEL * FROM {feature_store.schema}.{process_catalog_name}
329
- WHERE PROCESS_ID = '{process_id}'
330
- """
331
-
332
- # Executing the query and converting the result to Pandas DataFrame
333
- df = tdml.DataFrame.from_query(query).to_pandas()
334
-
335
- # Check if exactly one record is returned, else print an error
336
- if df.shape[0] != 1:
337
- print('error - there is ', df.shape[0], f' records. Check table {feature_store.schema}.{process_catalog_name}')
338
- return
339
-
340
- # Fetching the process type from the query result
341
- process_type = df['PROCESS_TYPE'].values[0]
342
-
343
- # Fetching the data domain from the query result
344
- feature_store.data_domain = df['DATA_DOMAIN'].values[0]
345
-
346
- # Handling 'denormalized view' process type
347
- if process_type == 'denormalized view':
348
- # Extracting necessary details for this process type
349
- view_name = df['VIEW_NAME'].values[0]
350
- entity_id = eval(df['ENTITY_ID'].values[0])
351
- feature_names = df['FEATURE_NAMES'].values[0].split(',')
352
-
353
- # Fetching data and uploading features to the feature store
354
- df_data = tdml.DataFrame(tdml.in_schema(view_name.split('.')[0], view_name.split('.')[1]))
355
-
356
- dataset = feature_store._upload_features(
357
- df_data,
358
- entity_id,
359
- feature_names,
360
- feature_versions = process_id)
361
-
362
- # Handling 'tdstone2 view' process type
363
- elif process_type == 'tdstone2 view':
364
- print('not implemented yet')
365
-
366
- if return_dataset:
367
- return dataset
368
- else:
369
- return
370
-
371
- @execute_query_wrapper
372
- def remove_process(process_id):
373
- """
374
- Deletes a process from the feature store's process catalog based on the given process ID.
375
-
376
- Args:
377
- process_id (str): The unique identifier of the process to be removed.
378
-
379
- Returns:
380
- str: SQL query string that deletes the specified process from the process catalog.
381
- """
382
-
383
- # Constructing SQL query to delete a process by its ID
384
- query = f"DELETE FROM {feature_store.schema}.{process_catalog_name} WHERE process_id = '{process_id}'"
385
-
386
- # Returning the SQL query string
387
- return query