tdfs4ds 0.2.4.47__py3-none-any.whl → 0.2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tdfs4ds/utils.py DELETED
@@ -1,579 +0,0 @@
1
- import functools
2
- import teradataml as tdml
3
- import re
4
- import pandas as pd
5
- import sqlparse
6
- import plotly.graph_objects as go
7
- import os
8
- from packaging import version
9
- import datetime
10
-
11
- def is_version_greater_than(tested_version, base_version="17.20.00.03"):
12
- """
13
- Check if the tested version is greater than the base version.
14
-
15
- Args:
16
- tested_version (str): Version number to be tested.
17
- base_version (str, optional): Base version number to compare. Defaults to "17.20.00.03".
18
-
19
- Returns:
20
- bool: True if tested version is greater, False otherwise.
21
- """
22
- return version.parse(tested_version) > version.parse(base_version)
23
- def execute_query_wrapper(f):
24
- """
25
- Decorator to execute a query. It wraps around the function and adds exception handling.
26
-
27
- Args:
28
- f (function): Function to be decorated.
29
-
30
- Returns:
31
- function: Decorated function.
32
- """
33
- @functools.wraps(f)
34
- def wrapped_f(*args, **kwargs):
35
- query = f(*args, **kwargs)
36
- if is_version_greater_than(tdml.__version__, base_version="17.20.00.03"):
37
- if type(query) == list:
38
- for q in query:
39
- try:
40
- tdml.execute_sql(q)
41
- except Exception as e:
42
- print(str(e).split('\n')[0])
43
- print(q)
44
- else:
45
- try:
46
- tdml.execute_sql(query)
47
- except Exception as e:
48
- print(str(e).split('\n')[0])
49
- print(query)
50
- else:
51
- if type(query) == list:
52
- for q in query:
53
- try:
54
- tdml.get_context().execute(q)
55
- except Exception as e:
56
- print(str(e).split('\n')[0])
57
- print(q)
58
- else:
59
- try:
60
- tdml.get_context().execute(query)
61
- except Exception as e:
62
- print(str(e).split('\n')[0])
63
- print(query)
64
- return
65
-
66
- return wrapped_f
67
-
68
-
69
- def execute_query(query):
70
- if is_version_greater_than(tdml.__version__, base_version="17.20.00.03"):
71
- if type(query) == list:
72
- for q in query:
73
- try:
74
- tdml.execute_sql(q)
75
- except Exception as e:
76
- print(str(e).split('\n')[0])
77
- print(q)
78
- else:
79
- try:
80
- return tdml.execute_sql(query)
81
- except Exception as e:
82
- print(str(e).split('\n')[0])
83
- print(query)
84
- else:
85
- if type(query) == list:
86
- for q in query:
87
- try:
88
- tdml.get_context().execute(q)
89
- except Exception as e:
90
- print(str(e).split('\n')[0])
91
- print(q)
92
- else:
93
- try:
94
- return tdml.get_context().execute(query)
95
- except Exception as e:
96
- print(str(e).split('\n')[0])
97
- print(query)
98
- return
99
-
100
-
101
- def _analyze_sql_query(sql_query):
102
- """
103
- Analyze a SQL query to extract the source tables, target tables, and views.
104
-
105
- The function uses regular expressions to search for patterns indicative
106
- of source tables, target tables, and views in the given SQL query.
107
-
108
- :param sql_query: str
109
- A string containing a SQL query to be analyzed.
110
-
111
- :return: dict
112
- A dictionary containing lists of source tables, target tables, and views.
113
- Format: {
114
- 'source': [source_tables],
115
- 'target': [target_tables]
116
- }
117
- """
118
-
119
- def find_in_with_statement(sql_text):
120
- """
121
- Extracts terms from a SQL text that are followed by 'AS ('.
122
-
123
- Args:
124
- sql_text (str): The SQL text to be searched.
125
-
126
- Returns:
127
- list: A list of terms that are followed by 'AS ('
128
- """
129
- # Regex pattern to find ', term AS ('
130
- # It looks for a comma, optional whitespace, captures a word (term), followed by optional whitespace, 'AS', whitespace, and an opening parenthesis
131
- pattern = r'WITH\s*(\w+)\s+AS\s+\('
132
-
133
- # Find all occurrences of the pattern
134
- terms = re.findall(pattern, sql_text, re.IGNORECASE)
135
-
136
- pattern = r',\s*(\w+)\s+AS\s+\('
137
-
138
- # Find all occurrences of the pattern
139
- terms = terms + re.findall(pattern, sql_text, re.IGNORECASE)
140
-
141
- terms = [t.split(' ')[0] for t in terms]
142
- return terms
143
-
144
- # Regular expression patterns for different SQL components
145
- create_table_pattern = r'CREATE\s+TABLE\s+([\w\s\.\"]+?)\s+AS'
146
- insert_into_pattern = r'INSERT\s+INTO\s+([\w\s\.\"]+?)'
147
- create_view_pattern = r'(CREATE|REPLACE)\s+VIEW\s+([\w\s\.\"]+?)\s+AS'
148
- #select_pattern = r'(FROM|JOIN|LEFT\sJOIN|RIGHT\sJOIN)\s+([\w\s\.\"]+?)(?=\s*(,|\s+GROUP|$|WHERE|PIVOT|UNPIVOT|UNION|ON|\)|\s+AS))'
149
- select_pattern = r'(FROM|JOIN|LEFT\s+JOIN|RIGHT\s+JOIN)\s+([\w\s\.\"]+?)(?=\s*(,|\bGROUP\b|\bWHERE\b|\bPIVOT\b|\bUNPIVOT\b|\bUNION\b|\bON\b|\bAS\b|$|\)))'
150
- # select_pattern2 = r'(FROM|JOIN)\s+([\w\s\.\"]+?)(?=\s*(,|group|$|where|pivot|unpivot|\)|AS))'
151
-
152
- # Find all matches in the SQL query for each pattern
153
- create_table_matches = re.findall(create_table_pattern, sql_query, re.IGNORECASE)
154
- insert_into_matches = re.findall(insert_into_pattern, sql_query, re.IGNORECASE)
155
- create_view_matches = re.findall(create_view_pattern, sql_query, re.IGNORECASE)
156
- select_matches = re.findall(select_pattern, sql_query, re.IGNORECASE)
157
-
158
- # select_matches2 = re.findall(select_pattern2, sql_query, re.IGNORECASE)
159
- # print(select_matches2)
160
- # Extract the actual table or view name from the match tuples
161
- create_table_matches = [match[0] if match[0] else match[1] for match in create_table_matches]
162
- insert_into_matches = [match[0] if match[0] else match[1] for match in insert_into_matches]
163
- create_view_matches = [match[1] if match[0] else match[1] for match in create_view_matches]
164
-
165
- with_matches = [x.lower() for x in find_in_with_statement(sql_query)]
166
- select_matches = [match[1] for match in select_matches]
167
-
168
- # select_matches2 = [match[0] for match in select_matches2]
169
-
170
- table_names = {
171
- 'source': [],
172
- 'target': []
173
- }
174
-
175
- # Categorize the matched tables and views into 'source' or 'target'
176
- table_names['target'].extend(create_table_matches)
177
- table_names['target'].extend(insert_into_matches)
178
- table_names['target'].extend(create_view_matches)
179
- table_names['source'].extend(select_matches)
180
- # table_names['source'].extend(select_matches2)
181
-
182
- # Remove duplicate table and view names
183
- table_names['source'] = list(set(table_names['source']))
184
- table_names['target'] = list(set(table_names['target']))
185
-
186
- correct_source = []
187
- for target in table_names['source']:
188
- if '"' not in target:
189
- if ' ' in target:
190
- target = target.split(' ')[0]
191
- if target.lower() not in with_matches:
192
- correct_source.append('.'.join(['"' + t + '"' for t in target.split('.')]))
193
- else:
194
- if target.lower() not in with_matches:
195
- correct_source.append(target)
196
-
197
- correct_target = []
198
- for target in table_names['target']:
199
- if '"' not in target:
200
- if ' ' in target:
201
- target = target.split(' ')[0]
202
- if target.lower() not in with_matches:
203
- correct_target.append('.'.join(['"' + t + '"' for t in target.split('.')]))
204
- else:
205
- if target.lower() not in with_matches:
206
- correct_target.append(target)
207
-
208
- table_names['source'] = correct_source
209
- table_names['target'] = correct_target
210
-
211
- return table_names
212
-
213
- def analyze_sql_query(sql_query, df=None, target=None, root_name='ml__', node_info=None):
214
- """
215
- Analyzes the provided SQL query to determine source and target tables/views relationships.
216
- It then captures these relationships in a pandas DataFrame.
217
-
218
- :param sql_query: str
219
- A string containing the SQL query to be analyzed.
220
- :param df: pd.DataFrame, optional
221
- An existing DataFrame to append the relationships to. If not provided, a new DataFrame is created.
222
- :param target: str, optional
223
- Name of the target table/view. If not provided, it's deduced from the SQL query.
224
-
225
- :return: pd.DataFrame
226
- A DataFrame with two columns: 'source' and 'target', representing the relationships.
227
-
228
- :Note: This function is specifically tailored for Teradata and makes use of teradataml (tdml) for certain operations.
229
- """
230
-
231
- # Extract source and potential target tables/views from the provided SQL query
232
- table_name = _analyze_sql_query(sql_query)
233
- # print(table_name)
234
- # print(sql_query)
235
- # print('-----')
236
-
237
- # Extract node informations
238
- if node_info is None and target is None:
239
- node_info = [{'target': target, 'columns': tdml.DataFrame.from_query(sql_query).columns, 'query': sql_query}]
240
- elif node_info is None:
241
- if '"' not in target:
242
- target = '.'.join(['"' + t + '"' for t in target.split('.')])
243
- #print(target)
244
- node_info = [{'target': target, 'columns': tdml.DataFrame(target).columns, 'query': sql_query}]
245
- else:
246
- if '"' not in target:
247
- target = '.'.join(['"' + t + '"' for t in target.split('.')])
248
- #print(target)
249
- node_info = node_info + [{'target': target, 'columns': tdml.DataFrame(target).columns, 'query': sql_query}]
250
-
251
- # If df is not provided, initialize it; else append to the existing df
252
- table_name['target'] = [target] * len(table_name['source'])
253
- if df is None:
254
- df = pd.DataFrame(table_name)
255
- else:
256
- df = pd.concat([df, pd.DataFrame(table_name)], ignore_index=True)
257
-
258
- # Check for teradataml views in the source and recursively analyze them
259
- for obj in table_name['source']:
260
- if root_name == None or root_name.lower() in obj.lower():
261
- #print(obj)
262
- # It's a teradataml view. Fetch its definition.
263
- try:
264
- sql_query_ = tdml.execute_sql(f"SHOW VIEW {obj}").fetchall()[0][0].replace('\r', '\n').replace('\t', '\n')
265
- except Exception as e:
266
- print(str(e).split("\n")[0])
267
- try:
268
- # Recursively analyze the view definition to get its relationships
269
- df, node_info = analyze_sql_query(sql_query_, df, target=obj, node_info=node_info, root_name=root_name)
270
- except:
271
- print(f"{obj} is a root, outside of the current database or a view directly connected to a table")
272
-
273
- else:
274
- print(root_name.lower(), ' not in ', obj.lower(), 'then excluded')
275
-
276
- return df, node_info
277
- def plot_graph(tddf, root_name='ml__'):
278
- """
279
- Visualizes a given dataframe's source-target relationships using a Sankey diagram.
280
-
281
- :param df: pd.DataFrame
282
- The input dataframe should have two columns: 'source' and 'target'.
283
- Each row represents a relationship between a source and a target.
284
-
285
- :Note: This function makes use of Plotly's Sankey diagram representation for visualization.
286
-
287
- :return: None
288
- The function outputs the Sankey diagram and doesn't return anything.
289
- """
290
-
291
- tddf._DataFrame__execute_node_and_set_table_name(tddf._nodeid, tddf._metaexpr)
292
-
293
- df, node_info = analyze_sql_query(tddf.show_query(), df=None, target=tddf._table_name, root_name=root_name)
294
-
295
- if df['source'].values[0].lower() == df['target'].values[0].lower():
296
- df = df.iloc[1::, :]
297
-
298
- # Create a list of unique labels combining sources and targets from the dataframe
299
- labels = list(pd.concat([df['source'], df['target']]).unique())
300
-
301
- # Creating a mapping of node labels to additional information
302
- node_info_dict = pd.DataFrame(node_info).set_index('target').T.to_dict()
303
-
304
- # Create hovertext for each label using the node_info_map
305
- hovertexts = [
306
- f"Columns:<br> {','.join(node_info_dict[label]['columns'])}<br> Query: {sqlparse.format(node_info_dict[label]['query'], reindent=True, keyword_case='upper')}".replace(
307
- '\n', '<br>').replace('PARTITION BY', '<br>PARTITION BY').replace('USING', '<br>USING').replace(' ON',
308
- '<br>ON').replace(') ',')<br>').replace(')<br>AS',') AS').replace(', ',',<br>')
309
-
310
- if label in node_info_dict else '' for label in labels]
311
-
312
- # Use the length of 'columns' for the value (thickness) of each link
313
- values = df['source'].apply(lambda x: len(node_info_dict[x]['columns']) if x in node_info_dict else 1)
314
-
315
- # Convert source and target names to indices based on their position in the labels list
316
- source_indices = df['source'].apply(lambda x: labels.index(x))
317
- target_indices = df['target'].apply(lambda x: labels.index(x))
318
-
319
- # Construct the Sankey diagram with nodes (sources & targets) and links (relationships)
320
- fig = go.Figure(data=[go.Sankey(
321
- node=dict(
322
- pad=15, # Space between the nodes
323
- thickness=20, # Node thickness
324
- line=dict(color="black", width=0.5), # Node border properties
325
- label=labels, # Labels for nodes
326
- color="blue", # Node color
327
- # hovertext=link_hovertexts # set hover text for nodes
328
- customdata=hovertexts,
329
- hovertemplate=' %{customdata}<extra></extra>',
330
- ),
331
- link=dict(
332
- source=source_indices, # Link sources
333
- target=target_indices, # Link targets
334
- value=values # [1] * len(df) # Assuming equal "flow" for each link. Can be modified if needed.
335
- )
336
- )])
337
-
338
- # Customize the layout, such as setting the title and font size
339
- fig.update_layout(title_text="Hierarchical Data Visualization", font_size=10)
340
-
341
- # Display the Sankey diagram
342
- fig.show()
343
-
344
- return df
345
- def crystallize_view(tddf, view_name, schema_name):
346
- """
347
- Materializes a given teradataml DataFrame as a view in the database with sub-views, if needed. This function
348
- helps in creating nested views, where complex views are broken down into simpler sub-views to simplify debugging
349
- and optimization. Each sub-view is named based on the main view's name with an additional suffix.
350
-
351
- Parameters:
352
- :param tddf: teradataml.DataFrame
353
- The teradataml dataframe whose view needs to be materialized.
354
- :param view_name: str
355
- The name of the main view to be created.
356
- :param schema_name: str
357
- The schema in which the view should be created.
358
-
359
- Returns:
360
- :return: teradataml.DataFrame
361
- A teradataml DataFrame representation of the created view.
362
-
363
- Notes:
364
- This function is specific to the teradataml library, and assumes the existence of certain attributes in the input DataFrame.
365
- """
366
-
367
- # Create the _table_name attribute for the teradataml DataFrame if it doesn't exist
368
- tddf._DataFrame__execute_node_and_set_table_name(tddf._nodeid, tddf._metaexpr)
369
-
370
- # Generate the dependency graph for the input DataFrame's SQL representation
371
- tddf_graph, _ = analyze_sql_query(tddf.show_query(), target=tddf._table_name)
372
-
373
- # Generate new names for sub-views based on the main view's name and store in a mapping dictionary
374
- if len(tddf_graph['target'].values)>1:
375
- mapping = {n: schema_name + '.' + view_name + '_sub_' + str(i) for i, n in enumerate(tddf_graph['target'].values)}
376
- else:
377
- mapping = {tddf_graph['target'].values[0] : schema_name + '.' + view_name}
378
-
379
- # Replace or create the sub-views with their new names in the database
380
- for old_name, new_name in reversed(mapping.items()):
381
- query = tdml.execute_sql(f"SHOW VIEW {old_name}").fetchall()[0][0].replace('\r','\n').lower()
382
- query = query.replace('create', 'replace')
383
- for old_sub_name, new_sub_name in mapping.items():
384
- query = query.upper().replace(old_sub_name.upper(), new_sub_name)
385
- #print(query)
386
- print('REPLACE VIEW ', new_name)
387
- tdml.execute_sql(query)
388
-
389
- # Construct the final view by replacing the old names with new ones in the SQL representation
390
- mapping[new_name] = view_name
391
-
392
- #query = tdml.execute_sql(f"SHOW VIEW {tddf._table_name}").fetchall()[0][0].replace('\r','\n').lower()
393
- #query = f'replace view {schema_name}.{view_name} AS \n' + query
394
- for old_name, new_name in mapping.items():
395
- query = query.upper().replace(old_name.upper(), new_name)
396
-
397
- # Execute the final query to create the main view
398
- #print(query)
399
- print('REPLACE VIEW ', schema_name,'.',view_name)
400
- tdml.execute_sql(query)
401
-
402
-
403
- # Return a teradataml DataFrame representation of the created view
404
- return tdml.DataFrame(tdml.in_schema(schema_name, view_name))
405
-
406
- def display_table(df, max_widths=[25, 55, 10], header=["Feature Database", "Feature Table", "# rows"]):
407
- # Create a format string for each row
408
- row_format = " | ".join("{:<" + str(width) + "." + str(width) + "}" for width in max_widths)
409
-
410
- # Print the header
411
- print('\n')
412
- print(row_format.format(*header))
413
- print("-" * (sum(max_widths) + 2 * (len(max_widths) - 1))) # Account for separators
414
-
415
- # Iterate over rows and print each one
416
- for _, row in df.iterrows():
417
- print(row_format.format(*[str(row[col]) for col in df.columns]))
418
-
419
- print('\n')
420
- return
421
-
422
-
423
- def get_column_types(df, columns):
424
- if type(columns) != list:
425
- columns = [columns]
426
-
427
- col_type = {x[0]: x[1] for x in df._td_column_names_and_types if x[0] in columns}
428
-
429
- for k, v in col_type.items():
430
- if v == 'VARCHAR':
431
- temp = df._td_column_names_and_sqlalchemy_types[k.lower()]
432
- col_type[k] = f"{temp.compile()} CHARACTER SET {temp.charset}"
433
- return col_type
434
-
435
-
436
- def get_column_types_simple(df, columns):
437
- if type(columns) != list:
438
- columns = [columns]
439
-
440
- col_type = {x[0]: x[1] for x in df._td_column_names_and_types if x[0] in columns}
441
-
442
- mapping = {'INTEGER': 'int',
443
- 'BYTEINT': 'int',
444
- 'BIGINT': 'int',
445
- 'FLOAT': 'float'
446
- }
447
-
448
- for k, v in col_type.items():
449
- if v in mapping.keys():
450
- col_type[k] = mapping[v]
451
-
452
- return col_type
453
-
454
-
455
- class TimeManager:
456
- """
457
- A class to manage time-related operations in a database table.
458
-
459
- Attributes:
460
- schema_name (str): Name of the schema in the database.
461
- table_name (str): Name of the table in the schema.
462
- data_type (str): Type of the date/time data, defaults to 'DATE'.
463
- """
464
-
465
- def __init__(self, table_name, schema_name, data_type='DATE'):
466
- """
467
- Initializes the TimeManager with a table name, schema name, and optionally a data type.
468
-
469
- If the table doesn't exist, it creates one with a BUSINESS_DATE column of the specified data type.
470
-
471
- Args:
472
- table_name (str): Name of the table.
473
- schema_name (str): Name of the schema.
474
- data_type (str, optional): Type of the date/time data. Defaults to 'DATE'.
475
- """
476
- self.schema_name = schema_name
477
- self.table_name = table_name
478
- if not self._exists():
479
- self.data_type = data_type
480
- self._create_table()
481
- else:
482
- df = tdml.DataFrame(tdml.in_schema(self.schema_name, self.table_name))
483
- d_ = {x[0]: x[1] for x in df._td_column_names_and_types}
484
- self.data_type = d_['BUSINESS_DATE']
485
-
486
- def _create_table(self):
487
- """
488
- Creates a table in the database with a BUSINESS_DATE column.
489
- """
490
- query = f"""
491
- CREATE TABLE {self.schema_name}.{self.table_name}
492
- (
493
- BUSINESS_DATE {self.data_type}
494
- )
495
- """
496
- tdml.execute_sql(query)
497
-
498
- if 'date' in self.data_type.lower():
499
- query = f"""
500
- INSERT INTO {self.schema_name}.{self.table_name} VALUES (CURRENT_DATE)
501
- """
502
- else:
503
- query = f"""
504
- INSERT INTO {self.schema_name}.{self.table_name} VALUES (CURRENT_TIME)
505
- """
506
- tdml.execute_sql(query)
507
-
508
- def _exists(self):
509
- """
510
- Checks if the table exists in the database.
511
-
512
- Returns:
513
- bool: True if the table exists, False otherwise.
514
- """
515
-
516
- return len([x for x in tdml.db_list_tables(schema_name=self.schema_name).TableName.values if
517
- x.lower().replace('"', '') == self.table_name.lower()]) > 0
518
-
519
- def _drop(self):
520
- """
521
- Drops the table if it exists.
522
- """
523
- # Drop the table if it exists
524
- if self._exists():
525
- tdml.db_drop_table(schema_name=self.schema_name, table_name=self.table_name)
526
-
527
- def update(self, new_time=None):
528
- """
529
- Updates the BUSINESS_DATE in the table.
530
-
531
- Args:
532
- new_time (str, optional): The new time to update. If None, current date or time is used depending on the data type.
533
- """
534
- if self._exists():
535
- if new_time is None and 'date' in self.data_type.lower():
536
- query = f"""
537
- UPDATE {self.schema_name}.{self.table_name}
538
- SET BUSINESS_DATE = CURRENT_DATE
539
- """
540
- elif new_time is None:
541
- query = f"""
542
- UPDATE {self.schema_name}.{self.table_name}
543
- SET BUSINESS_DATE = CURRENT_TIME
544
- """
545
- else:
546
- query = f"""
547
- UPDATE {self.schema_name}.{self.table_name}
548
- SET BUSINESS_DATE = {self.data_type} '{new_time}'
549
- """
550
- tdml.execute_sql(query)
551
-
552
- def display(self):
553
- """
554
- Displays the table.
555
-
556
- Returns:
557
- DataFrame: The table data as a DataFrame.
558
- """
559
- return tdml.DataFrame(tdml.in_schema(self.schema_name, self.table_name))
560
-
561
- def get_date_in_the_past(self):
562
- # '9999-01-01 00:00:00'
563
- date_obj = self.display().to_pandas().reset_index().iloc[0,0]
564
-
565
- if isinstance(date_obj, datetime.datetime):
566
- # print("temp is a datetime.datetime object")
567
- datetime_obj = date_obj
568
- elif isinstance(date_obj, datetime.date):
569
- # print("temp is a datetime.date object")
570
- # Convert date object to a datetime object at midnight (00:00:00)
571
- datetime_obj = datetime.datetime.combine(date_obj, datetime.time.min)
572
- else:
573
- print("temp is neither a datetime.date nor a datetime.datetime object")
574
- return
575
-
576
- # Convert datetime object to string
577
- output_string = datetime_obj.strftime("%Y-%m-%d %H:%M:%S")
578
-
579
- return output_string