tdfs4ds 0.2.4.46__py3-none-any.whl → 0.2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tdfs4ds/__init__.py +214 -38
- tdfs4ds/feature_store/feature_data_processing.py +3 -1
- tdfs4ds/feature_store/feature_store_management.py +1 -1
- tdfs4ds/genai/__init__.py +27 -0
- tdfs4ds/genai/documentation.py +1877 -0
- tdfs4ds/process_store/process_store_catalog_management.py +77 -24
- tdfs4ds/utils/filter_management.py +21 -12
- tdfs4ds/utils/time_management.py +22 -12
- {tdfs4ds-0.2.4.46.dist-info → tdfs4ds-0.2.5.0.dist-info}/METADATA +1 -1
- {tdfs4ds-0.2.4.46.dist-info → tdfs4ds-0.2.5.0.dist-info}/RECORD +12 -18
- tdfs/__init__.py +0 -1
- tdfs/data/curves.csv +0 -5086
- tdfs/datasets.py +0 -27
- tdfs/feature_store.py +0 -723
- tdfs4ds/feature_engineering.py +0 -152
- tdfs4ds/feature_store.py +0 -1529
- tdfs4ds/process_store.py +0 -387
- tdfs4ds/utils.py +0 -579
- {tdfs4ds-0.2.4.46.dist-info → tdfs4ds-0.2.5.0.dist-info}/WHEEL +0 -0
- {tdfs4ds-0.2.4.46.dist-info → tdfs4ds-0.2.5.0.dist-info}/top_level.txt +0 -0
tdfs4ds/feature_engineering.py
DELETED
|
@@ -1,152 +0,0 @@
|
|
|
1
|
-
import teradataml as tdml
|
|
2
|
-
|
|
3
|
-
def plot_graph(tddf, root_name='ml__'):
|
|
4
|
-
"""
|
|
5
|
-
Visualizes a given dataframe's source-target relationships using a Sankey diagram.
|
|
6
|
-
|
|
7
|
-
:param df: pd.DataFrame
|
|
8
|
-
The input dataframe should have two columns: 'source' and 'target'.
|
|
9
|
-
Each row represents a relationship between a source and a target.
|
|
10
|
-
|
|
11
|
-
:Note: This function makes use of Plotly's Sankey diagram representation for visualization.
|
|
12
|
-
|
|
13
|
-
:return: None
|
|
14
|
-
The function outputs the Sankey diagram and doesn't return anything.
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
tddf._DataFrame__execute_node_and_set_table_name(tddf._nodeid, tddf._metaexpr)
|
|
18
|
-
|
|
19
|
-
df, node_info = analyze_sql_query(tddf.show_query(), df=None, target=tddf._table_name, root_name=root_name)
|
|
20
|
-
|
|
21
|
-
if df['source'].values[0].lower() == df['target'].values[0].lower():
|
|
22
|
-
df = df.iloc[1::, :]
|
|
23
|
-
|
|
24
|
-
# Create a list of unique labels combining sources and targets from the dataframe
|
|
25
|
-
labels = list(pd.concat([df['source'], df['target']]).unique())
|
|
26
|
-
|
|
27
|
-
# Creating a mapping of node labels to additional information
|
|
28
|
-
node_info_dict = pd.DataFrame(node_info).set_index('target').T.to_dict()
|
|
29
|
-
|
|
30
|
-
# Create hovertext for each label using the node_info_map
|
|
31
|
-
hovertexts = [
|
|
32
|
-
f"Columns:<br> {','.join(node_info_dict[label]['columns'])}<br> Query: {sqlparse.format(node_info_dict[label]['query'], reindent=True, keyword_case='upper')}".replace(
|
|
33
|
-
'\n', '<br>').replace('PARTITION BY', '<br>PARTITION BY').replace('USING', '<br>USING').replace(' ON',
|
|
34
|
-
'<br>ON').replace(') ',')<br>').replace(')<br>AS',') AS').replace(', ',',<br>')
|
|
35
|
-
|
|
36
|
-
if label in node_info_dict else '' for label in labels]
|
|
37
|
-
|
|
38
|
-
# Use the length of 'columns' for the value (thickness) of each link
|
|
39
|
-
values = df['source'].apply(lambda x: len(node_info_dict[x]['columns']) if x in node_info_dict else 1)
|
|
40
|
-
|
|
41
|
-
# Convert source and target names to indices based on their position in the labels list
|
|
42
|
-
source_indices = df['source'].apply(lambda x: labels.index(x))
|
|
43
|
-
target_indices = df['target'].apply(lambda x: labels.index(x))
|
|
44
|
-
|
|
45
|
-
# Construct the Sankey diagram with nodes (sources & targets) and links (relationships)
|
|
46
|
-
fig = go.Figure(data=[go.Sankey(
|
|
47
|
-
node=dict(
|
|
48
|
-
pad=15, # Space between the nodes
|
|
49
|
-
thickness=20, # Node thickness
|
|
50
|
-
line=dict(color="black", width=0.5), # Node border properties
|
|
51
|
-
label=labels, # Labels for nodes
|
|
52
|
-
color="blue", # Node color
|
|
53
|
-
# hovertext=link_hovertexts # set hover text for nodes
|
|
54
|
-
customdata=hovertexts,
|
|
55
|
-
hovertemplate=' %{customdata}<extra></extra>',
|
|
56
|
-
),
|
|
57
|
-
link=dict(
|
|
58
|
-
source=source_indices, # Link sources
|
|
59
|
-
target=target_indices, # Link targets
|
|
60
|
-
value=values # [1] * len(df) # Assuming equal "flow" for each link. Can be modified if needed.
|
|
61
|
-
)
|
|
62
|
-
)])
|
|
63
|
-
|
|
64
|
-
# Customize the layout, such as setting the title and font size
|
|
65
|
-
fig.update_layout(title_text="Hierarchical Data Visualization", font_size=10)
|
|
66
|
-
|
|
67
|
-
# Display the Sankey diagram
|
|
68
|
-
fig.show()
|
|
69
|
-
|
|
70
|
-
return df
|
|
71
|
-
def materialize_view(tddf, view_name, schema_name):
|
|
72
|
-
"""
|
|
73
|
-
Materializes a given teradataml DataFrame as a view in the database with sub-views, if needed. This function
|
|
74
|
-
helps in creating nested views, where complex views are broken down into simpler sub-views to simplify debugging
|
|
75
|
-
and optimization. Each sub-view is named based on the main view's name with an additional suffix.
|
|
76
|
-
|
|
77
|
-
Parameters:
|
|
78
|
-
:param tddf: teradataml.DataFrame
|
|
79
|
-
The teradataml dataframe whose view needs to be materialized.
|
|
80
|
-
:param view_name: str
|
|
81
|
-
The name of the main view to be created.
|
|
82
|
-
:param schema_name: str
|
|
83
|
-
The schema in which the view should be created.
|
|
84
|
-
|
|
85
|
-
Returns:
|
|
86
|
-
:return: teradataml.DataFrame
|
|
87
|
-
A teradataml DataFrame representation of the created view.
|
|
88
|
-
|
|
89
|
-
Notes:
|
|
90
|
-
This function is specific to the teradataml library, and assumes the existence of certain attributes in the input DataFrame.
|
|
91
|
-
"""
|
|
92
|
-
|
|
93
|
-
# Create the _table_name attribute for the teradataml DataFrame if it doesn't exist
|
|
94
|
-
tddf._DataFrame__execute_node_and_set_table_name(tddf._nodeid, tddf._metaexpr)
|
|
95
|
-
|
|
96
|
-
# Generate the dependency graph for the input DataFrame's SQL representation
|
|
97
|
-
tddf_graph, _ = analyze_sql_query(tddf.show_query(), target=tddf._table_name)
|
|
98
|
-
|
|
99
|
-
# Generate new names for sub-views based on the main view's name and store in a mapping dictionary
|
|
100
|
-
mapping = {n: schema_name + '.' + view_name + '_sub_' + str(i) for i, n in enumerate(tddf_graph['target'].values)}
|
|
101
|
-
|
|
102
|
-
# Replace or create the sub-views with their new names in the database
|
|
103
|
-
for old_name, new_name in reversed(mapping.items()):
|
|
104
|
-
query = tdml.execute_sql(f"SHOW VIEW {old_name}").fetchall()[0][0].replace('\r','\n').lower()
|
|
105
|
-
query = query.replace('create', 'replace')
|
|
106
|
-
for old_sub_name, new_sub_name in mapping.items():
|
|
107
|
-
query = query.replace(old_sub_name.lower(), new_sub_name.lower())
|
|
108
|
-
#print(query)
|
|
109
|
-
print('REPLACE VIEW ', new_name)
|
|
110
|
-
tdml.execute_sql(query)
|
|
111
|
-
|
|
112
|
-
# Construct the final view by replacing the old names with new ones in the SQL representation
|
|
113
|
-
mapping[new_name] = view_name
|
|
114
|
-
#query = tdml.execute_sql(f"SHOW VIEW {tddf._table_name}").fetchall()[0][0].replace('\r','\n').lower()
|
|
115
|
-
#query = f'replace view {schema_name}.{view_name} AS \n' + query
|
|
116
|
-
for old_name, new_name in mapping.items():
|
|
117
|
-
query = query.replace(old_name.lower(), new_name.lower())
|
|
118
|
-
|
|
119
|
-
# Execute the final query to create the main view
|
|
120
|
-
#print(query)
|
|
121
|
-
print('REPLACE VIEW ', view_name)
|
|
122
|
-
tdml.execute_sql(query)
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
# Return a teradataml DataFrame representation of the created view
|
|
126
|
-
return tdml.DataFrame(tdml.in_schema(schema_name, view_name))
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
def crystallize_view(tddf, view_name, schema_name):
|
|
130
|
-
|
|
131
|
-
"""
|
|
132
|
-
Crystallizes a given teradataml DataFrame as a view in the database with sub-views, if needed. This function
|
|
133
|
-
helps in creating nested views, where complex views are broken down into simpler sub-views to simplify debugging
|
|
134
|
-
and optimization. Each sub-view is named based on the main view's name with an additional suffix.
|
|
135
|
-
|
|
136
|
-
Parameters:
|
|
137
|
-
:param tddf: teradataml.DataFrame
|
|
138
|
-
The teradataml dataframe whose view needs to be materialized.
|
|
139
|
-
:param view_name: str
|
|
140
|
-
The name of the main view to be created.
|
|
141
|
-
:param schema_name: str
|
|
142
|
-
The schema in which the view should be created.
|
|
143
|
-
|
|
144
|
-
Returns:
|
|
145
|
-
:return: teradataml.DataFrame
|
|
146
|
-
A teradataml DataFrame representation of the created view.
|
|
147
|
-
|
|
148
|
-
Notes:
|
|
149
|
-
This function is specific to the teradataml library, and assumes the existence of certain attributes in the input DataFrame.
|
|
150
|
-
"""
|
|
151
|
-
|
|
152
|
-
return materialize_view(tddf, view_name, schema_name)
|