semantic-link-labs 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of semantic-link-labs might be problematic. Click here for more details.
- semantic_link_labs-0.4.1.dist-info/LICENSE +21 -0
- semantic_link_labs-0.4.1.dist-info/METADATA +22 -0
- semantic_link_labs-0.4.1.dist-info/RECORD +52 -0
- semantic_link_labs-0.4.1.dist-info/WHEEL +5 -0
- semantic_link_labs-0.4.1.dist-info/top_level.txt +1 -0
- sempy_labs/__init__.py +154 -0
- sempy_labs/_ai.py +496 -0
- sempy_labs/_clear_cache.py +39 -0
- sempy_labs/_connections.py +234 -0
- sempy_labs/_dax.py +70 -0
- sempy_labs/_generate_semantic_model.py +280 -0
- sempy_labs/_helper_functions.py +506 -0
- sempy_labs/_icons.py +4 -0
- sempy_labs/_list_functions.py +1372 -0
- sempy_labs/_model_auto_build.py +143 -0
- sempy_labs/_model_bpa.py +1354 -0
- sempy_labs/_model_dependencies.py +341 -0
- sempy_labs/_one_lake_integration.py +155 -0
- sempy_labs/_query_scale_out.py +447 -0
- sempy_labs/_refresh_semantic_model.py +184 -0
- sempy_labs/_tom.py +3766 -0
- sempy_labs/_translations.py +378 -0
- sempy_labs/_vertipaq.py +893 -0
- sempy_labs/directlake/__init__.py +45 -0
- sempy_labs/directlake/_directlake_schema_compare.py +110 -0
- sempy_labs/directlake/_directlake_schema_sync.py +128 -0
- sempy_labs/directlake/_fallback.py +62 -0
- sempy_labs/directlake/_get_directlake_lakehouse.py +69 -0
- sempy_labs/directlake/_get_shared_expression.py +59 -0
- sempy_labs/directlake/_guardrails.py +84 -0
- sempy_labs/directlake/_list_directlake_model_calc_tables.py +54 -0
- sempy_labs/directlake/_show_unsupported_directlake_objects.py +89 -0
- sempy_labs/directlake/_update_directlake_model_lakehouse_connection.py +81 -0
- sempy_labs/directlake/_update_directlake_partition_entity.py +64 -0
- sempy_labs/directlake/_warm_cache.py +210 -0
- sempy_labs/lakehouse/__init__.py +24 -0
- sempy_labs/lakehouse/_get_lakehouse_columns.py +81 -0
- sempy_labs/lakehouse/_get_lakehouse_tables.py +250 -0
- sempy_labs/lakehouse/_lakehouse.py +85 -0
- sempy_labs/lakehouse/_shortcuts.py +296 -0
- sempy_labs/migration/__init__.py +29 -0
- sempy_labs/migration/_create_pqt_file.py +239 -0
- sempy_labs/migration/_migrate_calctables_to_lakehouse.py +429 -0
- sempy_labs/migration/_migrate_calctables_to_semantic_model.py +150 -0
- sempy_labs/migration/_migrate_model_objects_to_semantic_model.py +524 -0
- sempy_labs/migration/_migrate_tables_columns_to_semantic_model.py +165 -0
- sempy_labs/migration/_migration_validation.py +227 -0
- sempy_labs/migration/_refresh_calc_tables.py +129 -0
- sempy_labs/report/__init__.py +35 -0
- sempy_labs/report/_generate_report.py +253 -0
- sempy_labs/report/_report_functions.py +855 -0
- sempy_labs/report/_report_rebind.py +131 -0
sempy_labs/_vertipaq.py
ADDED
|
@@ -0,0 +1,893 @@
|
|
|
1
|
+
import sempy
|
|
2
|
+
import sempy.fabric as fabric
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from IPython.display import display, HTML
|
|
5
|
+
import zipfile, os, shutil, datetime, warnings
|
|
6
|
+
from pyspark.sql import SparkSession
|
|
7
|
+
from sempy_labs._helper_functions import (
|
|
8
|
+
format_dax_object_name,
|
|
9
|
+
get_direct_lake_sql_endpoint,
|
|
10
|
+
resolve_lakehouse_name,
|
|
11
|
+
)
|
|
12
|
+
from sempy_labs._list_functions import list_relationships
|
|
13
|
+
from sempy_labs.lakehouse._get_lakehouse_tables import get_lakehouse_tables
|
|
14
|
+
from sempy_labs.lakehouse._lakehouse import lakehouse_attached
|
|
15
|
+
from typing import List, Optional, Union
|
|
16
|
+
from sempy._utils._log import log
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@log
|
|
20
|
+
def vertipaq_analyzer(
|
|
21
|
+
dataset: str,
|
|
22
|
+
workspace: Optional[str] = None,
|
|
23
|
+
export: Optional[str] = None,
|
|
24
|
+
lakehouse_workspace: Optional[str] = None,
|
|
25
|
+
read_stats_from_data: Optional[bool] = False,
|
|
26
|
+
):
|
|
27
|
+
"""
|
|
28
|
+
Displays an HTML visualization of the Vertipaq Analyzer statistics from a semantic model.
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
dataset : str
|
|
33
|
+
Name of the semantic model.
|
|
34
|
+
workspace : str, default=None
|
|
35
|
+
The Fabric workspace name in which the semantic model exists.
|
|
36
|
+
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
37
|
+
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
38
|
+
export : str, default=None
|
|
39
|
+
Specifying 'zip' will export the results to a zip file in your lakehouse (which can be imported using the import_vertipaq_analyzer function.
|
|
40
|
+
Specifying 'table' will export the results to delta tables (appended) in your lakehouse.
|
|
41
|
+
Default value: None.
|
|
42
|
+
lakehouse_workspace : str, default=None
|
|
43
|
+
The Fabric workspace used by the lakehouse (for Direct Lake semantic models).
|
|
44
|
+
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
45
|
+
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
46
|
+
read_stats_from_data : bool, default=False
|
|
47
|
+
Setting this parameter to true has the function get Column Cardinality and Missing Rows using DAX (Direct Lake semantic models achieve this using a Spark query to the lakehouse).
|
|
48
|
+
|
|
49
|
+
Returns
|
|
50
|
+
-------
|
|
51
|
+
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
pd.options.mode.copy_on_write = True
|
|
55
|
+
warnings.filterwarnings(
|
|
56
|
+
"ignore", message="createDataFrame attempted Arrow optimization*"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
if workspace == None:
|
|
60
|
+
workspace_id = fabric.get_workspace_id()
|
|
61
|
+
workspace = fabric.resolve_workspace_name(workspace_id)
|
|
62
|
+
|
|
63
|
+
if lakehouse_workspace == None:
|
|
64
|
+
lakehouse_workspace = workspace
|
|
65
|
+
|
|
66
|
+
dfT = fabric.list_tables(dataset=dataset, extended=True, workspace=workspace)
|
|
67
|
+
dfT.rename(columns={"Name": "Table Name"}, inplace=True)
|
|
68
|
+
dfC = fabric.list_columns(dataset=dataset, extended=True, workspace=workspace)
|
|
69
|
+
dfC["Column Object"] = format_dax_object_name(dfC["Table Name"], dfC["Column Name"])
|
|
70
|
+
dfC.rename(columns={"Column Cardinality": "Cardinality"}, inplace=True)
|
|
71
|
+
dfH = fabric.list_hierarchies(dataset=dataset, extended=True, workspace=workspace)
|
|
72
|
+
dfR = list_relationships(dataset=dataset, extended=True, workspace=workspace)
|
|
73
|
+
dfR["From Object"] = format_dax_object_name(dfR["From Table"], dfR["From Column"])
|
|
74
|
+
dfR["To Object"] = format_dax_object_name(dfR["To Table"], dfR["To Column"])
|
|
75
|
+
dfP = fabric.list_partitions(dataset=dataset, extended=True, workspace=workspace)
|
|
76
|
+
dfD = fabric.list_datasets(
|
|
77
|
+
workspace=workspace,
|
|
78
|
+
additional_xmla_properties=["CompatibilityLevel", "Model.DefaultMode"],
|
|
79
|
+
)
|
|
80
|
+
dfD = dfD[dfD["Dataset Name"] == dataset]
|
|
81
|
+
dfD["Compatibility Level"] = dfD["Compatibility Level"].astype(int)
|
|
82
|
+
isDirectLake = any(r["Mode"] == "DirectLake" for i, r in dfP.iterrows())
|
|
83
|
+
dfR["Missing Rows"] = None
|
|
84
|
+
|
|
85
|
+
# Direct Lake
|
|
86
|
+
if read_stats_from_data:
|
|
87
|
+
if isDirectLake:
|
|
88
|
+
dfC = pd.merge(
|
|
89
|
+
dfC,
|
|
90
|
+
dfP[["Table Name", "Query", "Source Type"]],
|
|
91
|
+
on="Table Name",
|
|
92
|
+
how="left",
|
|
93
|
+
)
|
|
94
|
+
dfC_flt = dfC[
|
|
95
|
+
(dfC["Source Type"] == "Entity")
|
|
96
|
+
& (~dfC["Column Name"].str.startswith("RowNumber-"))
|
|
97
|
+
]
|
|
98
|
+
sqlEndpointId = get_direct_lake_sql_endpoint(dataset, workspace)
|
|
99
|
+
|
|
100
|
+
# Get lakehouse name from SQL Endpoint ID
|
|
101
|
+
dfI = fabric.list_items(workspace=lakehouse_workspace, type="SQLEndpoint")
|
|
102
|
+
dfI_filt = dfI[(dfI["Id"] == sqlEndpointId)]
|
|
103
|
+
|
|
104
|
+
if len(dfI_filt) == 0:
|
|
105
|
+
print(
|
|
106
|
+
f"The lakehouse (SQL Endpoint) used by the '{dataset}' semantic model does not reside in the '{lakehouse_workspace}' workspace. Please update the lakehouse_workspace parameter."
|
|
107
|
+
)
|
|
108
|
+
else:
|
|
109
|
+
lakehouseName = dfI_filt["Display Name"].iloc[0]
|
|
110
|
+
|
|
111
|
+
current_workspace_id = fabric.get_workspace_id()
|
|
112
|
+
current_workspace = fabric.resolve_workspace_name(current_workspace_id)
|
|
113
|
+
if current_workspace != lakehouse_workspace:
|
|
114
|
+
lakeTables = get_lakehouse_tables(
|
|
115
|
+
lakehouse=lakehouseName, workspace=lakehouse_workspace
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
sql_statements = []
|
|
119
|
+
spark = SparkSession.builder.getOrCreate()
|
|
120
|
+
# Loop through tables
|
|
121
|
+
for lakeTName in dfC_flt["Query"].unique():
|
|
122
|
+
query = "SELECT "
|
|
123
|
+
columns_in_table = dfC_flt.loc[
|
|
124
|
+
dfC_flt["Query"] == lakeTName, "Source"
|
|
125
|
+
].unique()
|
|
126
|
+
|
|
127
|
+
# Loop through columns within those tables
|
|
128
|
+
for scName in columns_in_table:
|
|
129
|
+
query = query + f"COUNT(DISTINCT({scName})) AS {scName}, "
|
|
130
|
+
|
|
131
|
+
query = query[:-2]
|
|
132
|
+
if lakehouse_workspace == current_workspace:
|
|
133
|
+
query = query + f" FROM {lakehouseName}.{lakeTName}"
|
|
134
|
+
else:
|
|
135
|
+
lakeTables_filt = lakeTables[
|
|
136
|
+
lakeTables["Table Name"] == lakeTName
|
|
137
|
+
]
|
|
138
|
+
tPath = lakeTables_filt["Location"].iloc[0]
|
|
139
|
+
|
|
140
|
+
df = spark.read.format("delta").load(tPath)
|
|
141
|
+
tempTableName = "delta_table_" + lakeTName
|
|
142
|
+
df.createOrReplaceTempView(tempTableName)
|
|
143
|
+
query = query + f" FROM {tempTableName}"
|
|
144
|
+
sql_statements.append((lakeTName, query))
|
|
145
|
+
|
|
146
|
+
for o in sql_statements:
|
|
147
|
+
tName = o[0]
|
|
148
|
+
query = o[1]
|
|
149
|
+
|
|
150
|
+
df = spark.sql(query)
|
|
151
|
+
|
|
152
|
+
for column in df.columns:
|
|
153
|
+
x = df.collect()[0][column]
|
|
154
|
+
for i, r in dfC.iterrows():
|
|
155
|
+
if r["Query"] == tName and r["Source"] == column:
|
|
156
|
+
dfC.at[i, "Cardinality"] = x
|
|
157
|
+
|
|
158
|
+
# Remove column added temporarily
|
|
159
|
+
dfC.drop(columns=["Query", "Source Type"], inplace=True)
|
|
160
|
+
|
|
161
|
+
# Direct Lake missing rows
|
|
162
|
+
dfR = pd.merge(
|
|
163
|
+
dfR,
|
|
164
|
+
dfP[["Table Name", "Query"]],
|
|
165
|
+
left_on="From Table",
|
|
166
|
+
right_on="Table Name",
|
|
167
|
+
how="left",
|
|
168
|
+
)
|
|
169
|
+
dfR.rename(columns={"Query": "From Lake Table"}, inplace=True)
|
|
170
|
+
dfR.drop(columns=["Table Name"], inplace=True)
|
|
171
|
+
dfR = pd.merge(
|
|
172
|
+
dfR,
|
|
173
|
+
dfP[["Table Name", "Query"]],
|
|
174
|
+
left_on="To Table",
|
|
175
|
+
right_on="Table Name",
|
|
176
|
+
how="left",
|
|
177
|
+
)
|
|
178
|
+
dfR.rename(columns={"Query": "To Lake Table"}, inplace=True)
|
|
179
|
+
dfR.drop(columns=["Table Name"], inplace=True)
|
|
180
|
+
dfR = pd.merge(
|
|
181
|
+
dfR,
|
|
182
|
+
dfC[["Column Object", "Source"]],
|
|
183
|
+
left_on="From Object",
|
|
184
|
+
right_on="Column Object",
|
|
185
|
+
how="left",
|
|
186
|
+
)
|
|
187
|
+
dfR.rename(columns={"Source": "From Lake Column"}, inplace=True)
|
|
188
|
+
dfR.drop(columns=["Column Object"], inplace=True)
|
|
189
|
+
dfR = pd.merge(
|
|
190
|
+
dfR,
|
|
191
|
+
dfC[["Column Object", "Source"]],
|
|
192
|
+
left_on="To Object",
|
|
193
|
+
right_on="Column Object",
|
|
194
|
+
how="left",
|
|
195
|
+
)
|
|
196
|
+
dfR.rename(columns={"Source": "To Lake Column"}, inplace=True)
|
|
197
|
+
dfR.drop(columns=["Column Object"], inplace=True)
|
|
198
|
+
|
|
199
|
+
spark = SparkSession.builder.getOrCreate()
|
|
200
|
+
for i, r in dfR.iterrows():
|
|
201
|
+
fromTable = r["From Lake Table"]
|
|
202
|
+
fromColumn = r["From Lake Column"]
|
|
203
|
+
toTable = r["To Lake Table"]
|
|
204
|
+
toColumn = r["To Lake Column"]
|
|
205
|
+
|
|
206
|
+
if lakehouse_workspace == current_workspace:
|
|
207
|
+
query = f"select count(f.{fromColumn}) as {fromColumn}\nfrom {fromTable} as f\nleft join {toTable} as c on f.{fromColumn} = c.{toColumn}\nwhere c.{toColumn} is null"
|
|
208
|
+
else:
|
|
209
|
+
tempTableFrom = "delta_table_" + fromTable
|
|
210
|
+
tempTableTo = "delta_table_" + toTable
|
|
211
|
+
|
|
212
|
+
query = f"select count(f.{fromColumn}) as {fromColumn}\nfrom {tempTableFrom} as f\nleft join {tempTableTo} as c on f.{fromColumn} = c.{toColumn}\nwhere c.{toColumn} is null"
|
|
213
|
+
|
|
214
|
+
# query = f"select count(f.{fromColumn}) as {fromColumn}\nfrom {fromTable} as f\nleft join {toTable} as c on f.{fromColumn} = c.{toColumn}\nwhere c.{toColumn} is null"
|
|
215
|
+
|
|
216
|
+
df = spark.sql(query)
|
|
217
|
+
missingRows = df.collect()[0][0]
|
|
218
|
+
dfR.at[i, "Missing Rows"] = missingRows
|
|
219
|
+
|
|
220
|
+
dfR["Missing Rows"] = dfR["Missing Rows"].astype(int)
|
|
221
|
+
else:
|
|
222
|
+
# Calculate missing rows using DAX for non-direct lake
|
|
223
|
+
for i, r in dfR.iterrows():
|
|
224
|
+
fromTable = r["From Table"]
|
|
225
|
+
fromColumn = r["From Column"]
|
|
226
|
+
toTable = r["To Table"]
|
|
227
|
+
toColumn = r["To Column"]
|
|
228
|
+
isActive = bool(r["Active"])
|
|
229
|
+
fromObject = format_dax_object_name(fromTable, fromColumn)
|
|
230
|
+
toObject = format_dax_object_name(toTable, toColumn)
|
|
231
|
+
|
|
232
|
+
missingRows = 0
|
|
233
|
+
|
|
234
|
+
query = f"evaluate\nsummarizecolumns(\n\"1\",calculate(countrows('{fromTable}'),isblank({toObject}))\n)"
|
|
235
|
+
|
|
236
|
+
if isActive == False: # add userelationship
|
|
237
|
+
query = f"evaluate\nsummarizecolumns(\n\"1\",calculate(countrows('{fromTable}'),userelationship({fromObject},{toObject}),isblank({toObject}))\n)"
|
|
238
|
+
|
|
239
|
+
result = fabric.evaluate_dax(
|
|
240
|
+
dataset=dataset, dax_string=query, workspace=workspace
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
try:
|
|
244
|
+
missingRows = result.iloc[0, 0]
|
|
245
|
+
except:
|
|
246
|
+
pass
|
|
247
|
+
|
|
248
|
+
dfR.at[i, "Missing Rows"] = missingRows
|
|
249
|
+
dfR["Missing Rows"] = dfR["Missing Rows"].astype(int)
|
|
250
|
+
|
|
251
|
+
dfTP = dfP.groupby("Table Name")["Partition Name"].count().reset_index()
|
|
252
|
+
dfTP.rename(columns={"Partition Name": "Partitions"}, inplace=True)
|
|
253
|
+
dfTC = dfC.groupby("Table Name")["Column Name"].count().reset_index()
|
|
254
|
+
dfTC.rename(columns={"Column Name": "Columns"}, inplace=True)
|
|
255
|
+
|
|
256
|
+
total_size = dfC["Total Size"].sum()
|
|
257
|
+
table_sizes = dfC.groupby("Table Name")["Total Size"].sum().reset_index()
|
|
258
|
+
table_sizes.rename(columns={"Total Size": "Table Size"}, inplace=True)
|
|
259
|
+
|
|
260
|
+
# Columns
|
|
261
|
+
dfC_filt = dfC[~dfC["Column Name"].str.startswith("RowNumber-")]
|
|
262
|
+
dfC_filt["% DB"] = round((dfC_filt["Total Size"] / total_size) * 100, 2)
|
|
263
|
+
dfC_filt = pd.merge(dfC_filt, table_sizes, on="Table Name", how="left")
|
|
264
|
+
dfC_filt["% Table"] = round(
|
|
265
|
+
(dfC_filt["Total Size"] / dfC_filt["Table Size"]) * 100, 2
|
|
266
|
+
)
|
|
267
|
+
columnList = [
|
|
268
|
+
"Table Name",
|
|
269
|
+
"Column Name",
|
|
270
|
+
"Type",
|
|
271
|
+
"Cardinality",
|
|
272
|
+
"Total Size",
|
|
273
|
+
"Data Size",
|
|
274
|
+
"Dictionary Size",
|
|
275
|
+
"Hierarchy Size",
|
|
276
|
+
"% Table",
|
|
277
|
+
"% DB",
|
|
278
|
+
"Data Type",
|
|
279
|
+
"Encoding",
|
|
280
|
+
"Is Resident",
|
|
281
|
+
"Temperature",
|
|
282
|
+
"Last Accessed",
|
|
283
|
+
]
|
|
284
|
+
|
|
285
|
+
colSize = dfC_filt[columnList].sort_values(by="Total Size", ascending=False)
|
|
286
|
+
temp = dfC_filt[columnList].sort_values(by="Temperature", ascending=False)
|
|
287
|
+
colSize.reset_index(drop=True, inplace=True)
|
|
288
|
+
temp.reset_index(drop=True, inplace=True)
|
|
289
|
+
|
|
290
|
+
export_Col = colSize.copy()
|
|
291
|
+
|
|
292
|
+
intList = [
|
|
293
|
+
"Cardinality",
|
|
294
|
+
"Total Size",
|
|
295
|
+
"Data Size",
|
|
296
|
+
"Dictionary Size",
|
|
297
|
+
"Hierarchy Size",
|
|
298
|
+
]
|
|
299
|
+
pctList = ["% Table", "% DB"]
|
|
300
|
+
colSize[intList] = colSize[intList].applymap("{:,}".format)
|
|
301
|
+
temp[intList] = temp[intList].applymap("{:,}".format)
|
|
302
|
+
colSize[pctList] = colSize[pctList].applymap("{:.2f}%".format)
|
|
303
|
+
temp[pctList] = temp[pctList].applymap("{:.2f}%".format)
|
|
304
|
+
|
|
305
|
+
# Tables
|
|
306
|
+
intList = ["Total Size", "Data Size", "Dictionary Size", "Hierarchy Size"]
|
|
307
|
+
dfCSum = dfC.groupby(["Table Name"])[intList].sum().reset_index()
|
|
308
|
+
dfCSum["% DB"] = round((dfCSum["Total Size"] / total_size) * 100, 2)
|
|
309
|
+
|
|
310
|
+
dfTable = pd.merge(
|
|
311
|
+
dfT[["Table Name", "Type", "Row Count"]], dfCSum, on="Table Name", how="inner"
|
|
312
|
+
)
|
|
313
|
+
dfTable = pd.merge(dfTable, dfTP, on="Table Name", how="left")
|
|
314
|
+
dfTable = pd.merge(dfTable, dfTC, on="Table Name", how="left")
|
|
315
|
+
dfTable = dfTable.drop_duplicates() # Drop duplicates (temporary)
|
|
316
|
+
dfTable = dfTable.sort_values(by="Total Size", ascending=False)
|
|
317
|
+
dfTable.reset_index(drop=True, inplace=True)
|
|
318
|
+
export_Table = dfTable.copy()
|
|
319
|
+
|
|
320
|
+
intList.extend(["Row Count", "Partitions", "Columns"])
|
|
321
|
+
dfTable[intList] = dfTable[intList].applymap("{:,}".format)
|
|
322
|
+
pctList = ["% DB"]
|
|
323
|
+
dfTable[pctList] = dfTable[pctList].applymap("{:.2f}%".format)
|
|
324
|
+
|
|
325
|
+
## Relationships
|
|
326
|
+
# dfR.drop(columns=['Max From Cardinality', 'Max To Cardinality'], inplace=True)
|
|
327
|
+
dfR = pd.merge(
|
|
328
|
+
dfR,
|
|
329
|
+
dfC[["Column Object", "Cardinality"]],
|
|
330
|
+
left_on="From Object",
|
|
331
|
+
right_on="Column Object",
|
|
332
|
+
how="left",
|
|
333
|
+
)
|
|
334
|
+
dfR.rename(columns={"Cardinality": "Max From Cardinality"}, inplace=True)
|
|
335
|
+
dfR = pd.merge(
|
|
336
|
+
dfR,
|
|
337
|
+
dfC[["Column Object", "Cardinality"]],
|
|
338
|
+
left_on="To Object",
|
|
339
|
+
right_on="Column Object",
|
|
340
|
+
how="left",
|
|
341
|
+
)
|
|
342
|
+
dfR.rename(columns={"Cardinality": "Max To Cardinality"}, inplace=True)
|
|
343
|
+
dfR = dfR[
|
|
344
|
+
[
|
|
345
|
+
"From Object",
|
|
346
|
+
"To Object",
|
|
347
|
+
"Multiplicity",
|
|
348
|
+
"Used Size",
|
|
349
|
+
"Max From Cardinality",
|
|
350
|
+
"Max To Cardinality",
|
|
351
|
+
"Missing Rows",
|
|
352
|
+
]
|
|
353
|
+
].sort_values(by="Used Size", ascending=False)
|
|
354
|
+
dfR.reset_index(drop=True, inplace=True)
|
|
355
|
+
export_Rel = dfR.copy()
|
|
356
|
+
intList = [
|
|
357
|
+
"Used Size",
|
|
358
|
+
"Max From Cardinality",
|
|
359
|
+
"Max To Cardinality",
|
|
360
|
+
"Missing Rows",
|
|
361
|
+
]
|
|
362
|
+
if read_stats_from_data == False:
|
|
363
|
+
intList.remove("Missing Rows")
|
|
364
|
+
dfR[intList] = dfR[intList].applymap("{:,}".format)
|
|
365
|
+
|
|
366
|
+
## Partitions
|
|
367
|
+
dfP = dfP[
|
|
368
|
+
["Table Name", "Partition Name", "Mode", "Record Count", "Segment Count"]
|
|
369
|
+
].sort_values(
|
|
370
|
+
by="Record Count", ascending=False
|
|
371
|
+
) # , 'Records per Segment'
|
|
372
|
+
dfP["Records per Segment"] = round(
|
|
373
|
+
dfP["Record Count"] / dfP["Segment Count"], 2
|
|
374
|
+
) # Remove after records per segment is fixed
|
|
375
|
+
dfP.reset_index(drop=True, inplace=True)
|
|
376
|
+
export_Part = dfP.copy()
|
|
377
|
+
intList = ["Record Count", "Segment Count", "Records per Segment"]
|
|
378
|
+
dfP[intList] = dfP[intList].applymap("{:,}".format)
|
|
379
|
+
|
|
380
|
+
## Hierarchies
|
|
381
|
+
dfH_filt = dfH[dfH["Level Ordinal"] == 0]
|
|
382
|
+
dfH_filt = dfH_filt[["Table Name", "Hierarchy Name", "Used Size"]].sort_values(
|
|
383
|
+
by="Used Size", ascending=False
|
|
384
|
+
)
|
|
385
|
+
dfH_filt.reset_index(drop=True, inplace=True)
|
|
386
|
+
export_Hier = dfH_filt.copy()
|
|
387
|
+
intList = ["Used Size"]
|
|
388
|
+
dfH_filt[intList] = dfH_filt[intList].applymap("{:,}".format)
|
|
389
|
+
|
|
390
|
+
## Model
|
|
391
|
+
if total_size >= 1000000000:
|
|
392
|
+
y = total_size / (1024**3) * 1000000000
|
|
393
|
+
elif total_size >= 1000000:
|
|
394
|
+
y = total_size / (1024**2) * 1000000
|
|
395
|
+
elif total_size >= 1000:
|
|
396
|
+
y = total_size / (1024) * 1000
|
|
397
|
+
y = round(y)
|
|
398
|
+
|
|
399
|
+
tblCount = len(dfT)
|
|
400
|
+
colCount = len(dfC_filt)
|
|
401
|
+
compatLevel = dfD["Compatibility Level"].iloc[0]
|
|
402
|
+
defMode = dfD["Model Default Mode"].iloc[0]
|
|
403
|
+
|
|
404
|
+
dfModel = pd.DataFrame(
|
|
405
|
+
{
|
|
406
|
+
"Dataset Name": dataset,
|
|
407
|
+
"Total Size": y,
|
|
408
|
+
"Table Count": tblCount,
|
|
409
|
+
"Column Count": colCount,
|
|
410
|
+
"Compatibility Level": compatLevel,
|
|
411
|
+
"Default Mode": defMode,
|
|
412
|
+
},
|
|
413
|
+
index=[0],
|
|
414
|
+
)
|
|
415
|
+
dfModel.reset_index(drop=True, inplace=True)
|
|
416
|
+
export_Model = dfModel.copy()
|
|
417
|
+
intList = ["Total Size", "Table Count", "Column Count"]
|
|
418
|
+
dfModel[intList] = dfModel[intList].applymap("{:,}".format)
|
|
419
|
+
|
|
420
|
+
dataFrames = {
|
|
421
|
+
"dfModel": dfModel,
|
|
422
|
+
"dfTable": dfTable,
|
|
423
|
+
"dfP": dfP,
|
|
424
|
+
"colSize": colSize,
|
|
425
|
+
"temp": temp,
|
|
426
|
+
"dfR": dfR,
|
|
427
|
+
"dfH_filt": dfH_filt,
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
dfs = {}
|
|
431
|
+
for fileName, df in dataFrames.items():
|
|
432
|
+
dfs[fileName] = df
|
|
433
|
+
|
|
434
|
+
visualize_vertipaq(dfs)
|
|
435
|
+
|
|
436
|
+
### Export vertipaq to delta tables in lakehouse
|
|
437
|
+
if export in ["table", "zip"]:
|
|
438
|
+
lakeAttach = lakehouse_attached()
|
|
439
|
+
if lakeAttach == False:
|
|
440
|
+
print(
|
|
441
|
+
f"In order to save the Vertipaq Analyzer results, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook."
|
|
442
|
+
)
|
|
443
|
+
return
|
|
444
|
+
|
|
445
|
+
if export == "table":
|
|
446
|
+
spark = SparkSession.builder.getOrCreate()
|
|
447
|
+
|
|
448
|
+
lakehouse_id = fabric.get_lakehouse_id()
|
|
449
|
+
lakehouse = resolve_lakehouse_name(
|
|
450
|
+
lakehouse_id=lakehouse_id, workspace=workspace
|
|
451
|
+
)
|
|
452
|
+
lakeTName = "vertipaq_analyzer_model"
|
|
453
|
+
|
|
454
|
+
lakeT = get_lakehouse_tables(lakehouse=lakehouse, workspace=workspace)
|
|
455
|
+
lakeT_filt = lakeT[lakeT["Table Name"] == lakeTName]
|
|
456
|
+
|
|
457
|
+
query = f"SELECT MAX(RunId) FROM {lakehouse}.{lakeTName}"
|
|
458
|
+
|
|
459
|
+
if len(lakeT_filt) == 0:
|
|
460
|
+
runId = 1
|
|
461
|
+
else:
|
|
462
|
+
dfSpark = spark.sql(query)
|
|
463
|
+
maxRunId = dfSpark.collect()[0][0]
|
|
464
|
+
runId = maxRunId + 1
|
|
465
|
+
|
|
466
|
+
dfMap = {
|
|
467
|
+
"export_Col": ["Columns", export_Col],
|
|
468
|
+
"export_Table": ["Tables", export_Table],
|
|
469
|
+
"export_Part": ["Partitions", export_Part],
|
|
470
|
+
"export_Rel": ["Relationships", export_Rel],
|
|
471
|
+
"export_Hier": ["Hierarchies", export_Hier],
|
|
472
|
+
"export_Model": ["Model", export_Model],
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
print(f"Saving Vertipaq Analyzer to delta tables in the lakehouse...\n")
|
|
476
|
+
now = datetime.datetime.now()
|
|
477
|
+
for key, (obj, df) in dfMap.items():
|
|
478
|
+
df["Timestamp"] = now
|
|
479
|
+
df["Workspace Name"] = workspace
|
|
480
|
+
df["Dataset Name"] = dataset
|
|
481
|
+
df["RunId"] = runId
|
|
482
|
+
|
|
483
|
+
colName = "Workspace Name"
|
|
484
|
+
df.insert(0, colName, df.pop(colName))
|
|
485
|
+
colName = "Dataset Name"
|
|
486
|
+
df.insert(1, colName, df.pop(colName))
|
|
487
|
+
|
|
488
|
+
df.columns = df.columns.str.replace(" ", "_")
|
|
489
|
+
|
|
490
|
+
delta_table_name = f"VertipaqAnalyzer_{obj}".lower()
|
|
491
|
+
spark_df = spark.createDataFrame(df)
|
|
492
|
+
spark_df.write.mode("append").format("delta").saveAsTable(delta_table_name)
|
|
493
|
+
print(
|
|
494
|
+
f"\u2022 Vertipaq Analyzer results for '{obj}' have been appended to the '{delta_table_name}' delta table."
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
### Export vertipaq to zip file within the lakehouse
|
|
498
|
+
if export == "zip":
|
|
499
|
+
dataFrames = {
|
|
500
|
+
"dfModel": dfModel,
|
|
501
|
+
"dfTable": dfTable,
|
|
502
|
+
"dfP": dfP,
|
|
503
|
+
"colSize": colSize,
|
|
504
|
+
"temp": temp,
|
|
505
|
+
"dfR": dfR,
|
|
506
|
+
"dfH_filt": dfH_filt,
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
zipFileName = f"{workspace}.{dataset}.zip"
|
|
510
|
+
|
|
511
|
+
folderPath = "/lakehouse/default/Files"
|
|
512
|
+
subFolderPath = os.path.join(folderPath, "VertipaqAnalyzer")
|
|
513
|
+
ext = ".csv"
|
|
514
|
+
if not os.path.exists(subFolderPath):
|
|
515
|
+
os.makedirs(subFolderPath, exist_ok=True)
|
|
516
|
+
zipFilePath = os.path.join(subFolderPath, zipFileName)
|
|
517
|
+
|
|
518
|
+
# Create CSV files based on dataframes
|
|
519
|
+
for fileName, df in dataFrames.items():
|
|
520
|
+
filePath = os.path.join(subFolderPath, fileName + ext)
|
|
521
|
+
df.to_csv(filePath, index=False)
|
|
522
|
+
|
|
523
|
+
# Create a zip file and add CSV files to it
|
|
524
|
+
with zipfile.ZipFile(zipFilePath, "w") as zipf:
|
|
525
|
+
for fileName in dataFrames:
|
|
526
|
+
filePath = os.path.join(subFolderPath, fileName + ext)
|
|
527
|
+
zipf.write(filePath, os.path.basename(filePath))
|
|
528
|
+
|
|
529
|
+
# Clean up: remove the individual CSV files
|
|
530
|
+
for fileName, df in dataFrames.items():
|
|
531
|
+
filePath = os.path.join(subFolderPath, fileName) + ext
|
|
532
|
+
if os.path.exists(filePath):
|
|
533
|
+
os.remove(filePath)
|
|
534
|
+
print(
|
|
535
|
+
f"The Vertipaq Analyzer info for the '{dataset}' semantic model in the '{workspace}' workspace has been saved to the 'Vertipaq Analyzer/{zipFileName}' in the default lakehouse attached to this notebook."
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def visualize_vertipaq(dataframes):
|
|
540
|
+
|
|
541
|
+
# Tooltips for columns within the visual
|
|
542
|
+
data = [
|
|
543
|
+
{
|
|
544
|
+
"ViewName": "Model",
|
|
545
|
+
"ColumnName": "Dataset Name",
|
|
546
|
+
"Tooltip": "The name of the semantic model",
|
|
547
|
+
},
|
|
548
|
+
{
|
|
549
|
+
"ViewName": "Model",
|
|
550
|
+
"ColumnName": "Total Size",
|
|
551
|
+
"Tooltip": "The size of the model (in bytes)",
|
|
552
|
+
},
|
|
553
|
+
{
|
|
554
|
+
"ViewName": "Model",
|
|
555
|
+
"ColumnName": "Table Count",
|
|
556
|
+
"Tooltip": "The number of tables in the semantic model",
|
|
557
|
+
},
|
|
558
|
+
{
|
|
559
|
+
"ViewName": "Model",
|
|
560
|
+
"ColumnName": "Column Count",
|
|
561
|
+
"Tooltip": "The number of columns in the semantic model",
|
|
562
|
+
},
|
|
563
|
+
{
|
|
564
|
+
"ViewName": "Model",
|
|
565
|
+
"ColumnName": "Compatibility Level",
|
|
566
|
+
"Tooltip": "The compatibility level of the semantic model",
|
|
567
|
+
},
|
|
568
|
+
{
|
|
569
|
+
"ViewName": "Model",
|
|
570
|
+
"ColumnName": "Default Mode",
|
|
571
|
+
"Tooltip": "The default query mode of the semantic model",
|
|
572
|
+
},
|
|
573
|
+
{
|
|
574
|
+
"ViewName": "Table",
|
|
575
|
+
"ColumnName": "Table Name",
|
|
576
|
+
"Tooltip": "The name of the table",
|
|
577
|
+
},
|
|
578
|
+
{"ViewName": "Table", "ColumnName": "Type", "Tooltip": "The type of table"},
|
|
579
|
+
{
|
|
580
|
+
"ViewName": "Table",
|
|
581
|
+
"ColumnName": "Row Count",
|
|
582
|
+
"Tooltip": "The number of rows in the table",
|
|
583
|
+
},
|
|
584
|
+
{
|
|
585
|
+
"ViewName": "Table",
|
|
586
|
+
"ColumnName": "Total Size",
|
|
587
|
+
"Tooltip": "Data Size + Dictionary Size + Hierarchy Size (in bytes)",
|
|
588
|
+
},
|
|
589
|
+
{
|
|
590
|
+
"ViewName": "Table",
|
|
591
|
+
"ColumnName": "Data Size",
|
|
592
|
+
"Tooltip": "The size of the data for all the columns in this table (in bytes)",
|
|
593
|
+
},
|
|
594
|
+
{
|
|
595
|
+
"ViewName": "Table",
|
|
596
|
+
"ColumnName": "Dictionary Size",
|
|
597
|
+
"Tooltip": "The size of the column's dictionary for all columns in this table (in bytes)",
|
|
598
|
+
},
|
|
599
|
+
{
|
|
600
|
+
"ViewName": "Table",
|
|
601
|
+
"ColumnName": "Hierarchy Size",
|
|
602
|
+
"Tooltip": "The size of hierarchy structures for all columns in this table (in bytes)",
|
|
603
|
+
},
|
|
604
|
+
{
|
|
605
|
+
"ViewName": "Table",
|
|
606
|
+
"ColumnName": "% DB",
|
|
607
|
+
"Tooltip": "The size of the table relative to the size of the semantic model",
|
|
608
|
+
},
|
|
609
|
+
{
|
|
610
|
+
"ViewName": "Table",
|
|
611
|
+
"ColumnName": "Partitions",
|
|
612
|
+
"Tooltip": "The number of partitions in the table",
|
|
613
|
+
},
|
|
614
|
+
{
|
|
615
|
+
"ViewName": "Table",
|
|
616
|
+
"ColumnName": "Columns",
|
|
617
|
+
"Tooltip": "The number of columns in the table",
|
|
618
|
+
},
|
|
619
|
+
{
|
|
620
|
+
"ViewName": "Partition",
|
|
621
|
+
"ColumnName": "Table Name",
|
|
622
|
+
"Tooltip": "The name of the table",
|
|
623
|
+
},
|
|
624
|
+
{
|
|
625
|
+
"ViewName": "Partition",
|
|
626
|
+
"ColumnName": "Partition Name",
|
|
627
|
+
"Tooltip": "The name of the partition within the table",
|
|
628
|
+
},
|
|
629
|
+
{
|
|
630
|
+
"ViewName": "Partition",
|
|
631
|
+
"ColumnName": "Mode",
|
|
632
|
+
"Tooltip": "The query mode of the partition",
|
|
633
|
+
},
|
|
634
|
+
{
|
|
635
|
+
"ViewName": "Partition",
|
|
636
|
+
"ColumnName": "Record Count",
|
|
637
|
+
"Tooltip": "The number of rows in the partition",
|
|
638
|
+
},
|
|
639
|
+
{
|
|
640
|
+
"ViewName": "Partition",
|
|
641
|
+
"ColumnName": "Segment Count",
|
|
642
|
+
"Tooltip": "The number of segments within the partition",
|
|
643
|
+
},
|
|
644
|
+
{
|
|
645
|
+
"ViewName": "Partition",
|
|
646
|
+
"ColumnName": "Records per Segment",
|
|
647
|
+
"Tooltip": "The number of rows per segment",
|
|
648
|
+
},
|
|
649
|
+
{
|
|
650
|
+
"ViewName": "Column",
|
|
651
|
+
"ColumnName": "Table Name",
|
|
652
|
+
"Tooltip": "The name of the table",
|
|
653
|
+
},
|
|
654
|
+
{
|
|
655
|
+
"ViewName": "Column",
|
|
656
|
+
"ColumnName": "Column Name",
|
|
657
|
+
"Tooltip": "The name of the column",
|
|
658
|
+
},
|
|
659
|
+
{"ViewName": "Column", "ColumnName": "Type", "Tooltip": "The type of column"},
|
|
660
|
+
{
|
|
661
|
+
"ViewName": "Column",
|
|
662
|
+
"ColumnName": "Cardinality",
|
|
663
|
+
"Tooltip": "The number of unique rows in the column",
|
|
664
|
+
},
|
|
665
|
+
{
|
|
666
|
+
"ViewName": "Column",
|
|
667
|
+
"ColumnName": "Total Size",
|
|
668
|
+
"Tooltip": "Data Size + Dictionary Size + Hierarchy Size (in bytes)",
|
|
669
|
+
},
|
|
670
|
+
{
|
|
671
|
+
"ViewName": "Column",
|
|
672
|
+
"ColumnName": "Data Size",
|
|
673
|
+
"Tooltip": "The size of the data for the column (in bytes)",
|
|
674
|
+
},
|
|
675
|
+
{
|
|
676
|
+
"ViewName": "Column",
|
|
677
|
+
"ColumnName": "Dictionary Size",
|
|
678
|
+
"Tooltip": "The size of the column's dictionary (in bytes)",
|
|
679
|
+
},
|
|
680
|
+
{
|
|
681
|
+
"ViewName": "Column",
|
|
682
|
+
"ColumnName": "Hierarchy Size",
|
|
683
|
+
"Tooltip": "The size of hierarchy structures (in bytes)",
|
|
684
|
+
},
|
|
685
|
+
{
|
|
686
|
+
"ViewName": "Column",
|
|
687
|
+
"ColumnName": "% Table",
|
|
688
|
+
"Tooltip": "The size of the column relative to the size of the table",
|
|
689
|
+
},
|
|
690
|
+
{
|
|
691
|
+
"ViewName": "Column",
|
|
692
|
+
"ColumnName": "% DB",
|
|
693
|
+
"Tooltip": "The size of the column relative to the size of the semantic model",
|
|
694
|
+
},
|
|
695
|
+
{
|
|
696
|
+
"ViewName": "Column",
|
|
697
|
+
"ColumnName": "Data Type",
|
|
698
|
+
"Tooltip": "The data type of the column",
|
|
699
|
+
},
|
|
700
|
+
{
|
|
701
|
+
"ViewName": "Column",
|
|
702
|
+
"ColumnName": "Encoding",
|
|
703
|
+
"Tooltip": "The encoding type for the column",
|
|
704
|
+
},
|
|
705
|
+
{
|
|
706
|
+
"ViewName": "Column",
|
|
707
|
+
"ColumnName": "Is Resident",
|
|
708
|
+
"Tooltip": "Indicates whether the column is in memory or not",
|
|
709
|
+
},
|
|
710
|
+
{
|
|
711
|
+
"ViewName": "Column",
|
|
712
|
+
"ColumnName": "Temperature",
|
|
713
|
+
"Tooltip": "A decimal indicating the frequency and recency of queries against the column",
|
|
714
|
+
},
|
|
715
|
+
{
|
|
716
|
+
"ViewName": "Column",
|
|
717
|
+
"ColumnName": "Last Accessed",
|
|
718
|
+
"Tooltip": "The time the column was last queried",
|
|
719
|
+
},
|
|
720
|
+
{
|
|
721
|
+
"ViewName": "Hierarchy",
|
|
722
|
+
"ColumnName": "Table Name",
|
|
723
|
+
"Tooltip": "The name of the table",
|
|
724
|
+
},
|
|
725
|
+
{
|
|
726
|
+
"ViewName": "Hierarchy",
|
|
727
|
+
"ColumnName": "Hierarchy Name",
|
|
728
|
+
"Tooltip": "The name of the hierarchy",
|
|
729
|
+
},
|
|
730
|
+
{
|
|
731
|
+
"ViewName": "Hierarchy",
|
|
732
|
+
"ColumnName": "Used Size",
|
|
733
|
+
"Tooltip": "The size of user hierarchy structures (in bytes)",
|
|
734
|
+
},
|
|
735
|
+
{
|
|
736
|
+
"ViewName": "Relationship",
|
|
737
|
+
"ColumnName": "From Object",
|
|
738
|
+
"Tooltip": "The from table/column in the relationship",
|
|
739
|
+
},
|
|
740
|
+
{
|
|
741
|
+
"ViewName": "Relationship",
|
|
742
|
+
"ColumnName": "To Object",
|
|
743
|
+
"Tooltip": "The to table/column in the relationship",
|
|
744
|
+
},
|
|
745
|
+
{
|
|
746
|
+
"ViewName": "Relationship",
|
|
747
|
+
"ColumnName": "Multiplicity",
|
|
748
|
+
"Tooltip": "The cardinality on each side of the relationship",
|
|
749
|
+
},
|
|
750
|
+
{
|
|
751
|
+
"ViewName": "Relationship",
|
|
752
|
+
"ColumnName": "Used Size",
|
|
753
|
+
"Tooltip": "The size of the relationship (in bytes)",
|
|
754
|
+
},
|
|
755
|
+
{
|
|
756
|
+
"ViewName": "Relationship",
|
|
757
|
+
"ColumnName": "Max From Cardinality",
|
|
758
|
+
"Tooltip": "The number of unique values in the column used in the from side of the relationship",
|
|
759
|
+
},
|
|
760
|
+
{
|
|
761
|
+
"ViewName": "Relationship",
|
|
762
|
+
"ColumnName": "Max To Cardinality",
|
|
763
|
+
"Tooltip": "The number of unique values in the column used in the to side of the relationship",
|
|
764
|
+
},
|
|
765
|
+
{
|
|
766
|
+
"ViewName": "Relationship",
|
|
767
|
+
"ColumnName": "Missing Rows",
|
|
768
|
+
"Tooltip": "The number of rows in the 'from' table which do not map to the key column in the 'to' table",
|
|
769
|
+
},
|
|
770
|
+
]
|
|
771
|
+
|
|
772
|
+
# Create DataFrame
|
|
773
|
+
tooltipDF = pd.DataFrame(data)
|
|
774
|
+
|
|
775
|
+
# define the dictionary with {"Tab name":df}
|
|
776
|
+
df_dict = {
|
|
777
|
+
"Model Summary": dataframes["dfModel"],
|
|
778
|
+
"Tables": dataframes["dfTable"],
|
|
779
|
+
"Partitions": dataframes["dfP"],
|
|
780
|
+
"Columns (Total Size)": dataframes["colSize"],
|
|
781
|
+
"Columns (Temperature)": dataframes["temp"],
|
|
782
|
+
"Relationships": dataframes["dfR"],
|
|
783
|
+
"Hierarchies": dataframes["dfH_filt"],
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
mapping = {
|
|
787
|
+
"Model Summary": "Model",
|
|
788
|
+
"Tables": "Table",
|
|
789
|
+
"Partitions": "Partition",
|
|
790
|
+
"Columns (Total Size)": "Column",
|
|
791
|
+
"Columns (Temperature)": "Column",
|
|
792
|
+
"Relationships": "Relationship",
|
|
793
|
+
"Hierarchies": "Hierarchy",
|
|
794
|
+
}
|
|
795
|
+
|
|
796
|
+
# Basic styles for the tabs and tab content
|
|
797
|
+
styles = """
|
|
798
|
+
<style>
|
|
799
|
+
.tab { overflow: hidden; border: 1px solid #ccc; background-color: #f1f1f1; }
|
|
800
|
+
.tab button { background-color: inherit; float: left; border: none; outline: none; cursor: pointer; padding: 14px 16px; transition: 0.3s; }
|
|
801
|
+
.tab button:hover { background-color: #ddd; }
|
|
802
|
+
.tab button.active { background-color: #ccc; }
|
|
803
|
+
.tabcontent { display: none; padding: 6px 12px; border: 1px solid #ccc; border-top: none; }
|
|
804
|
+
</style>
|
|
805
|
+
"""
|
|
806
|
+
# JavaScript for tab functionality
|
|
807
|
+
script = """
|
|
808
|
+
<script>
|
|
809
|
+
function openTab(evt, tabName) {
|
|
810
|
+
var i, tabcontent, tablinks;
|
|
811
|
+
tabcontent = document.getElementsByClassName("tabcontent");
|
|
812
|
+
for (i = 0; i < tabcontent.length; i++) {
|
|
813
|
+
tabcontent[i].style.display = "none";
|
|
814
|
+
}
|
|
815
|
+
tablinks = document.getElementsByClassName("tablinks");
|
|
816
|
+
for (i = 0; i < tablinks.length; i++) {
|
|
817
|
+
tablinks[i].className = tablinks[i].className.replace(" active", "");
|
|
818
|
+
}
|
|
819
|
+
document.getElementById(tabName).style.display = "block";
|
|
820
|
+
evt.currentTarget.className += " active";
|
|
821
|
+
}
|
|
822
|
+
</script>
|
|
823
|
+
"""
|
|
824
|
+
|
|
825
|
+
# HTML for tabs
|
|
826
|
+
tab_html = '<div class="tab">'
|
|
827
|
+
content_html = ""
|
|
828
|
+
for i, (title, df) in enumerate(df_dict.items()):
|
|
829
|
+
tab_id = f"tab{i}"
|
|
830
|
+
tab_html += f'<button class="tablinks" onclick="openTab(event, \'{tab_id}\')">{title}</button>'
|
|
831
|
+
|
|
832
|
+
vw = mapping.get(title)
|
|
833
|
+
|
|
834
|
+
df_html = df.to_html()
|
|
835
|
+
for col in df.columns:
|
|
836
|
+
tt = None
|
|
837
|
+
try:
|
|
838
|
+
tooltipDF_filt = tooltipDF[
|
|
839
|
+
(tooltipDF["ViewName"] == vw) & (tooltipDF["ColumnName"] == col)
|
|
840
|
+
]
|
|
841
|
+
tt = tooltipDF_filt["Tooltip"].iloc[0]
|
|
842
|
+
except:
|
|
843
|
+
pass
|
|
844
|
+
df_html = df_html.replace(f"<th>{col}</th>", f'<th title="{tt}">{col}</th>')
|
|
845
|
+
content_html += (
|
|
846
|
+
f'<div id="{tab_id}" class="tabcontent"><h3>{title}</h3>{df_html}</div>'
|
|
847
|
+
)
|
|
848
|
+
tab_html += "</div>"
|
|
849
|
+
|
|
850
|
+
# Display the tabs, tab contents, and run the script
|
|
851
|
+
display(HTML(styles + tab_html + content_html + script))
|
|
852
|
+
# Default to open the first tab
|
|
853
|
+
display(
|
|
854
|
+
HTML("<script>document.getElementsByClassName('tablinks')[0].click();</script>")
|
|
855
|
+
)
|
|
856
|
+
|
|
857
|
+
|
|
858
|
+
@log
|
|
859
|
+
def import_vertipaq_analyzer(folder_path: str, file_name: str):
|
|
860
|
+
"""
|
|
861
|
+
Imports and visualizes the vertipaq analyzer info from a saved .zip file in your lakehouse.
|
|
862
|
+
|
|
863
|
+
Parameters
|
|
864
|
+
----------
|
|
865
|
+
folder_path : str
|
|
866
|
+
The folder within your lakehouse in which the .zip file containing the vertipaq analyzer info has been saved.
|
|
867
|
+
file_name : str
|
|
868
|
+
The file name of the file which contains the vertipaq analyzer info.
|
|
869
|
+
|
|
870
|
+
Returns
|
|
871
|
+
-------
|
|
872
|
+
str
|
|
873
|
+
A visualization of the Vertipaq Analyzer statistics.
|
|
874
|
+
"""
|
|
875
|
+
|
|
876
|
+
pd.options.mode.copy_on_write = True
|
|
877
|
+
|
|
878
|
+
zipFilePath = os.path.join(folder_path, file_name)
|
|
879
|
+
extracted_dir = os.path.join(folder_path, "extracted_dataframes")
|
|
880
|
+
|
|
881
|
+
with zipfile.ZipFile(zipFilePath, "r") as zip_ref:
|
|
882
|
+
zip_ref.extractall(extracted_dir)
|
|
883
|
+
|
|
884
|
+
# Read all CSV files into a dictionary of DataFrames
|
|
885
|
+
dfs = {}
|
|
886
|
+
for file_name in zip_ref.namelist():
|
|
887
|
+
df = pd.read_csv(extracted_dir + "/" + file_name)
|
|
888
|
+
dfs[file_name] = df
|
|
889
|
+
|
|
890
|
+
visualize_vertipaq(dfs)
|
|
891
|
+
|
|
892
|
+
# Clean up: remove the extracted directory
|
|
893
|
+
shutil.rmtree(extracted_dir)
|