semantic-link-labs 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of semantic-link-labs might be problematic. Click here for more details.

Files changed (52) hide show
  1. semantic_link_labs-0.4.1.dist-info/LICENSE +21 -0
  2. semantic_link_labs-0.4.1.dist-info/METADATA +22 -0
  3. semantic_link_labs-0.4.1.dist-info/RECORD +52 -0
  4. semantic_link_labs-0.4.1.dist-info/WHEEL +5 -0
  5. semantic_link_labs-0.4.1.dist-info/top_level.txt +1 -0
  6. sempy_labs/__init__.py +154 -0
  7. sempy_labs/_ai.py +496 -0
  8. sempy_labs/_clear_cache.py +39 -0
  9. sempy_labs/_connections.py +234 -0
  10. sempy_labs/_dax.py +70 -0
  11. sempy_labs/_generate_semantic_model.py +280 -0
  12. sempy_labs/_helper_functions.py +506 -0
  13. sempy_labs/_icons.py +4 -0
  14. sempy_labs/_list_functions.py +1372 -0
  15. sempy_labs/_model_auto_build.py +143 -0
  16. sempy_labs/_model_bpa.py +1354 -0
  17. sempy_labs/_model_dependencies.py +341 -0
  18. sempy_labs/_one_lake_integration.py +155 -0
  19. sempy_labs/_query_scale_out.py +447 -0
  20. sempy_labs/_refresh_semantic_model.py +184 -0
  21. sempy_labs/_tom.py +3766 -0
  22. sempy_labs/_translations.py +378 -0
  23. sempy_labs/_vertipaq.py +893 -0
  24. sempy_labs/directlake/__init__.py +45 -0
  25. sempy_labs/directlake/_directlake_schema_compare.py +110 -0
  26. sempy_labs/directlake/_directlake_schema_sync.py +128 -0
  27. sempy_labs/directlake/_fallback.py +62 -0
  28. sempy_labs/directlake/_get_directlake_lakehouse.py +69 -0
  29. sempy_labs/directlake/_get_shared_expression.py +59 -0
  30. sempy_labs/directlake/_guardrails.py +84 -0
  31. sempy_labs/directlake/_list_directlake_model_calc_tables.py +54 -0
  32. sempy_labs/directlake/_show_unsupported_directlake_objects.py +89 -0
  33. sempy_labs/directlake/_update_directlake_model_lakehouse_connection.py +81 -0
  34. sempy_labs/directlake/_update_directlake_partition_entity.py +64 -0
  35. sempy_labs/directlake/_warm_cache.py +210 -0
  36. sempy_labs/lakehouse/__init__.py +24 -0
  37. sempy_labs/lakehouse/_get_lakehouse_columns.py +81 -0
  38. sempy_labs/lakehouse/_get_lakehouse_tables.py +250 -0
  39. sempy_labs/lakehouse/_lakehouse.py +85 -0
  40. sempy_labs/lakehouse/_shortcuts.py +296 -0
  41. sempy_labs/migration/__init__.py +29 -0
  42. sempy_labs/migration/_create_pqt_file.py +239 -0
  43. sempy_labs/migration/_migrate_calctables_to_lakehouse.py +429 -0
  44. sempy_labs/migration/_migrate_calctables_to_semantic_model.py +150 -0
  45. sempy_labs/migration/_migrate_model_objects_to_semantic_model.py +524 -0
  46. sempy_labs/migration/_migrate_tables_columns_to_semantic_model.py +165 -0
  47. sempy_labs/migration/_migration_validation.py +227 -0
  48. sempy_labs/migration/_refresh_calc_tables.py +129 -0
  49. sempy_labs/report/__init__.py +35 -0
  50. sempy_labs/report/_generate_report.py +253 -0
  51. sempy_labs/report/_report_functions.py +855 -0
  52. sempy_labs/report/_report_rebind.py +131 -0
@@ -0,0 +1,893 @@
1
+ import sempy
2
+ import sempy.fabric as fabric
3
+ import pandas as pd
4
+ from IPython.display import display, HTML
5
+ import zipfile, os, shutil, datetime, warnings
6
+ from pyspark.sql import SparkSession
7
+ from sempy_labs._helper_functions import (
8
+ format_dax_object_name,
9
+ get_direct_lake_sql_endpoint,
10
+ resolve_lakehouse_name,
11
+ )
12
+ from sempy_labs._list_functions import list_relationships
13
+ from sempy_labs.lakehouse._get_lakehouse_tables import get_lakehouse_tables
14
+ from sempy_labs.lakehouse._lakehouse import lakehouse_attached
15
+ from typing import List, Optional, Union
16
+ from sempy._utils._log import log
17
+
18
+
19
+ @log
20
+ def vertipaq_analyzer(
21
+ dataset: str,
22
+ workspace: Optional[str] = None,
23
+ export: Optional[str] = None,
24
+ lakehouse_workspace: Optional[str] = None,
25
+ read_stats_from_data: Optional[bool] = False,
26
+ ):
27
+ """
28
+ Displays an HTML visualization of the Vertipaq Analyzer statistics from a semantic model.
29
+
30
+ Parameters
31
+ ----------
32
+ dataset : str
33
+ Name of the semantic model.
34
+ workspace : str, default=None
35
+ The Fabric workspace name in which the semantic model exists.
36
+ Defaults to None which resolves to the workspace of the attached lakehouse
37
+ or if no lakehouse attached, resolves to the workspace of the notebook.
38
+ export : str, default=None
39
+ Specifying 'zip' will export the results to a zip file in your lakehouse (which can be imported using the import_vertipaq_analyzer function.
40
+ Specifying 'table' will export the results to delta tables (appended) in your lakehouse.
41
+ Default value: None.
42
+ lakehouse_workspace : str, default=None
43
+ The Fabric workspace used by the lakehouse (for Direct Lake semantic models).
44
+ Defaults to None which resolves to the workspace of the attached lakehouse
45
+ or if no lakehouse attached, resolves to the workspace of the notebook.
46
+ read_stats_from_data : bool, default=False
47
+ Setting this parameter to true has the function get Column Cardinality and Missing Rows using DAX (Direct Lake semantic models achieve this using a Spark query to the lakehouse).
48
+
49
+ Returns
50
+ -------
51
+
52
+ """
53
+
54
+ pd.options.mode.copy_on_write = True
55
+ warnings.filterwarnings(
56
+ "ignore", message="createDataFrame attempted Arrow optimization*"
57
+ )
58
+
59
+ if workspace == None:
60
+ workspace_id = fabric.get_workspace_id()
61
+ workspace = fabric.resolve_workspace_name(workspace_id)
62
+
63
+ if lakehouse_workspace == None:
64
+ lakehouse_workspace = workspace
65
+
66
+ dfT = fabric.list_tables(dataset=dataset, extended=True, workspace=workspace)
67
+ dfT.rename(columns={"Name": "Table Name"}, inplace=True)
68
+ dfC = fabric.list_columns(dataset=dataset, extended=True, workspace=workspace)
69
+ dfC["Column Object"] = format_dax_object_name(dfC["Table Name"], dfC["Column Name"])
70
+ dfC.rename(columns={"Column Cardinality": "Cardinality"}, inplace=True)
71
+ dfH = fabric.list_hierarchies(dataset=dataset, extended=True, workspace=workspace)
72
+ dfR = list_relationships(dataset=dataset, extended=True, workspace=workspace)
73
+ dfR["From Object"] = format_dax_object_name(dfR["From Table"], dfR["From Column"])
74
+ dfR["To Object"] = format_dax_object_name(dfR["To Table"], dfR["To Column"])
75
+ dfP = fabric.list_partitions(dataset=dataset, extended=True, workspace=workspace)
76
+ dfD = fabric.list_datasets(
77
+ workspace=workspace,
78
+ additional_xmla_properties=["CompatibilityLevel", "Model.DefaultMode"],
79
+ )
80
+ dfD = dfD[dfD["Dataset Name"] == dataset]
81
+ dfD["Compatibility Level"] = dfD["Compatibility Level"].astype(int)
82
+ isDirectLake = any(r["Mode"] == "DirectLake" for i, r in dfP.iterrows())
83
+ dfR["Missing Rows"] = None
84
+
85
+ # Direct Lake
86
+ if read_stats_from_data:
87
+ if isDirectLake:
88
+ dfC = pd.merge(
89
+ dfC,
90
+ dfP[["Table Name", "Query", "Source Type"]],
91
+ on="Table Name",
92
+ how="left",
93
+ )
94
+ dfC_flt = dfC[
95
+ (dfC["Source Type"] == "Entity")
96
+ & (~dfC["Column Name"].str.startswith("RowNumber-"))
97
+ ]
98
+ sqlEndpointId = get_direct_lake_sql_endpoint(dataset, workspace)
99
+
100
+ # Get lakehouse name from SQL Endpoint ID
101
+ dfI = fabric.list_items(workspace=lakehouse_workspace, type="SQLEndpoint")
102
+ dfI_filt = dfI[(dfI["Id"] == sqlEndpointId)]
103
+
104
+ if len(dfI_filt) == 0:
105
+ print(
106
+ f"The lakehouse (SQL Endpoint) used by the '{dataset}' semantic model does not reside in the '{lakehouse_workspace}' workspace. Please update the lakehouse_workspace parameter."
107
+ )
108
+ else:
109
+ lakehouseName = dfI_filt["Display Name"].iloc[0]
110
+
111
+ current_workspace_id = fabric.get_workspace_id()
112
+ current_workspace = fabric.resolve_workspace_name(current_workspace_id)
113
+ if current_workspace != lakehouse_workspace:
114
+ lakeTables = get_lakehouse_tables(
115
+ lakehouse=lakehouseName, workspace=lakehouse_workspace
116
+ )
117
+
118
+ sql_statements = []
119
+ spark = SparkSession.builder.getOrCreate()
120
+ # Loop through tables
121
+ for lakeTName in dfC_flt["Query"].unique():
122
+ query = "SELECT "
123
+ columns_in_table = dfC_flt.loc[
124
+ dfC_flt["Query"] == lakeTName, "Source"
125
+ ].unique()
126
+
127
+ # Loop through columns within those tables
128
+ for scName in columns_in_table:
129
+ query = query + f"COUNT(DISTINCT({scName})) AS {scName}, "
130
+
131
+ query = query[:-2]
132
+ if lakehouse_workspace == current_workspace:
133
+ query = query + f" FROM {lakehouseName}.{lakeTName}"
134
+ else:
135
+ lakeTables_filt = lakeTables[
136
+ lakeTables["Table Name"] == lakeTName
137
+ ]
138
+ tPath = lakeTables_filt["Location"].iloc[0]
139
+
140
+ df = spark.read.format("delta").load(tPath)
141
+ tempTableName = "delta_table_" + lakeTName
142
+ df.createOrReplaceTempView(tempTableName)
143
+ query = query + f" FROM {tempTableName}"
144
+ sql_statements.append((lakeTName, query))
145
+
146
+ for o in sql_statements:
147
+ tName = o[0]
148
+ query = o[1]
149
+
150
+ df = spark.sql(query)
151
+
152
+ for column in df.columns:
153
+ x = df.collect()[0][column]
154
+ for i, r in dfC.iterrows():
155
+ if r["Query"] == tName and r["Source"] == column:
156
+ dfC.at[i, "Cardinality"] = x
157
+
158
+ # Remove column added temporarily
159
+ dfC.drop(columns=["Query", "Source Type"], inplace=True)
160
+
161
+ # Direct Lake missing rows
162
+ dfR = pd.merge(
163
+ dfR,
164
+ dfP[["Table Name", "Query"]],
165
+ left_on="From Table",
166
+ right_on="Table Name",
167
+ how="left",
168
+ )
169
+ dfR.rename(columns={"Query": "From Lake Table"}, inplace=True)
170
+ dfR.drop(columns=["Table Name"], inplace=True)
171
+ dfR = pd.merge(
172
+ dfR,
173
+ dfP[["Table Name", "Query"]],
174
+ left_on="To Table",
175
+ right_on="Table Name",
176
+ how="left",
177
+ )
178
+ dfR.rename(columns={"Query": "To Lake Table"}, inplace=True)
179
+ dfR.drop(columns=["Table Name"], inplace=True)
180
+ dfR = pd.merge(
181
+ dfR,
182
+ dfC[["Column Object", "Source"]],
183
+ left_on="From Object",
184
+ right_on="Column Object",
185
+ how="left",
186
+ )
187
+ dfR.rename(columns={"Source": "From Lake Column"}, inplace=True)
188
+ dfR.drop(columns=["Column Object"], inplace=True)
189
+ dfR = pd.merge(
190
+ dfR,
191
+ dfC[["Column Object", "Source"]],
192
+ left_on="To Object",
193
+ right_on="Column Object",
194
+ how="left",
195
+ )
196
+ dfR.rename(columns={"Source": "To Lake Column"}, inplace=True)
197
+ dfR.drop(columns=["Column Object"], inplace=True)
198
+
199
+ spark = SparkSession.builder.getOrCreate()
200
+ for i, r in dfR.iterrows():
201
+ fromTable = r["From Lake Table"]
202
+ fromColumn = r["From Lake Column"]
203
+ toTable = r["To Lake Table"]
204
+ toColumn = r["To Lake Column"]
205
+
206
+ if lakehouse_workspace == current_workspace:
207
+ query = f"select count(f.{fromColumn}) as {fromColumn}\nfrom {fromTable} as f\nleft join {toTable} as c on f.{fromColumn} = c.{toColumn}\nwhere c.{toColumn} is null"
208
+ else:
209
+ tempTableFrom = "delta_table_" + fromTable
210
+ tempTableTo = "delta_table_" + toTable
211
+
212
+ query = f"select count(f.{fromColumn}) as {fromColumn}\nfrom {tempTableFrom} as f\nleft join {tempTableTo} as c on f.{fromColumn} = c.{toColumn}\nwhere c.{toColumn} is null"
213
+
214
+ # query = f"select count(f.{fromColumn}) as {fromColumn}\nfrom {fromTable} as f\nleft join {toTable} as c on f.{fromColumn} = c.{toColumn}\nwhere c.{toColumn} is null"
215
+
216
+ df = spark.sql(query)
217
+ missingRows = df.collect()[0][0]
218
+ dfR.at[i, "Missing Rows"] = missingRows
219
+
220
+ dfR["Missing Rows"] = dfR["Missing Rows"].astype(int)
221
+ else:
222
+ # Calculate missing rows using DAX for non-direct lake
223
+ for i, r in dfR.iterrows():
224
+ fromTable = r["From Table"]
225
+ fromColumn = r["From Column"]
226
+ toTable = r["To Table"]
227
+ toColumn = r["To Column"]
228
+ isActive = bool(r["Active"])
229
+ fromObject = format_dax_object_name(fromTable, fromColumn)
230
+ toObject = format_dax_object_name(toTable, toColumn)
231
+
232
+ missingRows = 0
233
+
234
+ query = f"evaluate\nsummarizecolumns(\n\"1\",calculate(countrows('{fromTable}'),isblank({toObject}))\n)"
235
+
236
+ if isActive == False: # add userelationship
237
+ query = f"evaluate\nsummarizecolumns(\n\"1\",calculate(countrows('{fromTable}'),userelationship({fromObject},{toObject}),isblank({toObject}))\n)"
238
+
239
+ result = fabric.evaluate_dax(
240
+ dataset=dataset, dax_string=query, workspace=workspace
241
+ )
242
+
243
+ try:
244
+ missingRows = result.iloc[0, 0]
245
+ except:
246
+ pass
247
+
248
+ dfR.at[i, "Missing Rows"] = missingRows
249
+ dfR["Missing Rows"] = dfR["Missing Rows"].astype(int)
250
+
251
+ dfTP = dfP.groupby("Table Name")["Partition Name"].count().reset_index()
252
+ dfTP.rename(columns={"Partition Name": "Partitions"}, inplace=True)
253
+ dfTC = dfC.groupby("Table Name")["Column Name"].count().reset_index()
254
+ dfTC.rename(columns={"Column Name": "Columns"}, inplace=True)
255
+
256
+ total_size = dfC["Total Size"].sum()
257
+ table_sizes = dfC.groupby("Table Name")["Total Size"].sum().reset_index()
258
+ table_sizes.rename(columns={"Total Size": "Table Size"}, inplace=True)
259
+
260
+ # Columns
261
+ dfC_filt = dfC[~dfC["Column Name"].str.startswith("RowNumber-")]
262
+ dfC_filt["% DB"] = round((dfC_filt["Total Size"] / total_size) * 100, 2)
263
+ dfC_filt = pd.merge(dfC_filt, table_sizes, on="Table Name", how="left")
264
+ dfC_filt["% Table"] = round(
265
+ (dfC_filt["Total Size"] / dfC_filt["Table Size"]) * 100, 2
266
+ )
267
+ columnList = [
268
+ "Table Name",
269
+ "Column Name",
270
+ "Type",
271
+ "Cardinality",
272
+ "Total Size",
273
+ "Data Size",
274
+ "Dictionary Size",
275
+ "Hierarchy Size",
276
+ "% Table",
277
+ "% DB",
278
+ "Data Type",
279
+ "Encoding",
280
+ "Is Resident",
281
+ "Temperature",
282
+ "Last Accessed",
283
+ ]
284
+
285
+ colSize = dfC_filt[columnList].sort_values(by="Total Size", ascending=False)
286
+ temp = dfC_filt[columnList].sort_values(by="Temperature", ascending=False)
287
+ colSize.reset_index(drop=True, inplace=True)
288
+ temp.reset_index(drop=True, inplace=True)
289
+
290
+ export_Col = colSize.copy()
291
+
292
+ intList = [
293
+ "Cardinality",
294
+ "Total Size",
295
+ "Data Size",
296
+ "Dictionary Size",
297
+ "Hierarchy Size",
298
+ ]
299
+ pctList = ["% Table", "% DB"]
300
+ colSize[intList] = colSize[intList].applymap("{:,}".format)
301
+ temp[intList] = temp[intList].applymap("{:,}".format)
302
+ colSize[pctList] = colSize[pctList].applymap("{:.2f}%".format)
303
+ temp[pctList] = temp[pctList].applymap("{:.2f}%".format)
304
+
305
+ # Tables
306
+ intList = ["Total Size", "Data Size", "Dictionary Size", "Hierarchy Size"]
307
+ dfCSum = dfC.groupby(["Table Name"])[intList].sum().reset_index()
308
+ dfCSum["% DB"] = round((dfCSum["Total Size"] / total_size) * 100, 2)
309
+
310
+ dfTable = pd.merge(
311
+ dfT[["Table Name", "Type", "Row Count"]], dfCSum, on="Table Name", how="inner"
312
+ )
313
+ dfTable = pd.merge(dfTable, dfTP, on="Table Name", how="left")
314
+ dfTable = pd.merge(dfTable, dfTC, on="Table Name", how="left")
315
+ dfTable = dfTable.drop_duplicates() # Drop duplicates (temporary)
316
+ dfTable = dfTable.sort_values(by="Total Size", ascending=False)
317
+ dfTable.reset_index(drop=True, inplace=True)
318
+ export_Table = dfTable.copy()
319
+
320
+ intList.extend(["Row Count", "Partitions", "Columns"])
321
+ dfTable[intList] = dfTable[intList].applymap("{:,}".format)
322
+ pctList = ["% DB"]
323
+ dfTable[pctList] = dfTable[pctList].applymap("{:.2f}%".format)
324
+
325
+ ## Relationships
326
+ # dfR.drop(columns=['Max From Cardinality', 'Max To Cardinality'], inplace=True)
327
+ dfR = pd.merge(
328
+ dfR,
329
+ dfC[["Column Object", "Cardinality"]],
330
+ left_on="From Object",
331
+ right_on="Column Object",
332
+ how="left",
333
+ )
334
+ dfR.rename(columns={"Cardinality": "Max From Cardinality"}, inplace=True)
335
+ dfR = pd.merge(
336
+ dfR,
337
+ dfC[["Column Object", "Cardinality"]],
338
+ left_on="To Object",
339
+ right_on="Column Object",
340
+ how="left",
341
+ )
342
+ dfR.rename(columns={"Cardinality": "Max To Cardinality"}, inplace=True)
343
+ dfR = dfR[
344
+ [
345
+ "From Object",
346
+ "To Object",
347
+ "Multiplicity",
348
+ "Used Size",
349
+ "Max From Cardinality",
350
+ "Max To Cardinality",
351
+ "Missing Rows",
352
+ ]
353
+ ].sort_values(by="Used Size", ascending=False)
354
+ dfR.reset_index(drop=True, inplace=True)
355
+ export_Rel = dfR.copy()
356
+ intList = [
357
+ "Used Size",
358
+ "Max From Cardinality",
359
+ "Max To Cardinality",
360
+ "Missing Rows",
361
+ ]
362
+ if read_stats_from_data == False:
363
+ intList.remove("Missing Rows")
364
+ dfR[intList] = dfR[intList].applymap("{:,}".format)
365
+
366
+ ## Partitions
367
+ dfP = dfP[
368
+ ["Table Name", "Partition Name", "Mode", "Record Count", "Segment Count"]
369
+ ].sort_values(
370
+ by="Record Count", ascending=False
371
+ ) # , 'Records per Segment'
372
+ dfP["Records per Segment"] = round(
373
+ dfP["Record Count"] / dfP["Segment Count"], 2
374
+ ) # Remove after records per segment is fixed
375
+ dfP.reset_index(drop=True, inplace=True)
376
+ export_Part = dfP.copy()
377
+ intList = ["Record Count", "Segment Count", "Records per Segment"]
378
+ dfP[intList] = dfP[intList].applymap("{:,}".format)
379
+
380
+ ## Hierarchies
381
+ dfH_filt = dfH[dfH["Level Ordinal"] == 0]
382
+ dfH_filt = dfH_filt[["Table Name", "Hierarchy Name", "Used Size"]].sort_values(
383
+ by="Used Size", ascending=False
384
+ )
385
+ dfH_filt.reset_index(drop=True, inplace=True)
386
+ export_Hier = dfH_filt.copy()
387
+ intList = ["Used Size"]
388
+ dfH_filt[intList] = dfH_filt[intList].applymap("{:,}".format)
389
+
390
+ ## Model
391
+ if total_size >= 1000000000:
392
+ y = total_size / (1024**3) * 1000000000
393
+ elif total_size >= 1000000:
394
+ y = total_size / (1024**2) * 1000000
395
+ elif total_size >= 1000:
396
+ y = total_size / (1024) * 1000
397
+ y = round(y)
398
+
399
+ tblCount = len(dfT)
400
+ colCount = len(dfC_filt)
401
+ compatLevel = dfD["Compatibility Level"].iloc[0]
402
+ defMode = dfD["Model Default Mode"].iloc[0]
403
+
404
+ dfModel = pd.DataFrame(
405
+ {
406
+ "Dataset Name": dataset,
407
+ "Total Size": y,
408
+ "Table Count": tblCount,
409
+ "Column Count": colCount,
410
+ "Compatibility Level": compatLevel,
411
+ "Default Mode": defMode,
412
+ },
413
+ index=[0],
414
+ )
415
+ dfModel.reset_index(drop=True, inplace=True)
416
+ export_Model = dfModel.copy()
417
+ intList = ["Total Size", "Table Count", "Column Count"]
418
+ dfModel[intList] = dfModel[intList].applymap("{:,}".format)
419
+
420
+ dataFrames = {
421
+ "dfModel": dfModel,
422
+ "dfTable": dfTable,
423
+ "dfP": dfP,
424
+ "colSize": colSize,
425
+ "temp": temp,
426
+ "dfR": dfR,
427
+ "dfH_filt": dfH_filt,
428
+ }
429
+
430
+ dfs = {}
431
+ for fileName, df in dataFrames.items():
432
+ dfs[fileName] = df
433
+
434
+ visualize_vertipaq(dfs)
435
+
436
+ ### Export vertipaq to delta tables in lakehouse
437
+ if export in ["table", "zip"]:
438
+ lakeAttach = lakehouse_attached()
439
+ if lakeAttach == False:
440
+ print(
441
+ f"In order to save the Vertipaq Analyzer results, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook."
442
+ )
443
+ return
444
+
445
+ if export == "table":
446
+ spark = SparkSession.builder.getOrCreate()
447
+
448
+ lakehouse_id = fabric.get_lakehouse_id()
449
+ lakehouse = resolve_lakehouse_name(
450
+ lakehouse_id=lakehouse_id, workspace=workspace
451
+ )
452
+ lakeTName = "vertipaq_analyzer_model"
453
+
454
+ lakeT = get_lakehouse_tables(lakehouse=lakehouse, workspace=workspace)
455
+ lakeT_filt = lakeT[lakeT["Table Name"] == lakeTName]
456
+
457
+ query = f"SELECT MAX(RunId) FROM {lakehouse}.{lakeTName}"
458
+
459
+ if len(lakeT_filt) == 0:
460
+ runId = 1
461
+ else:
462
+ dfSpark = spark.sql(query)
463
+ maxRunId = dfSpark.collect()[0][0]
464
+ runId = maxRunId + 1
465
+
466
+ dfMap = {
467
+ "export_Col": ["Columns", export_Col],
468
+ "export_Table": ["Tables", export_Table],
469
+ "export_Part": ["Partitions", export_Part],
470
+ "export_Rel": ["Relationships", export_Rel],
471
+ "export_Hier": ["Hierarchies", export_Hier],
472
+ "export_Model": ["Model", export_Model],
473
+ }
474
+
475
+ print(f"Saving Vertipaq Analyzer to delta tables in the lakehouse...\n")
476
+ now = datetime.datetime.now()
477
+ for key, (obj, df) in dfMap.items():
478
+ df["Timestamp"] = now
479
+ df["Workspace Name"] = workspace
480
+ df["Dataset Name"] = dataset
481
+ df["RunId"] = runId
482
+
483
+ colName = "Workspace Name"
484
+ df.insert(0, colName, df.pop(colName))
485
+ colName = "Dataset Name"
486
+ df.insert(1, colName, df.pop(colName))
487
+
488
+ df.columns = df.columns.str.replace(" ", "_")
489
+
490
+ delta_table_name = f"VertipaqAnalyzer_{obj}".lower()
491
+ spark_df = spark.createDataFrame(df)
492
+ spark_df.write.mode("append").format("delta").saveAsTable(delta_table_name)
493
+ print(
494
+ f"\u2022 Vertipaq Analyzer results for '{obj}' have been appended to the '{delta_table_name}' delta table."
495
+ )
496
+
497
+ ### Export vertipaq to zip file within the lakehouse
498
+ if export == "zip":
499
+ dataFrames = {
500
+ "dfModel": dfModel,
501
+ "dfTable": dfTable,
502
+ "dfP": dfP,
503
+ "colSize": colSize,
504
+ "temp": temp,
505
+ "dfR": dfR,
506
+ "dfH_filt": dfH_filt,
507
+ }
508
+
509
+ zipFileName = f"{workspace}.{dataset}.zip"
510
+
511
+ folderPath = "/lakehouse/default/Files"
512
+ subFolderPath = os.path.join(folderPath, "VertipaqAnalyzer")
513
+ ext = ".csv"
514
+ if not os.path.exists(subFolderPath):
515
+ os.makedirs(subFolderPath, exist_ok=True)
516
+ zipFilePath = os.path.join(subFolderPath, zipFileName)
517
+
518
+ # Create CSV files based on dataframes
519
+ for fileName, df in dataFrames.items():
520
+ filePath = os.path.join(subFolderPath, fileName + ext)
521
+ df.to_csv(filePath, index=False)
522
+
523
+ # Create a zip file and add CSV files to it
524
+ with zipfile.ZipFile(zipFilePath, "w") as zipf:
525
+ for fileName in dataFrames:
526
+ filePath = os.path.join(subFolderPath, fileName + ext)
527
+ zipf.write(filePath, os.path.basename(filePath))
528
+
529
+ # Clean up: remove the individual CSV files
530
+ for fileName, df in dataFrames.items():
531
+ filePath = os.path.join(subFolderPath, fileName) + ext
532
+ if os.path.exists(filePath):
533
+ os.remove(filePath)
534
+ print(
535
+ f"The Vertipaq Analyzer info for the '{dataset}' semantic model in the '{workspace}' workspace has been saved to the 'Vertipaq Analyzer/{zipFileName}' in the default lakehouse attached to this notebook."
536
+ )
537
+
538
+
539
+ def visualize_vertipaq(dataframes):
540
+
541
+ # Tooltips for columns within the visual
542
+ data = [
543
+ {
544
+ "ViewName": "Model",
545
+ "ColumnName": "Dataset Name",
546
+ "Tooltip": "The name of the semantic model",
547
+ },
548
+ {
549
+ "ViewName": "Model",
550
+ "ColumnName": "Total Size",
551
+ "Tooltip": "The size of the model (in bytes)",
552
+ },
553
+ {
554
+ "ViewName": "Model",
555
+ "ColumnName": "Table Count",
556
+ "Tooltip": "The number of tables in the semantic model",
557
+ },
558
+ {
559
+ "ViewName": "Model",
560
+ "ColumnName": "Column Count",
561
+ "Tooltip": "The number of columns in the semantic model",
562
+ },
563
+ {
564
+ "ViewName": "Model",
565
+ "ColumnName": "Compatibility Level",
566
+ "Tooltip": "The compatibility level of the semantic model",
567
+ },
568
+ {
569
+ "ViewName": "Model",
570
+ "ColumnName": "Default Mode",
571
+ "Tooltip": "The default query mode of the semantic model",
572
+ },
573
+ {
574
+ "ViewName": "Table",
575
+ "ColumnName": "Table Name",
576
+ "Tooltip": "The name of the table",
577
+ },
578
+ {"ViewName": "Table", "ColumnName": "Type", "Tooltip": "The type of table"},
579
+ {
580
+ "ViewName": "Table",
581
+ "ColumnName": "Row Count",
582
+ "Tooltip": "The number of rows in the table",
583
+ },
584
+ {
585
+ "ViewName": "Table",
586
+ "ColumnName": "Total Size",
587
+ "Tooltip": "Data Size + Dictionary Size + Hierarchy Size (in bytes)",
588
+ },
589
+ {
590
+ "ViewName": "Table",
591
+ "ColumnName": "Data Size",
592
+ "Tooltip": "The size of the data for all the columns in this table (in bytes)",
593
+ },
594
+ {
595
+ "ViewName": "Table",
596
+ "ColumnName": "Dictionary Size",
597
+ "Tooltip": "The size of the column's dictionary for all columns in this table (in bytes)",
598
+ },
599
+ {
600
+ "ViewName": "Table",
601
+ "ColumnName": "Hierarchy Size",
602
+ "Tooltip": "The size of hierarchy structures for all columns in this table (in bytes)",
603
+ },
604
+ {
605
+ "ViewName": "Table",
606
+ "ColumnName": "% DB",
607
+ "Tooltip": "The size of the table relative to the size of the semantic model",
608
+ },
609
+ {
610
+ "ViewName": "Table",
611
+ "ColumnName": "Partitions",
612
+ "Tooltip": "The number of partitions in the table",
613
+ },
614
+ {
615
+ "ViewName": "Table",
616
+ "ColumnName": "Columns",
617
+ "Tooltip": "The number of columns in the table",
618
+ },
619
+ {
620
+ "ViewName": "Partition",
621
+ "ColumnName": "Table Name",
622
+ "Tooltip": "The name of the table",
623
+ },
624
+ {
625
+ "ViewName": "Partition",
626
+ "ColumnName": "Partition Name",
627
+ "Tooltip": "The name of the partition within the table",
628
+ },
629
+ {
630
+ "ViewName": "Partition",
631
+ "ColumnName": "Mode",
632
+ "Tooltip": "The query mode of the partition",
633
+ },
634
+ {
635
+ "ViewName": "Partition",
636
+ "ColumnName": "Record Count",
637
+ "Tooltip": "The number of rows in the partition",
638
+ },
639
+ {
640
+ "ViewName": "Partition",
641
+ "ColumnName": "Segment Count",
642
+ "Tooltip": "The number of segments within the partition",
643
+ },
644
+ {
645
+ "ViewName": "Partition",
646
+ "ColumnName": "Records per Segment",
647
+ "Tooltip": "The number of rows per segment",
648
+ },
649
+ {
650
+ "ViewName": "Column",
651
+ "ColumnName": "Table Name",
652
+ "Tooltip": "The name of the table",
653
+ },
654
+ {
655
+ "ViewName": "Column",
656
+ "ColumnName": "Column Name",
657
+ "Tooltip": "The name of the column",
658
+ },
659
+ {"ViewName": "Column", "ColumnName": "Type", "Tooltip": "The type of column"},
660
+ {
661
+ "ViewName": "Column",
662
+ "ColumnName": "Cardinality",
663
+ "Tooltip": "The number of unique rows in the column",
664
+ },
665
+ {
666
+ "ViewName": "Column",
667
+ "ColumnName": "Total Size",
668
+ "Tooltip": "Data Size + Dictionary Size + Hierarchy Size (in bytes)",
669
+ },
670
+ {
671
+ "ViewName": "Column",
672
+ "ColumnName": "Data Size",
673
+ "Tooltip": "The size of the data for the column (in bytes)",
674
+ },
675
+ {
676
+ "ViewName": "Column",
677
+ "ColumnName": "Dictionary Size",
678
+ "Tooltip": "The size of the column's dictionary (in bytes)",
679
+ },
680
+ {
681
+ "ViewName": "Column",
682
+ "ColumnName": "Hierarchy Size",
683
+ "Tooltip": "The size of hierarchy structures (in bytes)",
684
+ },
685
+ {
686
+ "ViewName": "Column",
687
+ "ColumnName": "% Table",
688
+ "Tooltip": "The size of the column relative to the size of the table",
689
+ },
690
+ {
691
+ "ViewName": "Column",
692
+ "ColumnName": "% DB",
693
+ "Tooltip": "The size of the column relative to the size of the semantic model",
694
+ },
695
+ {
696
+ "ViewName": "Column",
697
+ "ColumnName": "Data Type",
698
+ "Tooltip": "The data type of the column",
699
+ },
700
+ {
701
+ "ViewName": "Column",
702
+ "ColumnName": "Encoding",
703
+ "Tooltip": "The encoding type for the column",
704
+ },
705
+ {
706
+ "ViewName": "Column",
707
+ "ColumnName": "Is Resident",
708
+ "Tooltip": "Indicates whether the column is in memory or not",
709
+ },
710
+ {
711
+ "ViewName": "Column",
712
+ "ColumnName": "Temperature",
713
+ "Tooltip": "A decimal indicating the frequency and recency of queries against the column",
714
+ },
715
+ {
716
+ "ViewName": "Column",
717
+ "ColumnName": "Last Accessed",
718
+ "Tooltip": "The time the column was last queried",
719
+ },
720
+ {
721
+ "ViewName": "Hierarchy",
722
+ "ColumnName": "Table Name",
723
+ "Tooltip": "The name of the table",
724
+ },
725
+ {
726
+ "ViewName": "Hierarchy",
727
+ "ColumnName": "Hierarchy Name",
728
+ "Tooltip": "The name of the hierarchy",
729
+ },
730
+ {
731
+ "ViewName": "Hierarchy",
732
+ "ColumnName": "Used Size",
733
+ "Tooltip": "The size of user hierarchy structures (in bytes)",
734
+ },
735
+ {
736
+ "ViewName": "Relationship",
737
+ "ColumnName": "From Object",
738
+ "Tooltip": "The from table/column in the relationship",
739
+ },
740
+ {
741
+ "ViewName": "Relationship",
742
+ "ColumnName": "To Object",
743
+ "Tooltip": "The to table/column in the relationship",
744
+ },
745
+ {
746
+ "ViewName": "Relationship",
747
+ "ColumnName": "Multiplicity",
748
+ "Tooltip": "The cardinality on each side of the relationship",
749
+ },
750
+ {
751
+ "ViewName": "Relationship",
752
+ "ColumnName": "Used Size",
753
+ "Tooltip": "The size of the relationship (in bytes)",
754
+ },
755
+ {
756
+ "ViewName": "Relationship",
757
+ "ColumnName": "Max From Cardinality",
758
+ "Tooltip": "The number of unique values in the column used in the from side of the relationship",
759
+ },
760
+ {
761
+ "ViewName": "Relationship",
762
+ "ColumnName": "Max To Cardinality",
763
+ "Tooltip": "The number of unique values in the column used in the to side of the relationship",
764
+ },
765
+ {
766
+ "ViewName": "Relationship",
767
+ "ColumnName": "Missing Rows",
768
+ "Tooltip": "The number of rows in the 'from' table which do not map to the key column in the 'to' table",
769
+ },
770
+ ]
771
+
772
+ # Create DataFrame
773
+ tooltipDF = pd.DataFrame(data)
774
+
775
+ # define the dictionary with {"Tab name":df}
776
+ df_dict = {
777
+ "Model Summary": dataframes["dfModel"],
778
+ "Tables": dataframes["dfTable"],
779
+ "Partitions": dataframes["dfP"],
780
+ "Columns (Total Size)": dataframes["colSize"],
781
+ "Columns (Temperature)": dataframes["temp"],
782
+ "Relationships": dataframes["dfR"],
783
+ "Hierarchies": dataframes["dfH_filt"],
784
+ }
785
+
786
+ mapping = {
787
+ "Model Summary": "Model",
788
+ "Tables": "Table",
789
+ "Partitions": "Partition",
790
+ "Columns (Total Size)": "Column",
791
+ "Columns (Temperature)": "Column",
792
+ "Relationships": "Relationship",
793
+ "Hierarchies": "Hierarchy",
794
+ }
795
+
796
+ # Basic styles for the tabs and tab content
797
+ styles = """
798
+ <style>
799
+ .tab { overflow: hidden; border: 1px solid #ccc; background-color: #f1f1f1; }
800
+ .tab button { background-color: inherit; float: left; border: none; outline: none; cursor: pointer; padding: 14px 16px; transition: 0.3s; }
801
+ .tab button:hover { background-color: #ddd; }
802
+ .tab button.active { background-color: #ccc; }
803
+ .tabcontent { display: none; padding: 6px 12px; border: 1px solid #ccc; border-top: none; }
804
+ </style>
805
+ """
806
+ # JavaScript for tab functionality
807
+ script = """
808
+ <script>
809
+ function openTab(evt, tabName) {
810
+ var i, tabcontent, tablinks;
811
+ tabcontent = document.getElementsByClassName("tabcontent");
812
+ for (i = 0; i < tabcontent.length; i++) {
813
+ tabcontent[i].style.display = "none";
814
+ }
815
+ tablinks = document.getElementsByClassName("tablinks");
816
+ for (i = 0; i < tablinks.length; i++) {
817
+ tablinks[i].className = tablinks[i].className.replace(" active", "");
818
+ }
819
+ document.getElementById(tabName).style.display = "block";
820
+ evt.currentTarget.className += " active";
821
+ }
822
+ </script>
823
+ """
824
+
825
+ # HTML for tabs
826
+ tab_html = '<div class="tab">'
827
+ content_html = ""
828
+ for i, (title, df) in enumerate(df_dict.items()):
829
+ tab_id = f"tab{i}"
830
+ tab_html += f'<button class="tablinks" onclick="openTab(event, \'{tab_id}\')">{title}</button>'
831
+
832
+ vw = mapping.get(title)
833
+
834
+ df_html = df.to_html()
835
+ for col in df.columns:
836
+ tt = None
837
+ try:
838
+ tooltipDF_filt = tooltipDF[
839
+ (tooltipDF["ViewName"] == vw) & (tooltipDF["ColumnName"] == col)
840
+ ]
841
+ tt = tooltipDF_filt["Tooltip"].iloc[0]
842
+ except:
843
+ pass
844
+ df_html = df_html.replace(f"<th>{col}</th>", f'<th title="{tt}">{col}</th>')
845
+ content_html += (
846
+ f'<div id="{tab_id}" class="tabcontent"><h3>{title}</h3>{df_html}</div>'
847
+ )
848
+ tab_html += "</div>"
849
+
850
+ # Display the tabs, tab contents, and run the script
851
+ display(HTML(styles + tab_html + content_html + script))
852
+ # Default to open the first tab
853
+ display(
854
+ HTML("<script>document.getElementsByClassName('tablinks')[0].click();</script>")
855
+ )
856
+
857
+
858
+ @log
859
+ def import_vertipaq_analyzer(folder_path: str, file_name: str):
860
+ """
861
+ Imports and visualizes the vertipaq analyzer info from a saved .zip file in your lakehouse.
862
+
863
+ Parameters
864
+ ----------
865
+ folder_path : str
866
+ The folder within your lakehouse in which the .zip file containing the vertipaq analyzer info has been saved.
867
+ file_name : str
868
+ The file name of the file which contains the vertipaq analyzer info.
869
+
870
+ Returns
871
+ -------
872
+ str
873
+ A visualization of the Vertipaq Analyzer statistics.
874
+ """
875
+
876
+ pd.options.mode.copy_on_write = True
877
+
878
+ zipFilePath = os.path.join(folder_path, file_name)
879
+ extracted_dir = os.path.join(folder_path, "extracted_dataframes")
880
+
881
+ with zipfile.ZipFile(zipFilePath, "r") as zip_ref:
882
+ zip_ref.extractall(extracted_dir)
883
+
884
+ # Read all CSV files into a dictionary of DataFrames
885
+ dfs = {}
886
+ for file_name in zip_ref.namelist():
887
+ df = pd.read_csv(extracted_dir + "/" + file_name)
888
+ dfs[file_name] = df
889
+
890
+ visualize_vertipaq(dfs)
891
+
892
+ # Clean up: remove the extracted directory
893
+ shutil.rmtree(extracted_dir)