semantic-link-labs 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of semantic-link-labs might be problematic. Click here for more details.
- semantic_link_labs-0.7.0.dist-info/METADATA +148 -0
- semantic_link_labs-0.7.0.dist-info/RECORD +111 -0
- {semantic_link_labs-0.5.0.dist-info → semantic_link_labs-0.7.0.dist-info}/WHEEL +1 -1
- sempy_labs/__init__.py +45 -15
- sempy_labs/_ai.py +42 -85
- sempy_labs/_bpa_translation/_translations_am-ET.po +828 -0
- sempy_labs/_bpa_translation/_translations_ar-AE.po +860 -0
- sempy_labs/_bpa_translation/_translations_cs-CZ.po +894 -0
- sempy_labs/_bpa_translation/_translations_da-DK.po +894 -0
- sempy_labs/_bpa_translation/_translations_de-DE.po +933 -0
- sempy_labs/_bpa_translation/_translations_el-GR.po +936 -0
- sempy_labs/_bpa_translation/_translations_es-ES.po +915 -0
- sempy_labs/_bpa_translation/_translations_fa-IR.po +883 -0
- sempy_labs/_bpa_translation/_translations_fr-FR.po +938 -0
- sempy_labs/_bpa_translation/_translations_ga-IE.po +912 -0
- sempy_labs/_bpa_translation/_translations_he-IL.po +855 -0
- sempy_labs/_bpa_translation/_translations_hi-IN.po +892 -0
- sempy_labs/_bpa_translation/_translations_hu-HU.po +910 -0
- sempy_labs/_bpa_translation/_translations_is-IS.po +887 -0
- sempy_labs/_bpa_translation/_translations_it-IT.po +931 -0
- sempy_labs/_bpa_translation/_translations_ja-JP.po +805 -0
- sempy_labs/_bpa_translation/_translations_nl-NL.po +924 -0
- sempy_labs/_bpa_translation/_translations_pl-PL.po +913 -0
- sempy_labs/_bpa_translation/_translations_pt-BR.po +909 -0
- sempy_labs/_bpa_translation/_translations_pt-PT.po +904 -0
- sempy_labs/_bpa_translation/_translations_ru-RU.po +909 -0
- sempy_labs/_bpa_translation/_translations_ta-IN.po +922 -0
- sempy_labs/_bpa_translation/_translations_te-IN.po +896 -0
- sempy_labs/_bpa_translation/_translations_th-TH.po +873 -0
- sempy_labs/_bpa_translation/_translations_zh-CN.po +767 -0
- sempy_labs/_bpa_translation/_translations_zu-ZA.po +916 -0
- sempy_labs/_clear_cache.py +12 -8
- sempy_labs/_connections.py +77 -70
- sempy_labs/_dax.py +7 -9
- sempy_labs/_generate_semantic_model.py +75 -90
- sempy_labs/_helper_functions.py +371 -20
- sempy_labs/_icons.py +23 -0
- sempy_labs/_list_functions.py +855 -427
- sempy_labs/_model_auto_build.py +4 -3
- sempy_labs/_model_bpa.py +307 -1118
- sempy_labs/_model_bpa_bulk.py +363 -0
- sempy_labs/_model_bpa_rules.py +831 -0
- sempy_labs/_model_dependencies.py +20 -16
- sempy_labs/_one_lake_integration.py +18 -12
- sempy_labs/_query_scale_out.py +116 -129
- sempy_labs/_refresh_semantic_model.py +23 -10
- sempy_labs/_translations.py +367 -288
- sempy_labs/_vertipaq.py +152 -123
- sempy_labs/directlake/__init__.py +7 -1
- sempy_labs/directlake/_directlake_schema_compare.py +33 -30
- sempy_labs/directlake/_directlake_schema_sync.py +60 -77
- sempy_labs/directlake/_dl_helper.py +233 -0
- sempy_labs/directlake/_get_directlake_lakehouse.py +7 -8
- sempy_labs/directlake/_get_shared_expression.py +5 -3
- sempy_labs/directlake/_guardrails.py +20 -16
- sempy_labs/directlake/_list_directlake_model_calc_tables.py +17 -10
- sempy_labs/directlake/_show_unsupported_directlake_objects.py +3 -2
- sempy_labs/directlake/_update_directlake_model_lakehouse_connection.py +10 -5
- sempy_labs/directlake/_update_directlake_partition_entity.py +169 -22
- sempy_labs/directlake/_warm_cache.py +7 -4
- sempy_labs/lakehouse/_get_lakehouse_columns.py +1 -1
- sempy_labs/lakehouse/_get_lakehouse_tables.py +65 -71
- sempy_labs/lakehouse/_lakehouse.py +5 -3
- sempy_labs/lakehouse/_shortcuts.py +20 -13
- sempy_labs/migration/__init__.py +1 -1
- sempy_labs/migration/_create_pqt_file.py +184 -186
- sempy_labs/migration/_migrate_calctables_to_lakehouse.py +240 -269
- sempy_labs/migration/_migrate_calctables_to_semantic_model.py +78 -77
- sempy_labs/migration/_migrate_model_objects_to_semantic_model.py +444 -425
- sempy_labs/migration/_migrate_tables_columns_to_semantic_model.py +96 -102
- sempy_labs/migration/_migration_validation.py +2 -2
- sempy_labs/migration/_refresh_calc_tables.py +94 -100
- sempy_labs/report/_BPAReportTemplate.json +232 -0
- sempy_labs/report/__init__.py +6 -2
- sempy_labs/report/_bpareporttemplate/.pbi/localSettings.json +9 -0
- sempy_labs/report/_bpareporttemplate/.platform +11 -0
- sempy_labs/report/_bpareporttemplate/StaticResources/SharedResources/BaseThemes/CY24SU06.json +710 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/page.json +11 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/1b08bce3bebabb0a27a8/visual.json +191 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/2f22ddb70c301693c165/visual.json +438 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/3b1182230aa6c600b43a/visual.json +127 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/58577ba6380c69891500/visual.json +576 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/a2a8fa5028b3b776c96c/visual.json +207 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/adfd47ef30652707b987/visual.json +506 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/b6a80ee459e716e170b1/visual.json +127 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/ce3130a721c020cc3d81/visual.json +513 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/92735ae19b31712208ad/page.json +8 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/92735ae19b31712208ad/visuals/66e60dfb526437cd78d1/visual.json +112 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/page.json +11 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/07deb8bce824e1be37d7/visual.json +513 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/0b1c68838818b32ad03b/visual.json +352 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/0c171de9d2683d10b930/visual.json +37 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/0efa01be0510e40a645e/visual.json +542 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/6bf2f0eb830ab53cc668/visual.json +221 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/88d8141cb8500b60030c/visual.json +127 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/a753273590beed656a03/visual.json +576 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/b8fdc82cddd61ac447bc/visual.json +127 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/d37dce724a0ccc30044b/page.json +9 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/d37dce724a0ccc30044b/visuals/ce8532a7e25020271077/visual.json +38 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/pages.json +10 -0
- sempy_labs/report/_bpareporttemplate/definition/report.json +176 -0
- sempy_labs/report/_bpareporttemplate/definition/version.json +4 -0
- sempy_labs/report/_bpareporttemplate/definition.pbir +14 -0
- sempy_labs/report/_generate_report.py +260 -139
- sempy_labs/report/_report_functions.py +90 -59
- sempy_labs/report/_report_rebind.py +40 -34
- sempy_labs/tom/__init__.py +1 -4
- sempy_labs/tom/_model.py +601 -181
- semantic_link_labs-0.5.0.dist-info/METADATA +0 -22
- semantic_link_labs-0.5.0.dist-info/RECORD +0 -53
- sempy_labs/directlake/_fallback.py +0 -58
- {semantic_link_labs-0.5.0.dist-info → semantic_link_labs-0.7.0.dist-info}/LICENSE +0 -0
- {semantic_link_labs-0.5.0.dist-info → semantic_link_labs-0.7.0.dist-info}/top_level.txt +0 -0
sempy_labs/_vertipaq.py
CHANGED
|
@@ -1,28 +1,34 @@
|
|
|
1
|
-
import sempy
|
|
2
1
|
import sempy.fabric as fabric
|
|
3
2
|
import pandas as pd
|
|
4
3
|
from IPython.display import display, HTML
|
|
5
|
-
import zipfile
|
|
4
|
+
import zipfile
|
|
5
|
+
import os
|
|
6
|
+
import shutil
|
|
7
|
+
import datetime
|
|
8
|
+
import warnings
|
|
6
9
|
from pyspark.sql import SparkSession
|
|
7
10
|
from sempy_labs._helper_functions import (
|
|
8
11
|
format_dax_object_name,
|
|
9
|
-
get_direct_lake_sql_endpoint,
|
|
10
12
|
resolve_lakehouse_name,
|
|
13
|
+
resolve_dataset_id,
|
|
14
|
+
save_as_delta_table,
|
|
15
|
+
resolve_workspace_capacity,
|
|
11
16
|
)
|
|
12
17
|
from sempy_labs._list_functions import list_relationships
|
|
13
|
-
from sempy_labs.lakehouse
|
|
14
|
-
from sempy_labs.
|
|
15
|
-
from typing import
|
|
18
|
+
from sempy_labs.lakehouse import lakehouse_attached, get_lakehouse_tables
|
|
19
|
+
from sempy_labs.directlake import get_direct_lake_source
|
|
20
|
+
from typing import Optional
|
|
16
21
|
from sempy._utils._log import log
|
|
17
22
|
import sempy_labs._icons as icons
|
|
18
23
|
|
|
24
|
+
|
|
19
25
|
@log
|
|
20
26
|
def vertipaq_analyzer(
|
|
21
27
|
dataset: str,
|
|
22
28
|
workspace: Optional[str] = None,
|
|
23
29
|
export: Optional[str] = None,
|
|
24
|
-
lakehouse_workspace: Optional[str] = None,
|
|
25
30
|
read_stats_from_data: Optional[bool] = False,
|
|
31
|
+
**kwargs,
|
|
26
32
|
):
|
|
27
33
|
"""
|
|
28
34
|
Displays an HTML visualization of the Vertipaq Analyzer statistics from a semantic model.
|
|
@@ -39,10 +45,6 @@ def vertipaq_analyzer(
|
|
|
39
45
|
Specifying 'zip' will export the results to a zip file in your lakehouse (which can be imported using the import_vertipaq_analyzer function.
|
|
40
46
|
Specifying 'table' will export the results to delta tables (appended) in your lakehouse.
|
|
41
47
|
Default value: None.
|
|
42
|
-
lakehouse_workspace : str, default=None
|
|
43
|
-
The Fabric workspace used by the lakehouse (for Direct Lake semantic models).
|
|
44
|
-
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
45
|
-
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
46
48
|
read_stats_from_data : bool, default=False
|
|
47
49
|
Setting this parameter to true has the function get Column Cardinality and Missing Rows using DAX (Direct Lake semantic models achieve this using a Spark query to the lakehouse).
|
|
48
50
|
|
|
@@ -51,6 +53,14 @@ def vertipaq_analyzer(
|
|
|
51
53
|
|
|
52
54
|
"""
|
|
53
55
|
|
|
56
|
+
from sempy_labs.tom import connect_semantic_model
|
|
57
|
+
|
|
58
|
+
if "lakehouse_workspace" in kwargs:
|
|
59
|
+
print(
|
|
60
|
+
f"{icons.info} The 'lakehouse_workspace' parameter has been deprecated as it is no longer necessary. Please remove this parameter from the function going forward."
|
|
61
|
+
)
|
|
62
|
+
del kwargs["lakehouse_workspace"]
|
|
63
|
+
|
|
54
64
|
pd.options.mode.copy_on_write = True
|
|
55
65
|
warnings.filterwarnings(
|
|
56
66
|
"ignore", message="createDataFrame attempted Arrow optimization*"
|
|
@@ -58,9 +68,6 @@ def vertipaq_analyzer(
|
|
|
58
68
|
|
|
59
69
|
workspace = fabric.resolve_workspace_name(workspace)
|
|
60
70
|
|
|
61
|
-
if lakehouse_workspace is None:
|
|
62
|
-
lakehouse_workspace = workspace
|
|
63
|
-
|
|
64
71
|
dfT = fabric.list_tables(dataset=dataset, extended=True, workspace=workspace)
|
|
65
72
|
dfT.rename(columns={"Name": "Table Name"}, inplace=True)
|
|
66
73
|
dfC = fabric.list_columns(dataset=dataset, extended=True, workspace=workspace)
|
|
@@ -71,18 +78,24 @@ def vertipaq_analyzer(
|
|
|
71
78
|
dfR["From Object"] = format_dax_object_name(dfR["From Table"], dfR["From Column"])
|
|
72
79
|
dfR["To Object"] = format_dax_object_name(dfR["To Table"], dfR["To Column"])
|
|
73
80
|
dfP = fabric.list_partitions(dataset=dataset, extended=True, workspace=workspace)
|
|
74
|
-
|
|
75
|
-
workspace=workspace
|
|
76
|
-
additional_xmla_properties=["CompatibilityLevel", "Model.DefaultMode"],
|
|
81
|
+
artifact_type, lakehouse_name, lakehouse_id, lakehouse_workspace_id = (
|
|
82
|
+
get_direct_lake_source(dataset=dataset, workspace=workspace)
|
|
77
83
|
)
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
84
|
+
|
|
85
|
+
with connect_semantic_model(
|
|
86
|
+
dataset=dataset, readonly=True, workspace=workspace
|
|
87
|
+
) as tom:
|
|
88
|
+
compat_level = tom.model.Model.Database.CompatibilityLevel
|
|
89
|
+
is_direct_lake = tom.is_direct_lake()
|
|
90
|
+
def_mode = tom.model.DefaultMode
|
|
91
|
+
table_count = tom.model.Tables.Count
|
|
92
|
+
column_count = len(list(tom.all_columns()))
|
|
93
|
+
|
|
81
94
|
dfR["Missing Rows"] = None
|
|
82
95
|
|
|
83
96
|
# Direct Lake
|
|
84
97
|
if read_stats_from_data:
|
|
85
|
-
if
|
|
98
|
+
if is_direct_lake and artifact_type == "Lakehouse":
|
|
86
99
|
dfC = pd.merge(
|
|
87
100
|
dfC,
|
|
88
101
|
dfP[["Table Name", "Query", "Source Type"]],
|
|
@@ -93,66 +106,54 @@ def vertipaq_analyzer(
|
|
|
93
106
|
(dfC["Source Type"] == "Entity")
|
|
94
107
|
& (~dfC["Column Name"].str.startswith("RowNumber-"))
|
|
95
108
|
]
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
]
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
query = query
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
query = o[1]
|
|
145
|
-
|
|
146
|
-
df = spark.sql(query)
|
|
147
|
-
|
|
148
|
-
for column in df.columns:
|
|
149
|
-
x = df.collect()[0][column]
|
|
150
|
-
for i, r in dfC.iterrows():
|
|
151
|
-
if r["Query"] == tName and r["Source"] == column:
|
|
152
|
-
dfC.at[i, "Cardinality"] = x
|
|
153
|
-
|
|
154
|
-
# Remove column added temporarily
|
|
155
|
-
dfC.drop(columns=["Query", "Source Type"], inplace=True)
|
|
109
|
+
|
|
110
|
+
object_workspace = fabric.resolve_workspace_name(lakehouse_workspace_id)
|
|
111
|
+
current_workspace_id = fabric.get_workspace_id()
|
|
112
|
+
if current_workspace_id != lakehouse_workspace_id:
|
|
113
|
+
lakeTables = get_lakehouse_tables(
|
|
114
|
+
lakehouse=lakehouse_name, workspace=object_workspace
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
sql_statements = []
|
|
118
|
+
spark = SparkSession.builder.getOrCreate()
|
|
119
|
+
# Loop through tables
|
|
120
|
+
for lakeTName in dfC_flt["Query"].unique():
|
|
121
|
+
query = "SELECT "
|
|
122
|
+
columns_in_table = dfC_flt.loc[
|
|
123
|
+
dfC_flt["Query"] == lakeTName, "Source"
|
|
124
|
+
].unique()
|
|
125
|
+
|
|
126
|
+
# Loop through columns within those tables
|
|
127
|
+
for scName in columns_in_table:
|
|
128
|
+
query = query + f"COUNT(DISTINCT(`{scName}`)) AS `{scName}`, "
|
|
129
|
+
|
|
130
|
+
query = query[:-2]
|
|
131
|
+
if lakehouse_workspace_id == current_workspace_id:
|
|
132
|
+
query = query + f" FROM {lakehouse_name}.{lakeTName}"
|
|
133
|
+
else:
|
|
134
|
+
lakeTables_filt = lakeTables[lakeTables["Table Name"] == lakeTName]
|
|
135
|
+
tPath = lakeTables_filt["Location"].iloc[0]
|
|
136
|
+
|
|
137
|
+
df = spark.read.format("delta").load(tPath)
|
|
138
|
+
tempTableName = "delta_table_" + lakeTName
|
|
139
|
+
df.createOrReplaceTempView(tempTableName)
|
|
140
|
+
query = query + f" FROM {tempTableName}"
|
|
141
|
+
sql_statements.append((lakeTName, query))
|
|
142
|
+
|
|
143
|
+
for o in sql_statements:
|
|
144
|
+
tName = o[0]
|
|
145
|
+
query = o[1]
|
|
146
|
+
|
|
147
|
+
df = spark.sql(query)
|
|
148
|
+
|
|
149
|
+
for column in df.columns:
|
|
150
|
+
x = df.collect()[0][column]
|
|
151
|
+
for i, r in dfC.iterrows():
|
|
152
|
+
if r["Query"] == tName and r["Source"] == column:
|
|
153
|
+
dfC.at[i, "Cardinality"] = x
|
|
154
|
+
|
|
155
|
+
# Remove column added temporarily
|
|
156
|
+
dfC.drop(columns=["Query", "Source Type"], inplace=True)
|
|
156
157
|
|
|
157
158
|
# Direct Lake missing rows
|
|
158
159
|
dfR = pd.merge(
|
|
@@ -199,11 +200,11 @@ def vertipaq_analyzer(
|
|
|
199
200
|
toTable = r["To Lake Table"]
|
|
200
201
|
toColumn = r["To Lake Column"]
|
|
201
202
|
|
|
202
|
-
if
|
|
203
|
+
if lakehouse_workspace_id == current_workspace_id:
|
|
203
204
|
query = f"select count(f.{fromColumn}) as {fromColumn}\nfrom {fromTable} as f\nleft join {toTable} as c on f.{fromColumn} = c.{toColumn}\nwhere c.{toColumn} is null"
|
|
204
205
|
else:
|
|
205
|
-
tempTableFrom = "delta_table_"
|
|
206
|
-
tempTableTo = "delta_table_"
|
|
206
|
+
tempTableFrom = f"delta_table_{fromTable}"
|
|
207
|
+
tempTableTo = f"delta_table_{toTable}"
|
|
207
208
|
|
|
208
209
|
query = f"select count(f.{fromColumn}) as {fromColumn}\nfrom {tempTableFrom} as f\nleft join {tempTableTo} as c on f.{fromColumn} = c.{toColumn}\nwhere c.{toColumn} is null"
|
|
209
210
|
|
|
@@ -214,7 +215,7 @@ def vertipaq_analyzer(
|
|
|
214
215
|
dfR.at[i, "Missing Rows"] = missingRows
|
|
215
216
|
|
|
216
217
|
dfR["Missing Rows"] = dfR["Missing Rows"].astype(int)
|
|
217
|
-
|
|
218
|
+
elif not is_direct_lake:
|
|
218
219
|
# Calculate missing rows using DAX for non-direct lake
|
|
219
220
|
for i, r in dfR.iterrows():
|
|
220
221
|
fromTable = r["From Table"]
|
|
@@ -238,7 +239,7 @@ def vertipaq_analyzer(
|
|
|
238
239
|
|
|
239
240
|
try:
|
|
240
241
|
missingRows = result.iloc[0, 0]
|
|
241
|
-
except:
|
|
242
|
+
except Exception:
|
|
242
243
|
pass
|
|
243
244
|
|
|
244
245
|
dfR.at[i, "Missing Rows"] = missingRows
|
|
@@ -308,7 +309,6 @@ def vertipaq_analyzer(
|
|
|
308
309
|
)
|
|
309
310
|
dfTable = pd.merge(dfTable, dfTP, on="Table Name", how="left")
|
|
310
311
|
dfTable = pd.merge(dfTable, dfTC, on="Table Name", how="left")
|
|
311
|
-
dfTable = dfTable.drop_duplicates() # Drop duplicates (temporary)
|
|
312
312
|
dfTable = dfTable.sort_values(by="Total Size", ascending=False)
|
|
313
313
|
dfTable.reset_index(drop=True, inplace=True)
|
|
314
314
|
export_Table = dfTable.copy()
|
|
@@ -318,7 +318,7 @@ def vertipaq_analyzer(
|
|
|
318
318
|
pctList = ["% DB"]
|
|
319
319
|
dfTable[pctList] = dfTable[pctList].applymap("{:.2f}%".format)
|
|
320
320
|
|
|
321
|
-
|
|
321
|
+
# Relationships
|
|
322
322
|
# dfR.drop(columns=['Max From Cardinality', 'Max To Cardinality'], inplace=True)
|
|
323
323
|
dfR = pd.merge(
|
|
324
324
|
dfR,
|
|
@@ -359,12 +359,17 @@ def vertipaq_analyzer(
|
|
|
359
359
|
intList.remove("Missing Rows")
|
|
360
360
|
dfR[intList] = dfR[intList].applymap("{:,}".format)
|
|
361
361
|
|
|
362
|
-
|
|
362
|
+
# Partitions
|
|
363
363
|
dfP = dfP[
|
|
364
|
-
[
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
364
|
+
[
|
|
365
|
+
"Table Name",
|
|
366
|
+
"Partition Name",
|
|
367
|
+
"Mode",
|
|
368
|
+
"Record Count",
|
|
369
|
+
"Segment Count",
|
|
370
|
+
# "Records per Segment",
|
|
371
|
+
]
|
|
372
|
+
].sort_values(by="Record Count", ascending=False)
|
|
368
373
|
dfP["Records per Segment"] = round(
|
|
369
374
|
dfP["Record Count"] / dfP["Segment Count"], 2
|
|
370
375
|
) # Remove after records per segment is fixed
|
|
@@ -373,17 +378,19 @@ def vertipaq_analyzer(
|
|
|
373
378
|
intList = ["Record Count", "Segment Count", "Records per Segment"]
|
|
374
379
|
dfP[intList] = dfP[intList].applymap("{:,}".format)
|
|
375
380
|
|
|
376
|
-
|
|
381
|
+
# Hierarchies
|
|
377
382
|
dfH_filt = dfH[dfH["Level Ordinal"] == 0]
|
|
378
383
|
dfH_filt = dfH_filt[["Table Name", "Hierarchy Name", "Used Size"]].sort_values(
|
|
379
384
|
by="Used Size", ascending=False
|
|
380
385
|
)
|
|
381
386
|
dfH_filt.reset_index(drop=True, inplace=True)
|
|
387
|
+
dfH_filt.fillna({"Used Size": 0}, inplace=True)
|
|
388
|
+
dfH_filt["Used Size"] = dfH_filt["Used Size"].astype(int)
|
|
382
389
|
export_Hier = dfH_filt.copy()
|
|
383
390
|
intList = ["Used Size"]
|
|
384
391
|
dfH_filt[intList] = dfH_filt[intList].applymap("{:,}".format)
|
|
385
392
|
|
|
386
|
-
|
|
393
|
+
# Model
|
|
387
394
|
if total_size >= 1000000000:
|
|
388
395
|
y = total_size / (1024**3) * 1000000000
|
|
389
396
|
elif total_size >= 1000000:
|
|
@@ -392,23 +399,19 @@ def vertipaq_analyzer(
|
|
|
392
399
|
y = total_size / (1024) * 1000
|
|
393
400
|
y = round(y)
|
|
394
401
|
|
|
395
|
-
tblCount = len(dfT)
|
|
396
|
-
colCount = len(dfC_filt)
|
|
397
|
-
compatLevel = dfD["Compatibility Level"].iloc[0]
|
|
398
|
-
defMode = dfD["Model Default Mode"].iloc[0]
|
|
399
|
-
|
|
400
402
|
dfModel = pd.DataFrame(
|
|
401
403
|
{
|
|
402
404
|
"Dataset Name": dataset,
|
|
403
405
|
"Total Size": y,
|
|
404
|
-
"Table Count":
|
|
405
|
-
"Column Count":
|
|
406
|
-
"Compatibility Level":
|
|
407
|
-
"Default Mode":
|
|
406
|
+
"Table Count": table_count,
|
|
407
|
+
"Column Count": column_count,
|
|
408
|
+
"Compatibility Level": compat_level,
|
|
409
|
+
"Default Mode": def_mode,
|
|
408
410
|
},
|
|
409
411
|
index=[0],
|
|
410
412
|
)
|
|
411
413
|
dfModel.reset_index(drop=True, inplace=True)
|
|
414
|
+
dfModel["Default Mode"] = dfModel["Default Mode"].astype(str)
|
|
412
415
|
export_Model = dfModel.copy()
|
|
413
416
|
intList = ["Total Size", "Table Count", "Column Count"]
|
|
414
417
|
dfModel[intList] = dfModel[intList].applymap("{:,}".format)
|
|
@@ -429,22 +432,24 @@ def vertipaq_analyzer(
|
|
|
429
432
|
|
|
430
433
|
visualize_vertipaq(dfs)
|
|
431
434
|
|
|
432
|
-
|
|
435
|
+
# Export vertipaq to delta tables in lakehouse
|
|
433
436
|
if export in ["table", "zip"]:
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
+
if not lakehouse_attached():
|
|
438
|
+
raise ValueError(
|
|
439
|
+
f"{icons.red_dot} In order to save the Vertipaq Analyzer results, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook."
|
|
440
|
+
)
|
|
437
441
|
|
|
438
442
|
if export == "table":
|
|
439
443
|
spark = SparkSession.builder.getOrCreate()
|
|
440
444
|
|
|
441
445
|
lakehouse_id = fabric.get_lakehouse_id()
|
|
446
|
+
lake_workspace = fabric.resolve_workspace_name()
|
|
442
447
|
lakehouse = resolve_lakehouse_name(
|
|
443
|
-
lakehouse_id=lakehouse_id, workspace=
|
|
448
|
+
lakehouse_id=lakehouse_id, workspace=lake_workspace
|
|
444
449
|
)
|
|
445
450
|
lakeTName = "vertipaq_analyzer_model"
|
|
446
451
|
|
|
447
|
-
lakeT = get_lakehouse_tables(lakehouse=lakehouse, workspace=
|
|
452
|
+
lakeT = get_lakehouse_tables(lakehouse=lakehouse, workspace=lake_workspace)
|
|
448
453
|
lakeT_filt = lakeT[lakeT["Table Name"] == lakeTName]
|
|
449
454
|
|
|
450
455
|
query = f"SELECT MAX(RunId) FROM {lakehouse}.{lakeTName}"
|
|
@@ -465,29 +470,52 @@ def vertipaq_analyzer(
|
|
|
465
470
|
"export_Model": ["Model", export_Model],
|
|
466
471
|
}
|
|
467
472
|
|
|
468
|
-
print(
|
|
473
|
+
print(
|
|
474
|
+
f"{icons.in_progress} Saving Vertipaq Analyzer to delta tables in the lakehouse...\n"
|
|
475
|
+
)
|
|
469
476
|
now = datetime.datetime.now()
|
|
477
|
+
dfD = fabric.list_datasets(workspace=workspace, mode="rest")
|
|
478
|
+
dfD_filt = dfD[dfD["Dataset Name"] == dataset]
|
|
479
|
+
configured_by = dfD_filt["Configured By"].iloc[0]
|
|
480
|
+
capacity_id, capacity_name = resolve_workspace_capacity(workspace=workspace)
|
|
481
|
+
|
|
470
482
|
for key, (obj, df) in dfMap.items():
|
|
471
|
-
df["
|
|
483
|
+
df["Capacity Name"] = capacity_name
|
|
484
|
+
df["Capacity Id"] = capacity_id
|
|
485
|
+
df["Configured By"] = configured_by
|
|
472
486
|
df["Workspace Name"] = workspace
|
|
487
|
+
df["Workspace Id"] = fabric.resolve_workspace_id(workspace)
|
|
473
488
|
df["Dataset Name"] = dataset
|
|
489
|
+
df["Dataset Id"] = resolve_dataset_id(dataset, workspace)
|
|
474
490
|
df["RunId"] = runId
|
|
491
|
+
df["Timestamp"] = now
|
|
475
492
|
|
|
476
|
-
colName = "
|
|
493
|
+
colName = "Capacity Name"
|
|
477
494
|
df.insert(0, colName, df.pop(colName))
|
|
478
|
-
colName = "
|
|
495
|
+
colName = "Capacity Id"
|
|
479
496
|
df.insert(1, colName, df.pop(colName))
|
|
497
|
+
colName = "Workspace Name"
|
|
498
|
+
df.insert(2, colName, df.pop(colName))
|
|
499
|
+
colName = "Workspace Id"
|
|
500
|
+
df.insert(3, colName, df.pop(colName))
|
|
501
|
+
colName = "Dataset Name"
|
|
502
|
+
df.insert(4, colName, df.pop(colName))
|
|
503
|
+
colName = "Dataset Id"
|
|
504
|
+
df.insert(5, colName, df.pop(colName))
|
|
505
|
+
colName = "Configured By"
|
|
506
|
+
df.insert(6, colName, df.pop(colName))
|
|
480
507
|
|
|
481
508
|
df.columns = df.columns.str.replace(" ", "_")
|
|
482
509
|
|
|
483
510
|
delta_table_name = f"VertipaqAnalyzer_{obj}".lower()
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
511
|
+
save_as_delta_table(
|
|
512
|
+
dataframe=df,
|
|
513
|
+
delta_table_name=delta_table_name,
|
|
514
|
+
write_mode="append",
|
|
515
|
+
merge_schema=True,
|
|
488
516
|
)
|
|
489
517
|
|
|
490
|
-
|
|
518
|
+
# Export vertipaq to zip file within the lakehouse
|
|
491
519
|
if export == "zip":
|
|
492
520
|
dataFrames = {
|
|
493
521
|
"dfModel": dfModel,
|
|
@@ -510,13 +538,13 @@ def vertipaq_analyzer(
|
|
|
510
538
|
|
|
511
539
|
# Create CSV files based on dataframes
|
|
512
540
|
for fileName, df in dataFrames.items():
|
|
513
|
-
filePath = os.path.join(subFolderPath, fileName
|
|
541
|
+
filePath = os.path.join(subFolderPath, f"{fileName}{ext}")
|
|
514
542
|
df.to_csv(filePath, index=False)
|
|
515
543
|
|
|
516
544
|
# Create a zip file and add CSV files to it
|
|
517
545
|
with zipfile.ZipFile(zipFilePath, "w") as zipf:
|
|
518
546
|
for fileName in dataFrames:
|
|
519
|
-
filePath = os.path.join(subFolderPath, fileName
|
|
547
|
+
filePath = os.path.join(subFolderPath, f"{fileName}{ext}")
|
|
520
548
|
zipf.write(filePath, os.path.basename(filePath))
|
|
521
549
|
|
|
522
550
|
# Clean up: remove the individual CSV files
|
|
@@ -525,7 +553,8 @@ def vertipaq_analyzer(
|
|
|
525
553
|
if os.path.exists(filePath):
|
|
526
554
|
os.remove(filePath)
|
|
527
555
|
print(
|
|
528
|
-
f"{icons.green_dot} The Vertipaq Analyzer info for the '{dataset}' semantic model in the '{workspace}' workspace has been saved
|
|
556
|
+
f"{icons.green_dot} The Vertipaq Analyzer info for the '{dataset}' semantic model in the '{workspace}' workspace has been saved "
|
|
557
|
+
f"to the 'Vertipaq Analyzer/{zipFileName}' in the default lakehouse attached to this notebook."
|
|
529
558
|
)
|
|
530
559
|
|
|
531
560
|
|
|
@@ -832,7 +861,7 @@ def visualize_vertipaq(dataframes):
|
|
|
832
861
|
(tooltipDF["ViewName"] == vw) & (tooltipDF["ColumnName"] == col)
|
|
833
862
|
]
|
|
834
863
|
tt = tooltipDF_filt["Tooltip"].iloc[0]
|
|
835
|
-
except:
|
|
864
|
+
except Exception:
|
|
836
865
|
pass
|
|
837
866
|
df_html = df_html.replace(f"<th>{col}</th>", f'<th title="{tt}">{col}</th>')
|
|
838
867
|
content_html += (
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
from sempy_labs.directlake._directlake_schema_compare import direct_lake_schema_compare
|
|
2
2
|
from sempy_labs.directlake._directlake_schema_sync import direct_lake_schema_sync
|
|
3
|
-
from sempy_labs.directlake.
|
|
3
|
+
from sempy_labs.directlake._dl_helper import (
|
|
4
4
|
check_fallback_reason,
|
|
5
|
+
generate_direct_lake_semantic_model,
|
|
6
|
+
get_direct_lake_source,
|
|
5
7
|
)
|
|
6
8
|
from sempy_labs.directlake._get_directlake_lakehouse import get_direct_lake_lakehouse
|
|
7
9
|
from sempy_labs.directlake._get_shared_expression import get_shared_expression
|
|
@@ -21,6 +23,7 @@ from sempy_labs.directlake._update_directlake_model_lakehouse_connection import
|
|
|
21
23
|
)
|
|
22
24
|
from sempy_labs.directlake._update_directlake_partition_entity import (
|
|
23
25
|
update_direct_lake_partition_entity,
|
|
26
|
+
add_table_to_direct_lake_semantic_model,
|
|
24
27
|
)
|
|
25
28
|
from sempy_labs.directlake._warm_cache import (
|
|
26
29
|
warm_direct_lake_cache_isresident,
|
|
@@ -42,4 +45,7 @@ __all__ = [
|
|
|
42
45
|
"update_direct_lake_partition_entity",
|
|
43
46
|
"warm_direct_lake_cache_isresident",
|
|
44
47
|
"warm_direct_lake_cache_perspective",
|
|
48
|
+
"add_table_to_direct_lake_semantic_model",
|
|
49
|
+
"generate_direct_lake_semantic_model",
|
|
50
|
+
"get_direct_lake_source",
|
|
45
51
|
]
|
|
@@ -1,24 +1,22 @@
|
|
|
1
|
-
import sempy
|
|
2
1
|
import sempy.fabric as fabric
|
|
3
2
|
import pandas as pd
|
|
4
3
|
from sempy_labs._helper_functions import (
|
|
5
4
|
format_dax_object_name,
|
|
6
|
-
resolve_lakehouse_name,
|
|
7
|
-
get_direct_lake_sql_endpoint,
|
|
8
5
|
)
|
|
9
6
|
from IPython.display import display
|
|
10
|
-
from sempy_labs.lakehouse
|
|
7
|
+
from sempy_labs.lakehouse import get_lakehouse_columns
|
|
8
|
+
from sempy_labs.directlake._dl_helper import get_direct_lake_source
|
|
11
9
|
from sempy_labs._list_functions import list_tables
|
|
12
10
|
from typing import Optional
|
|
13
11
|
import sempy_labs._icons as icons
|
|
14
12
|
from sempy._utils._log import log
|
|
15
13
|
|
|
14
|
+
|
|
16
15
|
@log
|
|
17
16
|
def direct_lake_schema_compare(
|
|
18
17
|
dataset: str,
|
|
19
18
|
workspace: Optional[str] = None,
|
|
20
|
-
|
|
21
|
-
lakehouse_workspace: Optional[str] = None,
|
|
19
|
+
**kwargs,
|
|
22
20
|
):
|
|
23
21
|
"""
|
|
24
22
|
Checks that all the tables in a Direct Lake semantic model map to tables in their corresponding lakehouse and that the columns in each table exist.
|
|
@@ -31,38 +29,41 @@ def direct_lake_schema_compare(
|
|
|
31
29
|
The Fabric workspace name.
|
|
32
30
|
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
33
31
|
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
34
|
-
lakehouse : str, default=None
|
|
35
|
-
The Fabric lakehouse used by the Direct Lake semantic model.
|
|
36
|
-
Defaults to None which resolves to the lakehouse attached to the notebook.
|
|
37
|
-
lakehouse_workspace : str, default=None
|
|
38
|
-
The Fabric workspace used by the lakehouse.
|
|
39
|
-
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
40
|
-
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
41
32
|
"""
|
|
42
33
|
|
|
34
|
+
if "lakehouse" in kwargs:
|
|
35
|
+
print(
|
|
36
|
+
"The 'lakehouse' parameter has been deprecated as it is no longer necessary. Please remove this parameter from the function going forward."
|
|
37
|
+
)
|
|
38
|
+
del kwargs["lakehouse"]
|
|
39
|
+
if "lakehouse_workspace" in kwargs:
|
|
40
|
+
print(
|
|
41
|
+
"The 'lakehouse_workspace' parameter has been deprecated as it is no longer necessary. Please remove this parameter from the function going forward."
|
|
42
|
+
)
|
|
43
|
+
del kwargs["lakehouse_workspace"]
|
|
44
|
+
|
|
43
45
|
workspace = fabric.resolve_workspace_name(workspace)
|
|
44
46
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
+
artifact_type, lakehouse_name, lakehouse_id, lakehouse_workspace_id = (
|
|
48
|
+
get_direct_lake_source(dataset=dataset, workspace=workspace)
|
|
49
|
+
)
|
|
50
|
+
lakehouse_workspace = fabric.resolve_workspace_name(lakehouse_workspace_id)
|
|
47
51
|
|
|
48
|
-
if
|
|
49
|
-
|
|
50
|
-
|
|
52
|
+
if artifact_type == "Warehouse":
|
|
53
|
+
raise ValueError(
|
|
54
|
+
f"{icons.red_dot} This function is only valid for Direct Lake semantic models which source from Fabric lakehouses (not warehouses)."
|
|
55
|
+
)
|
|
51
56
|
|
|
52
57
|
dfP = fabric.list_partitions(dataset=dataset, workspace=workspace)
|
|
53
|
-
sqlEndpointId = get_direct_lake_sql_endpoint(dataset, workspace)
|
|
54
|
-
dfI = fabric.list_items(workspace=lakehouse_workspace, type="SQLEndpoint")
|
|
55
|
-
dfI_filt = dfI[(dfI["Id"] == sqlEndpointId)]
|
|
56
|
-
|
|
57
|
-
if len(dfI_filt) == 0:
|
|
58
|
-
raise ValueError(f"{icons.red_dot} The SQL Endpoint in the '{dataset}' semantic model in the '{workspace} workspace does not point to the '{lakehouse}' lakehouse in the '{lakehouse_workspace}' workspace as specified.")
|
|
59
58
|
|
|
60
59
|
if not any(r["Mode"] == "DirectLake" for i, r in dfP.iterrows()):
|
|
61
|
-
raise ValueError(
|
|
60
|
+
raise ValueError(
|
|
61
|
+
f"{icons.red_dot} The '{dataset}' semantic model is not in Direct Lake mode."
|
|
62
|
+
)
|
|
62
63
|
|
|
63
64
|
dfT = list_tables(dataset, workspace)
|
|
64
65
|
dfC = fabric.list_columns(dataset=dataset, workspace=workspace)
|
|
65
|
-
lc = get_lakehouse_columns(
|
|
66
|
+
lc = get_lakehouse_columns(lakehouse_name, lakehouse_workspace)
|
|
66
67
|
|
|
67
68
|
dfT.rename(columns={"Type": "Table Type"}, inplace=True)
|
|
68
69
|
dfP_filt = dfP[dfP["Mode"] == "DirectLake"]
|
|
@@ -88,19 +89,21 @@ def direct_lake_schema_compare(
|
|
|
88
89
|
|
|
89
90
|
if len(missingtbls) == 0:
|
|
90
91
|
print(
|
|
91
|
-
f"{icons.green_dot} All tables exist in the '{
|
|
92
|
+
f"{icons.green_dot} All tables exist in the '{lakehouse_name}' lakehouse within the '{lakehouse_workspace}' workspace."
|
|
92
93
|
)
|
|
93
94
|
else:
|
|
94
95
|
print(
|
|
95
|
-
f"{icons.yellow_dot} The following tables exist in the '{dataset}' semantic model within the '{workspace}' workspace
|
|
96
|
+
f"{icons.yellow_dot} The following tables exist in the '{dataset}' semantic model within the '{workspace}' workspace"
|
|
97
|
+
f" but do not exist in the '{lakehouse_name}' lakehouse within the '{lakehouse_workspace}' workspace."
|
|
96
98
|
)
|
|
97
99
|
display(missingtbls)
|
|
98
100
|
if len(missingcols) == 0:
|
|
99
101
|
print(
|
|
100
|
-
f"{icons.green_dot} All columns exist in the '{
|
|
102
|
+
f"{icons.green_dot} All columns exist in the '{lakehouse_name}' lakehouse within the '{lakehouse_workspace}' workspace."
|
|
101
103
|
)
|
|
102
104
|
else:
|
|
103
105
|
print(
|
|
104
|
-
f"{icons.yellow_dot} The following columns exist in the '{dataset}' semantic model within the '{workspace}' workspace
|
|
106
|
+
f"{icons.yellow_dot} The following columns exist in the '{dataset}' semantic model within the '{workspace}' workspace "
|
|
107
|
+
f"but do not exist in the '{lakehouse_name}' lakehouse within the '{lakehouse_workspace}' workspace."
|
|
105
108
|
)
|
|
106
109
|
display(missingcols)
|