semantic-link-labs 0.9.9__py3-none-any.whl → 0.9.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of semantic-link-labs might be problematic. Click here for more details.
- {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/METADATA +30 -22
- {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/RECORD +47 -40
- {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/WHEEL +1 -1
- sempy_labs/__init__.py +28 -1
- sempy_labs/_clear_cache.py +12 -0
- sempy_labs/_dax.py +8 -2
- sempy_labs/_delta_analyzer.py +17 -26
- sempy_labs/_environments.py +19 -1
- sempy_labs/_generate_semantic_model.py +7 -8
- sempy_labs/_helper_functions.py +351 -151
- sempy_labs/_kql_databases.py +18 -0
- sempy_labs/_kusto.py +137 -0
- sempy_labs/_list_functions.py +18 -36
- sempy_labs/_model_bpa_rules.py +13 -3
- sempy_labs/_notebooks.py +44 -11
- sempy_labs/_semantic_models.py +93 -1
- sempy_labs/_sql.py +3 -2
- sempy_labs/_tags.py +194 -0
- sempy_labs/_variable_libraries.py +89 -0
- sempy_labs/_vertipaq.py +6 -6
- sempy_labs/_vpax.py +386 -0
- sempy_labs/_warehouses.py +3 -3
- sempy_labs/admin/__init__.py +14 -0
- sempy_labs/admin/_artifacts.py +3 -3
- sempy_labs/admin/_capacities.py +161 -1
- sempy_labs/admin/_dataflows.py +45 -0
- sempy_labs/admin/_items.py +16 -11
- sempy_labs/admin/_tags.py +126 -0
- sempy_labs/admin/_tenant.py +5 -5
- sempy_labs/directlake/_generate_shared_expression.py +29 -26
- sempy_labs/directlake/_update_directlake_model_lakehouse_connection.py +55 -5
- sempy_labs/dotnet_lib/dotnet.runtime.config.json +10 -0
- sempy_labs/lakehouse/__init__.py +16 -0
- sempy_labs/lakehouse/_blobs.py +115 -63
- sempy_labs/lakehouse/_get_lakehouse_columns.py +41 -18
- sempy_labs/lakehouse/_get_lakehouse_tables.py +62 -47
- sempy_labs/lakehouse/_helper.py +211 -0
- sempy_labs/lakehouse/_lakehouse.py +45 -36
- sempy_labs/lakehouse/_livy_sessions.py +137 -0
- sempy_labs/migration/_migrate_calctables_to_lakehouse.py +7 -12
- sempy_labs/migration/_refresh_calc_tables.py +7 -6
- sempy_labs/report/_download_report.py +1 -1
- sempy_labs/report/_generate_report.py +5 -1
- sempy_labs/report/_reportwrapper.py +31 -18
- sempy_labs/tom/_model.py +104 -35
- sempy_labs/report/_bpareporttemplate/.pbi/localSettings.json +0 -9
- sempy_labs/report/_bpareporttemplate/.platform +0 -11
- {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/licenses/LICENSE +0 -0
- {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,7 @@ from sempy_labs._helper_functions import (
|
|
|
7
7
|
)
|
|
8
8
|
from sempy._utils._log import log
|
|
9
9
|
from sempy_labs.tom import connect_semantic_model
|
|
10
|
-
from typing import Optional
|
|
10
|
+
from typing import Optional, List
|
|
11
11
|
import sempy_labs._icons as icons
|
|
12
12
|
from uuid import UUID
|
|
13
13
|
import re
|
|
@@ -19,7 +19,9 @@ def _extract_expression_list(expression):
|
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
21
|
pattern_sql = r'Sql\.Database\s*\(\s*"([^"]+)"\s*,\s*"([^"]+)"\s*\)'
|
|
22
|
-
pattern_no_sql =
|
|
22
|
+
pattern_no_sql = (
|
|
23
|
+
r'AzureStorage\.DataLake\(".*?/([0-9a-fA-F\-]{36})/([0-9a-fA-F\-]{36})"'
|
|
24
|
+
)
|
|
23
25
|
|
|
24
26
|
match_sql = re.search(pattern_sql, expression)
|
|
25
27
|
match_no_sql = re.search(pattern_no_sql, expression)
|
|
@@ -102,6 +104,7 @@ def update_direct_lake_model_connection(
|
|
|
102
104
|
source_type: str = "Lakehouse",
|
|
103
105
|
source_workspace: Optional[str | UUID] = None,
|
|
104
106
|
use_sql_endpoint: bool = True,
|
|
107
|
+
tables: Optional[str | List[str]] = None,
|
|
105
108
|
):
|
|
106
109
|
"""
|
|
107
110
|
Remaps a Direct Lake semantic model's SQL Endpoint connection to a new lakehouse/warehouse.
|
|
@@ -126,12 +129,19 @@ def update_direct_lake_model_connection(
|
|
|
126
129
|
use_sql_endpoint : bool, default=True
|
|
127
130
|
If True, the SQL Endpoint will be used for the connection.
|
|
128
131
|
If False, Direct Lake over OneLake will be used.
|
|
132
|
+
tables : str | List[str], default=None
|
|
133
|
+
The name(s) of the table(s) to update in the Direct Lake semantic model.
|
|
134
|
+
If None, all tables will be updated (if there is only one expression).
|
|
135
|
+
If multiple tables are specified, they must be provided as a list.
|
|
129
136
|
"""
|
|
130
137
|
if use_sql_endpoint:
|
|
131
138
|
icons.sll_tags.append("UpdateDLConnection_SQL")
|
|
132
139
|
else:
|
|
133
140
|
icons.sll_tags.append("UpdateDLConnection_DLOL")
|
|
134
141
|
|
|
142
|
+
if isinstance(tables, str):
|
|
143
|
+
tables = [tables]
|
|
144
|
+
|
|
135
145
|
(workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
|
|
136
146
|
(dataset_name, dataset_id) = resolve_dataset_name_and_id(dataset, workspace_id)
|
|
137
147
|
|
|
@@ -174,7 +184,12 @@ def update_direct_lake_model_connection(
|
|
|
174
184
|
)
|
|
175
185
|
|
|
176
186
|
# Update the single connection expression
|
|
177
|
-
if len(expressions)
|
|
187
|
+
if len(expressions) > 1 and not tables:
|
|
188
|
+
print(
|
|
189
|
+
f"{icons.info} Multiple expressions found in the model. Please specify the tables to update using the 'tables parameter."
|
|
190
|
+
)
|
|
191
|
+
return
|
|
192
|
+
elif len(expressions) == 1 and not tables:
|
|
178
193
|
expr = expressions[0]
|
|
179
194
|
tom.model.Expressions[expr].Expression = shared_expression
|
|
180
195
|
|
|
@@ -182,6 +197,41 @@ def update_direct_lake_model_connection(
|
|
|
182
197
|
f"{icons.green_dot} The expression in the '{dataset_name}' semantic model within the '{workspace_name}' workspace has been updated to point to the '{source}' {source_type.lower()} in the '{source_workspace}' workspace."
|
|
183
198
|
)
|
|
184
199
|
else:
|
|
185
|
-
|
|
186
|
-
|
|
200
|
+
import sempy
|
|
201
|
+
|
|
202
|
+
sempy.fabric._client._utils._init_analysis_services()
|
|
203
|
+
import Microsoft.AnalysisServices.Tabular as TOM
|
|
204
|
+
|
|
205
|
+
expr_list = _extract_expression_list(shared_expression)
|
|
206
|
+
|
|
207
|
+
expr_name = next(
|
|
208
|
+
(name for name, exp in expression_dict.items() if exp == expr_list),
|
|
209
|
+
None,
|
|
187
210
|
)
|
|
211
|
+
|
|
212
|
+
# If the expression does not already exist, create it
|
|
213
|
+
def generate_unique_name(existing_names):
|
|
214
|
+
i = 1
|
|
215
|
+
while True:
|
|
216
|
+
candidate = f"DatabaseQuery{i}"
|
|
217
|
+
if candidate not in existing_names:
|
|
218
|
+
return candidate
|
|
219
|
+
i += 1
|
|
220
|
+
|
|
221
|
+
if not expr_name:
|
|
222
|
+
expr_name = generate_unique_name(expressions)
|
|
223
|
+
tom.add_expression(name=expr_name, expression=shared_expression)
|
|
224
|
+
|
|
225
|
+
all_tables = [t.Name for t in tom.model.Tables]
|
|
226
|
+
for t_name in tables:
|
|
227
|
+
if t_name not in all_tables:
|
|
228
|
+
raise ValueError(
|
|
229
|
+
f"{icons.red_dot} The table '{t_name}' does not exist in the '{dataset_name}' semantic model within the '{workspace_name}' workspace."
|
|
230
|
+
)
|
|
231
|
+
p = next(p for p in tom.model.Tables[t_name].Partitions)
|
|
232
|
+
if p.Mode != TOM.ModeType.DirectLake:
|
|
233
|
+
raise ValueError(
|
|
234
|
+
f"{icons.red_dot} The table '{t_name}' in the '{dataset_name}' semantic model within the '{workspace_name}' workspace is not in Direct Lake mode. This function is only applicable to Direct Lake tables."
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
p.Source.ExpressionSource = tom.model.Expressions[expr_name]
|
sempy_labs/lakehouse/__init__.py
CHANGED
|
@@ -20,6 +20,16 @@ from sempy_labs.lakehouse._shortcuts import (
|
|
|
20
20
|
from sempy_labs.lakehouse._blobs import (
|
|
21
21
|
recover_lakehouse_object,
|
|
22
22
|
list_blobs,
|
|
23
|
+
get_user_delegation_key,
|
|
24
|
+
)
|
|
25
|
+
from sempy_labs.lakehouse._livy_sessions import (
|
|
26
|
+
list_livy_sessions,
|
|
27
|
+
)
|
|
28
|
+
from sempy_labs.lakehouse._helper import (
|
|
29
|
+
is_v_ordered,
|
|
30
|
+
delete_lakehouse,
|
|
31
|
+
update_lakehouse,
|
|
32
|
+
load_table,
|
|
23
33
|
)
|
|
24
34
|
|
|
25
35
|
__all__ = [
|
|
@@ -36,4 +46,10 @@ __all__ = [
|
|
|
36
46
|
"list_shortcuts",
|
|
37
47
|
"recover_lakehouse_object",
|
|
38
48
|
"list_blobs",
|
|
49
|
+
"list_livy_sessions",
|
|
50
|
+
"is_v_ordered",
|
|
51
|
+
"delete_lakehouse",
|
|
52
|
+
"update_lakehouse",
|
|
53
|
+
"load_table",
|
|
54
|
+
"get_user_delegation_key",
|
|
39
55
|
]
|
sempy_labs/lakehouse/_blobs.py
CHANGED
|
@@ -11,6 +11,7 @@ from typing import Optional, List
|
|
|
11
11
|
import sempy_labs._icons as icons
|
|
12
12
|
import xml.etree.ElementTree as ET
|
|
13
13
|
import pandas as pd
|
|
14
|
+
from sempy.fabric.exceptions import FabricHTTPException
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
def _request_blob_api(
|
|
@@ -18,6 +19,7 @@ def _request_blob_api(
|
|
|
18
19
|
method: str = "get",
|
|
19
20
|
payload: Optional[dict] = None,
|
|
20
21
|
status_codes: int | List[int] = 200,
|
|
22
|
+
uses_pagination: bool = False,
|
|
21
23
|
):
|
|
22
24
|
|
|
23
25
|
import requests
|
|
@@ -31,21 +33,41 @@ def _request_blob_api(
|
|
|
31
33
|
|
|
32
34
|
headers = {
|
|
33
35
|
"Authorization": f"Bearer {token}",
|
|
34
|
-
"Content-Type": "application/
|
|
36
|
+
"Content-Type": "application/xml",
|
|
35
37
|
"x-ms-version": "2025-05-05",
|
|
36
38
|
}
|
|
37
39
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
40
|
+
base_url = "https://onelake.blob.fabric.microsoft.com/"
|
|
41
|
+
full_url = f"{base_url}{request}"
|
|
42
|
+
results = []
|
|
43
|
+
|
|
44
|
+
while True:
|
|
45
|
+
response = requests.request(
|
|
46
|
+
method.upper(),
|
|
47
|
+
full_url,
|
|
48
|
+
headers=headers,
|
|
49
|
+
data=payload if method.lower() != "get" else None,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
if response.status_code not in status_codes:
|
|
53
|
+
raise FabricHTTPException(response)
|
|
54
|
+
|
|
55
|
+
if not uses_pagination:
|
|
56
|
+
return response
|
|
57
|
+
|
|
58
|
+
# Parse XML to find blobs and NextMarker
|
|
59
|
+
root = ET.fromstring(response.content)
|
|
60
|
+
results.append(root)
|
|
61
|
+
|
|
62
|
+
next_marker = root.findtext(".//NextMarker")
|
|
63
|
+
if not next_marker:
|
|
64
|
+
break # No more pages
|
|
44
65
|
|
|
45
|
-
|
|
46
|
-
|
|
66
|
+
# Append the marker to the original request (assuming query string format)
|
|
67
|
+
delimiter = "&" if "?" in request else "?"
|
|
68
|
+
full_url = f"{base_url}{request}{delimiter}marker={next_marker}"
|
|
47
69
|
|
|
48
|
-
return
|
|
70
|
+
return results
|
|
49
71
|
|
|
50
72
|
|
|
51
73
|
@log
|
|
@@ -90,12 +112,6 @@ def list_blobs(
|
|
|
90
112
|
)
|
|
91
113
|
path_prefix = f"{workspace_id}/{lakehouse_id}/{container}"
|
|
92
114
|
|
|
93
|
-
response = _request_blob_api(
|
|
94
|
-
request=f"{path_prefix}?restype=container&comp=list&include=deleted"
|
|
95
|
-
)
|
|
96
|
-
root = ET.fromstring(response.content)
|
|
97
|
-
response_json = _xml_to_dict(root)
|
|
98
|
-
|
|
99
115
|
columns = {
|
|
100
116
|
"Blob Name": "str",
|
|
101
117
|
"Is Deleted": "bool",
|
|
@@ -122,37 +138,55 @@ def list_blobs(
|
|
|
122
138
|
|
|
123
139
|
df = _create_dataframe(columns=columns)
|
|
124
140
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
"
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
141
|
+
url = f"{path_prefix}?restype=container&comp=list&include=deleted"
|
|
142
|
+
|
|
143
|
+
responses = _request_blob_api(
|
|
144
|
+
request=url,
|
|
145
|
+
uses_pagination=True,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
dfs = []
|
|
149
|
+
for root in responses:
|
|
150
|
+
response_json = _xml_to_dict(root)
|
|
151
|
+
|
|
152
|
+
blobs = (
|
|
153
|
+
response_json.get("EnumerationResults", {}).get("Blobs", {}).get("Blob", [])
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
if isinstance(blobs, dict):
|
|
157
|
+
blobs = [blobs]
|
|
158
|
+
|
|
159
|
+
for blob in blobs:
|
|
160
|
+
p = blob.get("Properties", {})
|
|
161
|
+
new_data = {
|
|
162
|
+
"Blob Name": blob.get("Name"),
|
|
163
|
+
"Is Deleted": blob.get("Deleted", False),
|
|
164
|
+
"Deletion Id": blob.get("DeletionId"),
|
|
165
|
+
"Creation Time": p.get("Creation-Time"),
|
|
166
|
+
"Expiry Time": p.get("Expiry-Time"),
|
|
167
|
+
"Etag": p.get("Etag"),
|
|
168
|
+
"Resource Type": p.get("ResourceType"),
|
|
169
|
+
"Content Length": p.get("Content-Length"),
|
|
170
|
+
"Content Type": p.get("Content-Type"),
|
|
171
|
+
"Content Encoding": p.get("Content-Encoding"),
|
|
172
|
+
"Content Language": p.get("Content-Language"),
|
|
173
|
+
"Content CRC64": p.get("Content-CRC64"),
|
|
174
|
+
"Content MD5": p.get("Content-MD5"),
|
|
175
|
+
"Cache Control": p.get("Cache-Control"),
|
|
176
|
+
"Content Disposition": p.get("Content-Disposition"),
|
|
177
|
+
"Blob Type": p.get("BlobType"),
|
|
178
|
+
"Access Tier": p.get("AccessTier"),
|
|
179
|
+
"Access Tier Inferred": p.get("AccessTierInferred"),
|
|
180
|
+
"Server Encrypted": p.get("ServerEncrypted"),
|
|
181
|
+
"Deleted Time": p.get("DeletedTime"),
|
|
182
|
+
"Remaining Retention Days": p.get("RemainingRetentionDays"),
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
dfs.append(pd.DataFrame(new_data, index=[0]))
|
|
186
|
+
|
|
187
|
+
if dfs:
|
|
188
|
+
df = pd.concat(dfs, ignore_index=True)
|
|
189
|
+
_update_dataframe_datatypes(dataframe=df, column_map=columns)
|
|
156
190
|
|
|
157
191
|
return df
|
|
158
192
|
|
|
@@ -182,7 +216,7 @@ def recover_lakehouse_object(
|
|
|
182
216
|
workspace_id = resolve_workspace_id(workspace)
|
|
183
217
|
lakehouse_id = resolve_lakehouse_id(lakehouse, workspace_id)
|
|
184
218
|
|
|
185
|
-
|
|
219
|
+
blob_name = f"{lakehouse_id}/{file_path}"
|
|
186
220
|
|
|
187
221
|
container = file_path.split("/")[0]
|
|
188
222
|
if container not in ["Tables", "Files"]:
|
|
@@ -190,29 +224,45 @@ def recover_lakehouse_object(
|
|
|
190
224
|
f"{icons.red_dot} Invalid container '{container}' within the file_path parameter. Expected 'Tables' or 'Files'."
|
|
191
225
|
)
|
|
192
226
|
|
|
193
|
-
|
|
227
|
+
# Undelete the blob
|
|
228
|
+
print(f"{icons.in_progress} Attempting to recover the '{blob_name}' blob...")
|
|
194
229
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
230
|
+
try:
|
|
231
|
+
_request_blob_api(
|
|
232
|
+
request=f"{workspace_id}/{lakehouse_id}/{file_path}?comp=undelete",
|
|
233
|
+
method="put",
|
|
234
|
+
)
|
|
235
|
+
print(
|
|
236
|
+
f"{icons.green_dot} The '{blob_name}' blob recover attempt was successful."
|
|
237
|
+
)
|
|
238
|
+
except FabricHTTPException as e:
|
|
239
|
+
if e.status_code == 404:
|
|
240
|
+
print(
|
|
241
|
+
f"{icons.warning} The '{blob_name}' blob was not found. No action taken."
|
|
242
|
+
)
|
|
243
|
+
else:
|
|
244
|
+
print(
|
|
245
|
+
f"{icons.red_dot} An error occurred while recovering the '{blob_name}' blob: {e}"
|
|
203
246
|
)
|
|
204
|
-
print(f"{icons.green_dot} The '{blob_name}' blob has been restored.")
|
|
205
247
|
|
|
206
248
|
|
|
207
|
-
def
|
|
249
|
+
def get_user_delegation_key():
|
|
250
|
+
"""
|
|
251
|
+
Gets a key that can be used to sign a user delegation SAS (shared access signature). A user delegation SAS grants access to Azure Blob Storage resources by using Microsoft Entra credentials.
|
|
208
252
|
|
|
209
|
-
|
|
253
|
+
This is a wrapper function for the following API: `Get User Delegation Key <https://learn.microsoft.com/rest/api/storageservices/get-user-delegation-key>`_.
|
|
254
|
+
|
|
255
|
+
Returns
|
|
256
|
+
-------
|
|
257
|
+
str
|
|
258
|
+
The user delegation key value.
|
|
259
|
+
"""
|
|
210
260
|
|
|
211
261
|
from datetime import datetime, timedelta, timezone
|
|
212
262
|
|
|
213
263
|
utc_now = datetime.now(timezone.utc)
|
|
214
264
|
start_time = utc_now + timedelta(minutes=2)
|
|
215
|
-
expiry_time = start_time + timedelta(minutes=
|
|
265
|
+
expiry_time = start_time + timedelta(minutes=60)
|
|
216
266
|
start_str = start_time.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
217
267
|
expiry_str = expiry_time.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
218
268
|
|
|
@@ -223,9 +273,11 @@ def _get_user_delegation_key():
|
|
|
223
273
|
</KeyInfo>"""
|
|
224
274
|
|
|
225
275
|
response = _request_blob_api(
|
|
226
|
-
request="restype=service&comp=userdelegationkey",
|
|
276
|
+
request="?restype=service&comp=userdelegationkey",
|
|
227
277
|
method="post",
|
|
228
278
|
payload=payload,
|
|
229
279
|
)
|
|
230
280
|
|
|
231
|
-
|
|
281
|
+
root = ET.fromstring(response.content)
|
|
282
|
+
response_json = _xml_to_dict(root)
|
|
283
|
+
return response_json.get("UserDelegationKey", {}).get("Value", None)
|
|
@@ -1,14 +1,17 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
|
+
import re
|
|
2
3
|
from sempy_labs._helper_functions import (
|
|
3
4
|
format_dax_object_name,
|
|
4
5
|
resolve_workspace_name_and_id,
|
|
5
6
|
resolve_lakehouse_name_and_id,
|
|
6
7
|
_create_dataframe,
|
|
7
|
-
|
|
8
|
+
_get_delta_table,
|
|
9
|
+
_pure_python_notebook,
|
|
8
10
|
)
|
|
9
11
|
from typing import Optional
|
|
10
12
|
from sempy._utils._log import log
|
|
11
13
|
from uuid import UUID
|
|
14
|
+
import sempy_labs._icons as icons
|
|
12
15
|
|
|
13
16
|
|
|
14
17
|
@log
|
|
@@ -16,7 +19,9 @@ def get_lakehouse_columns(
|
|
|
16
19
|
lakehouse: Optional[str | UUID] = None, workspace: Optional[str | UUID] = None
|
|
17
20
|
) -> pd.DataFrame:
|
|
18
21
|
"""
|
|
19
|
-
Shows the tables and columns of a lakehouse and their respective properties.
|
|
22
|
+
Shows the tables and columns of a lakehouse and their respective properties. This function can be executed in either a PySpark or pure Python notebook. Note that data types may show differently when using PySpark vs pure Python.
|
|
23
|
+
|
|
24
|
+
Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
|
|
20
25
|
|
|
21
26
|
Parameters
|
|
22
27
|
----------
|
|
@@ -34,7 +39,6 @@ def get_lakehouse_columns(
|
|
|
34
39
|
Shows the tables/columns within a lakehouse and their properties.
|
|
35
40
|
"""
|
|
36
41
|
from sempy_labs.lakehouse._get_lakehouse_tables import get_lakehouse_tables
|
|
37
|
-
from delta import DeltaTable
|
|
38
42
|
|
|
39
43
|
columns = {
|
|
40
44
|
"Workspace Name": "string",
|
|
@@ -51,29 +55,48 @@ def get_lakehouse_columns(
|
|
|
51
55
|
lakehouse=lakehouse, workspace=workspace_id
|
|
52
56
|
)
|
|
53
57
|
|
|
54
|
-
spark = _create_spark_session()
|
|
55
|
-
|
|
56
58
|
tables = get_lakehouse_tables(
|
|
57
59
|
lakehouse=lakehouse_id, workspace=workspace_id, extended=False, count_rows=False
|
|
58
60
|
)
|
|
59
61
|
tables_filt = tables[tables["Format"] == "delta"]
|
|
60
62
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
delta_table = DeltaTable.forPath(spark, path)
|
|
65
|
-
sparkdf = delta_table.toDF()
|
|
66
|
-
|
|
67
|
-
for col_name, data_type in sparkdf.dtypes:
|
|
68
|
-
full_column_name = format_dax_object_name(table_name, col_name)
|
|
69
|
-
new_data = {
|
|
63
|
+
def add_column_metadata(table_name, col_name, data_type):
|
|
64
|
+
new_rows.append(
|
|
65
|
+
{
|
|
70
66
|
"Workspace Name": workspace_name,
|
|
71
|
-
"Lakehouse Name":
|
|
67
|
+
"Lakehouse Name": lakehouse_name,
|
|
72
68
|
"Table Name": table_name,
|
|
73
69
|
"Column Name": col_name,
|
|
74
|
-
"Full Column Name":
|
|
70
|
+
"Full Column Name": format_dax_object_name(table_name, col_name),
|
|
75
71
|
"Data Type": data_type,
|
|
76
72
|
}
|
|
77
|
-
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
new_rows = []
|
|
76
|
+
|
|
77
|
+
for _, r in tables_filt.iterrows():
|
|
78
|
+
table_name = r["Table Name"]
|
|
79
|
+
path = r["Location"]
|
|
80
|
+
|
|
81
|
+
if _pure_python_notebook():
|
|
82
|
+
from deltalake import DeltaTable
|
|
83
|
+
|
|
84
|
+
table_schema = DeltaTable(path).schema()
|
|
85
|
+
|
|
86
|
+
for field in table_schema.fields:
|
|
87
|
+
col_name = field.name
|
|
88
|
+
match = re.search(r'"(.*?)"', str(field.type))
|
|
89
|
+
if not match:
|
|
90
|
+
raise ValueError(
|
|
91
|
+
f"{icons.red_dot} Could not find data type for column {col_name}."
|
|
92
|
+
)
|
|
93
|
+
data_type = match.group(1)
|
|
94
|
+
add_column_metadata(table_name, col_name, data_type)
|
|
95
|
+
else:
|
|
96
|
+
delta_table = _get_delta_table(path=path)
|
|
97
|
+
table_df = delta_table.toDF()
|
|
98
|
+
|
|
99
|
+
for col_name, data_type in table_df.dtypes:
|
|
100
|
+
add_column_metadata(table_name, col_name, data_type)
|
|
78
101
|
|
|
79
|
-
return df
|
|
102
|
+
return pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
|