semantic-link-labs 0.9.9__py3-none-any.whl → 0.9.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of semantic-link-labs might be problematic. Click here for more details.
- {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.10.dist-info}/METADATA +5 -3
- {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.10.dist-info}/RECORD +29 -27
- {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.10.dist-info}/WHEEL +1 -1
- sempy_labs/__init__.py +6 -0
- sempy_labs/_clear_cache.py +12 -0
- sempy_labs/_dax.py +8 -2
- sempy_labs/_delta_analyzer.py +8 -18
- sempy_labs/_generate_semantic_model.py +6 -7
- sempy_labs/_helper_functions.py +205 -64
- sempy_labs/_kql_databases.py +18 -0
- sempy_labs/_kusto.py +135 -0
- sempy_labs/_list_functions.py +5 -1
- sempy_labs/_vertipaq.py +6 -6
- sempy_labs/_warehouses.py +3 -3
- sempy_labs/admin/__init__.py +6 -0
- sempy_labs/admin/_artifacts.py +3 -3
- sempy_labs/admin/_capacities.py +161 -1
- sempy_labs/admin/_dataflows.py +45 -0
- sempy_labs/admin/_items.py +16 -11
- sempy_labs/admin/_tenant.py +5 -5
- sempy_labs/directlake/_generate_shared_expression.py +25 -26
- sempy_labs/lakehouse/_get_lakehouse_columns.py +41 -18
- sempy_labs/lakehouse/_get_lakehouse_tables.py +66 -39
- sempy_labs/lakehouse/_lakehouse.py +44 -35
- sempy_labs/migration/_migrate_calctables_to_lakehouse.py +7 -12
- sempy_labs/migration/_refresh_calc_tables.py +7 -6
- sempy_labs/tom/_model.py +21 -14
- {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.10.dist-info}/licenses/LICENSE +0 -0
- {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.10.dist-info}/top_level.txt +0 -0
sempy_labs/admin/_capacities.py
CHANGED
|
@@ -5,6 +5,7 @@ from typing import Optional, Tuple
|
|
|
5
5
|
from sempy._utils._log import log
|
|
6
6
|
from sempy_labs._helper_functions import (
|
|
7
7
|
_base_api,
|
|
8
|
+
_build_url,
|
|
8
9
|
_create_dataframe,
|
|
9
10
|
_update_dataframe_datatypes,
|
|
10
11
|
_is_valid_uuid,
|
|
@@ -57,6 +58,24 @@ def _resolve_capacity_name_and_id(
|
|
|
57
58
|
return capacity_name, capacity_id
|
|
58
59
|
|
|
59
60
|
|
|
61
|
+
def _resolve_capacity_id(
|
|
62
|
+
capacity: str | UUID,
|
|
63
|
+
) -> UUID:
|
|
64
|
+
|
|
65
|
+
if _is_valid_uuid(capacity):
|
|
66
|
+
capacity_id = capacity
|
|
67
|
+
else:
|
|
68
|
+
dfC = list_capacities(capacity=capacity)
|
|
69
|
+
if dfC.empty:
|
|
70
|
+
raise ValueError(
|
|
71
|
+
f"{icons.red_dot} The '{capacity}' capacity was not found."
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
capacity_id = dfC["Capacity Id"].iloc[0]
|
|
75
|
+
|
|
76
|
+
return capacity_id
|
|
77
|
+
|
|
78
|
+
|
|
60
79
|
def _list_capacities_meta() -> pd.DataFrame:
|
|
61
80
|
"""
|
|
62
81
|
Shows the a list of capacities and their properties. This function is the admin version.
|
|
@@ -221,7 +240,7 @@ def list_capacities(
|
|
|
221
240
|
"Sku": "string",
|
|
222
241
|
"Region": "string",
|
|
223
242
|
"State": "string",
|
|
224
|
-
"Admins": "
|
|
243
|
+
"Admins": "list",
|
|
225
244
|
}
|
|
226
245
|
df = _create_dataframe(columns=columns)
|
|
227
246
|
|
|
@@ -309,3 +328,144 @@ def list_capacity_users(capacity: str | UUID) -> pd.DataFrame:
|
|
|
309
328
|
_update_dataframe_datatypes(dataframe=df, column_map=columns)
|
|
310
329
|
|
|
311
330
|
return df
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
@log
|
|
334
|
+
def get_refreshables(
|
|
335
|
+
top: Optional[int] = None,
|
|
336
|
+
expand: Optional[str] = None,
|
|
337
|
+
filter: Optional[str] = None,
|
|
338
|
+
skip: Optional[int] = None,
|
|
339
|
+
capacity: Optional[str | UUID] = None,
|
|
340
|
+
) -> pd.DataFrame | dict:
|
|
341
|
+
"""
|
|
342
|
+
Returns a list of refreshables for the organization within a capacity.
|
|
343
|
+
|
|
344
|
+
Power BI retains a seven-day refresh history for each dataset, up to a maximum of sixty refreshes.
|
|
345
|
+
|
|
346
|
+
This is a wrapper function for the following API: `Admin - Get Refreshables <https://learn.microsoft.com/rest/api/power-bi/admin/get-refreshables>`_.
|
|
347
|
+
|
|
348
|
+
Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
|
|
349
|
+
|
|
350
|
+
Parameters
|
|
351
|
+
----------
|
|
352
|
+
top : int, default=None
|
|
353
|
+
Returns only the first n results.
|
|
354
|
+
expand : str, default=None
|
|
355
|
+
Accepts a comma-separated list of data types, which will be expanded inline in the response. Supports capacities and groups.
|
|
356
|
+
filter : str, default=None
|
|
357
|
+
Returns a subset of a results based on Odata filter query parameter condition.
|
|
358
|
+
skip : int, default=None
|
|
359
|
+
Skips the first n results. Use with top to fetch results beyond the first 1000.
|
|
360
|
+
capacity : str | uuid.UUID, default=None
|
|
361
|
+
The capacity name or ID to filter. If None, all capacities are returned.
|
|
362
|
+
|
|
363
|
+
Returns
|
|
364
|
+
-------
|
|
365
|
+
pandas.DataFrame
|
|
366
|
+
Returns a list of refreshables for the organization within a capacity.
|
|
367
|
+
"""
|
|
368
|
+
|
|
369
|
+
columns = {
|
|
370
|
+
"Workspace Id": "string",
|
|
371
|
+
"Workspace Name": "string",
|
|
372
|
+
"Item Id": "string",
|
|
373
|
+
"Item Name": "string",
|
|
374
|
+
"Item Kind": "string",
|
|
375
|
+
"Capacity Id": "string",
|
|
376
|
+
"Capacity Name": "string",
|
|
377
|
+
"Capacity SKU": "string",
|
|
378
|
+
"Refresh Count": "int",
|
|
379
|
+
"Refresh Failures": "int",
|
|
380
|
+
"Average Duration": "float",
|
|
381
|
+
"Median Duration": "float",
|
|
382
|
+
"Refreshes Per Day": "int",
|
|
383
|
+
"Refresh Type": "string",
|
|
384
|
+
"Start Time": "string",
|
|
385
|
+
"End Time": "string",
|
|
386
|
+
"Status": "string",
|
|
387
|
+
"Request Id": "string",
|
|
388
|
+
"Service Exception Json": "string",
|
|
389
|
+
"Extended Status": "dict",
|
|
390
|
+
"Refresh Attempts": "list",
|
|
391
|
+
"Refresh Schedule Days": "list",
|
|
392
|
+
"Refresh Schedule Times": "list",
|
|
393
|
+
"Refresh Schedule Enabled": "bool",
|
|
394
|
+
"Refresh Schedule Local Timezone Id": "string",
|
|
395
|
+
"Refresh Schedule Notify Option": "string",
|
|
396
|
+
"Configured By": "list",
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
df = _create_dataframe(columns=columns)
|
|
400
|
+
|
|
401
|
+
params = {}
|
|
402
|
+
url = (
|
|
403
|
+
"/v1.0/myorg/admin/capacities/refreshables"
|
|
404
|
+
if capacity is None
|
|
405
|
+
else f"/v1.0/myorg/admin/capacities/{_resolve_capacity_id(capacity=capacity)}/refreshables"
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
if top is not None:
|
|
409
|
+
params["$top"] = top
|
|
410
|
+
|
|
411
|
+
if expand is not None:
|
|
412
|
+
params["$expand"] = expand
|
|
413
|
+
|
|
414
|
+
if filter is not None:
|
|
415
|
+
params["$filter"] = filter
|
|
416
|
+
|
|
417
|
+
if skip is not None:
|
|
418
|
+
params["$skip"] = skip
|
|
419
|
+
|
|
420
|
+
url = _build_url(url, params)
|
|
421
|
+
|
|
422
|
+
responses = _base_api(request=url, client="fabric_sp")
|
|
423
|
+
|
|
424
|
+
refreshables = []
|
|
425
|
+
|
|
426
|
+
for i in responses.json().get("value", []):
|
|
427
|
+
last_refresh = i.get("lastRefresh", {})
|
|
428
|
+
refresh_schedule = i.get("refreshSchedule", {})
|
|
429
|
+
new_data = {
|
|
430
|
+
"Workspace Id": i.get("group", {}).get("id"),
|
|
431
|
+
"Workspace Name": i.get("group", {}).get("name"),
|
|
432
|
+
"Item Id": i.get("id"),
|
|
433
|
+
"Item Name": i.get("name"),
|
|
434
|
+
"Item Kind": i.get("kind"),
|
|
435
|
+
"Capacity Id": (
|
|
436
|
+
i.get("capacity", {}).get("id").lower()
|
|
437
|
+
if i.get("capacity", {}).get("id")
|
|
438
|
+
else None
|
|
439
|
+
),
|
|
440
|
+
"Capacity Name": i.get("capacity", {}).get("displayName"),
|
|
441
|
+
"Capacity SKU": i.get("capacity", {}).get("sku"),
|
|
442
|
+
"Refresh Count": i.get("refreshCount", 0),
|
|
443
|
+
"Refresh Failures": i.get("refreshFailures", 0),
|
|
444
|
+
"Average Duration": i.get("averageDuration", 0),
|
|
445
|
+
"Median Duration": i.get("medianDuration", 0),
|
|
446
|
+
"Refreshes Per Day": i.get("refreshesPerDay", 0),
|
|
447
|
+
"Refresh Type": last_refresh.get("refreshType"),
|
|
448
|
+
"Start Time": last_refresh.get("startTime"),
|
|
449
|
+
"End Time": last_refresh.get("endTime"),
|
|
450
|
+
"Status": last_refresh.get("status"),
|
|
451
|
+
"Request Id": last_refresh.get("requestId"),
|
|
452
|
+
"Service Exception Json": last_refresh.get("serviceExceptionJson"),
|
|
453
|
+
"Extended Status": last_refresh.get("extendedStatus"),
|
|
454
|
+
"Refresh Attempts": last_refresh.get("refreshAttempts"),
|
|
455
|
+
"Refresh Schedule Days": refresh_schedule.get("days"),
|
|
456
|
+
"Refresh Schedule Times": refresh_schedule.get("times"),
|
|
457
|
+
"Refresh Schedule Enabled": refresh_schedule.get("enabled"),
|
|
458
|
+
"Refresh Schedule Local Timezone Id": refresh_schedule.get(
|
|
459
|
+
"localTimeZoneId"
|
|
460
|
+
),
|
|
461
|
+
"Refresh Schedule Notify Option": refresh_schedule.get("notifyOption"),
|
|
462
|
+
"Configured By": i.get("configuredBy"),
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
refreshables.append(new_data)
|
|
466
|
+
|
|
467
|
+
if len(refreshables) > 0:
|
|
468
|
+
df = pd.DataFrame(refreshables)
|
|
469
|
+
_update_dataframe_datatypes(dataframe=df, column_map=columns)
|
|
470
|
+
|
|
471
|
+
return df
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
from sempy_labs._helper_functions import (
|
|
3
|
+
_base_api,
|
|
4
|
+
)
|
|
5
|
+
from sempy_labs.admin._items import (
|
|
6
|
+
_resolve_item_id,
|
|
7
|
+
)
|
|
8
|
+
from uuid import UUID
|
|
9
|
+
from sempy._utils._log import log
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@log
|
|
13
|
+
def export_dataflow(
|
|
14
|
+
dataflow: str | UUID,
|
|
15
|
+
workspace: Optional[str | UUID] = None,
|
|
16
|
+
) -> dict:
|
|
17
|
+
"""
|
|
18
|
+
Shows a list of datasets for the organization.
|
|
19
|
+
|
|
20
|
+
This is a wrapper function for the following API: `Admin - Dataflows ExportDataflowAsAdmin <https://learn.microsoft.com/rest/api/power-bi/admin/dataflows-export-dataflow-as-admin>`_.
|
|
21
|
+
|
|
22
|
+
Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
dataflow : str | UUID, default=None
|
|
27
|
+
The dataflow Name or Id.
|
|
28
|
+
workspace : str | uuid.UUID, default=None
|
|
29
|
+
The Fabric workspace name or id.
|
|
30
|
+
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
31
|
+
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
32
|
+
Only used if given a dataflow name and not an id.
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
dict
|
|
37
|
+
Exported Json file.
|
|
38
|
+
"""
|
|
39
|
+
dataflow_id = _resolve_item_id(item=dataflow, type="dataflow", workspace=workspace)
|
|
40
|
+
|
|
41
|
+
url = f"/v1.0/myorg/admin/dataflows/{dataflow_id}/export"
|
|
42
|
+
|
|
43
|
+
response = _base_api(request=url, client="fabric_sp")
|
|
44
|
+
|
|
45
|
+
return response.json()
|
sempy_labs/admin/_items.py
CHANGED
|
@@ -17,20 +17,26 @@ from sempy_labs._helper_functions import (
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
def _resolve_item_id(
|
|
20
|
-
|
|
20
|
+
item: str,
|
|
21
21
|
type: Optional[str] = None,
|
|
22
22
|
workspace: Optional[str | UUID] = None,
|
|
23
23
|
) -> UUID:
|
|
24
|
+
if _is_valid_uuid(item):
|
|
25
|
+
item_id = item
|
|
24
26
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
+
else:
|
|
28
|
+
workspace_id = _resolve_workspace_name_and_id(workspace)[1]
|
|
29
|
+
dfI = list_items(workspace=workspace_id, type=type)
|
|
30
|
+
dfI_filt = dfI[dfI["Item Name"] == item]
|
|
27
31
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
+
if len(dfI_filt) == 0:
|
|
33
|
+
raise ValueError(
|
|
34
|
+
f"The '{item}' {type} does not exist within the '{workspace}' workspace or is not of type '{type}'."
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
item_id = dfI_filt["Item Id"].iloc[0]
|
|
32
38
|
|
|
33
|
-
return
|
|
39
|
+
return item_id
|
|
34
40
|
|
|
35
41
|
|
|
36
42
|
def _resolve_item_name_and_id(
|
|
@@ -84,9 +90,8 @@ def list_items(
|
|
|
84
90
|
capacity : str | uuid.UUID, default=None
|
|
85
91
|
The capacity name or id.
|
|
86
92
|
workspace : str | uuid.UUID, default=None
|
|
87
|
-
The Fabric workspace name.
|
|
88
|
-
Defaults to None which
|
|
89
|
-
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
93
|
+
The Fabric workspace name or id.
|
|
94
|
+
Defaults to None which looks into all the workspaces.
|
|
90
95
|
state : str, default=None
|
|
91
96
|
The item state.
|
|
92
97
|
type : str, default=None
|
sempy_labs/admin/_tenant.py
CHANGED
|
@@ -32,7 +32,7 @@ def list_tenant_settings() -> pd.DataFrame:
|
|
|
32
32
|
"Enabled": "bool",
|
|
33
33
|
"Can Specify Security Groups": "bool",
|
|
34
34
|
"Tenant Setting Group": "string",
|
|
35
|
-
"Enabled Security Groups": "
|
|
35
|
+
"Enabled Security Groups": "list",
|
|
36
36
|
}
|
|
37
37
|
df = _create_dataframe(columns=columns)
|
|
38
38
|
|
|
@@ -86,9 +86,9 @@ def list_capacity_tenant_settings_overrides(
|
|
|
86
86
|
"Setting Title": "string",
|
|
87
87
|
"Setting Enabled": "bool",
|
|
88
88
|
"Can Specify Security Groups": "bool",
|
|
89
|
-
"Enabled Security Groups": "
|
|
89
|
+
"Enabled Security Groups": "list",
|
|
90
90
|
"Tenant Setting Group": "string",
|
|
91
|
-
"Tenant Setting Properties": "
|
|
91
|
+
"Tenant Setting Properties": "list",
|
|
92
92
|
"Delegate to Workspace": "bool",
|
|
93
93
|
"Delegated From": "string",
|
|
94
94
|
}
|
|
@@ -395,7 +395,7 @@ def list_workspaces_tenant_settings_overrides() -> pd.DataFrame:
|
|
|
395
395
|
"Title": "string",
|
|
396
396
|
"Enabled": "bool",
|
|
397
397
|
"Can Specify Security Groups": "bool",
|
|
398
|
-
"Enabled Security Groups": "
|
|
398
|
+
"Enabled Security Groups": "list",
|
|
399
399
|
"Tenant Setting Group": "string",
|
|
400
400
|
"Delegated From": "string",
|
|
401
401
|
}
|
|
@@ -454,7 +454,7 @@ def list_domain_tenant_settings_overrides() -> pd.DataFrame:
|
|
|
454
454
|
"Title": "string",
|
|
455
455
|
"Enabled": "bool",
|
|
456
456
|
"Can Specify Security Groups": "bool",
|
|
457
|
-
"Enabled Security Groups": "
|
|
457
|
+
"Enabled Security Groups": "list",
|
|
458
458
|
"Tenant Setting Group": "string",
|
|
459
459
|
"Delegated To Workspace": "bool",
|
|
460
460
|
"Delegated From": "string",
|
|
@@ -56,34 +56,33 @@ def generate_shared_expression(
|
|
|
56
56
|
item=item_name, type=item_type, workspace=workspace_id
|
|
57
57
|
)
|
|
58
58
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
59
|
+
if use_sql_endpoint:
|
|
60
|
+
item_type_rest = f"{item_type.lower()}s"
|
|
61
|
+
response = _base_api(
|
|
62
|
+
request=f"/v1/workspaces/{workspace_id}/{item_type_rest}/{item_id}"
|
|
63
|
+
)
|
|
63
64
|
|
|
64
|
-
|
|
65
|
+
prop = response.json().get("properties")
|
|
65
66
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
67
|
+
if item_type == "Lakehouse":
|
|
68
|
+
sqlprop = prop.get("sqlEndpointProperties")
|
|
69
|
+
sqlEPCS = sqlprop.get("connectionString")
|
|
70
|
+
sqlepid = sqlprop.get("id")
|
|
71
|
+
provStatus = sqlprop.get("provisioningStatus")
|
|
72
|
+
elif item_type == "Warehouse":
|
|
73
|
+
sqlEPCS = prop.get("connectionString")
|
|
74
|
+
sqlepid = item_id
|
|
75
|
+
provStatus = None
|
|
75
76
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
77
|
+
if provStatus == "InProgress":
|
|
78
|
+
raise ValueError(
|
|
79
|
+
f"{icons.red_dot} The SQL Endpoint for the '{item_name}' {item_type.lower()} within the '{workspace_name}' workspace has not yet been provisioned. Please wait until it has been provisioned."
|
|
80
|
+
)
|
|
80
81
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
# Build DL/OL expression
|
|
86
|
-
if not use_sql_endpoint and item_type == "Lakehouse":
|
|
87
|
-
return f'AzureDataLakeStorage{{"server":"onelake.dfs.fabric.microsoft.com","path":"/{workspace_id}/{item_id}/"}}'
|
|
88
|
-
else:
|
|
82
|
+
start_expr = "let\n\tdatabase = "
|
|
83
|
+
end_expr = "\nin\n\tdatabase"
|
|
84
|
+
mid_expr = f'Sql.Database("{sqlEPCS}", "{sqlepid}")'
|
|
89
85
|
return f"{start_expr}{mid_expr}{end_expr}"
|
|
86
|
+
else:
|
|
87
|
+
# Build DL/OL expression
|
|
88
|
+
return f"""let\n\tSource = AzureStorage.DataLake("onelake.dfs.fabric.microsoft.com/{workspace_id}/{item_id}")\nin\n\tSource"""
|
|
@@ -1,14 +1,17 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
|
+
import re
|
|
2
3
|
from sempy_labs._helper_functions import (
|
|
3
4
|
format_dax_object_name,
|
|
4
5
|
resolve_workspace_name_and_id,
|
|
5
6
|
resolve_lakehouse_name_and_id,
|
|
6
7
|
_create_dataframe,
|
|
7
|
-
|
|
8
|
+
_get_delta_table,
|
|
9
|
+
_pure_python_notebook,
|
|
8
10
|
)
|
|
9
11
|
from typing import Optional
|
|
10
12
|
from sempy._utils._log import log
|
|
11
13
|
from uuid import UUID
|
|
14
|
+
import sempy_labs._icons as icons
|
|
12
15
|
|
|
13
16
|
|
|
14
17
|
@log
|
|
@@ -16,7 +19,9 @@ def get_lakehouse_columns(
|
|
|
16
19
|
lakehouse: Optional[str | UUID] = None, workspace: Optional[str | UUID] = None
|
|
17
20
|
) -> pd.DataFrame:
|
|
18
21
|
"""
|
|
19
|
-
Shows the tables and columns of a lakehouse and their respective properties.
|
|
22
|
+
Shows the tables and columns of a lakehouse and their respective properties. This function can be executed in either a PySpark or pure Python notebook. Note that data types may show differently when using PySpark vs pure Python.
|
|
23
|
+
|
|
24
|
+
Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
|
|
20
25
|
|
|
21
26
|
Parameters
|
|
22
27
|
----------
|
|
@@ -34,7 +39,6 @@ def get_lakehouse_columns(
|
|
|
34
39
|
Shows the tables/columns within a lakehouse and their properties.
|
|
35
40
|
"""
|
|
36
41
|
from sempy_labs.lakehouse._get_lakehouse_tables import get_lakehouse_tables
|
|
37
|
-
from delta import DeltaTable
|
|
38
42
|
|
|
39
43
|
columns = {
|
|
40
44
|
"Workspace Name": "string",
|
|
@@ -51,29 +55,48 @@ def get_lakehouse_columns(
|
|
|
51
55
|
lakehouse=lakehouse, workspace=workspace_id
|
|
52
56
|
)
|
|
53
57
|
|
|
54
|
-
spark = _create_spark_session()
|
|
55
|
-
|
|
56
58
|
tables = get_lakehouse_tables(
|
|
57
59
|
lakehouse=lakehouse_id, workspace=workspace_id, extended=False, count_rows=False
|
|
58
60
|
)
|
|
59
61
|
tables_filt = tables[tables["Format"] == "delta"]
|
|
60
62
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
delta_table = DeltaTable.forPath(spark, path)
|
|
65
|
-
sparkdf = delta_table.toDF()
|
|
66
|
-
|
|
67
|
-
for col_name, data_type in sparkdf.dtypes:
|
|
68
|
-
full_column_name = format_dax_object_name(table_name, col_name)
|
|
69
|
-
new_data = {
|
|
63
|
+
def add_column_metadata(table_name, col_name, data_type):
|
|
64
|
+
new_rows.append(
|
|
65
|
+
{
|
|
70
66
|
"Workspace Name": workspace_name,
|
|
71
|
-
"Lakehouse Name":
|
|
67
|
+
"Lakehouse Name": lakehouse_name,
|
|
72
68
|
"Table Name": table_name,
|
|
73
69
|
"Column Name": col_name,
|
|
74
|
-
"Full Column Name":
|
|
70
|
+
"Full Column Name": format_dax_object_name(table_name, col_name),
|
|
75
71
|
"Data Type": data_type,
|
|
76
72
|
}
|
|
77
|
-
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
new_rows = []
|
|
76
|
+
|
|
77
|
+
for _, r in tables_filt.iterrows():
|
|
78
|
+
table_name = r["Table Name"]
|
|
79
|
+
path = r["Location"]
|
|
80
|
+
|
|
81
|
+
if _pure_python_notebook():
|
|
82
|
+
from deltalake import DeltaTable
|
|
83
|
+
|
|
84
|
+
table_schema = DeltaTable(path).schema()
|
|
85
|
+
|
|
86
|
+
for field in table_schema.fields:
|
|
87
|
+
col_name = field.name
|
|
88
|
+
match = re.search(r'"(.*?)"', str(field.type))
|
|
89
|
+
if not match:
|
|
90
|
+
raise ValueError(
|
|
91
|
+
f"{icons.red_dot} Could not find data type for column {col_name}."
|
|
92
|
+
)
|
|
93
|
+
data_type = match.group(1)
|
|
94
|
+
add_column_metadata(table_name, col_name, data_type)
|
|
95
|
+
else:
|
|
96
|
+
delta_table = _get_delta_table(path=path)
|
|
97
|
+
table_df = delta_table.toDF()
|
|
98
|
+
|
|
99
|
+
for col_name, data_type in table_df.dtypes:
|
|
100
|
+
add_column_metadata(table_name, col_name, data_type)
|
|
78
101
|
|
|
79
|
-
return df
|
|
102
|
+
return pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import
|
|
1
|
+
import os
|
|
2
2
|
import pandas as pd
|
|
3
3
|
import pyarrow.parquet as pq
|
|
4
|
-
import datetime
|
|
4
|
+
from datetime import datetime
|
|
5
5
|
from sempy_labs._helper_functions import (
|
|
6
6
|
_get_column_aggregate,
|
|
7
7
|
resolve_workspace_name_and_id,
|
|
@@ -9,7 +9,13 @@ from sempy_labs._helper_functions import (
|
|
|
9
9
|
save_as_delta_table,
|
|
10
10
|
_base_api,
|
|
11
11
|
_create_dataframe,
|
|
12
|
-
|
|
12
|
+
resolve_workspace_id,
|
|
13
|
+
resolve_lakehouse_id,
|
|
14
|
+
_read_delta_table,
|
|
15
|
+
_get_delta_table,
|
|
16
|
+
_mount,
|
|
17
|
+
create_abfss_path,
|
|
18
|
+
_pure_python_notebook,
|
|
13
19
|
)
|
|
14
20
|
from sempy_labs.directlake._guardrails import (
|
|
15
21
|
get_sku_size,
|
|
@@ -33,8 +39,12 @@ def get_lakehouse_tables(
|
|
|
33
39
|
"""
|
|
34
40
|
Shows the tables of a lakehouse and their respective properties. Option to include additional properties relevant to Direct Lake guardrails.
|
|
35
41
|
|
|
42
|
+
This function can be executed in either a PySpark or pure Python notebook.
|
|
43
|
+
|
|
36
44
|
This is a wrapper function for the following API: `Tables - List Tables <https://learn.microsoft.com/rest/api/fabric/lakehouse/tables/list-tables>`_ plus extended capabilities.
|
|
37
45
|
|
|
46
|
+
Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
|
|
47
|
+
|
|
38
48
|
Parameters
|
|
39
49
|
----------
|
|
40
50
|
lakehouse : str | uuid.UUID, default=None
|
|
@@ -76,8 +86,8 @@ def get_lakehouse_tables(
|
|
|
76
86
|
extended = True
|
|
77
87
|
|
|
78
88
|
if (
|
|
79
|
-
workspace_id !=
|
|
80
|
-
and lakehouse_id !=
|
|
89
|
+
workspace_id != resolve_workspace_id()
|
|
90
|
+
and lakehouse_id != resolve_lakehouse_id()
|
|
81
91
|
and count_rows
|
|
82
92
|
):
|
|
83
93
|
raise ValueError(
|
|
@@ -88,6 +98,7 @@ def get_lakehouse_tables(
|
|
|
88
98
|
responses = _base_api(
|
|
89
99
|
request=f"v1/workspaces/{workspace_id}/lakehouses/{lakehouse_id}/tables",
|
|
90
100
|
uses_pagination=True,
|
|
101
|
+
client="fabric_sp",
|
|
91
102
|
)
|
|
92
103
|
|
|
93
104
|
if not responses[0].get("data"):
|
|
@@ -112,40 +123,59 @@ def get_lakehouse_tables(
|
|
|
112
123
|
if extended:
|
|
113
124
|
sku_value = get_sku_size(workspace_id)
|
|
114
125
|
guardrail = get_directlake_guardrails_for_sku(sku_value)
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
df["Row Groups"] = None
|
|
118
|
-
df["Table Size"] = None
|
|
126
|
+
local_path = _mount()
|
|
127
|
+
|
|
128
|
+
df["Files"], df["Row Groups"], df["Table Size"] = None, None, None
|
|
119
129
|
if count_rows:
|
|
120
130
|
df["Row Count"] = None
|
|
131
|
+
|
|
121
132
|
for i, r in df.iterrows():
|
|
122
|
-
|
|
133
|
+
table_name = r["Table Name"]
|
|
123
134
|
if r["Type"] == "Managed" and r["Format"] == "delta":
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
size_in_bytes = detail_df.sizeInBytes
|
|
127
|
-
|
|
128
|
-
delta_table_path = f"Tables/{tName}"
|
|
129
|
-
latest_files = (
|
|
130
|
-
spark.read.format("delta").load(delta_table_path).inputFiles()
|
|
135
|
+
delta_table_path = create_abfss_path(
|
|
136
|
+
lakehouse_id, workspace_id, table_name
|
|
131
137
|
)
|
|
132
|
-
file_paths = [f.split("/")[-1] for f in latest_files]
|
|
133
138
|
|
|
134
|
-
|
|
139
|
+
if _pure_python_notebook():
|
|
140
|
+
from deltalake import DeltaTable
|
|
141
|
+
|
|
142
|
+
delta_table = DeltaTable(delta_table_path)
|
|
143
|
+
latest_files = [
|
|
144
|
+
file["path"]
|
|
145
|
+
for file in delta_table.get_add_actions().to_pylist()
|
|
146
|
+
]
|
|
147
|
+
size_in_bytes = 0
|
|
148
|
+
for f in latest_files:
|
|
149
|
+
local_file_path = os.path.join(
|
|
150
|
+
local_path, "Tables", table_name, os.path.basename(f)
|
|
151
|
+
)
|
|
152
|
+
if os.path.exists(local_file_path):
|
|
153
|
+
size_in_bytes += os.path.getsize(local_file_path)
|
|
154
|
+
num_latest_files = len(latest_files)
|
|
155
|
+
else:
|
|
156
|
+
delta_table = _get_delta_table(delta_table_path)
|
|
157
|
+
latest_files = _read_delta_table(delta_table_path).inputFiles()
|
|
158
|
+
table_df = delta_table.toDF()
|
|
159
|
+
table_details = delta_table.detail().collect()[0].asDict()
|
|
160
|
+
num_latest_files = table_details.get("numFiles", 0)
|
|
161
|
+
size_in_bytes = table_details.get("sizeInBytes", 0)
|
|
162
|
+
|
|
163
|
+
table_path = os.path.join(local_path, "Tables", table_name)
|
|
164
|
+
file_paths = [os.path.basename(f) for f in latest_files]
|
|
165
|
+
|
|
135
166
|
num_rowgroups = 0
|
|
136
167
|
for filename in file_paths:
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
).num_row_groups
|
|
141
|
-
except FileNotFoundError:
|
|
142
|
-
continue
|
|
143
|
-
df.at[i, "Files"] = num_files
|
|
168
|
+
parquet_file = pq.ParquetFile(f"{table_path}/{filename}")
|
|
169
|
+
num_rowgroups += parquet_file.num_row_groups
|
|
170
|
+
df.at[i, "Files"] = num_latest_files
|
|
144
171
|
df.at[i, "Row Groups"] = num_rowgroups
|
|
145
172
|
df.at[i, "Table Size"] = size_in_bytes
|
|
146
173
|
if count_rows:
|
|
147
|
-
|
|
148
|
-
|
|
174
|
+
if _pure_python_notebook():
|
|
175
|
+
row_count = delta_table.to_pyarrow_table().num_rows
|
|
176
|
+
else:
|
|
177
|
+
row_count = table_df.count()
|
|
178
|
+
df.at[i, "Row Count"] = row_count
|
|
149
179
|
|
|
150
180
|
if extended:
|
|
151
181
|
intColumns = ["Files", "Row Groups", "Table Size"]
|
|
@@ -168,19 +198,16 @@ def get_lakehouse_tables(
|
|
|
168
198
|
if export:
|
|
169
199
|
if not lakehouse_attached():
|
|
170
200
|
raise ValueError(
|
|
171
|
-
f"{icons.red_dot} In order to save the
|
|
201
|
+
f"{icons.red_dot} In order to save the dataframe, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook."
|
|
172
202
|
)
|
|
173
203
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
lakeT_filt = df[df["Table Name"] == lakeTName]
|
|
204
|
+
lake_table_name = "lakehouse_table_details"
|
|
205
|
+
df_filt = df[df["Table Name"] == lake_table_name]
|
|
177
206
|
|
|
178
|
-
if
|
|
207
|
+
if df_filt.empty:
|
|
179
208
|
run_id = 1
|
|
180
209
|
else:
|
|
181
|
-
max_run_id = _get_column_aggregate(
|
|
182
|
-
lakehouse=current_lakehouse_name, table_name=lakeTName
|
|
183
|
-
)
|
|
210
|
+
max_run_id = _get_column_aggregate(table_name=lake_table_name)
|
|
184
211
|
run_id = max_run_id + 1
|
|
185
212
|
|
|
186
213
|
export_df = df.copy()
|
|
@@ -224,13 +251,13 @@ def get_lakehouse_tables(
|
|
|
224
251
|
export_df[c] = export_df[c].astype(bool)
|
|
225
252
|
|
|
226
253
|
print(
|
|
227
|
-
f"{icons.in_progress} Saving Lakehouse table properties to the '{
|
|
254
|
+
f"{icons.in_progress} Saving Lakehouse table properties to the '{lake_table_name}' table in the lakehouse...\n"
|
|
228
255
|
)
|
|
229
|
-
export_df["Timestamp"] = datetime.
|
|
256
|
+
export_df["Timestamp"] = datetime.now()
|
|
230
257
|
export_df["RunId"] = run_id
|
|
231
258
|
|
|
232
259
|
save_as_delta_table(
|
|
233
|
-
dataframe=export_df, delta_table_name=
|
|
260
|
+
dataframe=export_df, delta_table_name=lake_table_name, write_mode="append"
|
|
234
261
|
)
|
|
235
262
|
|
|
236
263
|
return df
|