semantic-link-labs 0.9.9__py3-none-any.whl → 0.9.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of semantic-link-labs might be problematic. Click here for more details.
- {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/METADATA +30 -22
- {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/RECORD +47 -40
- {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/WHEEL +1 -1
- sempy_labs/__init__.py +28 -1
- sempy_labs/_clear_cache.py +12 -0
- sempy_labs/_dax.py +8 -2
- sempy_labs/_delta_analyzer.py +17 -26
- sempy_labs/_environments.py +19 -1
- sempy_labs/_generate_semantic_model.py +7 -8
- sempy_labs/_helper_functions.py +351 -151
- sempy_labs/_kql_databases.py +18 -0
- sempy_labs/_kusto.py +137 -0
- sempy_labs/_list_functions.py +18 -36
- sempy_labs/_model_bpa_rules.py +13 -3
- sempy_labs/_notebooks.py +44 -11
- sempy_labs/_semantic_models.py +93 -1
- sempy_labs/_sql.py +3 -2
- sempy_labs/_tags.py +194 -0
- sempy_labs/_variable_libraries.py +89 -0
- sempy_labs/_vertipaq.py +6 -6
- sempy_labs/_vpax.py +386 -0
- sempy_labs/_warehouses.py +3 -3
- sempy_labs/admin/__init__.py +14 -0
- sempy_labs/admin/_artifacts.py +3 -3
- sempy_labs/admin/_capacities.py +161 -1
- sempy_labs/admin/_dataflows.py +45 -0
- sempy_labs/admin/_items.py +16 -11
- sempy_labs/admin/_tags.py +126 -0
- sempy_labs/admin/_tenant.py +5 -5
- sempy_labs/directlake/_generate_shared_expression.py +29 -26
- sempy_labs/directlake/_update_directlake_model_lakehouse_connection.py +55 -5
- sempy_labs/dotnet_lib/dotnet.runtime.config.json +10 -0
- sempy_labs/lakehouse/__init__.py +16 -0
- sempy_labs/lakehouse/_blobs.py +115 -63
- sempy_labs/lakehouse/_get_lakehouse_columns.py +41 -18
- sempy_labs/lakehouse/_get_lakehouse_tables.py +62 -47
- sempy_labs/lakehouse/_helper.py +211 -0
- sempy_labs/lakehouse/_lakehouse.py +45 -36
- sempy_labs/lakehouse/_livy_sessions.py +137 -0
- sempy_labs/migration/_migrate_calctables_to_lakehouse.py +7 -12
- sempy_labs/migration/_refresh_calc_tables.py +7 -6
- sempy_labs/report/_download_report.py +1 -1
- sempy_labs/report/_generate_report.py +5 -1
- sempy_labs/report/_reportwrapper.py +31 -18
- sempy_labs/tom/_model.py +104 -35
- sempy_labs/report/_bpareporttemplate/.pbi/localSettings.json +0 -9
- sempy_labs/report/_bpareporttemplate/.platform +0 -11
- {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/licenses/LICENSE +0 -0
- {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import
|
|
1
|
+
import os
|
|
2
2
|
import pandas as pd
|
|
3
3
|
import pyarrow.parquet as pq
|
|
4
|
-
import datetime
|
|
4
|
+
from datetime import datetime
|
|
5
5
|
from sempy_labs._helper_functions import (
|
|
6
6
|
_get_column_aggregate,
|
|
7
7
|
resolve_workspace_name_and_id,
|
|
@@ -9,7 +9,11 @@ from sempy_labs._helper_functions import (
|
|
|
9
9
|
save_as_delta_table,
|
|
10
10
|
_base_api,
|
|
11
11
|
_create_dataframe,
|
|
12
|
-
|
|
12
|
+
_read_delta_table,
|
|
13
|
+
_get_delta_table,
|
|
14
|
+
_mount,
|
|
15
|
+
create_abfss_path,
|
|
16
|
+
_pure_python_notebook,
|
|
13
17
|
)
|
|
14
18
|
from sempy_labs.directlake._guardrails import (
|
|
15
19
|
get_sku_size,
|
|
@@ -33,8 +37,12 @@ def get_lakehouse_tables(
|
|
|
33
37
|
"""
|
|
34
38
|
Shows the tables of a lakehouse and their respective properties. Option to include additional properties relevant to Direct Lake guardrails.
|
|
35
39
|
|
|
40
|
+
This function can be executed in either a PySpark or pure Python notebook.
|
|
41
|
+
|
|
36
42
|
This is a wrapper function for the following API: `Tables - List Tables <https://learn.microsoft.com/rest/api/fabric/lakehouse/tables/list-tables>`_ plus extended capabilities.
|
|
37
43
|
|
|
44
|
+
Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
|
|
45
|
+
|
|
38
46
|
Parameters
|
|
39
47
|
----------
|
|
40
48
|
lakehouse : str | uuid.UUID, default=None
|
|
@@ -75,19 +83,10 @@ def get_lakehouse_tables(
|
|
|
75
83
|
if count_rows: # Setting countrows defaults to extended=True
|
|
76
84
|
extended = True
|
|
77
85
|
|
|
78
|
-
if (
|
|
79
|
-
workspace_id != fabric.get_workspace_id()
|
|
80
|
-
and lakehouse_id != fabric.get_lakehouse_id()
|
|
81
|
-
and count_rows
|
|
82
|
-
):
|
|
83
|
-
raise ValueError(
|
|
84
|
-
f"{icons.red_dot} If 'count_rows' is set to True, you must run this function against the default lakehouse attached to the notebook. "
|
|
85
|
-
"Count rows runs a spark query and cross-workspace spark queries are currently not supported."
|
|
86
|
-
)
|
|
87
|
-
|
|
88
86
|
responses = _base_api(
|
|
89
87
|
request=f"v1/workspaces/{workspace_id}/lakehouses/{lakehouse_id}/tables",
|
|
90
88
|
uses_pagination=True,
|
|
89
|
+
client="fabric_sp",
|
|
91
90
|
)
|
|
92
91
|
|
|
93
92
|
if not responses[0].get("data"):
|
|
@@ -112,40 +111,59 @@ def get_lakehouse_tables(
|
|
|
112
111
|
if extended:
|
|
113
112
|
sku_value = get_sku_size(workspace_id)
|
|
114
113
|
guardrail = get_directlake_guardrails_for_sku(sku_value)
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
df["Row Groups"] = None
|
|
118
|
-
df["Table Size"] = None
|
|
114
|
+
local_path = _mount(lakehouse=lakehouse_id, workspace=workspace_id)
|
|
115
|
+
|
|
116
|
+
df["Files"], df["Row Groups"], df["Table Size"] = None, None, None
|
|
119
117
|
if count_rows:
|
|
120
118
|
df["Row Count"] = None
|
|
119
|
+
|
|
121
120
|
for i, r in df.iterrows():
|
|
122
|
-
|
|
121
|
+
table_name = r["Table Name"]
|
|
123
122
|
if r["Type"] == "Managed" and r["Format"] == "delta":
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
size_in_bytes = detail_df.sizeInBytes
|
|
127
|
-
|
|
128
|
-
delta_table_path = f"Tables/{tName}"
|
|
129
|
-
latest_files = (
|
|
130
|
-
spark.read.format("delta").load(delta_table_path).inputFiles()
|
|
123
|
+
delta_table_path = create_abfss_path(
|
|
124
|
+
lakehouse_id, workspace_id, table_name
|
|
131
125
|
)
|
|
132
|
-
file_paths = [f.split("/")[-1] for f in latest_files]
|
|
133
126
|
|
|
134
|
-
|
|
127
|
+
if _pure_python_notebook():
|
|
128
|
+
from deltalake import DeltaTable
|
|
129
|
+
|
|
130
|
+
delta_table = DeltaTable(delta_table_path)
|
|
131
|
+
latest_files = [
|
|
132
|
+
file["path"]
|
|
133
|
+
for file in delta_table.get_add_actions().to_pylist()
|
|
134
|
+
]
|
|
135
|
+
size_in_bytes = 0
|
|
136
|
+
for f in latest_files:
|
|
137
|
+
local_file_path = os.path.join(
|
|
138
|
+
local_path, "Tables", table_name, os.path.basename(f)
|
|
139
|
+
)
|
|
140
|
+
if os.path.exists(local_file_path):
|
|
141
|
+
size_in_bytes += os.path.getsize(local_file_path)
|
|
142
|
+
num_latest_files = len(latest_files)
|
|
143
|
+
else:
|
|
144
|
+
delta_table = _get_delta_table(delta_table_path)
|
|
145
|
+
latest_files = _read_delta_table(delta_table_path).inputFiles()
|
|
146
|
+
table_df = delta_table.toDF()
|
|
147
|
+
table_details = delta_table.detail().collect()[0].asDict()
|
|
148
|
+
num_latest_files = table_details.get("numFiles", 0)
|
|
149
|
+
size_in_bytes = table_details.get("sizeInBytes", 0)
|
|
150
|
+
|
|
151
|
+
table_path = os.path.join(local_path, "Tables", table_name)
|
|
152
|
+
file_paths = [os.path.basename(f) for f in latest_files]
|
|
153
|
+
|
|
135
154
|
num_rowgroups = 0
|
|
136
155
|
for filename in file_paths:
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
).num_row_groups
|
|
141
|
-
except FileNotFoundError:
|
|
142
|
-
continue
|
|
143
|
-
df.at[i, "Files"] = num_files
|
|
156
|
+
parquet_file = pq.ParquetFile(f"{table_path}/{filename}")
|
|
157
|
+
num_rowgroups += parquet_file.num_row_groups
|
|
158
|
+
df.at[i, "Files"] = num_latest_files
|
|
144
159
|
df.at[i, "Row Groups"] = num_rowgroups
|
|
145
160
|
df.at[i, "Table Size"] = size_in_bytes
|
|
146
161
|
if count_rows:
|
|
147
|
-
|
|
148
|
-
|
|
162
|
+
if _pure_python_notebook():
|
|
163
|
+
row_count = delta_table.to_pyarrow_table().num_rows
|
|
164
|
+
else:
|
|
165
|
+
row_count = table_df.count()
|
|
166
|
+
df.at[i, "Row Count"] = row_count
|
|
149
167
|
|
|
150
168
|
if extended:
|
|
151
169
|
intColumns = ["Files", "Row Groups", "Table Size"]
|
|
@@ -168,19 +186,16 @@ def get_lakehouse_tables(
|
|
|
168
186
|
if export:
|
|
169
187
|
if not lakehouse_attached():
|
|
170
188
|
raise ValueError(
|
|
171
|
-
f"{icons.red_dot} In order to save the
|
|
189
|
+
f"{icons.red_dot} In order to save the dataframe, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook."
|
|
172
190
|
)
|
|
173
191
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
lakeT_filt = df[df["Table Name"] == lakeTName]
|
|
192
|
+
lake_table_name = "lakehouse_table_details"
|
|
193
|
+
df_filt = df[df["Table Name"] == lake_table_name]
|
|
177
194
|
|
|
178
|
-
if
|
|
195
|
+
if df_filt.empty:
|
|
179
196
|
run_id = 1
|
|
180
197
|
else:
|
|
181
|
-
max_run_id = _get_column_aggregate(
|
|
182
|
-
lakehouse=current_lakehouse_name, table_name=lakeTName
|
|
183
|
-
)
|
|
198
|
+
max_run_id = _get_column_aggregate(table_name=lake_table_name)
|
|
184
199
|
run_id = max_run_id + 1
|
|
185
200
|
|
|
186
201
|
export_df = df.copy()
|
|
@@ -224,13 +239,13 @@ def get_lakehouse_tables(
|
|
|
224
239
|
export_df[c] = export_df[c].astype(bool)
|
|
225
240
|
|
|
226
241
|
print(
|
|
227
|
-
f"{icons.in_progress} Saving Lakehouse table properties to the '{
|
|
242
|
+
f"{icons.in_progress} Saving Lakehouse table properties to the '{lake_table_name}' table in the lakehouse...\n"
|
|
228
243
|
)
|
|
229
|
-
export_df["Timestamp"] = datetime.
|
|
244
|
+
export_df["Timestamp"] = datetime.now()
|
|
230
245
|
export_df["RunId"] = run_id
|
|
231
246
|
|
|
232
247
|
save_as_delta_table(
|
|
233
|
-
dataframe=export_df, delta_table_name=
|
|
248
|
+
dataframe=export_df, delta_table_name=lake_table_name, write_mode="append"
|
|
234
249
|
)
|
|
235
250
|
|
|
236
251
|
return df
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
from uuid import UUID
|
|
2
|
+
from typing import Optional, Literal
|
|
3
|
+
import pyarrow.dataset as ds
|
|
4
|
+
from sempy_labs._helper_functions import (
|
|
5
|
+
_mount,
|
|
6
|
+
delete_item,
|
|
7
|
+
_base_api,
|
|
8
|
+
resolve_workspace_name_and_id,
|
|
9
|
+
resolve_lakehouse_name_and_id,
|
|
10
|
+
)
|
|
11
|
+
from sempy._utils._log import log
|
|
12
|
+
import sempy_labs._icons as icons
|
|
13
|
+
import os
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@log
|
|
17
|
+
def is_v_ordered(
|
|
18
|
+
table_name: str,
|
|
19
|
+
lakehouse: Optional[str | UUID] = None,
|
|
20
|
+
workspace: Optional[str | UUID] = None,
|
|
21
|
+
schema: Optional[str] = None,
|
|
22
|
+
) -> bool:
|
|
23
|
+
"""
|
|
24
|
+
Checks if a delta table is v-ordered.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
table_name : str
|
|
29
|
+
The name of the table to check.
|
|
30
|
+
lakehouse : str | uuid.UUID, default=None
|
|
31
|
+
The Fabric lakehouse name or ID.
|
|
32
|
+
Defaults to None which resolves to the lakehouse attached to the notebook.
|
|
33
|
+
workspace : str | uuid.UUID, default=None
|
|
34
|
+
The Fabric workspace name or ID used by the lakehouse.
|
|
35
|
+
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
36
|
+
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
37
|
+
schema : str, optional
|
|
38
|
+
The schema of the table to check. If not provided, the default schema is used.
|
|
39
|
+
|
|
40
|
+
Returns
|
|
41
|
+
-------
|
|
42
|
+
bool
|
|
43
|
+
True if the table is v-ordered, False otherwise.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
local_path = _mount(lakehouse=lakehouse, workspace=workspace)
|
|
47
|
+
table_path = (
|
|
48
|
+
f"{local_path}/Tables/{schema}/{table_name}"
|
|
49
|
+
if schema
|
|
50
|
+
else f"{local_path}/Tables/{table_name}"
|
|
51
|
+
)
|
|
52
|
+
ds_schema = ds.dataset(table_path).schema.metadata
|
|
53
|
+
|
|
54
|
+
return any(b"vorder" in key for key in ds_schema.keys())
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def delete_lakehouse(
|
|
58
|
+
lakehouse: str | UUID, workspace: Optional[str | UUID] = None
|
|
59
|
+
) -> None:
|
|
60
|
+
"""
|
|
61
|
+
Deletes a lakehouse.
|
|
62
|
+
|
|
63
|
+
This is a wrapper function for the following API: `Items - Delete Lakehouse <https://learn.microsoft.com/rest/api/fabric/lakehouse/items/delete-lakehouse>`_.
|
|
64
|
+
|
|
65
|
+
Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
----------
|
|
69
|
+
lakehouse : str | uuid.UUID
|
|
70
|
+
The name or ID of the lakehouse to delete.
|
|
71
|
+
workspace : str | uuid.UUID, default=None
|
|
72
|
+
The Fabric workspace name or ID used by the lakehouse.
|
|
73
|
+
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
74
|
+
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
delete_item(item=lakehouse, item_type="lakehouse", workspace=workspace)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def update_lakehouse(
|
|
81
|
+
name: Optional[str] = None,
|
|
82
|
+
description: Optional[str] = None,
|
|
83
|
+
lakehouse: Optional[str | UUID] = None,
|
|
84
|
+
workspace: Optional[str | UUID] = None,
|
|
85
|
+
):
|
|
86
|
+
"""
|
|
87
|
+
Updates a lakehouse.
|
|
88
|
+
|
|
89
|
+
This is a wrapper function for the following API: `Items - Update Lakehouse <https://learn.microsoft.com/rest/api/fabric/lakehouse/items/update-lakehouse>`_.
|
|
90
|
+
|
|
91
|
+
Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
|
|
92
|
+
|
|
93
|
+
Parameters
|
|
94
|
+
----------
|
|
95
|
+
name: str, default=None
|
|
96
|
+
The new name of the lakehouse.
|
|
97
|
+
Defaults to None which does not update the name.
|
|
98
|
+
description: str, default=None
|
|
99
|
+
The new description of the lakehouse.
|
|
100
|
+
Defaults to None which does not update the description.
|
|
101
|
+
lakehouse : str | uuid.UUID, default=None
|
|
102
|
+
The name or ID of the lakehouse to update.
|
|
103
|
+
Defaults to None which resolves to the lakehouse attached to the notebook.
|
|
104
|
+
workspace : str | uuid.UUID, default=None
|
|
105
|
+
The Fabric workspace name or ID used by the lakehouse.
|
|
106
|
+
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
107
|
+
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
if not name and not description:
|
|
111
|
+
raise ValueError(
|
|
112
|
+
f"{icons.red_dot} Either name or description must be provided."
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
(workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
|
|
116
|
+
(lakehouse_name, lakehouse_id) = resolve_lakehouse_name_and_id(
|
|
117
|
+
lakehouse, workspace_id
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
payload = {}
|
|
121
|
+
if name:
|
|
122
|
+
payload["displayName"] = name
|
|
123
|
+
if description:
|
|
124
|
+
payload["description"] = description
|
|
125
|
+
|
|
126
|
+
_base_api(
|
|
127
|
+
request=f"/v1/workspaces/{workspace_id}/lakehouses/{lakehouse_id}",
|
|
128
|
+
method="patch",
|
|
129
|
+
client="fabric_sp",
|
|
130
|
+
payload=payload,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
print(
|
|
134
|
+
f"{icons.green_dot} The '{lakehouse_name}' lakehouse within the '{workspace_name}' workspace has been updated accordingly."
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@log
|
|
139
|
+
def load_table(
|
|
140
|
+
table_name: str,
|
|
141
|
+
file_path: str,
|
|
142
|
+
mode: Literal["Overwrite", "Append"],
|
|
143
|
+
lakehouse: Optional[str | UUID] = None,
|
|
144
|
+
workspace: Optional[str | UUID] = None,
|
|
145
|
+
):
|
|
146
|
+
"""
|
|
147
|
+
Loads a table into a lakehouse. Currently only files are supported, not folders.
|
|
148
|
+
|
|
149
|
+
This is a wrapper function for the following API: `Tables - Load Table <https://learn.microsoft.com/rest/api/fabric/lakehouse/tables/load-table>`_.
|
|
150
|
+
|
|
151
|
+
Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
|
|
152
|
+
|
|
153
|
+
Parameters
|
|
154
|
+
----------
|
|
155
|
+
table_name : str
|
|
156
|
+
The name of the table to load.
|
|
157
|
+
file_path : str
|
|
158
|
+
The path to the data to load.
|
|
159
|
+
mode : Literal["Overwrite", "Append"]
|
|
160
|
+
The mode to use when loading the data.
|
|
161
|
+
"Overwrite" will overwrite the existing data.
|
|
162
|
+
"Append" will append the data to the existing data.
|
|
163
|
+
lakehouse : str | uuid.UUID, default=None
|
|
164
|
+
The name or ID of the lakehouse to load the table into.
|
|
165
|
+
Defaults to None which resolves to the lakehouse attached to the notebook.
|
|
166
|
+
workspace : str | uuid.UUID, default=None
|
|
167
|
+
The Fabric workspace name or ID used by the lakehouse.
|
|
168
|
+
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
169
|
+
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
(workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
|
|
173
|
+
(lakehouse_name, lakehouse_id) = resolve_lakehouse_name_and_id(
|
|
174
|
+
lakehouse, workspace_id
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
file_extension = os.path.splitext(file_path)[1]
|
|
178
|
+
|
|
179
|
+
payload = {
|
|
180
|
+
"relativePath": file_path,
|
|
181
|
+
"pathType": "File",
|
|
182
|
+
"mode": mode,
|
|
183
|
+
"formatOptions": {},
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
if file_extension == ".csv":
|
|
187
|
+
payload["formatOptions"] = {"format": "Csv", "header": True, "delimiter": ","}
|
|
188
|
+
elif file_extension == ".parquet":
|
|
189
|
+
payload["formatOptions"] = {
|
|
190
|
+
"format": "Parquet",
|
|
191
|
+
"header": True,
|
|
192
|
+
}
|
|
193
|
+
# Solve for loading folders
|
|
194
|
+
# elif file_extension == '':
|
|
195
|
+
# payload['pathType'] = "Folder"
|
|
196
|
+
# payload["recursive"] = recursive
|
|
197
|
+
# payload['formatOptions']
|
|
198
|
+
else:
|
|
199
|
+
raise NotImplementedError()
|
|
200
|
+
|
|
201
|
+
_base_api(
|
|
202
|
+
request=f"/v1/workspaces/{workspace_id}/lakehouses/{lakehouse_id}/tables/{table_name}/load",
|
|
203
|
+
client="fabric_sp",
|
|
204
|
+
method="post",
|
|
205
|
+
status_codes=202,
|
|
206
|
+
lro_return_status_code=True,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
print(
|
|
210
|
+
f"{icons.green_dot} The '{table_name}' table has been loaded into the '{lakehouse_name}' lakehouse within the '{workspace_name}' workspace."
|
|
211
|
+
)
|
|
@@ -7,6 +7,7 @@ from sempy_labs._helper_functions import (
|
|
|
7
7
|
resolve_lakehouse_name_and_id,
|
|
8
8
|
resolve_workspace_name_and_id,
|
|
9
9
|
_create_spark_session,
|
|
10
|
+
_pure_python_notebook,
|
|
10
11
|
)
|
|
11
12
|
import sempy_labs._icons as icons
|
|
12
13
|
import re
|
|
@@ -32,6 +33,33 @@ def lakehouse_attached() -> bool:
|
|
|
32
33
|
return False
|
|
33
34
|
|
|
34
35
|
|
|
36
|
+
def _optimize_table(path):
|
|
37
|
+
|
|
38
|
+
if _pure_python_notebook():
|
|
39
|
+
from deltalake import DeltaTable
|
|
40
|
+
|
|
41
|
+
DeltaTable(path).optimize.compact()
|
|
42
|
+
else:
|
|
43
|
+
from delta import DeltaTable
|
|
44
|
+
|
|
45
|
+
spark = _create_spark_session()
|
|
46
|
+
DeltaTable.forPath(spark, path).optimize().executeCompaction()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _vacuum_table(path, retain_n_hours):
|
|
50
|
+
|
|
51
|
+
if _pure_python_notebook():
|
|
52
|
+
from deltalake import DeltaTable
|
|
53
|
+
|
|
54
|
+
DeltaTable(path).vacuum(retention_hours=retain_n_hours)
|
|
55
|
+
else:
|
|
56
|
+
from delta import DeltaTable
|
|
57
|
+
|
|
58
|
+
spark = _create_spark_session()
|
|
59
|
+
spark.conf.set("spark.databricks.delta.vacuum.parallelDelete.enabled", "true")
|
|
60
|
+
DeltaTable.forPath(spark, path).vacuum(retain_n_hours)
|
|
61
|
+
|
|
62
|
+
|
|
35
63
|
@log
|
|
36
64
|
def optimize_lakehouse_tables(
|
|
37
65
|
tables: Optional[Union[str, List[str]]] = None,
|
|
@@ -56,27 +84,20 @@ def optimize_lakehouse_tables(
|
|
|
56
84
|
"""
|
|
57
85
|
|
|
58
86
|
from sempy_labs.lakehouse._get_lakehouse_tables import get_lakehouse_tables
|
|
59
|
-
from delta import DeltaTable
|
|
60
87
|
|
|
61
|
-
|
|
62
|
-
|
|
88
|
+
df = get_lakehouse_tables(lakehouse=lakehouse, workspace=workspace)
|
|
89
|
+
df_delta = df[df["Format"] == "delta"]
|
|
63
90
|
|
|
64
91
|
if isinstance(tables, str):
|
|
65
92
|
tables = [tables]
|
|
66
93
|
|
|
67
|
-
if tables
|
|
68
|
-
tables_filt = lakeTablesDelta[lakeTablesDelta["Table Name"].isin(tables)]
|
|
69
|
-
else:
|
|
70
|
-
tables_filt = lakeTablesDelta.copy()
|
|
71
|
-
|
|
72
|
-
spark = _create_spark_session()
|
|
94
|
+
df_tables = df_delta[df_delta["Table Name"].isin(tables)] if tables else df_delta
|
|
73
95
|
|
|
74
|
-
for _, r in (bar := tqdm(
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
bar.set_description(f"Optimizing the '{
|
|
78
|
-
|
|
79
|
-
deltaTable.optimize().executeCompaction()
|
|
96
|
+
for _, r in (bar := tqdm(df_tables.iterrows())):
|
|
97
|
+
table_name = r["Table Name"]
|
|
98
|
+
path = r["Location"]
|
|
99
|
+
bar.set_description(f"Optimizing the '{table_name}' table...")
|
|
100
|
+
_optimize_table(path=path)
|
|
80
101
|
|
|
81
102
|
|
|
82
103
|
@log
|
|
@@ -92,7 +113,7 @@ def vacuum_lakehouse_tables(
|
|
|
92
113
|
Parameters
|
|
93
114
|
----------
|
|
94
115
|
tables : str | List[str] | None
|
|
95
|
-
The table(s) to vacuum. If no tables are specified, all tables in the lakehouse will be
|
|
116
|
+
The table(s) to vacuum. If no tables are specified, all tables in the lakehouse will be vacuumed.
|
|
96
117
|
lakehouse : str | uuid.UUID, default=None
|
|
97
118
|
The Fabric lakehouse name or ID.
|
|
98
119
|
Defaults to None which resolves to the lakehouse attached to the notebook.
|
|
@@ -108,32 +129,20 @@ def vacuum_lakehouse_tables(
|
|
|
108
129
|
"""
|
|
109
130
|
|
|
110
131
|
from sempy_labs.lakehouse._get_lakehouse_tables import get_lakehouse_tables
|
|
111
|
-
from delta import DeltaTable
|
|
112
132
|
|
|
113
|
-
|
|
114
|
-
|
|
133
|
+
df = get_lakehouse_tables(lakehouse=lakehouse, workspace=workspace)
|
|
134
|
+
df_delta = df[df["Format"] == "delta"]
|
|
115
135
|
|
|
116
136
|
if isinstance(tables, str):
|
|
117
137
|
tables = [tables]
|
|
118
138
|
|
|
119
|
-
if tables
|
|
120
|
-
tables_filt = lakeTablesDelta[lakeTablesDelta["Table Name"].isin(tables)]
|
|
121
|
-
else:
|
|
122
|
-
tables_filt = lakeTablesDelta.copy()
|
|
123
|
-
|
|
124
|
-
spark = _create_spark_session()
|
|
125
|
-
spark.conf.set("spark.databricks.delta.vacuum.parallelDelete.enabled", "true")
|
|
126
|
-
|
|
127
|
-
for _, r in (bar := tqdm(tables_filt.iterrows())):
|
|
128
|
-
tableName = r["Table Name"]
|
|
129
|
-
tablePath = r["Location"]
|
|
130
|
-
bar.set_description(f"Vacuuming the '{tableName}' table...")
|
|
131
|
-
deltaTable = DeltaTable.forPath(spark, tablePath)
|
|
139
|
+
df_tables = df_delta[df_delta["Table Name"].isin(tables)] if tables else df_delta
|
|
132
140
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
141
|
+
for _, r in (bar := tqdm(df_tables.iterrows())):
|
|
142
|
+
table_name = r["Table Name"]
|
|
143
|
+
path = r["Location"]
|
|
144
|
+
bar.set_description(f"Vacuuming the '{table_name}' table...")
|
|
145
|
+
_vacuum_table(path=path, retain_n_hours=retain_n_hours)
|
|
137
146
|
|
|
138
147
|
|
|
139
148
|
def run_table_maintenance(
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
from sempy_labs._helper_functions import (
|
|
2
|
+
resolve_workspace_id,
|
|
3
|
+
resolve_lakehouse_id,
|
|
4
|
+
_base_api,
|
|
5
|
+
_create_dataframe,
|
|
6
|
+
_update_dataframe_datatypes,
|
|
7
|
+
)
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from typing import Optional
|
|
10
|
+
from uuid import UUID
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def list_livy_sessions(
|
|
14
|
+
lakehouse: Optional[str | UUID] = None, workspace: Optional[str | UUID] = None
|
|
15
|
+
) -> pd.DataFrame:
|
|
16
|
+
"""
|
|
17
|
+
Shows a list of livy sessions from the specified item identifier.
|
|
18
|
+
|
|
19
|
+
This is a wrapper function for the following API: `Livy Sessions - List Livy Sessions <https://learn.microsoft.com/rest/api/fabric/lakehouse/livy-sessions/list-livy-sessions>`_.
|
|
20
|
+
|
|
21
|
+
Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
lakehouse : str | uuid.UUID, default=None
|
|
26
|
+
The Fabric lakehouse name or ID.
|
|
27
|
+
Defaults to None which resolves to the lakehouse attached to the notebook.
|
|
28
|
+
workspace : str | uuid.UUID, default=None
|
|
29
|
+
The Fabric workspace name or ID.
|
|
30
|
+
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
31
|
+
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
32
|
+
|
|
33
|
+
Returns
|
|
34
|
+
-------
|
|
35
|
+
pandas.DataFrame
|
|
36
|
+
A pandas dataframe showing a list of livy sessions from the specified item identifier.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
columns = {
|
|
40
|
+
"Spark Application Id": "string",
|
|
41
|
+
"State:": "string",
|
|
42
|
+
"Livy Id": "string",
|
|
43
|
+
"Origin": "string",
|
|
44
|
+
"Attempt Number": "int",
|
|
45
|
+
"Max Number Of Attempts": "int",
|
|
46
|
+
"Livy Name": "string",
|
|
47
|
+
"Submitter Id": "string",
|
|
48
|
+
"Submitter Type": "string",
|
|
49
|
+
"Item Workspace Id": "string",
|
|
50
|
+
"Item Id": "string",
|
|
51
|
+
"Item Reference Type": "string",
|
|
52
|
+
"Item Name": "string",
|
|
53
|
+
"Item Type": "string",
|
|
54
|
+
"Job Type": "string",
|
|
55
|
+
"Submitted Date Time": "str",
|
|
56
|
+
"Start Date Time": "str",
|
|
57
|
+
"End Date Time": "string",
|
|
58
|
+
"Queued Duration Value": "int",
|
|
59
|
+
"Queued Duration Time Unit": "string",
|
|
60
|
+
"Running Duration Value": "int",
|
|
61
|
+
"Running Duration Time Unit": "string",
|
|
62
|
+
"Total Duration Value": "int",
|
|
63
|
+
"Total Duration Time Unit": "string",
|
|
64
|
+
"Job Instance Id": "string",
|
|
65
|
+
"Creator Item Workspace Id": "string",
|
|
66
|
+
"Creator Item Id": "string",
|
|
67
|
+
"Creator Item Reference Type": "string",
|
|
68
|
+
"Creator Item Name": "string",
|
|
69
|
+
"Creator Item Type": "string",
|
|
70
|
+
"Cancellation Reason": "string",
|
|
71
|
+
"Capacity Id": "string",
|
|
72
|
+
"Operation Name": "string",
|
|
73
|
+
"Runtime Version": "string",
|
|
74
|
+
"Livy Session Item Resource Uri": "string",
|
|
75
|
+
}
|
|
76
|
+
df = _create_dataframe(columns=columns)
|
|
77
|
+
|
|
78
|
+
workspace_id = resolve_workspace_id(workspace)
|
|
79
|
+
lakehouse_id = resolve_lakehouse_id(lakehouse, workspace_id)
|
|
80
|
+
|
|
81
|
+
responses = _base_api(
|
|
82
|
+
request=f"/v1/workspaces/{workspace_id}/lakehouses/{lakehouse_id}/livySessions",
|
|
83
|
+
uses_pagination=True,
|
|
84
|
+
client="fabric_sp",
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
dfs = []
|
|
88
|
+
|
|
89
|
+
for r in responses:
|
|
90
|
+
for v in r.get("value", []):
|
|
91
|
+
queued_duration = v.get("queuedDuration", {})
|
|
92
|
+
running_duration = v.get("runningDuration", {})
|
|
93
|
+
total_duration = v.get("totalDuration", {})
|
|
94
|
+
new_data = {
|
|
95
|
+
"Spark Application Id": v.get("sparkApplicationId"),
|
|
96
|
+
"State:": v.get("state"),
|
|
97
|
+
"Livy Id": v.get("livyId"),
|
|
98
|
+
"Origin": v.get("origin"),
|
|
99
|
+
"Attempt Number": v.get("attemptNumber"),
|
|
100
|
+
"Max Number Of Attempts": v.get("maxNumberOfAttempts"),
|
|
101
|
+
"Livy Name": v.get("livyName"),
|
|
102
|
+
"Submitter Id": v["submitter"].get("id"),
|
|
103
|
+
"Submitter Type": v["submitter"].get("type"),
|
|
104
|
+
"Item Workspace Id": v["item"].get("workspaceId"),
|
|
105
|
+
"Item Id": v["item"].get("itemId"),
|
|
106
|
+
"Item Reference Type": v["item"].get("referenceType"),
|
|
107
|
+
"Item Name": v.get("itemName"),
|
|
108
|
+
"Item Type": v.get("itemType"),
|
|
109
|
+
"Job Type": v.get("jobType"),
|
|
110
|
+
"Submitted Date Time": v.get("submittedDateTime"),
|
|
111
|
+
"Start Date Time": v.get("startDateTime"),
|
|
112
|
+
"End Date Time": v.get("endDateTime"),
|
|
113
|
+
"Queued Duration Value": queued_duration.get("value"),
|
|
114
|
+
"Queued Duration Time Unit": queued_duration.get("timeUnit"),
|
|
115
|
+
"Running Duration Value": running_duration.get("value"),
|
|
116
|
+
"Running Duration Time Unit": running_duration.get("timeUnit"),
|
|
117
|
+
"Total Duration Value": total_duration.get("value"),
|
|
118
|
+
"Total Duration Time Unit": total_duration.get("timeUnit"),
|
|
119
|
+
"Job Instance Id": v.get("jobInstanceId"),
|
|
120
|
+
"Creator Item Workspace Id": v["creatorItem"].get("workspaceId"),
|
|
121
|
+
"Creator Item Id": v["creatorItem"].get("itemId"),
|
|
122
|
+
"Creator Item Reference Type": v["creatorItem"].get("referenceType"),
|
|
123
|
+
"Creator Item Name": v.get("creatorItemName"),
|
|
124
|
+
"Creator Item Type": v.get("creatorItemType"),
|
|
125
|
+
"Cancellation Reason": v.get("cancellationReason"),
|
|
126
|
+
"Capacity Id": v.get("capacityId"),
|
|
127
|
+
"Operation Name": v.get("operationName"),
|
|
128
|
+
"Runtime Version": v.get("runtimeVersion"),
|
|
129
|
+
"Livy Session Item Resource Uri": v.get("livySessionItemResourceUri"),
|
|
130
|
+
}
|
|
131
|
+
dfs.append(pd.DataFrame(new_data, index=[0]))
|
|
132
|
+
|
|
133
|
+
if dfs:
|
|
134
|
+
df = pd.concat(dfs, ignore_index=True)
|
|
135
|
+
_update_dataframe_datatypes(dataframe=df, column_map=columns)
|
|
136
|
+
|
|
137
|
+
return df
|