semantic-link-labs 0.9.9__py3-none-any.whl → 0.9.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of semantic-link-labs might be problematic. Click here for more details.

Files changed (49) hide show
  1. {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/METADATA +30 -22
  2. {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/RECORD +47 -40
  3. {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/WHEEL +1 -1
  4. sempy_labs/__init__.py +28 -1
  5. sempy_labs/_clear_cache.py +12 -0
  6. sempy_labs/_dax.py +8 -2
  7. sempy_labs/_delta_analyzer.py +17 -26
  8. sempy_labs/_environments.py +19 -1
  9. sempy_labs/_generate_semantic_model.py +7 -8
  10. sempy_labs/_helper_functions.py +351 -151
  11. sempy_labs/_kql_databases.py +18 -0
  12. sempy_labs/_kusto.py +137 -0
  13. sempy_labs/_list_functions.py +18 -36
  14. sempy_labs/_model_bpa_rules.py +13 -3
  15. sempy_labs/_notebooks.py +44 -11
  16. sempy_labs/_semantic_models.py +93 -1
  17. sempy_labs/_sql.py +3 -2
  18. sempy_labs/_tags.py +194 -0
  19. sempy_labs/_variable_libraries.py +89 -0
  20. sempy_labs/_vertipaq.py +6 -6
  21. sempy_labs/_vpax.py +386 -0
  22. sempy_labs/_warehouses.py +3 -3
  23. sempy_labs/admin/__init__.py +14 -0
  24. sempy_labs/admin/_artifacts.py +3 -3
  25. sempy_labs/admin/_capacities.py +161 -1
  26. sempy_labs/admin/_dataflows.py +45 -0
  27. sempy_labs/admin/_items.py +16 -11
  28. sempy_labs/admin/_tags.py +126 -0
  29. sempy_labs/admin/_tenant.py +5 -5
  30. sempy_labs/directlake/_generate_shared_expression.py +29 -26
  31. sempy_labs/directlake/_update_directlake_model_lakehouse_connection.py +55 -5
  32. sempy_labs/dotnet_lib/dotnet.runtime.config.json +10 -0
  33. sempy_labs/lakehouse/__init__.py +16 -0
  34. sempy_labs/lakehouse/_blobs.py +115 -63
  35. sempy_labs/lakehouse/_get_lakehouse_columns.py +41 -18
  36. sempy_labs/lakehouse/_get_lakehouse_tables.py +62 -47
  37. sempy_labs/lakehouse/_helper.py +211 -0
  38. sempy_labs/lakehouse/_lakehouse.py +45 -36
  39. sempy_labs/lakehouse/_livy_sessions.py +137 -0
  40. sempy_labs/migration/_migrate_calctables_to_lakehouse.py +7 -12
  41. sempy_labs/migration/_refresh_calc_tables.py +7 -6
  42. sempy_labs/report/_download_report.py +1 -1
  43. sempy_labs/report/_generate_report.py +5 -1
  44. sempy_labs/report/_reportwrapper.py +31 -18
  45. sempy_labs/tom/_model.py +104 -35
  46. sempy_labs/report/_bpareporttemplate/.pbi/localSettings.json +0 -9
  47. sempy_labs/report/_bpareporttemplate/.platform +0 -11
  48. {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/licenses/LICENSE +0 -0
  49. {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
- import sempy.fabric as fabric
1
+ import os
2
2
  import pandas as pd
3
3
  import pyarrow.parquet as pq
4
- import datetime
4
+ from datetime import datetime
5
5
  from sempy_labs._helper_functions import (
6
6
  _get_column_aggregate,
7
7
  resolve_workspace_name_and_id,
@@ -9,7 +9,11 @@ from sempy_labs._helper_functions import (
9
9
  save_as_delta_table,
10
10
  _base_api,
11
11
  _create_dataframe,
12
- _create_spark_session,
12
+ _read_delta_table,
13
+ _get_delta_table,
14
+ _mount,
15
+ create_abfss_path,
16
+ _pure_python_notebook,
13
17
  )
14
18
  from sempy_labs.directlake._guardrails import (
15
19
  get_sku_size,
@@ -33,8 +37,12 @@ def get_lakehouse_tables(
33
37
  """
34
38
  Shows the tables of a lakehouse and their respective properties. Option to include additional properties relevant to Direct Lake guardrails.
35
39
 
40
+ This function can be executed in either a PySpark or pure Python notebook.
41
+
36
42
  This is a wrapper function for the following API: `Tables - List Tables <https://learn.microsoft.com/rest/api/fabric/lakehouse/tables/list-tables>`_ plus extended capabilities.
37
43
 
44
+ Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
45
+
38
46
  Parameters
39
47
  ----------
40
48
  lakehouse : str | uuid.UUID, default=None
@@ -75,19 +83,10 @@ def get_lakehouse_tables(
75
83
  if count_rows: # Setting countrows defaults to extended=True
76
84
  extended = True
77
85
 
78
- if (
79
- workspace_id != fabric.get_workspace_id()
80
- and lakehouse_id != fabric.get_lakehouse_id()
81
- and count_rows
82
- ):
83
- raise ValueError(
84
- f"{icons.red_dot} If 'count_rows' is set to True, you must run this function against the default lakehouse attached to the notebook. "
85
- "Count rows runs a spark query and cross-workspace spark queries are currently not supported."
86
- )
87
-
88
86
  responses = _base_api(
89
87
  request=f"v1/workspaces/{workspace_id}/lakehouses/{lakehouse_id}/tables",
90
88
  uses_pagination=True,
89
+ client="fabric_sp",
91
90
  )
92
91
 
93
92
  if not responses[0].get("data"):
@@ -112,40 +111,59 @@ def get_lakehouse_tables(
112
111
  if extended:
113
112
  sku_value = get_sku_size(workspace_id)
114
113
  guardrail = get_directlake_guardrails_for_sku(sku_value)
115
- spark = _create_spark_session()
116
- df["Files"] = None
117
- df["Row Groups"] = None
118
- df["Table Size"] = None
114
+ local_path = _mount(lakehouse=lakehouse_id, workspace=workspace_id)
115
+
116
+ df["Files"], df["Row Groups"], df["Table Size"] = None, None, None
119
117
  if count_rows:
120
118
  df["Row Count"] = None
119
+
121
120
  for i, r in df.iterrows():
122
- tName = r["Table Name"]
121
+ table_name = r["Table Name"]
123
122
  if r["Type"] == "Managed" and r["Format"] == "delta":
124
- detail_df = spark.sql(f"DESCRIBE DETAIL `{tName}`").collect()[0]
125
- num_files = detail_df.numFiles
126
- size_in_bytes = detail_df.sizeInBytes
127
-
128
- delta_table_path = f"Tables/{tName}"
129
- latest_files = (
130
- spark.read.format("delta").load(delta_table_path).inputFiles()
123
+ delta_table_path = create_abfss_path(
124
+ lakehouse_id, workspace_id, table_name
131
125
  )
132
- file_paths = [f.split("/")[-1] for f in latest_files]
133
126
 
134
- # Handle FileNotFoundError
127
+ if _pure_python_notebook():
128
+ from deltalake import DeltaTable
129
+
130
+ delta_table = DeltaTable(delta_table_path)
131
+ latest_files = [
132
+ file["path"]
133
+ for file in delta_table.get_add_actions().to_pylist()
134
+ ]
135
+ size_in_bytes = 0
136
+ for f in latest_files:
137
+ local_file_path = os.path.join(
138
+ local_path, "Tables", table_name, os.path.basename(f)
139
+ )
140
+ if os.path.exists(local_file_path):
141
+ size_in_bytes += os.path.getsize(local_file_path)
142
+ num_latest_files = len(latest_files)
143
+ else:
144
+ delta_table = _get_delta_table(delta_table_path)
145
+ latest_files = _read_delta_table(delta_table_path).inputFiles()
146
+ table_df = delta_table.toDF()
147
+ table_details = delta_table.detail().collect()[0].asDict()
148
+ num_latest_files = table_details.get("numFiles", 0)
149
+ size_in_bytes = table_details.get("sizeInBytes", 0)
150
+
151
+ table_path = os.path.join(local_path, "Tables", table_name)
152
+ file_paths = [os.path.basename(f) for f in latest_files]
153
+
135
154
  num_rowgroups = 0
136
155
  for filename in file_paths:
137
- try:
138
- num_rowgroups += pq.ParquetFile(
139
- f"/lakehouse/default/{delta_table_path}/{filename}"
140
- ).num_row_groups
141
- except FileNotFoundError:
142
- continue
143
- df.at[i, "Files"] = num_files
156
+ parquet_file = pq.ParquetFile(f"{table_path}/{filename}")
157
+ num_rowgroups += parquet_file.num_row_groups
158
+ df.at[i, "Files"] = num_latest_files
144
159
  df.at[i, "Row Groups"] = num_rowgroups
145
160
  df.at[i, "Table Size"] = size_in_bytes
146
161
  if count_rows:
147
- num_rows = spark.table(tName).count()
148
- df.at[i, "Row Count"] = num_rows
162
+ if _pure_python_notebook():
163
+ row_count = delta_table.to_pyarrow_table().num_rows
164
+ else:
165
+ row_count = table_df.count()
166
+ df.at[i, "Row Count"] = row_count
149
167
 
150
168
  if extended:
151
169
  intColumns = ["Files", "Row Groups", "Table Size"]
@@ -168,19 +186,16 @@ def get_lakehouse_tables(
168
186
  if export:
169
187
  if not lakehouse_attached():
170
188
  raise ValueError(
171
- f"{icons.red_dot} In order to save the report.json file, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook."
189
+ f"{icons.red_dot} In order to save the dataframe, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook."
172
190
  )
173
191
 
174
- (current_lakehouse_name, current_lakehouse_id) = resolve_lakehouse_name_and_id()
175
- lakeTName = "lakehouse_table_details"
176
- lakeT_filt = df[df["Table Name"] == lakeTName]
192
+ lake_table_name = "lakehouse_table_details"
193
+ df_filt = df[df["Table Name"] == lake_table_name]
177
194
 
178
- if len(lakeT_filt) == 0:
195
+ if df_filt.empty:
179
196
  run_id = 1
180
197
  else:
181
- max_run_id = _get_column_aggregate(
182
- lakehouse=current_lakehouse_name, table_name=lakeTName
183
- )
198
+ max_run_id = _get_column_aggregate(table_name=lake_table_name)
184
199
  run_id = max_run_id + 1
185
200
 
186
201
  export_df = df.copy()
@@ -224,13 +239,13 @@ def get_lakehouse_tables(
224
239
  export_df[c] = export_df[c].astype(bool)
225
240
 
226
241
  print(
227
- f"{icons.in_progress} Saving Lakehouse table properties to the '{lakeTName}' table in the lakehouse...\n"
242
+ f"{icons.in_progress} Saving Lakehouse table properties to the '{lake_table_name}' table in the lakehouse...\n"
228
243
  )
229
- export_df["Timestamp"] = datetime.datetime.now()
244
+ export_df["Timestamp"] = datetime.now()
230
245
  export_df["RunId"] = run_id
231
246
 
232
247
  save_as_delta_table(
233
- dataframe=export_df, delta_table_name=lakeTName, write_mode="append"
248
+ dataframe=export_df, delta_table_name=lake_table_name, write_mode="append"
234
249
  )
235
250
 
236
251
  return df
@@ -0,0 +1,211 @@
1
+ from uuid import UUID
2
+ from typing import Optional, Literal
3
+ import pyarrow.dataset as ds
4
+ from sempy_labs._helper_functions import (
5
+ _mount,
6
+ delete_item,
7
+ _base_api,
8
+ resolve_workspace_name_and_id,
9
+ resolve_lakehouse_name_and_id,
10
+ )
11
+ from sempy._utils._log import log
12
+ import sempy_labs._icons as icons
13
+ import os
14
+
15
+
16
+ @log
17
+ def is_v_ordered(
18
+ table_name: str,
19
+ lakehouse: Optional[str | UUID] = None,
20
+ workspace: Optional[str | UUID] = None,
21
+ schema: Optional[str] = None,
22
+ ) -> bool:
23
+ """
24
+ Checks if a delta table is v-ordered.
25
+
26
+ Parameters
27
+ ----------
28
+ table_name : str
29
+ The name of the table to check.
30
+ lakehouse : str | uuid.UUID, default=None
31
+ The Fabric lakehouse name or ID.
32
+ Defaults to None which resolves to the lakehouse attached to the notebook.
33
+ workspace : str | uuid.UUID, default=None
34
+ The Fabric workspace name or ID used by the lakehouse.
35
+ Defaults to None which resolves to the workspace of the attached lakehouse
36
+ or if no lakehouse attached, resolves to the workspace of the notebook.
37
+ schema : str, optional
38
+ The schema of the table to check. If not provided, the default schema is used.
39
+
40
+ Returns
41
+ -------
42
+ bool
43
+ True if the table is v-ordered, False otherwise.
44
+ """
45
+
46
+ local_path = _mount(lakehouse=lakehouse, workspace=workspace)
47
+ table_path = (
48
+ f"{local_path}/Tables/{schema}/{table_name}"
49
+ if schema
50
+ else f"{local_path}/Tables/{table_name}"
51
+ )
52
+ ds_schema = ds.dataset(table_path).schema.metadata
53
+
54
+ return any(b"vorder" in key for key in ds_schema.keys())
55
+
56
+
57
+ def delete_lakehouse(
58
+ lakehouse: str | UUID, workspace: Optional[str | UUID] = None
59
+ ) -> None:
60
+ """
61
+ Deletes a lakehouse.
62
+
63
+ This is a wrapper function for the following API: `Items - Delete Lakehouse <https://learn.microsoft.com/rest/api/fabric/lakehouse/items/delete-lakehouse>`_.
64
+
65
+ Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
66
+
67
+ Parameters
68
+ ----------
69
+ lakehouse : str | uuid.UUID
70
+ The name or ID of the lakehouse to delete.
71
+ workspace : str | uuid.UUID, default=None
72
+ The Fabric workspace name or ID used by the lakehouse.
73
+ Defaults to None which resolves to the workspace of the attached lakehouse
74
+ or if no lakehouse attached, resolves to the workspace of the notebook.
75
+ """
76
+
77
+ delete_item(item=lakehouse, item_type="lakehouse", workspace=workspace)
78
+
79
+
80
+ def update_lakehouse(
81
+ name: Optional[str] = None,
82
+ description: Optional[str] = None,
83
+ lakehouse: Optional[str | UUID] = None,
84
+ workspace: Optional[str | UUID] = None,
85
+ ):
86
+ """
87
+ Updates a lakehouse.
88
+
89
+ This is a wrapper function for the following API: `Items - Update Lakehouse <https://learn.microsoft.com/rest/api/fabric/lakehouse/items/update-lakehouse>`_.
90
+
91
+ Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
92
+
93
+ Parameters
94
+ ----------
95
+ name: str, default=None
96
+ The new name of the lakehouse.
97
+ Defaults to None which does not update the name.
98
+ description: str, default=None
99
+ The new description of the lakehouse.
100
+ Defaults to None which does not update the description.
101
+ lakehouse : str | uuid.UUID, default=None
102
+ The name or ID of the lakehouse to update.
103
+ Defaults to None which resolves to the lakehouse attached to the notebook.
104
+ workspace : str | uuid.UUID, default=None
105
+ The Fabric workspace name or ID used by the lakehouse.
106
+ Defaults to None which resolves to the workspace of the attached lakehouse
107
+ or if no lakehouse attached, resolves to the workspace of the notebook.
108
+ """
109
+
110
+ if not name and not description:
111
+ raise ValueError(
112
+ f"{icons.red_dot} Either name or description must be provided."
113
+ )
114
+
115
+ (workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
116
+ (lakehouse_name, lakehouse_id) = resolve_lakehouse_name_and_id(
117
+ lakehouse, workspace_id
118
+ )
119
+
120
+ payload = {}
121
+ if name:
122
+ payload["displayName"] = name
123
+ if description:
124
+ payload["description"] = description
125
+
126
+ _base_api(
127
+ request=f"/v1/workspaces/{workspace_id}/lakehouses/{lakehouse_id}",
128
+ method="patch",
129
+ client="fabric_sp",
130
+ payload=payload,
131
+ )
132
+
133
+ print(
134
+ f"{icons.green_dot} The '{lakehouse_name}' lakehouse within the '{workspace_name}' workspace has been updated accordingly."
135
+ )
136
+
137
+
138
+ @log
139
+ def load_table(
140
+ table_name: str,
141
+ file_path: str,
142
+ mode: Literal["Overwrite", "Append"],
143
+ lakehouse: Optional[str | UUID] = None,
144
+ workspace: Optional[str | UUID] = None,
145
+ ):
146
+ """
147
+ Loads a table into a lakehouse. Currently only files are supported, not folders.
148
+
149
+ This is a wrapper function for the following API: `Tables - Load Table <https://learn.microsoft.com/rest/api/fabric/lakehouse/tables/load-table>`_.
150
+
151
+ Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
152
+
153
+ Parameters
154
+ ----------
155
+ table_name : str
156
+ The name of the table to load.
157
+ file_path : str
158
+ The path to the data to load.
159
+ mode : Literal["Overwrite", "Append"]
160
+ The mode to use when loading the data.
161
+ "Overwrite" will overwrite the existing data.
162
+ "Append" will append the data to the existing data.
163
+ lakehouse : str | uuid.UUID, default=None
164
+ The name or ID of the lakehouse to load the table into.
165
+ Defaults to None which resolves to the lakehouse attached to the notebook.
166
+ workspace : str | uuid.UUID, default=None
167
+ The Fabric workspace name or ID used by the lakehouse.
168
+ Defaults to None which resolves to the workspace of the attached lakehouse
169
+ or if no lakehouse attached, resolves to the workspace of the notebook.
170
+ """
171
+
172
+ (workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
173
+ (lakehouse_name, lakehouse_id) = resolve_lakehouse_name_and_id(
174
+ lakehouse, workspace_id
175
+ )
176
+
177
+ file_extension = os.path.splitext(file_path)[1]
178
+
179
+ payload = {
180
+ "relativePath": file_path,
181
+ "pathType": "File",
182
+ "mode": mode,
183
+ "formatOptions": {},
184
+ }
185
+
186
+ if file_extension == ".csv":
187
+ payload["formatOptions"] = {"format": "Csv", "header": True, "delimiter": ","}
188
+ elif file_extension == ".parquet":
189
+ payload["formatOptions"] = {
190
+ "format": "Parquet",
191
+ "header": True,
192
+ }
193
+ # Solve for loading folders
194
+ # elif file_extension == '':
195
+ # payload['pathType'] = "Folder"
196
+ # payload["recursive"] = recursive
197
+ # payload['formatOptions']
198
+ else:
199
+ raise NotImplementedError()
200
+
201
+ _base_api(
202
+ request=f"/v1/workspaces/{workspace_id}/lakehouses/{lakehouse_id}/tables/{table_name}/load",
203
+ client="fabric_sp",
204
+ method="post",
205
+ status_codes=202,
206
+ lro_return_status_code=True,
207
+ )
208
+
209
+ print(
210
+ f"{icons.green_dot} The '{table_name}' table has been loaded into the '{lakehouse_name}' lakehouse within the '{workspace_name}' workspace."
211
+ )
@@ -7,6 +7,7 @@ from sempy_labs._helper_functions import (
7
7
  resolve_lakehouse_name_and_id,
8
8
  resolve_workspace_name_and_id,
9
9
  _create_spark_session,
10
+ _pure_python_notebook,
10
11
  )
11
12
  import sempy_labs._icons as icons
12
13
  import re
@@ -32,6 +33,33 @@ def lakehouse_attached() -> bool:
32
33
  return False
33
34
 
34
35
 
36
+ def _optimize_table(path):
37
+
38
+ if _pure_python_notebook():
39
+ from deltalake import DeltaTable
40
+
41
+ DeltaTable(path).optimize.compact()
42
+ else:
43
+ from delta import DeltaTable
44
+
45
+ spark = _create_spark_session()
46
+ DeltaTable.forPath(spark, path).optimize().executeCompaction()
47
+
48
+
49
+ def _vacuum_table(path, retain_n_hours):
50
+
51
+ if _pure_python_notebook():
52
+ from deltalake import DeltaTable
53
+
54
+ DeltaTable(path).vacuum(retention_hours=retain_n_hours)
55
+ else:
56
+ from delta import DeltaTable
57
+
58
+ spark = _create_spark_session()
59
+ spark.conf.set("spark.databricks.delta.vacuum.parallelDelete.enabled", "true")
60
+ DeltaTable.forPath(spark, path).vacuum(retain_n_hours)
61
+
62
+
35
63
  @log
36
64
  def optimize_lakehouse_tables(
37
65
  tables: Optional[Union[str, List[str]]] = None,
@@ -56,27 +84,20 @@ def optimize_lakehouse_tables(
56
84
  """
57
85
 
58
86
  from sempy_labs.lakehouse._get_lakehouse_tables import get_lakehouse_tables
59
- from delta import DeltaTable
60
87
 
61
- lakeTables = get_lakehouse_tables(lakehouse=lakehouse, workspace=workspace)
62
- lakeTablesDelta = lakeTables[lakeTables["Format"] == "delta"]
88
+ df = get_lakehouse_tables(lakehouse=lakehouse, workspace=workspace)
89
+ df_delta = df[df["Format"] == "delta"]
63
90
 
64
91
  if isinstance(tables, str):
65
92
  tables = [tables]
66
93
 
67
- if tables is not None:
68
- tables_filt = lakeTablesDelta[lakeTablesDelta["Table Name"].isin(tables)]
69
- else:
70
- tables_filt = lakeTablesDelta.copy()
71
-
72
- spark = _create_spark_session()
94
+ df_tables = df_delta[df_delta["Table Name"].isin(tables)] if tables else df_delta
73
95
 
74
- for _, r in (bar := tqdm(tables_filt.iterrows())):
75
- tableName = r["Table Name"]
76
- tablePath = r["Location"]
77
- bar.set_description(f"Optimizing the '{tableName}' table...")
78
- deltaTable = DeltaTable.forPath(spark, tablePath)
79
- deltaTable.optimize().executeCompaction()
96
+ for _, r in (bar := tqdm(df_tables.iterrows())):
97
+ table_name = r["Table Name"]
98
+ path = r["Location"]
99
+ bar.set_description(f"Optimizing the '{table_name}' table...")
100
+ _optimize_table(path=path)
80
101
 
81
102
 
82
103
  @log
@@ -92,7 +113,7 @@ def vacuum_lakehouse_tables(
92
113
  Parameters
93
114
  ----------
94
115
  tables : str | List[str] | None
95
- The table(s) to vacuum. If no tables are specified, all tables in the lakehouse will be optimized.
116
+ The table(s) to vacuum. If no tables are specified, all tables in the lakehouse will be vacuumed.
96
117
  lakehouse : str | uuid.UUID, default=None
97
118
  The Fabric lakehouse name or ID.
98
119
  Defaults to None which resolves to the lakehouse attached to the notebook.
@@ -108,32 +129,20 @@ def vacuum_lakehouse_tables(
108
129
  """
109
130
 
110
131
  from sempy_labs.lakehouse._get_lakehouse_tables import get_lakehouse_tables
111
- from delta import DeltaTable
112
132
 
113
- lakeTables = get_lakehouse_tables(lakehouse=lakehouse, workspace=workspace)
114
- lakeTablesDelta = lakeTables[lakeTables["Format"] == "delta"]
133
+ df = get_lakehouse_tables(lakehouse=lakehouse, workspace=workspace)
134
+ df_delta = df[df["Format"] == "delta"]
115
135
 
116
136
  if isinstance(tables, str):
117
137
  tables = [tables]
118
138
 
119
- if tables is not None:
120
- tables_filt = lakeTablesDelta[lakeTablesDelta["Table Name"].isin(tables)]
121
- else:
122
- tables_filt = lakeTablesDelta.copy()
123
-
124
- spark = _create_spark_session()
125
- spark.conf.set("spark.databricks.delta.vacuum.parallelDelete.enabled", "true")
126
-
127
- for _, r in (bar := tqdm(tables_filt.iterrows())):
128
- tableName = r["Table Name"]
129
- tablePath = r["Location"]
130
- bar.set_description(f"Vacuuming the '{tableName}' table...")
131
- deltaTable = DeltaTable.forPath(spark, tablePath)
139
+ df_tables = df_delta[df_delta["Table Name"].isin(tables)] if tables else df_delta
132
140
 
133
- if retain_n_hours is None:
134
- deltaTable.vacuum()
135
- else:
136
- deltaTable.vacuum(retain_n_hours)
141
+ for _, r in (bar := tqdm(df_tables.iterrows())):
142
+ table_name = r["Table Name"]
143
+ path = r["Location"]
144
+ bar.set_description(f"Vacuuming the '{table_name}' table...")
145
+ _vacuum_table(path=path, retain_n_hours=retain_n_hours)
137
146
 
138
147
 
139
148
  def run_table_maintenance(
@@ -0,0 +1,137 @@
1
+ from sempy_labs._helper_functions import (
2
+ resolve_workspace_id,
3
+ resolve_lakehouse_id,
4
+ _base_api,
5
+ _create_dataframe,
6
+ _update_dataframe_datatypes,
7
+ )
8
+ import pandas as pd
9
+ from typing import Optional
10
+ from uuid import UUID
11
+
12
+
13
+ def list_livy_sessions(
14
+ lakehouse: Optional[str | UUID] = None, workspace: Optional[str | UUID] = None
15
+ ) -> pd.DataFrame:
16
+ """
17
+ Shows a list of livy sessions from the specified item identifier.
18
+
19
+ This is a wrapper function for the following API: `Livy Sessions - List Livy Sessions <https://learn.microsoft.com/rest/api/fabric/lakehouse/livy-sessions/list-livy-sessions>`_.
20
+
21
+ Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
22
+
23
+ Parameters
24
+ ----------
25
+ lakehouse : str | uuid.UUID, default=None
26
+ The Fabric lakehouse name or ID.
27
+ Defaults to None which resolves to the lakehouse attached to the notebook.
28
+ workspace : str | uuid.UUID, default=None
29
+ The Fabric workspace name or ID.
30
+ Defaults to None which resolves to the workspace of the attached lakehouse
31
+ or if no lakehouse attached, resolves to the workspace of the notebook.
32
+
33
+ Returns
34
+ -------
35
+ pandas.DataFrame
36
+ A pandas dataframe showing a list of livy sessions from the specified item identifier.
37
+ """
38
+
39
+ columns = {
40
+ "Spark Application Id": "string",
41
+ "State:": "string",
42
+ "Livy Id": "string",
43
+ "Origin": "string",
44
+ "Attempt Number": "int",
45
+ "Max Number Of Attempts": "int",
46
+ "Livy Name": "string",
47
+ "Submitter Id": "string",
48
+ "Submitter Type": "string",
49
+ "Item Workspace Id": "string",
50
+ "Item Id": "string",
51
+ "Item Reference Type": "string",
52
+ "Item Name": "string",
53
+ "Item Type": "string",
54
+ "Job Type": "string",
55
+ "Submitted Date Time": "str",
56
+ "Start Date Time": "str",
57
+ "End Date Time": "string",
58
+ "Queued Duration Value": "int",
59
+ "Queued Duration Time Unit": "string",
60
+ "Running Duration Value": "int",
61
+ "Running Duration Time Unit": "string",
62
+ "Total Duration Value": "int",
63
+ "Total Duration Time Unit": "string",
64
+ "Job Instance Id": "string",
65
+ "Creator Item Workspace Id": "string",
66
+ "Creator Item Id": "string",
67
+ "Creator Item Reference Type": "string",
68
+ "Creator Item Name": "string",
69
+ "Creator Item Type": "string",
70
+ "Cancellation Reason": "string",
71
+ "Capacity Id": "string",
72
+ "Operation Name": "string",
73
+ "Runtime Version": "string",
74
+ "Livy Session Item Resource Uri": "string",
75
+ }
76
+ df = _create_dataframe(columns=columns)
77
+
78
+ workspace_id = resolve_workspace_id(workspace)
79
+ lakehouse_id = resolve_lakehouse_id(lakehouse, workspace_id)
80
+
81
+ responses = _base_api(
82
+ request=f"/v1/workspaces/{workspace_id}/lakehouses/{lakehouse_id}/livySessions",
83
+ uses_pagination=True,
84
+ client="fabric_sp",
85
+ )
86
+
87
+ dfs = []
88
+
89
+ for r in responses:
90
+ for v in r.get("value", []):
91
+ queued_duration = v.get("queuedDuration", {})
92
+ running_duration = v.get("runningDuration", {})
93
+ total_duration = v.get("totalDuration", {})
94
+ new_data = {
95
+ "Spark Application Id": v.get("sparkApplicationId"),
96
+ "State:": v.get("state"),
97
+ "Livy Id": v.get("livyId"),
98
+ "Origin": v.get("origin"),
99
+ "Attempt Number": v.get("attemptNumber"),
100
+ "Max Number Of Attempts": v.get("maxNumberOfAttempts"),
101
+ "Livy Name": v.get("livyName"),
102
+ "Submitter Id": v["submitter"].get("id"),
103
+ "Submitter Type": v["submitter"].get("type"),
104
+ "Item Workspace Id": v["item"].get("workspaceId"),
105
+ "Item Id": v["item"].get("itemId"),
106
+ "Item Reference Type": v["item"].get("referenceType"),
107
+ "Item Name": v.get("itemName"),
108
+ "Item Type": v.get("itemType"),
109
+ "Job Type": v.get("jobType"),
110
+ "Submitted Date Time": v.get("submittedDateTime"),
111
+ "Start Date Time": v.get("startDateTime"),
112
+ "End Date Time": v.get("endDateTime"),
113
+ "Queued Duration Value": queued_duration.get("value"),
114
+ "Queued Duration Time Unit": queued_duration.get("timeUnit"),
115
+ "Running Duration Value": running_duration.get("value"),
116
+ "Running Duration Time Unit": running_duration.get("timeUnit"),
117
+ "Total Duration Value": total_duration.get("value"),
118
+ "Total Duration Time Unit": total_duration.get("timeUnit"),
119
+ "Job Instance Id": v.get("jobInstanceId"),
120
+ "Creator Item Workspace Id": v["creatorItem"].get("workspaceId"),
121
+ "Creator Item Id": v["creatorItem"].get("itemId"),
122
+ "Creator Item Reference Type": v["creatorItem"].get("referenceType"),
123
+ "Creator Item Name": v.get("creatorItemName"),
124
+ "Creator Item Type": v.get("creatorItemType"),
125
+ "Cancellation Reason": v.get("cancellationReason"),
126
+ "Capacity Id": v.get("capacityId"),
127
+ "Operation Name": v.get("operationName"),
128
+ "Runtime Version": v.get("runtimeVersion"),
129
+ "Livy Session Item Resource Uri": v.get("livySessionItemResourceUri"),
130
+ }
131
+ dfs.append(pd.DataFrame(new_data, index=[0]))
132
+
133
+ if dfs:
134
+ df = pd.concat(dfs, ignore_index=True)
135
+ _update_dataframe_datatypes(dataframe=df, column_map=columns)
136
+
137
+ return df