semantic-link-labs 0.9.3__py3-none-any.whl → 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of semantic-link-labs might be problematic. Click here for more details.

Files changed (41) hide show
  1. {semantic_link_labs-0.9.3.dist-info → semantic_link_labs-0.9.4.dist-info}/METADATA +9 -6
  2. {semantic_link_labs-0.9.3.dist-info → semantic_link_labs-0.9.4.dist-info}/RECORD +41 -31
  3. {semantic_link_labs-0.9.3.dist-info → semantic_link_labs-0.9.4.dist-info}/WHEEL +1 -1
  4. sempy_labs/__init__.py +27 -1
  5. sempy_labs/_capacity_migration.py +3 -2
  6. sempy_labs/_dax.py +17 -3
  7. sempy_labs/_delta_analyzer.py +279 -127
  8. sempy_labs/_eventhouses.py +70 -1
  9. sempy_labs/_generate_semantic_model.py +30 -9
  10. sempy_labs/_helper_functions.py +30 -1
  11. sempy_labs/_job_scheduler.py +226 -2
  12. sempy_labs/_list_functions.py +40 -16
  13. sempy_labs/_model_bpa.py +15 -0
  14. sempy_labs/_model_bpa_rules.py +12 -2
  15. sempy_labs/_semantic_models.py +117 -0
  16. sempy_labs/_sql.py +73 -6
  17. sempy_labs/_sqldatabase.py +227 -0
  18. sempy_labs/admin/__init__.py +49 -8
  19. sempy_labs/admin/_activities.py +166 -0
  20. sempy_labs/admin/_apps.py +143 -0
  21. sempy_labs/admin/_basic_functions.py +32 -652
  22. sempy_labs/admin/_capacities.py +250 -0
  23. sempy_labs/admin/_datasets.py +184 -0
  24. sempy_labs/admin/_domains.py +1 -1
  25. sempy_labs/admin/_items.py +3 -1
  26. sempy_labs/admin/_reports.py +165 -0
  27. sempy_labs/admin/_scanner.py +0 -1
  28. sempy_labs/admin/_shared.py +74 -0
  29. sempy_labs/admin/_tenant.py +489 -0
  30. sempy_labs/directlake/_dl_helper.py +0 -1
  31. sempy_labs/directlake/_update_directlake_partition_entity.py +6 -0
  32. sempy_labs/graph/_teams.py +1 -1
  33. sempy_labs/graph/_users.py +9 -1
  34. sempy_labs/lakehouse/_shortcuts.py +28 -15
  35. sempy_labs/report/__init__.py +3 -1
  36. sempy_labs/report/_download_report.py +4 -1
  37. sempy_labs/report/_export_report.py +272 -0
  38. sempy_labs/report/_report_functions.py +9 -261
  39. sempy_labs/tom/_model.py +278 -29
  40. {semantic_link_labs-0.9.3.dist-info → semantic_link_labs-0.9.4.dist-info}/LICENSE +0 -0
  41. {semantic_link_labs-0.9.3.dist-info → semantic_link_labs-0.9.4.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,8 @@
1
1
  import pandas as pd
2
- import datetime
2
+ import re
3
+ from datetime import datetime
4
+ import os
5
+ from uuid import UUID
3
6
  from typing import Dict, Optional
4
7
  import pyarrow.dataset as ds
5
8
  import pyarrow.parquet as pq
@@ -12,20 +15,43 @@ from sempy_labs._helper_functions import (
12
15
  resolve_workspace_name_and_id,
13
16
  resolve_lakehouse_name_and_id,
14
17
  _read_delta_table,
15
- _delta_table_row_count,
18
+ _mount,
19
+ _create_spark_session,
16
20
  )
21
+ from sempy._utils._log import log
17
22
  from sempy_labs.lakehouse._get_lakehouse_tables import get_lakehouse_tables
18
23
  from sempy_labs.lakehouse._lakehouse import lakehouse_attached
19
24
  import sempy_labs._icons as icons
20
- from uuid import UUID
25
+ from tqdm.auto import tqdm
26
+
21
27
 
28
+ def get_parquet_file_infos(path):
29
+
30
+ import notebookutils
22
31
 
32
+ files = []
33
+ items = notebookutils.fs.ls(path)
34
+ for item in items:
35
+ if item.isDir:
36
+ # Ignore the _delta_log directory
37
+ if "_delta_log" not in item.path:
38
+ files.extend(get_parquet_file_infos(item.path))
39
+ else:
40
+ # Filter out non-Parquet files and files with size 0
41
+ if item.path.endswith(".parquet") and item.size > 0:
42
+ files.append((item.path, item.size))
43
+ return files
44
+
45
+
46
+ @log
23
47
  def delta_analyzer(
24
48
  table_name: str,
25
49
  approx_distinct_count: bool = True,
26
50
  export: bool = False,
27
51
  lakehouse: Optional[str | UUID] = None,
28
52
  workspace: Optional[str | UUID] = None,
53
+ column_stats: bool = True,
54
+ skip_cardinality: bool = True,
29
55
  ) -> Dict[str, pd.DataFrame]:
30
56
  """
31
57
  Analyzes a delta table and shows the results in dictionary containing a set of 5 dataframes. If 'export' is set to True, the results will be saved to delta tables in the lakehouse attached to the notebook.
@@ -55,13 +81,20 @@ def delta_analyzer(
55
81
  The Fabric workspace name or ID used by the lakehouse.
56
82
  Defaults to None which resolves to the workspace of the attached lakehouse
57
83
  or if no lakehouse attached, resolves to the workspace of the notebook.
84
+ column_stats : bool, default=True
85
+ If True, collects data about column chunks and columns. If False, skips that step and only returns the other 3 dataframes.
86
+ skip_cardinality : bool, default=True
87
+ If True, skips the cardinality calculation for each column. If False, calculates the cardinality for each column.
58
88
 
59
89
  Returns
60
90
  -------
61
91
  Dict[str, pandas.DataFrame]
62
92
  A dictionary of pandas dataframes showing semantic model objects which violated the best practice analyzer rules.
63
93
  """
64
- import notebookutils
94
+
95
+ # Must calculate column stats if calculating cardinality
96
+ if not skip_cardinality:
97
+ column_stats = True
65
98
 
66
99
  # display_toggle = notebookutils.common.configs.pandas_display
67
100
 
@@ -70,70 +103,60 @@ def delta_analyzer(
70
103
  # notebookutils.common.configs.pandas_display = False
71
104
 
72
105
  prefix = "SLL_DeltaAnalyzer_"
73
- now = datetime.datetime.now()
106
+ now = datetime.now()
74
107
  (workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace=workspace)
75
108
  (lakehouse_name, lakehouse_id) = resolve_lakehouse_name_and_id(
76
109
  lakehouse=lakehouse, workspace=workspace
77
110
  )
78
111
  path = create_abfss_path(lakehouse_id, workspace_id, table_name)
79
- lake_path = create_abfss_path(lakehouse_id, workspace_id)
80
- mounts = notebookutils.fs.mounts()
81
- mount_point = f"/{workspace_name.replace(' ', '')}{lakehouse_name.replace(' ', '')}"
82
- if not any(i.get("source") == lake_path for i in mounts):
83
- # Mount lakehouse if not mounted
84
- notebookutils.fs.mount(lake_path, mount_point)
85
- print(
86
- f"{icons.green_dot} Mounted the '{lakehouse_name}' lakehouse within the '{workspace_name}' to the notebook."
87
- )
88
-
89
- mounts = notebookutils.fs.mounts()
90
- local_path = next(
91
- i.get("localPath") for i in mounts if i.get("source") == lake_path
92
- )
112
+ local_path = _mount(lakehouse=lakehouse, workspace=workspace)
93
113
  table_path = f"{local_path}/Tables/{table_name}"
114
+ delta_table_path = create_abfss_path(lakehouse_id, workspace_id, table_name)
94
115
 
95
116
  # Set back to original value
96
117
  # notebookutils.common.configs.pandas_display = display_toggle
97
118
 
98
119
  parquet_file_df_columns = {
99
- "ParquetFile": "string",
100
- "RowCount": "int",
101
- "RowGroups": "int",
120
+ # "Dataset": "string",
121
+ "Parquet File": "string",
122
+ "Row Count": "int",
123
+ "Row Groups": "int",
124
+ "Created By": "string",
125
+ "Total Table Rows": "int",
126
+ "Total Table Row Groups": "int",
102
127
  }
103
128
  row_group_df_columns = {
104
- "ParquetFile": "string",
105
- "RowGroupID": "int",
106
- "RowCount": "int",
107
- "CompressedSize": "int",
108
- "UncompressedSize": "int",
109
- "CompressionRatio": "float",
129
+ # "Dataset": "string",
130
+ "Parquet File": "string",
131
+ "Row Group ID": "int",
132
+ "Row Count": "int",
133
+ "Compressed Size": "int",
134
+ "Uncompressed Size": "int",
135
+ "Compression Ratio": "float",
136
+ "Total Table Rows": "int",
137
+ "Ratio Of Total Table Rows": "float",
138
+ "Total Table Row Groups": "int",
110
139
  }
111
140
  column_chunk_df_columns = {
112
- "ParquetFile": "string",
113
- "ColumnID": "int",
114
- "ColumnName": "string",
115
- "ColumnType": "string",
116
- "CompressedSize": "int",
117
- "UncompressedSize": "int",
118
- "HasDict": "bool",
119
- "DictOffset": "int_fillna",
120
- "ValueCount": "int",
141
+ # "Dataset": "string",
142
+ "Parquet File": "string",
143
+ "Column ID": "int",
144
+ "Column Name": "string",
145
+ "Column Type": "string",
146
+ "Compressed Size": "int",
147
+ "Uncompressed Size": "int",
148
+ "Has Dict": "bool",
149
+ "Dict Offset": "int_fillna",
150
+ "Value Count": "int",
121
151
  "Encodings": "string",
152
+ "Statistics": "string",
153
+ "Primative Type": "string",
122
154
  }
123
155
 
124
156
  parquet_file_df = _create_dataframe(columns=parquet_file_df_columns)
125
157
  row_group_df = _create_dataframe(columns=row_group_df_columns)
126
158
  column_chunk_df = _create_dataframe(columns=column_chunk_df_columns)
127
159
 
128
- # delta_table = DeltaTable.forPath(spark, path)
129
- # detail_df = spark.sql(f"DESCRIBE DETAIL `{table_name}`").collect()[0]
130
-
131
- # num_files = detail_df.numFiles
132
- # size_in_bytes = detail_df.sizeInBytes
133
-
134
- latest_files = _read_delta_table(path).inputFiles()
135
- file_paths = [f.split("/")[-1] for f in latest_files]
136
- row_count = _delta_table_row_count(table_name)
137
160
  row_groups = 0
138
161
  max_rows_per_row_group = 0
139
162
  min_rows_per_row_group = float("inf")
@@ -141,21 +164,67 @@ def delta_analyzer(
141
164
  schema = ds.dataset(table_path).schema.metadata
142
165
  is_vorder = any(b"vorder" in key for key in schema.keys())
143
166
 
144
- for file_name in file_paths:
145
- parquet_file = pq.ParquetFile(f"{table_path}/{file_name}")
167
+ # Get the common details of the Delta table
168
+ spark = _create_spark_session()
169
+
170
+ from delta import DeltaTable
171
+
172
+ delta_table = DeltaTable.forPath(spark, delta_table_path)
173
+ table_df = delta_table.toDF()
174
+ # total_partition_count = table_df.rdd.getNumPartitions()
175
+ row_count = table_df.count()
176
+ table_details = delta_table.detail().collect()[0].asDict()
177
+ # created_at = table_details.get("createdAt")
178
+ # last_modified = table_details.get("lastModified")
179
+ # partition_columns = table_details.get("partitionColumns")
180
+ # clustering_columns = table_details.get("clusteringColumns")
181
+ num_latest_files = table_details.get("numFiles", 0)
182
+ # size_in_bytes = table_details.get("sizeInBytes")
183
+ # min_reader_version = table_details.get("minReaderVersion")
184
+ # min_writer_version = table_details.get("minWriterVersion")
185
+
186
+ latest_files = _read_delta_table(path).inputFiles()
187
+ # file_paths = [f.split("/")[-1] for f in latest_files]
188
+ all_parquet_files = get_parquet_file_infos(delta_table_path)
189
+ common_file_paths = set(
190
+ [file_info[0] for file_info in all_parquet_files]
191
+ ).intersection(set(latest_files))
192
+ latest_version_files = [
193
+ file_info
194
+ for file_info in all_parquet_files
195
+ if file_info[0] in common_file_paths
196
+ ]
197
+
198
+ for idx, (file_path, file_size) in enumerate(
199
+ bar := tqdm(latest_version_files), start=1
200
+ ):
201
+ file_name = os.path.basename(file_path)
202
+ bar.set_description(
203
+ f"Analyzing the '{file_name}' parquet file ({idx}/{num_latest_files})..."
204
+ )
205
+
206
+ relative_path = file_path.split("Tables/")[1]
207
+ file_system_path = f"{local_path}/Tables/{relative_path}"
208
+ parquet_file = pq.ParquetFile(file_system_path)
209
+
146
210
  row_groups += parquet_file.num_row_groups
147
211
 
148
212
  # Generate rowgroup dataframe
149
213
  new_data = {
150
- "ParquetFile": file_name,
151
- "RowCount": parquet_file.metadata.num_rows,
152
- "RowGroups": parquet_file.num_row_groups,
214
+ # "Dataset": "Parquet Files",
215
+ "Parquet File": file_name,
216
+ "Row Count": parquet_file.metadata.num_rows,
217
+ "Row Groups": parquet_file.num_row_groups,
218
+ "Created By": parquet_file.metadata.created_by,
219
+ "Total Table Rows": -1,
220
+ "Total Table Row Groups": -1,
153
221
  }
154
222
 
155
223
  parquet_file_df = pd.concat(
156
224
  [parquet_file_df, pd.DataFrame(new_data, index=[0])], ignore_index=True
157
225
  )
158
226
 
227
+ # Loop through the row groups
159
228
  for i in range(parquet_file.num_row_groups):
160
229
  row_group = parquet_file.metadata.row_group(i)
161
230
  num_rows = row_group.num_rows
@@ -166,38 +235,50 @@ def delta_analyzer(
166
235
  total_compressed_size = 0
167
236
  total_uncompressed_size = 0
168
237
 
169
- for j in range(row_group.num_columns):
170
- column_chunk = row_group.column(j)
171
- total_compressed_size += column_chunk.total_compressed_size
172
- total_uncompressed_size += column_chunk.total_uncompressed_size
173
-
174
- # Generate Column Chunk Dataframe
175
- new_data = {
176
- "ParquetFile": file_name,
177
- "ColumnID": j,
178
- "ColumnName": column_chunk.path_in_schema,
179
- "ColumnType": column_chunk.physical_type,
180
- "CompressedSize": column_chunk.total_compressed_size,
181
- "UncompressedSize": column_chunk.total_uncompressed_size,
182
- "HasDict": column_chunk.has_dictionary_page,
183
- "DictOffset": column_chunk.dictionary_page_offset,
184
- "ValueCount": column_chunk.num_values,
185
- "Encodings": str(column_chunk.encodings),
186
- }
187
-
188
- column_chunk_df = pd.concat(
189
- [column_chunk_df, pd.DataFrame(new_data, index=[0])],
190
- ignore_index=True,
191
- )
238
+ # Loop through the columns
239
+ if column_stats:
240
+ for j in range(row_group.num_columns):
241
+ column_chunk = row_group.column(j)
242
+ total_compressed_size += column_chunk.total_compressed_size
243
+ total_uncompressed_size += column_chunk.total_uncompressed_size
244
+
245
+ # Generate Column Chunk Dataframe
246
+ new_data = {
247
+ # "Dataset": "Column Chunks",
248
+ "Parquet File": file_name,
249
+ "Column ID": j,
250
+ "Column Name": column_chunk.path_in_schema,
251
+ "Column Type": column_chunk.physical_type,
252
+ "Compressed Size": column_chunk.total_compressed_size,
253
+ "Uncompressed Size": column_chunk.total_uncompressed_size,
254
+ "Has Dict": column_chunk.has_dictionary_page,
255
+ "Dict Offset": column_chunk.dictionary_page_offset,
256
+ "Value Count": column_chunk.num_values,
257
+ "Encodings": str(column_chunk.encodings),
258
+ "Statistics": column_chunk.statistics,
259
+ "PrimativeType": column_chunk.physical_type,
260
+ }
261
+
262
+ column_chunk_df = pd.concat(
263
+ [column_chunk_df, pd.DataFrame(new_data, index=[0])],
264
+ ignore_index=True,
265
+ )
192
266
 
193
267
  # Generate rowgroup dataframe
194
268
  new_data = {
195
- "ParquetFile": file_name,
196
- "RowGroupID": i + 1,
197
- "RowCount": num_rows,
198
- "CompressedSize": total_compressed_size,
199
- "UncompressedSize": total_uncompressed_size,
200
- "CompressionRatio": total_compressed_size / total_uncompressed_size,
269
+ # "Dataset": "Row Groups",
270
+ "Parquet File": file_name,
271
+ "Row Group ID": i + 1,
272
+ "Row Count": num_rows,
273
+ "Compressed Size": total_compressed_size,
274
+ "Uncompressed Size": total_uncompressed_size,
275
+ "Compression Ratio": (
276
+ total_compressed_size / total_uncompressed_size
277
+ if column_stats
278
+ else 0
279
+ ),
280
+ "Total Table Rows": -1,
281
+ "Total Table Row Groups": -1,
201
282
  }
202
283
 
203
284
  if not row_group_df.empty:
@@ -213,68 +294,89 @@ def delta_analyzer(
213
294
  summary_df = pd.DataFrame(
214
295
  [
215
296
  {
216
- "RowCount": row_count,
217
- "RowGroups": row_groups,
218
- "ParquetFiles": len(file_paths),
219
- "MaxRowsPerRowGroup": max_rows_per_row_group,
220
- "MinRowsPerRowGroup": min_rows_per_row_group,
221
- "AvgRowsPerRowGroup": avg_rows_per_row_group,
222
- "VOrderEnabled": is_vorder,
297
+ # "Dataset": "Summary",
298
+ "Row Count": row_count,
299
+ "Row Groups": row_groups,
300
+ "Parquet Files": num_latest_files,
301
+ "Max Rows Per Row Group": max_rows_per_row_group,
302
+ "Min Rows Per Row Group": min_rows_per_row_group,
303
+ "Avg Rows Per Row Group": avg_rows_per_row_group,
304
+ "VOrder Enabled": is_vorder,
223
305
  # "VOrderLevel": v_order_level,
224
306
  }
225
307
  ]
226
308
  )
227
309
 
228
310
  # Clean up data types
229
- _update_dataframe_datatypes(
230
- dataframe=column_chunk_df, column_map=column_chunk_df_columns
231
- )
232
311
  _update_dataframe_datatypes(dataframe=row_group_df, column_map=row_group_df_columns)
233
312
  _update_dataframe_datatypes(
234
313
  dataframe=parquet_file_df, column_map=parquet_file_df_columns
235
314
  )
236
315
 
237
316
  # Generate column dataframe
238
- column_df = column_chunk_df.groupby(
239
- ["ColumnName", "ColumnType"], as_index=False
240
- ).agg({"CompressedSize": "sum", "UncompressedSize": "sum"})
241
-
242
- # Add distinct count to column_df
243
- for ind, r in column_df.iterrows():
244
- col_name = r["ColumnName"]
245
- if approx_distinct_count:
246
- dc = _get_column_aggregate(
247
- table_name=table_name,
248
- column_name=col_name,
249
- function="approx",
250
- lakehouse=lakehouse,
251
- workspace=workspace,
252
- )
253
- else:
254
- dc = _get_column_aggregate(
255
- table_name=table_name,
256
- column_name=col_name,
257
- function="distinctcount",
258
- lakehouse=lakehouse,
259
- workspace=workspace,
260
- )
317
+ if column_stats:
318
+ _update_dataframe_datatypes(
319
+ dataframe=column_chunk_df, column_map=column_chunk_df_columns
320
+ )
321
+ column_df = column_chunk_df.groupby(
322
+ ["Column Name", "Column Type"], as_index=False
323
+ ).agg({"Compressed Size": "sum", "Uncompressed Size": "sum"})
324
+
325
+ # Add distinct count to column_df
326
+ if not skip_cardinality:
327
+ for ind, r in column_df.iterrows():
328
+ col_name = r["Column Name"]
329
+ if approx_distinct_count:
330
+ function = "approx"
331
+ else:
332
+ function = "distinctcount"
333
+ dc = _get_column_aggregate(
334
+ table_name=table_name,
335
+ column_name=col_name,
336
+ function=function,
337
+ lakehouse=lakehouse,
338
+ workspace=workspace,
339
+ )
340
+
341
+ if "Cardinality" not in column_df.columns:
342
+ column_df["Cardinality"] = None
261
343
 
262
- if "Cardinality" not in column_df.columns:
263
- column_df["Cardinality"] = None
344
+ column_df.at[ind, "Cardinality"] = dc
264
345
 
265
- column_df.at[ind, "Cardinality"] = dc
346
+ summary_df["Total Size"] = column_df["Compressed Size"].sum()
266
347
 
267
- column_df["Cardinality"] = column_df["Cardinality"].astype(int)
268
- summary_df["TotalSize"] = column_df["CompressedSize"].sum()
348
+ parquet_file_df["Total Table Rows"] = parquet_file_df["Row Count"].sum()
349
+ parquet_file_df["Total Table Row Groups"] = parquet_file_df["Row Groups"].sum()
350
+
351
+ row_group_df["Total Table Rows"] = parquet_file_df["Row Count"].sum()
352
+ row_group_df["Total Table Row Groups"] = parquet_file_df["Row Groups"].sum()
353
+ total_rows = row_group_df["Row Count"].sum()
354
+ row_group_df["Ratio Of Total Table Rows"] = (
355
+ row_group_df["Row Count"] / total_rows * 100.0
356
+ )
357
+
358
+ if column_stats:
359
+ column_df["Total Table Rows"] = parquet_file_df["Row Count"].sum()
360
+ column_df["Table Size"] = column_df["Compressed Size"].sum()
361
+ column_df["Size Percent Of Table"] = (
362
+ column_df["Compressed Size"] / column_df["Table Size"] * 100.0
363
+ )
364
+ if not skip_cardinality and column_stats:
365
+ column_df["Cardinality"] = column_df["Cardinality"].fillna(0).astype(int)
366
+ column_df["Cardinality Of Total Rows"] = (
367
+ column_df["Cardinality"] / column_df["Total Table Rows"] * 100.0
368
+ )
269
369
 
270
370
  dataframes = {
271
371
  "Summary": summary_df,
272
372
  "Parquet Files": parquet_file_df,
273
373
  "Row Groups": row_group_df,
274
- "Column Chunks": column_chunk_df,
275
- "Columns": column_df,
276
374
  }
277
375
 
376
+ if column_stats:
377
+ dataframes["Column Chunks"] = column_chunk_df
378
+ dataframes["Columns"] = column_df
379
+
278
380
  save_table = f"{prefix}Summary"
279
381
 
280
382
  if export:
@@ -295,11 +397,11 @@ def delta_analyzer(
295
397
  for name, df in dataframes.items():
296
398
  name = name.replace(" ", "")
297
399
  cols = {
298
- "WorkspaceName": workspace_name,
299
- "WorkspaceId": workspace_id,
300
- "LakehouseName": lakehouse_name,
301
- "LakehouseId": lakehouse_id,
302
- "TableName": table_name,
400
+ "Workspace Name": workspace_name,
401
+ "Workspace Id": workspace_id,
402
+ "Lakehouse Name": lakehouse_name,
403
+ "Lakehouse Id": lakehouse_id,
404
+ "Table Name": table_name,
303
405
  }
304
406
  for i, (col, param) in enumerate(cols.items()):
305
407
  df[col] = param
@@ -309,8 +411,10 @@ def delta_analyzer(
309
411
  df["Timestamp"] = pd.to_datetime(df["Timestamp"])
310
412
 
311
413
  if export:
312
- df["RunId"] = runId
313
- df["RunId"] = df["RunId"].astype(int)
414
+ df["Run Id"] = runId
415
+ df["Run Id"] = df["Run Id"].astype(int)
416
+
417
+ df.columns = df.columns.str.replace(" ", "")
314
418
  save_as_delta_table(
315
419
  dataframe=df,
316
420
  delta_table_name=f"{prefix}{name}",
@@ -319,3 +423,51 @@ def delta_analyzer(
319
423
  )
320
424
 
321
425
  return dataframes
426
+
427
+
428
+ @log
429
+ def get_delta_table_history(
430
+ table_name: str,
431
+ lakehouse: Optional[str | UUID] = None,
432
+ workspace: Optional[str | UUID] = None,
433
+ ) -> pd.DataFrame:
434
+ """
435
+ Returns the history of a delta table as a pandas dataframe.
436
+
437
+ Parameters
438
+ ----------
439
+ table_name : str
440
+ The delta table name.
441
+ lakehouse : str | uuid.UUID, default=None
442
+ The Fabric lakehouse name or ID.
443
+ Defaults to None which resolves to the lakehouse attached to the notebook.
444
+ workspace : str | uuid.UUID, default=None
445
+ The Fabric workspace name or ID used by the lakehouse.
446
+ Defaults to None which resolves to the workspace of the attached lakehouse
447
+ or if no lakehouse attached, resolves to the workspace of the notebook.
448
+
449
+ Returns
450
+ -------
451
+ pandas.DataFrame
452
+ A dataframe showing the history of the delta table.
453
+ """
454
+
455
+ def camel_to_title(text):
456
+ return re.sub(r"([a-z])([A-Z])", r"\1 \2", text).title()
457
+
458
+ spark = _create_spark_session()
459
+
460
+ (workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace=workspace)
461
+ (lakehouse_name, lakehouse_id) = resolve_lakehouse_name_and_id(
462
+ lakehouse=lakehouse, workspace=workspace
463
+ )
464
+ path = create_abfss_path(lakehouse_id, workspace_id, table_name)
465
+
466
+ from delta import DeltaTable
467
+
468
+ delta_table = DeltaTable.forPath(spark, path)
469
+ df = delta_table.history().toPandas()
470
+
471
+ df.rename(columns=lambda col: camel_to_title(col), inplace=True)
472
+
473
+ return df
@@ -7,12 +7,18 @@ from sempy_labs._helper_functions import (
7
7
  _print_success,
8
8
  resolve_item_id,
9
9
  _create_dataframe,
10
+ _conv_b64,
11
+ _decode_b64,
10
12
  )
11
13
  from uuid import UUID
14
+ import sempy_labs._icons as icons
12
15
 
13
16
 
14
17
  def create_eventhouse(
15
- name: str, description: Optional[str] = None, workspace: Optional[str | UUID] = None
18
+ name: str,
19
+ definition: Optional[dict],
20
+ description: Optional[str] = None,
21
+ workspace: Optional[str | UUID] = None,
16
22
  ):
17
23
  """
18
24
  Creates a Fabric eventhouse.
@@ -23,6 +29,8 @@ def create_eventhouse(
23
29
  ----------
24
30
  name: str
25
31
  Name of the eventhouse.
32
+ definition : dict
33
+ The definition (EventhouseProperties.json) of the eventhouse.
26
34
  description : str, default=None
27
35
  A description of the environment.
28
36
  workspace : str | uuid.UUID, default=None
@@ -38,6 +46,20 @@ def create_eventhouse(
38
46
  if description:
39
47
  payload["description"] = description
40
48
 
49
+ if definition is not None:
50
+ if not isinstance(definition, dict):
51
+ raise ValueError(f"{icons.red_dot} The definition must be a dictionary.")
52
+
53
+ payload["definition"] = {
54
+ "parts": [
55
+ {
56
+ "path": "EventhouseProperties.json",
57
+ "payload": _conv_b64(definition),
58
+ "payloadType": "InlineBase64",
59
+ }
60
+ ]
61
+ }
62
+
41
63
  _base_api(
42
64
  request=f"/v1/workspaces/{workspace_id}/eventhouses",
43
65
  method="post",
@@ -123,3 +145,50 @@ def delete_eventhouse(name: str, workspace: Optional[str | UUID] = None):
123
145
  workspace_name=workspace_name,
124
146
  action="deleted",
125
147
  )
148
+
149
+
150
+ def get_eventhouse_definition(
151
+ eventhouse: str | UUID,
152
+ workspace: Optional[str | UUID] = None,
153
+ return_dataframe: bool = False,
154
+ ) -> dict | pd.DataFrame:
155
+ """
156
+ Gets the eventhouse definition.
157
+
158
+ This is a wrapper function for the following API: `Items - Get Eventhouse Definition <https://learn.microsoft.com/rest/api/fabric/eventhouse/items/get-eventhouse-definition>`_.
159
+
160
+ Parameters
161
+ ----------
162
+ eventhouse : str
163
+ Name of the eventhouse.
164
+ workspace : str | uuid.UUID, default=None
165
+ The Fabric workspace name or ID in which the eventhouse resides.
166
+ Defaults to None which resolves to the workspace of the attached lakehouse
167
+ or if no lakehouse attached, resolves to the workspace of the notebook.
168
+ return_dataframe : bool, default=False
169
+ If True, returns a dataframe. If False, returns a json dictionary.
170
+
171
+ Returns
172
+ -------
173
+ dict | pandas.DataFrame
174
+ The eventhouse definition in .json format or as a pandas dataframe.
175
+ """
176
+
177
+ (workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
178
+ item_id = resolve_item_id(item=eventhouse, type="Eventhouse", workspace=workspace)
179
+
180
+ result = _base_api(
181
+ request=f"/v1/workspaces/{workspace_id}/eventhouses/{item_id}/getDefinition",
182
+ method="post",
183
+ status_codes=None,
184
+ lro_return_json=True,
185
+ )
186
+
187
+ df = pd.json_normalize(result["definition"]["parts"])
188
+
189
+ if return_dataframe:
190
+ return df
191
+ else:
192
+ df_filt = df[df["path"] == "EventhouseProperties.json"]
193
+ payload = df_filt["payload"].iloc[0]
194
+ return _decode_b64(payload)