semantic-link-labs 0.9.2__py3-none-any.whl → 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of semantic-link-labs might be problematic. Click here for more details.

Files changed (54) hide show
  1. {semantic_link_labs-0.9.2.dist-info → semantic_link_labs-0.9.4.dist-info}/METADATA +10 -6
  2. {semantic_link_labs-0.9.2.dist-info → semantic_link_labs-0.9.4.dist-info}/RECORD +54 -44
  3. {semantic_link_labs-0.9.2.dist-info → semantic_link_labs-0.9.4.dist-info}/WHEEL +1 -1
  4. sempy_labs/__init__.py +27 -1
  5. sempy_labs/_ai.py +8 -5
  6. sempy_labs/_capacity_migration.py +3 -2
  7. sempy_labs/_connections.py +45 -9
  8. sempy_labs/_dax.py +17 -3
  9. sempy_labs/_delta_analyzer.py +308 -138
  10. sempy_labs/_eventhouses.py +70 -1
  11. sempy_labs/_gateways.py +56 -8
  12. sempy_labs/_generate_semantic_model.py +30 -9
  13. sempy_labs/_helper_functions.py +84 -9
  14. sempy_labs/_job_scheduler.py +226 -2
  15. sempy_labs/_list_functions.py +42 -19
  16. sempy_labs/_ml_experiments.py +1 -1
  17. sempy_labs/_model_bpa.py +17 -2
  18. sempy_labs/_model_bpa_rules.py +20 -8
  19. sempy_labs/_semantic_models.py +117 -0
  20. sempy_labs/_sql.py +73 -6
  21. sempy_labs/_sqldatabase.py +227 -0
  22. sempy_labs/_translations.py +2 -2
  23. sempy_labs/_vertipaq.py +3 -3
  24. sempy_labs/_warehouses.py +1 -1
  25. sempy_labs/admin/__init__.py +49 -8
  26. sempy_labs/admin/_activities.py +166 -0
  27. sempy_labs/admin/_apps.py +143 -0
  28. sempy_labs/admin/_basic_functions.py +32 -652
  29. sempy_labs/admin/_capacities.py +250 -0
  30. sempy_labs/admin/_datasets.py +184 -0
  31. sempy_labs/admin/_domains.py +1 -3
  32. sempy_labs/admin/_items.py +3 -1
  33. sempy_labs/admin/_reports.py +165 -0
  34. sempy_labs/admin/_scanner.py +53 -49
  35. sempy_labs/admin/_shared.py +74 -0
  36. sempy_labs/admin/_tenant.py +489 -0
  37. sempy_labs/directlake/_dl_helper.py +0 -1
  38. sempy_labs/directlake/_update_directlake_partition_entity.py +6 -0
  39. sempy_labs/graph/_teams.py +1 -1
  40. sempy_labs/graph/_users.py +9 -1
  41. sempy_labs/lakehouse/_get_lakehouse_columns.py +2 -2
  42. sempy_labs/lakehouse/_get_lakehouse_tables.py +2 -2
  43. sempy_labs/lakehouse/_lakehouse.py +3 -3
  44. sempy_labs/lakehouse/_shortcuts.py +29 -16
  45. sempy_labs/migration/_migrate_calctables_to_lakehouse.py +2 -2
  46. sempy_labs/migration/_refresh_calc_tables.py +2 -2
  47. sempy_labs/report/__init__.py +3 -1
  48. sempy_labs/report/_download_report.py +4 -1
  49. sempy_labs/report/_export_report.py +272 -0
  50. sempy_labs/report/_report_functions.py +11 -263
  51. sempy_labs/report/_report_rebind.py +1 -1
  52. sempy_labs/tom/_model.py +281 -29
  53. {semantic_link_labs-0.9.2.dist-info → semantic_link_labs-0.9.4.dist-info}/LICENSE +0 -0
  54. {semantic_link_labs-0.9.2.dist-info → semantic_link_labs-0.9.4.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,11 @@
1
1
  import pandas as pd
2
- import datetime
3
- from typing import Dict
2
+ import re
3
+ from datetime import datetime
4
+ import os
5
+ from uuid import UUID
6
+ from typing import Dict, Optional
4
7
  import pyarrow.dataset as ds
5
8
  import pyarrow.parquet as pq
6
- from pyspark.sql import SparkSession
7
9
  from sempy_labs._helper_functions import (
8
10
  create_abfss_path,
9
11
  save_as_delta_table,
@@ -12,19 +14,47 @@ from sempy_labs._helper_functions import (
12
14
  _update_dataframe_datatypes,
13
15
  resolve_workspace_name_and_id,
14
16
  resolve_lakehouse_name_and_id,
17
+ _read_delta_table,
18
+ _mount,
19
+ _create_spark_session,
15
20
  )
21
+ from sempy._utils._log import log
16
22
  from sempy_labs.lakehouse._get_lakehouse_tables import get_lakehouse_tables
17
23
  from sempy_labs.lakehouse._lakehouse import lakehouse_attached
18
24
  import sempy_labs._icons as icons
25
+ from tqdm.auto import tqdm
19
26
 
20
27
 
28
+ def get_parquet_file_infos(path):
29
+
30
+ import notebookutils
31
+
32
+ files = []
33
+ items = notebookutils.fs.ls(path)
34
+ for item in items:
35
+ if item.isDir:
36
+ # Ignore the _delta_log directory
37
+ if "_delta_log" not in item.path:
38
+ files.extend(get_parquet_file_infos(item.path))
39
+ else:
40
+ # Filter out non-Parquet files and files with size 0
41
+ if item.path.endswith(".parquet") and item.size > 0:
42
+ files.append((item.path, item.size))
43
+ return files
44
+
45
+
46
+ @log
21
47
  def delta_analyzer(
22
48
  table_name: str,
23
49
  approx_distinct_count: bool = True,
24
50
  export: bool = False,
51
+ lakehouse: Optional[str | UUID] = None,
52
+ workspace: Optional[str | UUID] = None,
53
+ column_stats: bool = True,
54
+ skip_cardinality: bool = True,
25
55
  ) -> Dict[str, pd.DataFrame]:
26
56
  """
27
- Analyzes a delta table and shows the results in dictionary containing a set of 5 dataframes. The table being analyzed must be in the lakehouse attached to the notebook.
57
+ Analyzes a delta table and shows the results in dictionary containing a set of 5 dataframes. If 'export' is set to True, the results will be saved to delta tables in the lakehouse attached to the notebook.
28
58
 
29
59
  The 5 dataframes returned by this function are:
30
60
 
@@ -44,6 +74,17 @@ def delta_analyzer(
44
74
  If True, uses approx_count_distinct to calculate the cardinality of each column. If False, uses COUNT(DISTINCT) instead.
45
75
  export : bool, default=False
46
76
  If True, exports the resulting dataframes to delta tables in the lakehouse attached to the notebook.
77
+ lakehouse : str | uuid.UUID, default=None
78
+ The Fabric lakehouse name or ID.
79
+ Defaults to None which resolves to the lakehouse attached to the notebook.
80
+ workspace : str | uuid.UUID, default=None
81
+ The Fabric workspace name or ID used by the lakehouse.
82
+ Defaults to None which resolves to the workspace of the attached lakehouse
83
+ or if no lakehouse attached, resolves to the workspace of the notebook.
84
+ column_stats : bool, default=True
85
+ If True, collects data about column chunks and columns. If False, skips that step and only returns the other 3 dataframes.
86
+ skip_cardinality : bool, default=True
87
+ If True, skips the cardinality calculation for each column. If False, calculates the cardinality for each column.
47
88
 
48
89
  Returns
49
90
  -------
@@ -51,99 +92,139 @@ def delta_analyzer(
51
92
  A dictionary of pandas dataframes showing semantic model objects which violated the best practice analyzer rules.
52
93
  """
53
94
 
54
- if not lakehouse_attached():
55
- raise ValueError(
56
- f"{icons.red_dot} No lakehouse is attached to this notebook. Please attach a lakehouse to the notebook before running the Delta Analyzer."
57
- )
95
+ # Must calculate column stats if calculating cardinality
96
+ if not skip_cardinality:
97
+ column_stats = True
98
+
99
+ # display_toggle = notebookutils.common.configs.pandas_display
100
+
101
+ # Turn off notebookutils display
102
+ # if display_toggle is True:
103
+ # notebookutils.common.configs.pandas_display = False
58
104
 
59
105
  prefix = "SLL_DeltaAnalyzer_"
60
- now = datetime.datetime.now()
61
- (workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace=None)
106
+ now = datetime.now()
107
+ (workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace=workspace)
62
108
  (lakehouse_name, lakehouse_id) = resolve_lakehouse_name_and_id(
63
- lakehouse=None, workspace=None
109
+ lakehouse=lakehouse, workspace=workspace
64
110
  )
65
111
  path = create_abfss_path(lakehouse_id, workspace_id, table_name)
66
- table_path = f"/lakehouse/default/Tables/{table_name}"
112
+ local_path = _mount(lakehouse=lakehouse, workspace=workspace)
113
+ table_path = f"{local_path}/Tables/{table_name}"
114
+ delta_table_path = create_abfss_path(lakehouse_id, workspace_id, table_name)
115
+
116
+ # Set back to original value
117
+ # notebookutils.common.configs.pandas_display = display_toggle
67
118
 
68
119
  parquet_file_df_columns = {
69
- "ParquetFile": "string",
70
- "RowCount": "int",
71
- "RowGroups": "int",
120
+ # "Dataset": "string",
121
+ "Parquet File": "string",
122
+ "Row Count": "int",
123
+ "Row Groups": "int",
124
+ "Created By": "string",
125
+ "Total Table Rows": "int",
126
+ "Total Table Row Groups": "int",
72
127
  }
73
128
  row_group_df_columns = {
74
- "ParquetFile": "string",
75
- "RowGroupID": "int",
76
- "RowCount": "int",
77
- "CompressedSize": "int",
78
- "UncompressedSize": "int",
79
- "CompressionRatio": "float",
129
+ # "Dataset": "string",
130
+ "Parquet File": "string",
131
+ "Row Group ID": "int",
132
+ "Row Count": "int",
133
+ "Compressed Size": "int",
134
+ "Uncompressed Size": "int",
135
+ "Compression Ratio": "float",
136
+ "Total Table Rows": "int",
137
+ "Ratio Of Total Table Rows": "float",
138
+ "Total Table Row Groups": "int",
80
139
  }
81
140
  column_chunk_df_columns = {
82
- "ParquetFile": "string",
83
- "ColumnID": "int",
84
- "ColumnName": "string",
85
- "ColumnType": "string",
86
- "CompressedSize": "int",
87
- "UncompressedSize": "int",
88
- "HasDict": "bool",
89
- "DictOffset": "int_fillna",
90
- "ValueCount": "int",
141
+ # "Dataset": "string",
142
+ "Parquet File": "string",
143
+ "Column ID": "int",
144
+ "Column Name": "string",
145
+ "Column Type": "string",
146
+ "Compressed Size": "int",
147
+ "Uncompressed Size": "int",
148
+ "Has Dict": "bool",
149
+ "Dict Offset": "int_fillna",
150
+ "Value Count": "int",
91
151
  "Encodings": "string",
152
+ "Statistics": "string",
153
+ "Primative Type": "string",
92
154
  }
93
155
 
94
156
  parquet_file_df = _create_dataframe(columns=parquet_file_df_columns)
95
157
  row_group_df = _create_dataframe(columns=row_group_df_columns)
96
158
  column_chunk_df = _create_dataframe(columns=column_chunk_df_columns)
97
159
 
98
- spark = SparkSession.builder.getOrCreate()
99
- # delta_table = DeltaTable.forPath(spark, path)
100
- # detail_df = spark.sql(f"DESCRIBE DETAIL `{table_name}`").collect()[0]
101
-
102
- # num_files = detail_df.numFiles
103
- # size_in_bytes = detail_df.sizeInBytes
104
-
105
- latest_files = spark.read.format("delta").load(path).inputFiles()
106
- file_paths = [f.split("/")[-1] for f in latest_files]
107
- row_count = spark.table(table_name).count()
108
160
  row_groups = 0
109
161
  max_rows_per_row_group = 0
110
162
  min_rows_per_row_group = float("inf")
111
- # dt = DeltaTable.forPath(spark, path)
112
- # schema = dt.toDF().schema
113
- # is_vorder = False
114
- # if (
115
- # dt.detail()
116
- # .collect()[0]
117
- # .asDict()
118
- # .get("properties")
119
- # .get("delta.parquet.vorder.enabled")
120
- # == "true"
121
- # ):
122
- # is_vorder = True
123
163
 
124
164
  schema = ds.dataset(table_path).schema.metadata
125
165
  is_vorder = any(b"vorder" in key for key in schema.keys())
126
- # v_order_level = (
127
- # int(schema.get(b"com.microsoft.parquet.vorder.level").decode("utf-8"))
128
- # if is_vorder
129
- # else None
130
- # )
131
-
132
- for file_name in file_paths:
133
- parquet_file = pq.ParquetFile(f"{table_path}/{file_name}")
166
+
167
+ # Get the common details of the Delta table
168
+ spark = _create_spark_session()
169
+
170
+ from delta import DeltaTable
171
+
172
+ delta_table = DeltaTable.forPath(spark, delta_table_path)
173
+ table_df = delta_table.toDF()
174
+ # total_partition_count = table_df.rdd.getNumPartitions()
175
+ row_count = table_df.count()
176
+ table_details = delta_table.detail().collect()[0].asDict()
177
+ # created_at = table_details.get("createdAt")
178
+ # last_modified = table_details.get("lastModified")
179
+ # partition_columns = table_details.get("partitionColumns")
180
+ # clustering_columns = table_details.get("clusteringColumns")
181
+ num_latest_files = table_details.get("numFiles", 0)
182
+ # size_in_bytes = table_details.get("sizeInBytes")
183
+ # min_reader_version = table_details.get("minReaderVersion")
184
+ # min_writer_version = table_details.get("minWriterVersion")
185
+
186
+ latest_files = _read_delta_table(path).inputFiles()
187
+ # file_paths = [f.split("/")[-1] for f in latest_files]
188
+ all_parquet_files = get_parquet_file_infos(delta_table_path)
189
+ common_file_paths = set(
190
+ [file_info[0] for file_info in all_parquet_files]
191
+ ).intersection(set(latest_files))
192
+ latest_version_files = [
193
+ file_info
194
+ for file_info in all_parquet_files
195
+ if file_info[0] in common_file_paths
196
+ ]
197
+
198
+ for idx, (file_path, file_size) in enumerate(
199
+ bar := tqdm(latest_version_files), start=1
200
+ ):
201
+ file_name = os.path.basename(file_path)
202
+ bar.set_description(
203
+ f"Analyzing the '{file_name}' parquet file ({idx}/{num_latest_files})..."
204
+ )
205
+
206
+ relative_path = file_path.split("Tables/")[1]
207
+ file_system_path = f"{local_path}/Tables/{relative_path}"
208
+ parquet_file = pq.ParquetFile(file_system_path)
209
+
134
210
  row_groups += parquet_file.num_row_groups
135
211
 
136
212
  # Generate rowgroup dataframe
137
213
  new_data = {
138
- "ParquetFile": file_name,
139
- "RowCount": parquet_file.metadata.num_rows,
140
- "RowGroups": parquet_file.num_row_groups,
214
+ # "Dataset": "Parquet Files",
215
+ "Parquet File": file_name,
216
+ "Row Count": parquet_file.metadata.num_rows,
217
+ "Row Groups": parquet_file.num_row_groups,
218
+ "Created By": parquet_file.metadata.created_by,
219
+ "Total Table Rows": -1,
220
+ "Total Table Row Groups": -1,
141
221
  }
142
222
 
143
223
  parquet_file_df = pd.concat(
144
224
  [parquet_file_df, pd.DataFrame(new_data, index=[0])], ignore_index=True
145
225
  )
146
226
 
227
+ # Loop through the row groups
147
228
  for i in range(parquet_file.num_row_groups):
148
229
  row_group = parquet_file.metadata.row_group(i)
149
230
  num_rows = row_group.num_rows
@@ -154,38 +235,50 @@ def delta_analyzer(
154
235
  total_compressed_size = 0
155
236
  total_uncompressed_size = 0
156
237
 
157
- for j in range(row_group.num_columns):
158
- column_chunk = row_group.column(j)
159
- total_compressed_size += column_chunk.total_compressed_size
160
- total_uncompressed_size += column_chunk.total_uncompressed_size
161
-
162
- # Generate Column Chunk Dataframe
163
- new_data = {
164
- "ParquetFile": file_name,
165
- "ColumnID": j,
166
- "ColumnName": column_chunk.path_in_schema,
167
- "ColumnType": column_chunk.physical_type,
168
- "CompressedSize": column_chunk.total_compressed_size,
169
- "UncompressedSize": column_chunk.total_uncompressed_size,
170
- "HasDict": column_chunk.has_dictionary_page,
171
- "DictOffset": column_chunk.dictionary_page_offset,
172
- "ValueCount": column_chunk.num_values,
173
- "Encodings": str(column_chunk.encodings),
174
- }
175
-
176
- column_chunk_df = pd.concat(
177
- [column_chunk_df, pd.DataFrame(new_data, index=[0])],
178
- ignore_index=True,
179
- )
238
+ # Loop through the columns
239
+ if column_stats:
240
+ for j in range(row_group.num_columns):
241
+ column_chunk = row_group.column(j)
242
+ total_compressed_size += column_chunk.total_compressed_size
243
+ total_uncompressed_size += column_chunk.total_uncompressed_size
244
+
245
+ # Generate Column Chunk Dataframe
246
+ new_data = {
247
+ # "Dataset": "Column Chunks",
248
+ "Parquet File": file_name,
249
+ "Column ID": j,
250
+ "Column Name": column_chunk.path_in_schema,
251
+ "Column Type": column_chunk.physical_type,
252
+ "Compressed Size": column_chunk.total_compressed_size,
253
+ "Uncompressed Size": column_chunk.total_uncompressed_size,
254
+ "Has Dict": column_chunk.has_dictionary_page,
255
+ "Dict Offset": column_chunk.dictionary_page_offset,
256
+ "Value Count": column_chunk.num_values,
257
+ "Encodings": str(column_chunk.encodings),
258
+ "Statistics": column_chunk.statistics,
259
+ "PrimativeType": column_chunk.physical_type,
260
+ }
261
+
262
+ column_chunk_df = pd.concat(
263
+ [column_chunk_df, pd.DataFrame(new_data, index=[0])],
264
+ ignore_index=True,
265
+ )
180
266
 
181
267
  # Generate rowgroup dataframe
182
268
  new_data = {
183
- "ParquetFile": file_name,
184
- "RowGroupID": i + 1,
185
- "RowCount": num_rows,
186
- "CompressedSize": total_compressed_size,
187
- "UncompressedSize": total_uncompressed_size,
188
- "CompressionRatio": total_compressed_size / total_uncompressed_size,
269
+ # "Dataset": "Row Groups",
270
+ "Parquet File": file_name,
271
+ "Row Group ID": i + 1,
272
+ "Row Count": num_rows,
273
+ "Compressed Size": total_compressed_size,
274
+ "Uncompressed Size": total_uncompressed_size,
275
+ "Compression Ratio": (
276
+ total_compressed_size / total_uncompressed_size
277
+ if column_stats
278
+ else 0
279
+ ),
280
+ "Total Table Rows": -1,
281
+ "Total Table Row Groups": -1,
189
282
  }
190
283
 
191
284
  if not row_group_df.empty:
@@ -201,87 +294,114 @@ def delta_analyzer(
201
294
  summary_df = pd.DataFrame(
202
295
  [
203
296
  {
204
- "RowCount": row_count,
205
- "RowGroups": row_groups,
206
- "ParquetFiles": len(file_paths),
207
- "MaxRowsPerRowGroup": max_rows_per_row_group,
208
- "MinRowsPerRowGroup": min_rows_per_row_group,
209
- "AvgRowsPerRowGroup": avg_rows_per_row_group,
210
- "VOrderEnabled": is_vorder,
297
+ # "Dataset": "Summary",
298
+ "Row Count": row_count,
299
+ "Row Groups": row_groups,
300
+ "Parquet Files": num_latest_files,
301
+ "Max Rows Per Row Group": max_rows_per_row_group,
302
+ "Min Rows Per Row Group": min_rows_per_row_group,
303
+ "Avg Rows Per Row Group": avg_rows_per_row_group,
304
+ "VOrder Enabled": is_vorder,
211
305
  # "VOrderLevel": v_order_level,
212
306
  }
213
307
  ]
214
308
  )
215
309
 
216
310
  # Clean up data types
217
- _update_dataframe_datatypes(
218
- dataframe=column_chunk_df, column_map=column_chunk_df_columns
219
- )
220
311
  _update_dataframe_datatypes(dataframe=row_group_df, column_map=row_group_df_columns)
221
312
  _update_dataframe_datatypes(
222
313
  dataframe=parquet_file_df, column_map=parquet_file_df_columns
223
314
  )
224
315
 
225
316
  # Generate column dataframe
226
- column_df = column_chunk_df.groupby(
227
- ["ColumnName", "ColumnType"], as_index=False
228
- ).agg({"CompressedSize": "sum", "UncompressedSize": "sum"})
229
-
230
- # Add distinct count to column_df
231
- for ind, r in column_df.iterrows():
232
- col_name = r["ColumnName"]
233
- if approx_distinct_count:
234
- dc = _get_column_aggregate(
235
- table_name=table_name,
236
- column_name=col_name,
237
- function="approx",
238
- lakehouse=lakehouse_name,
239
- )
240
- else:
241
- dc = _get_column_aggregate(
242
- table_name=table_name,
243
- column_name=col_name,
244
- function="distinctcount",
245
- lakehouse=lakehouse_name,
246
- )
317
+ if column_stats:
318
+ _update_dataframe_datatypes(
319
+ dataframe=column_chunk_df, column_map=column_chunk_df_columns
320
+ )
321
+ column_df = column_chunk_df.groupby(
322
+ ["Column Name", "Column Type"], as_index=False
323
+ ).agg({"Compressed Size": "sum", "Uncompressed Size": "sum"})
324
+
325
+ # Add distinct count to column_df
326
+ if not skip_cardinality:
327
+ for ind, r in column_df.iterrows():
328
+ col_name = r["Column Name"]
329
+ if approx_distinct_count:
330
+ function = "approx"
331
+ else:
332
+ function = "distinctcount"
333
+ dc = _get_column_aggregate(
334
+ table_name=table_name,
335
+ column_name=col_name,
336
+ function=function,
337
+ lakehouse=lakehouse,
338
+ workspace=workspace,
339
+ )
247
340
 
248
- if "Cardinality" not in column_df.columns:
249
- column_df["Cardinality"] = None
341
+ if "Cardinality" not in column_df.columns:
342
+ column_df["Cardinality"] = None
250
343
 
251
- column_df.at[ind, "Cardinality"] = dc
344
+ column_df.at[ind, "Cardinality"] = dc
252
345
 
253
- column_df["Cardinality"] = column_df["Cardinality"].astype(int)
254
- summary_df["TotalSize"] = column_df["CompressedSize"].sum()
346
+ summary_df["Total Size"] = column_df["Compressed Size"].sum()
347
+
348
+ parquet_file_df["Total Table Rows"] = parquet_file_df["Row Count"].sum()
349
+ parquet_file_df["Total Table Row Groups"] = parquet_file_df["Row Groups"].sum()
350
+
351
+ row_group_df["Total Table Rows"] = parquet_file_df["Row Count"].sum()
352
+ row_group_df["Total Table Row Groups"] = parquet_file_df["Row Groups"].sum()
353
+ total_rows = row_group_df["Row Count"].sum()
354
+ row_group_df["Ratio Of Total Table Rows"] = (
355
+ row_group_df["Row Count"] / total_rows * 100.0
356
+ )
357
+
358
+ if column_stats:
359
+ column_df["Total Table Rows"] = parquet_file_df["Row Count"].sum()
360
+ column_df["Table Size"] = column_df["Compressed Size"].sum()
361
+ column_df["Size Percent Of Table"] = (
362
+ column_df["Compressed Size"] / column_df["Table Size"] * 100.0
363
+ )
364
+ if not skip_cardinality and column_stats:
365
+ column_df["Cardinality"] = column_df["Cardinality"].fillna(0).astype(int)
366
+ column_df["Cardinality Of Total Rows"] = (
367
+ column_df["Cardinality"] / column_df["Total Table Rows"] * 100.0
368
+ )
255
369
 
256
370
  dataframes = {
257
371
  "Summary": summary_df,
258
372
  "Parquet Files": parquet_file_df,
259
373
  "Row Groups": row_group_df,
260
- "Column Chunks": column_chunk_df,
261
- "Columns": column_df,
262
374
  }
263
375
 
376
+ if column_stats:
377
+ dataframes["Column Chunks"] = column_chunk_df
378
+ dataframes["Columns"] = column_df
379
+
264
380
  save_table = f"{prefix}Summary"
265
381
 
266
382
  if export:
383
+ if not lakehouse_attached():
384
+ raise ValueError(
385
+ f"{icons.red_dot} No lakehouse is attached to this notebook. Please attach a lakehouse to the notebook before running the Delta Analyzer."
386
+ )
267
387
  dfL = get_lakehouse_tables()
268
388
  dfL_filt = dfL[dfL["Table Name"] == save_table]
269
389
  if dfL_filt.empty:
270
390
  runId = 1
271
391
  else:
272
392
  max_run_id = _get_column_aggregate(
273
- lakehouse=lakehouse_name, table_name=save_table
393
+ table_name=save_table,
274
394
  )
275
395
  runId = max_run_id + 1
276
396
 
277
397
  for name, df in dataframes.items():
278
398
  name = name.replace(" ", "")
279
399
  cols = {
280
- "WorkspaceName": workspace_name,
281
- "WorkspaceId": workspace_id,
282
- "LakehouseName": lakehouse_name,
283
- "LakehouseId": lakehouse_id,
284
- "TableName": table_name,
400
+ "Workspace Name": workspace_name,
401
+ "Workspace Id": workspace_id,
402
+ "Lakehouse Name": lakehouse_name,
403
+ "Lakehouse Id": lakehouse_id,
404
+ "Table Name": table_name,
285
405
  }
286
406
  for i, (col, param) in enumerate(cols.items()):
287
407
  df[col] = param
@@ -291,8 +411,10 @@ def delta_analyzer(
291
411
  df["Timestamp"] = pd.to_datetime(df["Timestamp"])
292
412
 
293
413
  if export:
294
- df["RunId"] = runId
295
- df["RunId"] = df["RunId"].astype(int)
414
+ df["Run Id"] = runId
415
+ df["Run Id"] = df["Run Id"].astype(int)
416
+
417
+ df.columns = df.columns.str.replace(" ", "")
296
418
  save_as_delta_table(
297
419
  dataframe=df,
298
420
  delta_table_name=f"{prefix}{name}",
@@ -301,3 +423,51 @@ def delta_analyzer(
301
423
  )
302
424
 
303
425
  return dataframes
426
+
427
+
428
+ @log
429
+ def get_delta_table_history(
430
+ table_name: str,
431
+ lakehouse: Optional[str | UUID] = None,
432
+ workspace: Optional[str | UUID] = None,
433
+ ) -> pd.DataFrame:
434
+ """
435
+ Returns the history of a delta table as a pandas dataframe.
436
+
437
+ Parameters
438
+ ----------
439
+ table_name : str
440
+ The delta table name.
441
+ lakehouse : str | uuid.UUID, default=None
442
+ The Fabric lakehouse name or ID.
443
+ Defaults to None which resolves to the lakehouse attached to the notebook.
444
+ workspace : str | uuid.UUID, default=None
445
+ The Fabric workspace name or ID used by the lakehouse.
446
+ Defaults to None which resolves to the workspace of the attached lakehouse
447
+ or if no lakehouse attached, resolves to the workspace of the notebook.
448
+
449
+ Returns
450
+ -------
451
+ pandas.DataFrame
452
+ A dataframe showing the history of the delta table.
453
+ """
454
+
455
+ def camel_to_title(text):
456
+ return re.sub(r"([a-z])([A-Z])", r"\1 \2", text).title()
457
+
458
+ spark = _create_spark_session()
459
+
460
+ (workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace=workspace)
461
+ (lakehouse_name, lakehouse_id) = resolve_lakehouse_name_and_id(
462
+ lakehouse=lakehouse, workspace=workspace
463
+ )
464
+ path = create_abfss_path(lakehouse_id, workspace_id, table_name)
465
+
466
+ from delta import DeltaTable
467
+
468
+ delta_table = DeltaTable.forPath(spark, path)
469
+ df = delta_table.history().toPandas()
470
+
471
+ df.rename(columns=lambda col: camel_to_title(col), inplace=True)
472
+
473
+ return df