semantic-link-labs 0.9.3__py3-none-any.whl → 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of semantic-link-labs might be problematic. Click here for more details.
- {semantic_link_labs-0.9.3.dist-info → semantic_link_labs-0.9.4.dist-info}/METADATA +9 -6
- {semantic_link_labs-0.9.3.dist-info → semantic_link_labs-0.9.4.dist-info}/RECORD +41 -31
- {semantic_link_labs-0.9.3.dist-info → semantic_link_labs-0.9.4.dist-info}/WHEEL +1 -1
- sempy_labs/__init__.py +27 -1
- sempy_labs/_capacity_migration.py +3 -2
- sempy_labs/_dax.py +17 -3
- sempy_labs/_delta_analyzer.py +279 -127
- sempy_labs/_eventhouses.py +70 -1
- sempy_labs/_generate_semantic_model.py +30 -9
- sempy_labs/_helper_functions.py +30 -1
- sempy_labs/_job_scheduler.py +226 -2
- sempy_labs/_list_functions.py +40 -16
- sempy_labs/_model_bpa.py +15 -0
- sempy_labs/_model_bpa_rules.py +12 -2
- sempy_labs/_semantic_models.py +117 -0
- sempy_labs/_sql.py +73 -6
- sempy_labs/_sqldatabase.py +227 -0
- sempy_labs/admin/__init__.py +49 -8
- sempy_labs/admin/_activities.py +166 -0
- sempy_labs/admin/_apps.py +143 -0
- sempy_labs/admin/_basic_functions.py +32 -652
- sempy_labs/admin/_capacities.py +250 -0
- sempy_labs/admin/_datasets.py +184 -0
- sempy_labs/admin/_domains.py +1 -1
- sempy_labs/admin/_items.py +3 -1
- sempy_labs/admin/_reports.py +165 -0
- sempy_labs/admin/_scanner.py +0 -1
- sempy_labs/admin/_shared.py +74 -0
- sempy_labs/admin/_tenant.py +489 -0
- sempy_labs/directlake/_dl_helper.py +0 -1
- sempy_labs/directlake/_update_directlake_partition_entity.py +6 -0
- sempy_labs/graph/_teams.py +1 -1
- sempy_labs/graph/_users.py +9 -1
- sempy_labs/lakehouse/_shortcuts.py +28 -15
- sempy_labs/report/__init__.py +3 -1
- sempy_labs/report/_download_report.py +4 -1
- sempy_labs/report/_export_report.py +272 -0
- sempy_labs/report/_report_functions.py +9 -261
- sempy_labs/tom/_model.py +278 -29
- {semantic_link_labs-0.9.3.dist-info → semantic_link_labs-0.9.4.dist-info}/LICENSE +0 -0
- {semantic_link_labs-0.9.3.dist-info → semantic_link_labs-0.9.4.dist-info}/top_level.txt +0 -0
sempy_labs/_delta_analyzer.py
CHANGED
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
|
-
import
|
|
2
|
+
import re
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
import os
|
|
5
|
+
from uuid import UUID
|
|
3
6
|
from typing import Dict, Optional
|
|
4
7
|
import pyarrow.dataset as ds
|
|
5
8
|
import pyarrow.parquet as pq
|
|
@@ -12,20 +15,43 @@ from sempy_labs._helper_functions import (
|
|
|
12
15
|
resolve_workspace_name_and_id,
|
|
13
16
|
resolve_lakehouse_name_and_id,
|
|
14
17
|
_read_delta_table,
|
|
15
|
-
|
|
18
|
+
_mount,
|
|
19
|
+
_create_spark_session,
|
|
16
20
|
)
|
|
21
|
+
from sempy._utils._log import log
|
|
17
22
|
from sempy_labs.lakehouse._get_lakehouse_tables import get_lakehouse_tables
|
|
18
23
|
from sempy_labs.lakehouse._lakehouse import lakehouse_attached
|
|
19
24
|
import sempy_labs._icons as icons
|
|
20
|
-
from
|
|
25
|
+
from tqdm.auto import tqdm
|
|
26
|
+
|
|
21
27
|
|
|
28
|
+
def get_parquet_file_infos(path):
|
|
29
|
+
|
|
30
|
+
import notebookutils
|
|
22
31
|
|
|
32
|
+
files = []
|
|
33
|
+
items = notebookutils.fs.ls(path)
|
|
34
|
+
for item in items:
|
|
35
|
+
if item.isDir:
|
|
36
|
+
# Ignore the _delta_log directory
|
|
37
|
+
if "_delta_log" not in item.path:
|
|
38
|
+
files.extend(get_parquet_file_infos(item.path))
|
|
39
|
+
else:
|
|
40
|
+
# Filter out non-Parquet files and files with size 0
|
|
41
|
+
if item.path.endswith(".parquet") and item.size > 0:
|
|
42
|
+
files.append((item.path, item.size))
|
|
43
|
+
return files
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@log
|
|
23
47
|
def delta_analyzer(
|
|
24
48
|
table_name: str,
|
|
25
49
|
approx_distinct_count: bool = True,
|
|
26
50
|
export: bool = False,
|
|
27
51
|
lakehouse: Optional[str | UUID] = None,
|
|
28
52
|
workspace: Optional[str | UUID] = None,
|
|
53
|
+
column_stats: bool = True,
|
|
54
|
+
skip_cardinality: bool = True,
|
|
29
55
|
) -> Dict[str, pd.DataFrame]:
|
|
30
56
|
"""
|
|
31
57
|
Analyzes a delta table and shows the results in dictionary containing a set of 5 dataframes. If 'export' is set to True, the results will be saved to delta tables in the lakehouse attached to the notebook.
|
|
@@ -55,13 +81,20 @@ def delta_analyzer(
|
|
|
55
81
|
The Fabric workspace name or ID used by the lakehouse.
|
|
56
82
|
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
57
83
|
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
84
|
+
column_stats : bool, default=True
|
|
85
|
+
If True, collects data about column chunks and columns. If False, skips that step and only returns the other 3 dataframes.
|
|
86
|
+
skip_cardinality : bool, default=True
|
|
87
|
+
If True, skips the cardinality calculation for each column. If False, calculates the cardinality for each column.
|
|
58
88
|
|
|
59
89
|
Returns
|
|
60
90
|
-------
|
|
61
91
|
Dict[str, pandas.DataFrame]
|
|
62
92
|
A dictionary of pandas dataframes showing semantic model objects which violated the best practice analyzer rules.
|
|
63
93
|
"""
|
|
64
|
-
|
|
94
|
+
|
|
95
|
+
# Must calculate column stats if calculating cardinality
|
|
96
|
+
if not skip_cardinality:
|
|
97
|
+
column_stats = True
|
|
65
98
|
|
|
66
99
|
# display_toggle = notebookutils.common.configs.pandas_display
|
|
67
100
|
|
|
@@ -70,70 +103,60 @@ def delta_analyzer(
|
|
|
70
103
|
# notebookutils.common.configs.pandas_display = False
|
|
71
104
|
|
|
72
105
|
prefix = "SLL_DeltaAnalyzer_"
|
|
73
|
-
now = datetime.
|
|
106
|
+
now = datetime.now()
|
|
74
107
|
(workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace=workspace)
|
|
75
108
|
(lakehouse_name, lakehouse_id) = resolve_lakehouse_name_and_id(
|
|
76
109
|
lakehouse=lakehouse, workspace=workspace
|
|
77
110
|
)
|
|
78
111
|
path = create_abfss_path(lakehouse_id, workspace_id, table_name)
|
|
79
|
-
|
|
80
|
-
mounts = notebookutils.fs.mounts()
|
|
81
|
-
mount_point = f"/{workspace_name.replace(' ', '')}{lakehouse_name.replace(' ', '')}"
|
|
82
|
-
if not any(i.get("source") == lake_path for i in mounts):
|
|
83
|
-
# Mount lakehouse if not mounted
|
|
84
|
-
notebookutils.fs.mount(lake_path, mount_point)
|
|
85
|
-
print(
|
|
86
|
-
f"{icons.green_dot} Mounted the '{lakehouse_name}' lakehouse within the '{workspace_name}' to the notebook."
|
|
87
|
-
)
|
|
88
|
-
|
|
89
|
-
mounts = notebookutils.fs.mounts()
|
|
90
|
-
local_path = next(
|
|
91
|
-
i.get("localPath") for i in mounts if i.get("source") == lake_path
|
|
92
|
-
)
|
|
112
|
+
local_path = _mount(lakehouse=lakehouse, workspace=workspace)
|
|
93
113
|
table_path = f"{local_path}/Tables/{table_name}"
|
|
114
|
+
delta_table_path = create_abfss_path(lakehouse_id, workspace_id, table_name)
|
|
94
115
|
|
|
95
116
|
# Set back to original value
|
|
96
117
|
# notebookutils.common.configs.pandas_display = display_toggle
|
|
97
118
|
|
|
98
119
|
parquet_file_df_columns = {
|
|
99
|
-
"
|
|
100
|
-
"
|
|
101
|
-
"
|
|
120
|
+
# "Dataset": "string",
|
|
121
|
+
"Parquet File": "string",
|
|
122
|
+
"Row Count": "int",
|
|
123
|
+
"Row Groups": "int",
|
|
124
|
+
"Created By": "string",
|
|
125
|
+
"Total Table Rows": "int",
|
|
126
|
+
"Total Table Row Groups": "int",
|
|
102
127
|
}
|
|
103
128
|
row_group_df_columns = {
|
|
104
|
-
"
|
|
105
|
-
"
|
|
106
|
-
"
|
|
107
|
-
"
|
|
108
|
-
"
|
|
109
|
-
"
|
|
129
|
+
# "Dataset": "string",
|
|
130
|
+
"Parquet File": "string",
|
|
131
|
+
"Row Group ID": "int",
|
|
132
|
+
"Row Count": "int",
|
|
133
|
+
"Compressed Size": "int",
|
|
134
|
+
"Uncompressed Size": "int",
|
|
135
|
+
"Compression Ratio": "float",
|
|
136
|
+
"Total Table Rows": "int",
|
|
137
|
+
"Ratio Of Total Table Rows": "float",
|
|
138
|
+
"Total Table Row Groups": "int",
|
|
110
139
|
}
|
|
111
140
|
column_chunk_df_columns = {
|
|
112
|
-
"
|
|
113
|
-
"
|
|
114
|
-
"
|
|
115
|
-
"
|
|
116
|
-
"
|
|
117
|
-
"
|
|
118
|
-
"
|
|
119
|
-
"
|
|
120
|
-
"
|
|
141
|
+
# "Dataset": "string",
|
|
142
|
+
"Parquet File": "string",
|
|
143
|
+
"Column ID": "int",
|
|
144
|
+
"Column Name": "string",
|
|
145
|
+
"Column Type": "string",
|
|
146
|
+
"Compressed Size": "int",
|
|
147
|
+
"Uncompressed Size": "int",
|
|
148
|
+
"Has Dict": "bool",
|
|
149
|
+
"Dict Offset": "int_fillna",
|
|
150
|
+
"Value Count": "int",
|
|
121
151
|
"Encodings": "string",
|
|
152
|
+
"Statistics": "string",
|
|
153
|
+
"Primative Type": "string",
|
|
122
154
|
}
|
|
123
155
|
|
|
124
156
|
parquet_file_df = _create_dataframe(columns=parquet_file_df_columns)
|
|
125
157
|
row_group_df = _create_dataframe(columns=row_group_df_columns)
|
|
126
158
|
column_chunk_df = _create_dataframe(columns=column_chunk_df_columns)
|
|
127
159
|
|
|
128
|
-
# delta_table = DeltaTable.forPath(spark, path)
|
|
129
|
-
# detail_df = spark.sql(f"DESCRIBE DETAIL `{table_name}`").collect()[0]
|
|
130
|
-
|
|
131
|
-
# num_files = detail_df.numFiles
|
|
132
|
-
# size_in_bytes = detail_df.sizeInBytes
|
|
133
|
-
|
|
134
|
-
latest_files = _read_delta_table(path).inputFiles()
|
|
135
|
-
file_paths = [f.split("/")[-1] for f in latest_files]
|
|
136
|
-
row_count = _delta_table_row_count(table_name)
|
|
137
160
|
row_groups = 0
|
|
138
161
|
max_rows_per_row_group = 0
|
|
139
162
|
min_rows_per_row_group = float("inf")
|
|
@@ -141,21 +164,67 @@ def delta_analyzer(
|
|
|
141
164
|
schema = ds.dataset(table_path).schema.metadata
|
|
142
165
|
is_vorder = any(b"vorder" in key for key in schema.keys())
|
|
143
166
|
|
|
144
|
-
|
|
145
|
-
|
|
167
|
+
# Get the common details of the Delta table
|
|
168
|
+
spark = _create_spark_session()
|
|
169
|
+
|
|
170
|
+
from delta import DeltaTable
|
|
171
|
+
|
|
172
|
+
delta_table = DeltaTable.forPath(spark, delta_table_path)
|
|
173
|
+
table_df = delta_table.toDF()
|
|
174
|
+
# total_partition_count = table_df.rdd.getNumPartitions()
|
|
175
|
+
row_count = table_df.count()
|
|
176
|
+
table_details = delta_table.detail().collect()[0].asDict()
|
|
177
|
+
# created_at = table_details.get("createdAt")
|
|
178
|
+
# last_modified = table_details.get("lastModified")
|
|
179
|
+
# partition_columns = table_details.get("partitionColumns")
|
|
180
|
+
# clustering_columns = table_details.get("clusteringColumns")
|
|
181
|
+
num_latest_files = table_details.get("numFiles", 0)
|
|
182
|
+
# size_in_bytes = table_details.get("sizeInBytes")
|
|
183
|
+
# min_reader_version = table_details.get("minReaderVersion")
|
|
184
|
+
# min_writer_version = table_details.get("minWriterVersion")
|
|
185
|
+
|
|
186
|
+
latest_files = _read_delta_table(path).inputFiles()
|
|
187
|
+
# file_paths = [f.split("/")[-1] for f in latest_files]
|
|
188
|
+
all_parquet_files = get_parquet_file_infos(delta_table_path)
|
|
189
|
+
common_file_paths = set(
|
|
190
|
+
[file_info[0] for file_info in all_parquet_files]
|
|
191
|
+
).intersection(set(latest_files))
|
|
192
|
+
latest_version_files = [
|
|
193
|
+
file_info
|
|
194
|
+
for file_info in all_parquet_files
|
|
195
|
+
if file_info[0] in common_file_paths
|
|
196
|
+
]
|
|
197
|
+
|
|
198
|
+
for idx, (file_path, file_size) in enumerate(
|
|
199
|
+
bar := tqdm(latest_version_files), start=1
|
|
200
|
+
):
|
|
201
|
+
file_name = os.path.basename(file_path)
|
|
202
|
+
bar.set_description(
|
|
203
|
+
f"Analyzing the '{file_name}' parquet file ({idx}/{num_latest_files})..."
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
relative_path = file_path.split("Tables/")[1]
|
|
207
|
+
file_system_path = f"{local_path}/Tables/{relative_path}"
|
|
208
|
+
parquet_file = pq.ParquetFile(file_system_path)
|
|
209
|
+
|
|
146
210
|
row_groups += parquet_file.num_row_groups
|
|
147
211
|
|
|
148
212
|
# Generate rowgroup dataframe
|
|
149
213
|
new_data = {
|
|
150
|
-
"
|
|
151
|
-
"
|
|
152
|
-
"
|
|
214
|
+
# "Dataset": "Parquet Files",
|
|
215
|
+
"Parquet File": file_name,
|
|
216
|
+
"Row Count": parquet_file.metadata.num_rows,
|
|
217
|
+
"Row Groups": parquet_file.num_row_groups,
|
|
218
|
+
"Created By": parquet_file.metadata.created_by,
|
|
219
|
+
"Total Table Rows": -1,
|
|
220
|
+
"Total Table Row Groups": -1,
|
|
153
221
|
}
|
|
154
222
|
|
|
155
223
|
parquet_file_df = pd.concat(
|
|
156
224
|
[parquet_file_df, pd.DataFrame(new_data, index=[0])], ignore_index=True
|
|
157
225
|
)
|
|
158
226
|
|
|
227
|
+
# Loop through the row groups
|
|
159
228
|
for i in range(parquet_file.num_row_groups):
|
|
160
229
|
row_group = parquet_file.metadata.row_group(i)
|
|
161
230
|
num_rows = row_group.num_rows
|
|
@@ -166,38 +235,50 @@ def delta_analyzer(
|
|
|
166
235
|
total_compressed_size = 0
|
|
167
236
|
total_uncompressed_size = 0
|
|
168
237
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
238
|
+
# Loop through the columns
|
|
239
|
+
if column_stats:
|
|
240
|
+
for j in range(row_group.num_columns):
|
|
241
|
+
column_chunk = row_group.column(j)
|
|
242
|
+
total_compressed_size += column_chunk.total_compressed_size
|
|
243
|
+
total_uncompressed_size += column_chunk.total_uncompressed_size
|
|
244
|
+
|
|
245
|
+
# Generate Column Chunk Dataframe
|
|
246
|
+
new_data = {
|
|
247
|
+
# "Dataset": "Column Chunks",
|
|
248
|
+
"Parquet File": file_name,
|
|
249
|
+
"Column ID": j,
|
|
250
|
+
"Column Name": column_chunk.path_in_schema,
|
|
251
|
+
"Column Type": column_chunk.physical_type,
|
|
252
|
+
"Compressed Size": column_chunk.total_compressed_size,
|
|
253
|
+
"Uncompressed Size": column_chunk.total_uncompressed_size,
|
|
254
|
+
"Has Dict": column_chunk.has_dictionary_page,
|
|
255
|
+
"Dict Offset": column_chunk.dictionary_page_offset,
|
|
256
|
+
"Value Count": column_chunk.num_values,
|
|
257
|
+
"Encodings": str(column_chunk.encodings),
|
|
258
|
+
"Statistics": column_chunk.statistics,
|
|
259
|
+
"PrimativeType": column_chunk.physical_type,
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
column_chunk_df = pd.concat(
|
|
263
|
+
[column_chunk_df, pd.DataFrame(new_data, index=[0])],
|
|
264
|
+
ignore_index=True,
|
|
265
|
+
)
|
|
192
266
|
|
|
193
267
|
# Generate rowgroup dataframe
|
|
194
268
|
new_data = {
|
|
195
|
-
"
|
|
196
|
-
"
|
|
197
|
-
"
|
|
198
|
-
"
|
|
199
|
-
"
|
|
200
|
-
"
|
|
269
|
+
# "Dataset": "Row Groups",
|
|
270
|
+
"Parquet File": file_name,
|
|
271
|
+
"Row Group ID": i + 1,
|
|
272
|
+
"Row Count": num_rows,
|
|
273
|
+
"Compressed Size": total_compressed_size,
|
|
274
|
+
"Uncompressed Size": total_uncompressed_size,
|
|
275
|
+
"Compression Ratio": (
|
|
276
|
+
total_compressed_size / total_uncompressed_size
|
|
277
|
+
if column_stats
|
|
278
|
+
else 0
|
|
279
|
+
),
|
|
280
|
+
"Total Table Rows": -1,
|
|
281
|
+
"Total Table Row Groups": -1,
|
|
201
282
|
}
|
|
202
283
|
|
|
203
284
|
if not row_group_df.empty:
|
|
@@ -213,68 +294,89 @@ def delta_analyzer(
|
|
|
213
294
|
summary_df = pd.DataFrame(
|
|
214
295
|
[
|
|
215
296
|
{
|
|
216
|
-
"
|
|
217
|
-
"
|
|
218
|
-
"
|
|
219
|
-
"
|
|
220
|
-
"
|
|
221
|
-
"
|
|
222
|
-
"
|
|
297
|
+
# "Dataset": "Summary",
|
|
298
|
+
"Row Count": row_count,
|
|
299
|
+
"Row Groups": row_groups,
|
|
300
|
+
"Parquet Files": num_latest_files,
|
|
301
|
+
"Max Rows Per Row Group": max_rows_per_row_group,
|
|
302
|
+
"Min Rows Per Row Group": min_rows_per_row_group,
|
|
303
|
+
"Avg Rows Per Row Group": avg_rows_per_row_group,
|
|
304
|
+
"VOrder Enabled": is_vorder,
|
|
223
305
|
# "VOrderLevel": v_order_level,
|
|
224
306
|
}
|
|
225
307
|
]
|
|
226
308
|
)
|
|
227
309
|
|
|
228
310
|
# Clean up data types
|
|
229
|
-
_update_dataframe_datatypes(
|
|
230
|
-
dataframe=column_chunk_df, column_map=column_chunk_df_columns
|
|
231
|
-
)
|
|
232
311
|
_update_dataframe_datatypes(dataframe=row_group_df, column_map=row_group_df_columns)
|
|
233
312
|
_update_dataframe_datatypes(
|
|
234
313
|
dataframe=parquet_file_df, column_map=parquet_file_df_columns
|
|
235
314
|
)
|
|
236
315
|
|
|
237
316
|
# Generate column dataframe
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
317
|
+
if column_stats:
|
|
318
|
+
_update_dataframe_datatypes(
|
|
319
|
+
dataframe=column_chunk_df, column_map=column_chunk_df_columns
|
|
320
|
+
)
|
|
321
|
+
column_df = column_chunk_df.groupby(
|
|
322
|
+
["Column Name", "Column Type"], as_index=False
|
|
323
|
+
).agg({"Compressed Size": "sum", "Uncompressed Size": "sum"})
|
|
324
|
+
|
|
325
|
+
# Add distinct count to column_df
|
|
326
|
+
if not skip_cardinality:
|
|
327
|
+
for ind, r in column_df.iterrows():
|
|
328
|
+
col_name = r["Column Name"]
|
|
329
|
+
if approx_distinct_count:
|
|
330
|
+
function = "approx"
|
|
331
|
+
else:
|
|
332
|
+
function = "distinctcount"
|
|
333
|
+
dc = _get_column_aggregate(
|
|
334
|
+
table_name=table_name,
|
|
335
|
+
column_name=col_name,
|
|
336
|
+
function=function,
|
|
337
|
+
lakehouse=lakehouse,
|
|
338
|
+
workspace=workspace,
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
if "Cardinality" not in column_df.columns:
|
|
342
|
+
column_df["Cardinality"] = None
|
|
261
343
|
|
|
262
|
-
|
|
263
|
-
column_df["Cardinality"] = None
|
|
344
|
+
column_df.at[ind, "Cardinality"] = dc
|
|
264
345
|
|
|
265
|
-
|
|
346
|
+
summary_df["Total Size"] = column_df["Compressed Size"].sum()
|
|
266
347
|
|
|
267
|
-
|
|
268
|
-
|
|
348
|
+
parquet_file_df["Total Table Rows"] = parquet_file_df["Row Count"].sum()
|
|
349
|
+
parquet_file_df["Total Table Row Groups"] = parquet_file_df["Row Groups"].sum()
|
|
350
|
+
|
|
351
|
+
row_group_df["Total Table Rows"] = parquet_file_df["Row Count"].sum()
|
|
352
|
+
row_group_df["Total Table Row Groups"] = parquet_file_df["Row Groups"].sum()
|
|
353
|
+
total_rows = row_group_df["Row Count"].sum()
|
|
354
|
+
row_group_df["Ratio Of Total Table Rows"] = (
|
|
355
|
+
row_group_df["Row Count"] / total_rows * 100.0
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
if column_stats:
|
|
359
|
+
column_df["Total Table Rows"] = parquet_file_df["Row Count"].sum()
|
|
360
|
+
column_df["Table Size"] = column_df["Compressed Size"].sum()
|
|
361
|
+
column_df["Size Percent Of Table"] = (
|
|
362
|
+
column_df["Compressed Size"] / column_df["Table Size"] * 100.0
|
|
363
|
+
)
|
|
364
|
+
if not skip_cardinality and column_stats:
|
|
365
|
+
column_df["Cardinality"] = column_df["Cardinality"].fillna(0).astype(int)
|
|
366
|
+
column_df["Cardinality Of Total Rows"] = (
|
|
367
|
+
column_df["Cardinality"] / column_df["Total Table Rows"] * 100.0
|
|
368
|
+
)
|
|
269
369
|
|
|
270
370
|
dataframes = {
|
|
271
371
|
"Summary": summary_df,
|
|
272
372
|
"Parquet Files": parquet_file_df,
|
|
273
373
|
"Row Groups": row_group_df,
|
|
274
|
-
"Column Chunks": column_chunk_df,
|
|
275
|
-
"Columns": column_df,
|
|
276
374
|
}
|
|
277
375
|
|
|
376
|
+
if column_stats:
|
|
377
|
+
dataframes["Column Chunks"] = column_chunk_df
|
|
378
|
+
dataframes["Columns"] = column_df
|
|
379
|
+
|
|
278
380
|
save_table = f"{prefix}Summary"
|
|
279
381
|
|
|
280
382
|
if export:
|
|
@@ -295,11 +397,11 @@ def delta_analyzer(
|
|
|
295
397
|
for name, df in dataframes.items():
|
|
296
398
|
name = name.replace(" ", "")
|
|
297
399
|
cols = {
|
|
298
|
-
"
|
|
299
|
-
"
|
|
300
|
-
"
|
|
301
|
-
"
|
|
302
|
-
"
|
|
400
|
+
"Workspace Name": workspace_name,
|
|
401
|
+
"Workspace Id": workspace_id,
|
|
402
|
+
"Lakehouse Name": lakehouse_name,
|
|
403
|
+
"Lakehouse Id": lakehouse_id,
|
|
404
|
+
"Table Name": table_name,
|
|
303
405
|
}
|
|
304
406
|
for i, (col, param) in enumerate(cols.items()):
|
|
305
407
|
df[col] = param
|
|
@@ -309,8 +411,10 @@ def delta_analyzer(
|
|
|
309
411
|
df["Timestamp"] = pd.to_datetime(df["Timestamp"])
|
|
310
412
|
|
|
311
413
|
if export:
|
|
312
|
-
df["
|
|
313
|
-
df["
|
|
414
|
+
df["Run Id"] = runId
|
|
415
|
+
df["Run Id"] = df["Run Id"].astype(int)
|
|
416
|
+
|
|
417
|
+
df.columns = df.columns.str.replace(" ", "")
|
|
314
418
|
save_as_delta_table(
|
|
315
419
|
dataframe=df,
|
|
316
420
|
delta_table_name=f"{prefix}{name}",
|
|
@@ -319,3 +423,51 @@ def delta_analyzer(
|
|
|
319
423
|
)
|
|
320
424
|
|
|
321
425
|
return dataframes
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
@log
|
|
429
|
+
def get_delta_table_history(
|
|
430
|
+
table_name: str,
|
|
431
|
+
lakehouse: Optional[str | UUID] = None,
|
|
432
|
+
workspace: Optional[str | UUID] = None,
|
|
433
|
+
) -> pd.DataFrame:
|
|
434
|
+
"""
|
|
435
|
+
Returns the history of a delta table as a pandas dataframe.
|
|
436
|
+
|
|
437
|
+
Parameters
|
|
438
|
+
----------
|
|
439
|
+
table_name : str
|
|
440
|
+
The delta table name.
|
|
441
|
+
lakehouse : str | uuid.UUID, default=None
|
|
442
|
+
The Fabric lakehouse name or ID.
|
|
443
|
+
Defaults to None which resolves to the lakehouse attached to the notebook.
|
|
444
|
+
workspace : str | uuid.UUID, default=None
|
|
445
|
+
The Fabric workspace name or ID used by the lakehouse.
|
|
446
|
+
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
447
|
+
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
448
|
+
|
|
449
|
+
Returns
|
|
450
|
+
-------
|
|
451
|
+
pandas.DataFrame
|
|
452
|
+
A dataframe showing the history of the delta table.
|
|
453
|
+
"""
|
|
454
|
+
|
|
455
|
+
def camel_to_title(text):
|
|
456
|
+
return re.sub(r"([a-z])([A-Z])", r"\1 \2", text).title()
|
|
457
|
+
|
|
458
|
+
spark = _create_spark_session()
|
|
459
|
+
|
|
460
|
+
(workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace=workspace)
|
|
461
|
+
(lakehouse_name, lakehouse_id) = resolve_lakehouse_name_and_id(
|
|
462
|
+
lakehouse=lakehouse, workspace=workspace
|
|
463
|
+
)
|
|
464
|
+
path = create_abfss_path(lakehouse_id, workspace_id, table_name)
|
|
465
|
+
|
|
466
|
+
from delta import DeltaTable
|
|
467
|
+
|
|
468
|
+
delta_table = DeltaTable.forPath(spark, path)
|
|
469
|
+
df = delta_table.history().toPandas()
|
|
470
|
+
|
|
471
|
+
df.rename(columns=lambda col: camel_to_title(col), inplace=True)
|
|
472
|
+
|
|
473
|
+
return df
|
sempy_labs/_eventhouses.py
CHANGED
|
@@ -7,12 +7,18 @@ from sempy_labs._helper_functions import (
|
|
|
7
7
|
_print_success,
|
|
8
8
|
resolve_item_id,
|
|
9
9
|
_create_dataframe,
|
|
10
|
+
_conv_b64,
|
|
11
|
+
_decode_b64,
|
|
10
12
|
)
|
|
11
13
|
from uuid import UUID
|
|
14
|
+
import sempy_labs._icons as icons
|
|
12
15
|
|
|
13
16
|
|
|
14
17
|
def create_eventhouse(
|
|
15
|
-
name: str,
|
|
18
|
+
name: str,
|
|
19
|
+
definition: Optional[dict],
|
|
20
|
+
description: Optional[str] = None,
|
|
21
|
+
workspace: Optional[str | UUID] = None,
|
|
16
22
|
):
|
|
17
23
|
"""
|
|
18
24
|
Creates a Fabric eventhouse.
|
|
@@ -23,6 +29,8 @@ def create_eventhouse(
|
|
|
23
29
|
----------
|
|
24
30
|
name: str
|
|
25
31
|
Name of the eventhouse.
|
|
32
|
+
definition : dict
|
|
33
|
+
The definition (EventhouseProperties.json) of the eventhouse.
|
|
26
34
|
description : str, default=None
|
|
27
35
|
A description of the environment.
|
|
28
36
|
workspace : str | uuid.UUID, default=None
|
|
@@ -38,6 +46,20 @@ def create_eventhouse(
|
|
|
38
46
|
if description:
|
|
39
47
|
payload["description"] = description
|
|
40
48
|
|
|
49
|
+
if definition is not None:
|
|
50
|
+
if not isinstance(definition, dict):
|
|
51
|
+
raise ValueError(f"{icons.red_dot} The definition must be a dictionary.")
|
|
52
|
+
|
|
53
|
+
payload["definition"] = {
|
|
54
|
+
"parts": [
|
|
55
|
+
{
|
|
56
|
+
"path": "EventhouseProperties.json",
|
|
57
|
+
"payload": _conv_b64(definition),
|
|
58
|
+
"payloadType": "InlineBase64",
|
|
59
|
+
}
|
|
60
|
+
]
|
|
61
|
+
}
|
|
62
|
+
|
|
41
63
|
_base_api(
|
|
42
64
|
request=f"/v1/workspaces/{workspace_id}/eventhouses",
|
|
43
65
|
method="post",
|
|
@@ -123,3 +145,50 @@ def delete_eventhouse(name: str, workspace: Optional[str | UUID] = None):
|
|
|
123
145
|
workspace_name=workspace_name,
|
|
124
146
|
action="deleted",
|
|
125
147
|
)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def get_eventhouse_definition(
|
|
151
|
+
eventhouse: str | UUID,
|
|
152
|
+
workspace: Optional[str | UUID] = None,
|
|
153
|
+
return_dataframe: bool = False,
|
|
154
|
+
) -> dict | pd.DataFrame:
|
|
155
|
+
"""
|
|
156
|
+
Gets the eventhouse definition.
|
|
157
|
+
|
|
158
|
+
This is a wrapper function for the following API: `Items - Get Eventhouse Definition <https://learn.microsoft.com/rest/api/fabric/eventhouse/items/get-eventhouse-definition>`_.
|
|
159
|
+
|
|
160
|
+
Parameters
|
|
161
|
+
----------
|
|
162
|
+
eventhouse : str
|
|
163
|
+
Name of the eventhouse.
|
|
164
|
+
workspace : str | uuid.UUID, default=None
|
|
165
|
+
The Fabric workspace name or ID in which the eventhouse resides.
|
|
166
|
+
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
167
|
+
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
168
|
+
return_dataframe : bool, default=False
|
|
169
|
+
If True, returns a dataframe. If False, returns a json dictionary.
|
|
170
|
+
|
|
171
|
+
Returns
|
|
172
|
+
-------
|
|
173
|
+
dict | pandas.DataFrame
|
|
174
|
+
The eventhouse definition in .json format or as a pandas dataframe.
|
|
175
|
+
"""
|
|
176
|
+
|
|
177
|
+
(workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
|
|
178
|
+
item_id = resolve_item_id(item=eventhouse, type="Eventhouse", workspace=workspace)
|
|
179
|
+
|
|
180
|
+
result = _base_api(
|
|
181
|
+
request=f"/v1/workspaces/{workspace_id}/eventhouses/{item_id}/getDefinition",
|
|
182
|
+
method="post",
|
|
183
|
+
status_codes=None,
|
|
184
|
+
lro_return_json=True,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
df = pd.json_normalize(result["definition"]["parts"])
|
|
188
|
+
|
|
189
|
+
if return_dataframe:
|
|
190
|
+
return df
|
|
191
|
+
else:
|
|
192
|
+
df_filt = df[df["path"] == "EventhouseProperties.json"]
|
|
193
|
+
payload = df_filt["payload"].iloc[0]
|
|
194
|
+
return _decode_b64(payload)
|