semantic-link-labs 0.9.2__py3-none-any.whl → 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of semantic-link-labs might be problematic. Click here for more details.
- {semantic_link_labs-0.9.2.dist-info → semantic_link_labs-0.9.4.dist-info}/METADATA +10 -6
- {semantic_link_labs-0.9.2.dist-info → semantic_link_labs-0.9.4.dist-info}/RECORD +54 -44
- {semantic_link_labs-0.9.2.dist-info → semantic_link_labs-0.9.4.dist-info}/WHEEL +1 -1
- sempy_labs/__init__.py +27 -1
- sempy_labs/_ai.py +8 -5
- sempy_labs/_capacity_migration.py +3 -2
- sempy_labs/_connections.py +45 -9
- sempy_labs/_dax.py +17 -3
- sempy_labs/_delta_analyzer.py +308 -138
- sempy_labs/_eventhouses.py +70 -1
- sempy_labs/_gateways.py +56 -8
- sempy_labs/_generate_semantic_model.py +30 -9
- sempy_labs/_helper_functions.py +84 -9
- sempy_labs/_job_scheduler.py +226 -2
- sempy_labs/_list_functions.py +42 -19
- sempy_labs/_ml_experiments.py +1 -1
- sempy_labs/_model_bpa.py +17 -2
- sempy_labs/_model_bpa_rules.py +20 -8
- sempy_labs/_semantic_models.py +117 -0
- sempy_labs/_sql.py +73 -6
- sempy_labs/_sqldatabase.py +227 -0
- sempy_labs/_translations.py +2 -2
- sempy_labs/_vertipaq.py +3 -3
- sempy_labs/_warehouses.py +1 -1
- sempy_labs/admin/__init__.py +49 -8
- sempy_labs/admin/_activities.py +166 -0
- sempy_labs/admin/_apps.py +143 -0
- sempy_labs/admin/_basic_functions.py +32 -652
- sempy_labs/admin/_capacities.py +250 -0
- sempy_labs/admin/_datasets.py +184 -0
- sempy_labs/admin/_domains.py +1 -3
- sempy_labs/admin/_items.py +3 -1
- sempy_labs/admin/_reports.py +165 -0
- sempy_labs/admin/_scanner.py +53 -49
- sempy_labs/admin/_shared.py +74 -0
- sempy_labs/admin/_tenant.py +489 -0
- sempy_labs/directlake/_dl_helper.py +0 -1
- sempy_labs/directlake/_update_directlake_partition_entity.py +6 -0
- sempy_labs/graph/_teams.py +1 -1
- sempy_labs/graph/_users.py +9 -1
- sempy_labs/lakehouse/_get_lakehouse_columns.py +2 -2
- sempy_labs/lakehouse/_get_lakehouse_tables.py +2 -2
- sempy_labs/lakehouse/_lakehouse.py +3 -3
- sempy_labs/lakehouse/_shortcuts.py +29 -16
- sempy_labs/migration/_migrate_calctables_to_lakehouse.py +2 -2
- sempy_labs/migration/_refresh_calc_tables.py +2 -2
- sempy_labs/report/__init__.py +3 -1
- sempy_labs/report/_download_report.py +4 -1
- sempy_labs/report/_export_report.py +272 -0
- sempy_labs/report/_report_functions.py +11 -263
- sempy_labs/report/_report_rebind.py +1 -1
- sempy_labs/tom/_model.py +281 -29
- {semantic_link_labs-0.9.2.dist-info → semantic_link_labs-0.9.4.dist-info}/LICENSE +0 -0
- {semantic_link_labs-0.9.2.dist-info → semantic_link_labs-0.9.4.dist-info}/top_level.txt +0 -0
sempy_labs/_delta_analyzer.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
|
-
import
|
|
3
|
-
from
|
|
2
|
+
import re
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
import os
|
|
5
|
+
from uuid import UUID
|
|
6
|
+
from typing import Dict, Optional
|
|
4
7
|
import pyarrow.dataset as ds
|
|
5
8
|
import pyarrow.parquet as pq
|
|
6
|
-
from pyspark.sql import SparkSession
|
|
7
9
|
from sempy_labs._helper_functions import (
|
|
8
10
|
create_abfss_path,
|
|
9
11
|
save_as_delta_table,
|
|
@@ -12,19 +14,47 @@ from sempy_labs._helper_functions import (
|
|
|
12
14
|
_update_dataframe_datatypes,
|
|
13
15
|
resolve_workspace_name_and_id,
|
|
14
16
|
resolve_lakehouse_name_and_id,
|
|
17
|
+
_read_delta_table,
|
|
18
|
+
_mount,
|
|
19
|
+
_create_spark_session,
|
|
15
20
|
)
|
|
21
|
+
from sempy._utils._log import log
|
|
16
22
|
from sempy_labs.lakehouse._get_lakehouse_tables import get_lakehouse_tables
|
|
17
23
|
from sempy_labs.lakehouse._lakehouse import lakehouse_attached
|
|
18
24
|
import sempy_labs._icons as icons
|
|
25
|
+
from tqdm.auto import tqdm
|
|
19
26
|
|
|
20
27
|
|
|
28
|
+
def get_parquet_file_infos(path):
|
|
29
|
+
|
|
30
|
+
import notebookutils
|
|
31
|
+
|
|
32
|
+
files = []
|
|
33
|
+
items = notebookutils.fs.ls(path)
|
|
34
|
+
for item in items:
|
|
35
|
+
if item.isDir:
|
|
36
|
+
# Ignore the _delta_log directory
|
|
37
|
+
if "_delta_log" not in item.path:
|
|
38
|
+
files.extend(get_parquet_file_infos(item.path))
|
|
39
|
+
else:
|
|
40
|
+
# Filter out non-Parquet files and files with size 0
|
|
41
|
+
if item.path.endswith(".parquet") and item.size > 0:
|
|
42
|
+
files.append((item.path, item.size))
|
|
43
|
+
return files
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@log
|
|
21
47
|
def delta_analyzer(
|
|
22
48
|
table_name: str,
|
|
23
49
|
approx_distinct_count: bool = True,
|
|
24
50
|
export: bool = False,
|
|
51
|
+
lakehouse: Optional[str | UUID] = None,
|
|
52
|
+
workspace: Optional[str | UUID] = None,
|
|
53
|
+
column_stats: bool = True,
|
|
54
|
+
skip_cardinality: bool = True,
|
|
25
55
|
) -> Dict[str, pd.DataFrame]:
|
|
26
56
|
"""
|
|
27
|
-
Analyzes a delta table and shows the results in dictionary containing a set of 5 dataframes.
|
|
57
|
+
Analyzes a delta table and shows the results in dictionary containing a set of 5 dataframes. If 'export' is set to True, the results will be saved to delta tables in the lakehouse attached to the notebook.
|
|
28
58
|
|
|
29
59
|
The 5 dataframes returned by this function are:
|
|
30
60
|
|
|
@@ -44,6 +74,17 @@ def delta_analyzer(
|
|
|
44
74
|
If True, uses approx_count_distinct to calculate the cardinality of each column. If False, uses COUNT(DISTINCT) instead.
|
|
45
75
|
export : bool, default=False
|
|
46
76
|
If True, exports the resulting dataframes to delta tables in the lakehouse attached to the notebook.
|
|
77
|
+
lakehouse : str | uuid.UUID, default=None
|
|
78
|
+
The Fabric lakehouse name or ID.
|
|
79
|
+
Defaults to None which resolves to the lakehouse attached to the notebook.
|
|
80
|
+
workspace : str | uuid.UUID, default=None
|
|
81
|
+
The Fabric workspace name or ID used by the lakehouse.
|
|
82
|
+
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
83
|
+
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
84
|
+
column_stats : bool, default=True
|
|
85
|
+
If True, collects data about column chunks and columns. If False, skips that step and only returns the other 3 dataframes.
|
|
86
|
+
skip_cardinality : bool, default=True
|
|
87
|
+
If True, skips the cardinality calculation for each column. If False, calculates the cardinality for each column.
|
|
47
88
|
|
|
48
89
|
Returns
|
|
49
90
|
-------
|
|
@@ -51,99 +92,139 @@ def delta_analyzer(
|
|
|
51
92
|
A dictionary of pandas dataframes showing semantic model objects which violated the best practice analyzer rules.
|
|
52
93
|
"""
|
|
53
94
|
|
|
54
|
-
if
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
95
|
+
# Must calculate column stats if calculating cardinality
|
|
96
|
+
if not skip_cardinality:
|
|
97
|
+
column_stats = True
|
|
98
|
+
|
|
99
|
+
# display_toggle = notebookutils.common.configs.pandas_display
|
|
100
|
+
|
|
101
|
+
# Turn off notebookutils display
|
|
102
|
+
# if display_toggle is True:
|
|
103
|
+
# notebookutils.common.configs.pandas_display = False
|
|
58
104
|
|
|
59
105
|
prefix = "SLL_DeltaAnalyzer_"
|
|
60
|
-
now = datetime.
|
|
61
|
-
(workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace=
|
|
106
|
+
now = datetime.now()
|
|
107
|
+
(workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace=workspace)
|
|
62
108
|
(lakehouse_name, lakehouse_id) = resolve_lakehouse_name_and_id(
|
|
63
|
-
lakehouse=
|
|
109
|
+
lakehouse=lakehouse, workspace=workspace
|
|
64
110
|
)
|
|
65
111
|
path = create_abfss_path(lakehouse_id, workspace_id, table_name)
|
|
66
|
-
|
|
112
|
+
local_path = _mount(lakehouse=lakehouse, workspace=workspace)
|
|
113
|
+
table_path = f"{local_path}/Tables/{table_name}"
|
|
114
|
+
delta_table_path = create_abfss_path(lakehouse_id, workspace_id, table_name)
|
|
115
|
+
|
|
116
|
+
# Set back to original value
|
|
117
|
+
# notebookutils.common.configs.pandas_display = display_toggle
|
|
67
118
|
|
|
68
119
|
parquet_file_df_columns = {
|
|
69
|
-
"
|
|
70
|
-
"
|
|
71
|
-
"
|
|
120
|
+
# "Dataset": "string",
|
|
121
|
+
"Parquet File": "string",
|
|
122
|
+
"Row Count": "int",
|
|
123
|
+
"Row Groups": "int",
|
|
124
|
+
"Created By": "string",
|
|
125
|
+
"Total Table Rows": "int",
|
|
126
|
+
"Total Table Row Groups": "int",
|
|
72
127
|
}
|
|
73
128
|
row_group_df_columns = {
|
|
74
|
-
"
|
|
75
|
-
"
|
|
76
|
-
"
|
|
77
|
-
"
|
|
78
|
-
"
|
|
79
|
-
"
|
|
129
|
+
# "Dataset": "string",
|
|
130
|
+
"Parquet File": "string",
|
|
131
|
+
"Row Group ID": "int",
|
|
132
|
+
"Row Count": "int",
|
|
133
|
+
"Compressed Size": "int",
|
|
134
|
+
"Uncompressed Size": "int",
|
|
135
|
+
"Compression Ratio": "float",
|
|
136
|
+
"Total Table Rows": "int",
|
|
137
|
+
"Ratio Of Total Table Rows": "float",
|
|
138
|
+
"Total Table Row Groups": "int",
|
|
80
139
|
}
|
|
81
140
|
column_chunk_df_columns = {
|
|
82
|
-
"
|
|
83
|
-
"
|
|
84
|
-
"
|
|
85
|
-
"
|
|
86
|
-
"
|
|
87
|
-
"
|
|
88
|
-
"
|
|
89
|
-
"
|
|
90
|
-
"
|
|
141
|
+
# "Dataset": "string",
|
|
142
|
+
"Parquet File": "string",
|
|
143
|
+
"Column ID": "int",
|
|
144
|
+
"Column Name": "string",
|
|
145
|
+
"Column Type": "string",
|
|
146
|
+
"Compressed Size": "int",
|
|
147
|
+
"Uncompressed Size": "int",
|
|
148
|
+
"Has Dict": "bool",
|
|
149
|
+
"Dict Offset": "int_fillna",
|
|
150
|
+
"Value Count": "int",
|
|
91
151
|
"Encodings": "string",
|
|
152
|
+
"Statistics": "string",
|
|
153
|
+
"Primative Type": "string",
|
|
92
154
|
}
|
|
93
155
|
|
|
94
156
|
parquet_file_df = _create_dataframe(columns=parquet_file_df_columns)
|
|
95
157
|
row_group_df = _create_dataframe(columns=row_group_df_columns)
|
|
96
158
|
column_chunk_df = _create_dataframe(columns=column_chunk_df_columns)
|
|
97
159
|
|
|
98
|
-
spark = SparkSession.builder.getOrCreate()
|
|
99
|
-
# delta_table = DeltaTable.forPath(spark, path)
|
|
100
|
-
# detail_df = spark.sql(f"DESCRIBE DETAIL `{table_name}`").collect()[0]
|
|
101
|
-
|
|
102
|
-
# num_files = detail_df.numFiles
|
|
103
|
-
# size_in_bytes = detail_df.sizeInBytes
|
|
104
|
-
|
|
105
|
-
latest_files = spark.read.format("delta").load(path).inputFiles()
|
|
106
|
-
file_paths = [f.split("/")[-1] for f in latest_files]
|
|
107
|
-
row_count = spark.table(table_name).count()
|
|
108
160
|
row_groups = 0
|
|
109
161
|
max_rows_per_row_group = 0
|
|
110
162
|
min_rows_per_row_group = float("inf")
|
|
111
|
-
# dt = DeltaTable.forPath(spark, path)
|
|
112
|
-
# schema = dt.toDF().schema
|
|
113
|
-
# is_vorder = False
|
|
114
|
-
# if (
|
|
115
|
-
# dt.detail()
|
|
116
|
-
# .collect()[0]
|
|
117
|
-
# .asDict()
|
|
118
|
-
# .get("properties")
|
|
119
|
-
# .get("delta.parquet.vorder.enabled")
|
|
120
|
-
# == "true"
|
|
121
|
-
# ):
|
|
122
|
-
# is_vorder = True
|
|
123
163
|
|
|
124
164
|
schema = ds.dataset(table_path).schema.metadata
|
|
125
165
|
is_vorder = any(b"vorder" in key for key in schema.keys())
|
|
126
|
-
|
|
127
|
-
#
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
166
|
+
|
|
167
|
+
# Get the common details of the Delta table
|
|
168
|
+
spark = _create_spark_session()
|
|
169
|
+
|
|
170
|
+
from delta import DeltaTable
|
|
171
|
+
|
|
172
|
+
delta_table = DeltaTable.forPath(spark, delta_table_path)
|
|
173
|
+
table_df = delta_table.toDF()
|
|
174
|
+
# total_partition_count = table_df.rdd.getNumPartitions()
|
|
175
|
+
row_count = table_df.count()
|
|
176
|
+
table_details = delta_table.detail().collect()[0].asDict()
|
|
177
|
+
# created_at = table_details.get("createdAt")
|
|
178
|
+
# last_modified = table_details.get("lastModified")
|
|
179
|
+
# partition_columns = table_details.get("partitionColumns")
|
|
180
|
+
# clustering_columns = table_details.get("clusteringColumns")
|
|
181
|
+
num_latest_files = table_details.get("numFiles", 0)
|
|
182
|
+
# size_in_bytes = table_details.get("sizeInBytes")
|
|
183
|
+
# min_reader_version = table_details.get("minReaderVersion")
|
|
184
|
+
# min_writer_version = table_details.get("minWriterVersion")
|
|
185
|
+
|
|
186
|
+
latest_files = _read_delta_table(path).inputFiles()
|
|
187
|
+
# file_paths = [f.split("/")[-1] for f in latest_files]
|
|
188
|
+
all_parquet_files = get_parquet_file_infos(delta_table_path)
|
|
189
|
+
common_file_paths = set(
|
|
190
|
+
[file_info[0] for file_info in all_parquet_files]
|
|
191
|
+
).intersection(set(latest_files))
|
|
192
|
+
latest_version_files = [
|
|
193
|
+
file_info
|
|
194
|
+
for file_info in all_parquet_files
|
|
195
|
+
if file_info[0] in common_file_paths
|
|
196
|
+
]
|
|
197
|
+
|
|
198
|
+
for idx, (file_path, file_size) in enumerate(
|
|
199
|
+
bar := tqdm(latest_version_files), start=1
|
|
200
|
+
):
|
|
201
|
+
file_name = os.path.basename(file_path)
|
|
202
|
+
bar.set_description(
|
|
203
|
+
f"Analyzing the '{file_name}' parquet file ({idx}/{num_latest_files})..."
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
relative_path = file_path.split("Tables/")[1]
|
|
207
|
+
file_system_path = f"{local_path}/Tables/{relative_path}"
|
|
208
|
+
parquet_file = pq.ParquetFile(file_system_path)
|
|
209
|
+
|
|
134
210
|
row_groups += parquet_file.num_row_groups
|
|
135
211
|
|
|
136
212
|
# Generate rowgroup dataframe
|
|
137
213
|
new_data = {
|
|
138
|
-
"
|
|
139
|
-
"
|
|
140
|
-
"
|
|
214
|
+
# "Dataset": "Parquet Files",
|
|
215
|
+
"Parquet File": file_name,
|
|
216
|
+
"Row Count": parquet_file.metadata.num_rows,
|
|
217
|
+
"Row Groups": parquet_file.num_row_groups,
|
|
218
|
+
"Created By": parquet_file.metadata.created_by,
|
|
219
|
+
"Total Table Rows": -1,
|
|
220
|
+
"Total Table Row Groups": -1,
|
|
141
221
|
}
|
|
142
222
|
|
|
143
223
|
parquet_file_df = pd.concat(
|
|
144
224
|
[parquet_file_df, pd.DataFrame(new_data, index=[0])], ignore_index=True
|
|
145
225
|
)
|
|
146
226
|
|
|
227
|
+
# Loop through the row groups
|
|
147
228
|
for i in range(parquet_file.num_row_groups):
|
|
148
229
|
row_group = parquet_file.metadata.row_group(i)
|
|
149
230
|
num_rows = row_group.num_rows
|
|
@@ -154,38 +235,50 @@ def delta_analyzer(
|
|
|
154
235
|
total_compressed_size = 0
|
|
155
236
|
total_uncompressed_size = 0
|
|
156
237
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
238
|
+
# Loop through the columns
|
|
239
|
+
if column_stats:
|
|
240
|
+
for j in range(row_group.num_columns):
|
|
241
|
+
column_chunk = row_group.column(j)
|
|
242
|
+
total_compressed_size += column_chunk.total_compressed_size
|
|
243
|
+
total_uncompressed_size += column_chunk.total_uncompressed_size
|
|
244
|
+
|
|
245
|
+
# Generate Column Chunk Dataframe
|
|
246
|
+
new_data = {
|
|
247
|
+
# "Dataset": "Column Chunks",
|
|
248
|
+
"Parquet File": file_name,
|
|
249
|
+
"Column ID": j,
|
|
250
|
+
"Column Name": column_chunk.path_in_schema,
|
|
251
|
+
"Column Type": column_chunk.physical_type,
|
|
252
|
+
"Compressed Size": column_chunk.total_compressed_size,
|
|
253
|
+
"Uncompressed Size": column_chunk.total_uncompressed_size,
|
|
254
|
+
"Has Dict": column_chunk.has_dictionary_page,
|
|
255
|
+
"Dict Offset": column_chunk.dictionary_page_offset,
|
|
256
|
+
"Value Count": column_chunk.num_values,
|
|
257
|
+
"Encodings": str(column_chunk.encodings),
|
|
258
|
+
"Statistics": column_chunk.statistics,
|
|
259
|
+
"PrimativeType": column_chunk.physical_type,
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
column_chunk_df = pd.concat(
|
|
263
|
+
[column_chunk_df, pd.DataFrame(new_data, index=[0])],
|
|
264
|
+
ignore_index=True,
|
|
265
|
+
)
|
|
180
266
|
|
|
181
267
|
# Generate rowgroup dataframe
|
|
182
268
|
new_data = {
|
|
183
|
-
"
|
|
184
|
-
"
|
|
185
|
-
"
|
|
186
|
-
"
|
|
187
|
-
"
|
|
188
|
-
"
|
|
269
|
+
# "Dataset": "Row Groups",
|
|
270
|
+
"Parquet File": file_name,
|
|
271
|
+
"Row Group ID": i + 1,
|
|
272
|
+
"Row Count": num_rows,
|
|
273
|
+
"Compressed Size": total_compressed_size,
|
|
274
|
+
"Uncompressed Size": total_uncompressed_size,
|
|
275
|
+
"Compression Ratio": (
|
|
276
|
+
total_compressed_size / total_uncompressed_size
|
|
277
|
+
if column_stats
|
|
278
|
+
else 0
|
|
279
|
+
),
|
|
280
|
+
"Total Table Rows": -1,
|
|
281
|
+
"Total Table Row Groups": -1,
|
|
189
282
|
}
|
|
190
283
|
|
|
191
284
|
if not row_group_df.empty:
|
|
@@ -201,87 +294,114 @@ def delta_analyzer(
|
|
|
201
294
|
summary_df = pd.DataFrame(
|
|
202
295
|
[
|
|
203
296
|
{
|
|
204
|
-
"
|
|
205
|
-
"
|
|
206
|
-
"
|
|
207
|
-
"
|
|
208
|
-
"
|
|
209
|
-
"
|
|
210
|
-
"
|
|
297
|
+
# "Dataset": "Summary",
|
|
298
|
+
"Row Count": row_count,
|
|
299
|
+
"Row Groups": row_groups,
|
|
300
|
+
"Parquet Files": num_latest_files,
|
|
301
|
+
"Max Rows Per Row Group": max_rows_per_row_group,
|
|
302
|
+
"Min Rows Per Row Group": min_rows_per_row_group,
|
|
303
|
+
"Avg Rows Per Row Group": avg_rows_per_row_group,
|
|
304
|
+
"VOrder Enabled": is_vorder,
|
|
211
305
|
# "VOrderLevel": v_order_level,
|
|
212
306
|
}
|
|
213
307
|
]
|
|
214
308
|
)
|
|
215
309
|
|
|
216
310
|
# Clean up data types
|
|
217
|
-
_update_dataframe_datatypes(
|
|
218
|
-
dataframe=column_chunk_df, column_map=column_chunk_df_columns
|
|
219
|
-
)
|
|
220
311
|
_update_dataframe_datatypes(dataframe=row_group_df, column_map=row_group_df_columns)
|
|
221
312
|
_update_dataframe_datatypes(
|
|
222
313
|
dataframe=parquet_file_df, column_map=parquet_file_df_columns
|
|
223
314
|
)
|
|
224
315
|
|
|
225
316
|
# Generate column dataframe
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
317
|
+
if column_stats:
|
|
318
|
+
_update_dataframe_datatypes(
|
|
319
|
+
dataframe=column_chunk_df, column_map=column_chunk_df_columns
|
|
320
|
+
)
|
|
321
|
+
column_df = column_chunk_df.groupby(
|
|
322
|
+
["Column Name", "Column Type"], as_index=False
|
|
323
|
+
).agg({"Compressed Size": "sum", "Uncompressed Size": "sum"})
|
|
324
|
+
|
|
325
|
+
# Add distinct count to column_df
|
|
326
|
+
if not skip_cardinality:
|
|
327
|
+
for ind, r in column_df.iterrows():
|
|
328
|
+
col_name = r["Column Name"]
|
|
329
|
+
if approx_distinct_count:
|
|
330
|
+
function = "approx"
|
|
331
|
+
else:
|
|
332
|
+
function = "distinctcount"
|
|
333
|
+
dc = _get_column_aggregate(
|
|
334
|
+
table_name=table_name,
|
|
335
|
+
column_name=col_name,
|
|
336
|
+
function=function,
|
|
337
|
+
lakehouse=lakehouse,
|
|
338
|
+
workspace=workspace,
|
|
339
|
+
)
|
|
247
340
|
|
|
248
|
-
|
|
249
|
-
|
|
341
|
+
if "Cardinality" not in column_df.columns:
|
|
342
|
+
column_df["Cardinality"] = None
|
|
250
343
|
|
|
251
|
-
|
|
344
|
+
column_df.at[ind, "Cardinality"] = dc
|
|
252
345
|
|
|
253
|
-
|
|
254
|
-
|
|
346
|
+
summary_df["Total Size"] = column_df["Compressed Size"].sum()
|
|
347
|
+
|
|
348
|
+
parquet_file_df["Total Table Rows"] = parquet_file_df["Row Count"].sum()
|
|
349
|
+
parquet_file_df["Total Table Row Groups"] = parquet_file_df["Row Groups"].sum()
|
|
350
|
+
|
|
351
|
+
row_group_df["Total Table Rows"] = parquet_file_df["Row Count"].sum()
|
|
352
|
+
row_group_df["Total Table Row Groups"] = parquet_file_df["Row Groups"].sum()
|
|
353
|
+
total_rows = row_group_df["Row Count"].sum()
|
|
354
|
+
row_group_df["Ratio Of Total Table Rows"] = (
|
|
355
|
+
row_group_df["Row Count"] / total_rows * 100.0
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
if column_stats:
|
|
359
|
+
column_df["Total Table Rows"] = parquet_file_df["Row Count"].sum()
|
|
360
|
+
column_df["Table Size"] = column_df["Compressed Size"].sum()
|
|
361
|
+
column_df["Size Percent Of Table"] = (
|
|
362
|
+
column_df["Compressed Size"] / column_df["Table Size"] * 100.0
|
|
363
|
+
)
|
|
364
|
+
if not skip_cardinality and column_stats:
|
|
365
|
+
column_df["Cardinality"] = column_df["Cardinality"].fillna(0).astype(int)
|
|
366
|
+
column_df["Cardinality Of Total Rows"] = (
|
|
367
|
+
column_df["Cardinality"] / column_df["Total Table Rows"] * 100.0
|
|
368
|
+
)
|
|
255
369
|
|
|
256
370
|
dataframes = {
|
|
257
371
|
"Summary": summary_df,
|
|
258
372
|
"Parquet Files": parquet_file_df,
|
|
259
373
|
"Row Groups": row_group_df,
|
|
260
|
-
"Column Chunks": column_chunk_df,
|
|
261
|
-
"Columns": column_df,
|
|
262
374
|
}
|
|
263
375
|
|
|
376
|
+
if column_stats:
|
|
377
|
+
dataframes["Column Chunks"] = column_chunk_df
|
|
378
|
+
dataframes["Columns"] = column_df
|
|
379
|
+
|
|
264
380
|
save_table = f"{prefix}Summary"
|
|
265
381
|
|
|
266
382
|
if export:
|
|
383
|
+
if not lakehouse_attached():
|
|
384
|
+
raise ValueError(
|
|
385
|
+
f"{icons.red_dot} No lakehouse is attached to this notebook. Please attach a lakehouse to the notebook before running the Delta Analyzer."
|
|
386
|
+
)
|
|
267
387
|
dfL = get_lakehouse_tables()
|
|
268
388
|
dfL_filt = dfL[dfL["Table Name"] == save_table]
|
|
269
389
|
if dfL_filt.empty:
|
|
270
390
|
runId = 1
|
|
271
391
|
else:
|
|
272
392
|
max_run_id = _get_column_aggregate(
|
|
273
|
-
|
|
393
|
+
table_name=save_table,
|
|
274
394
|
)
|
|
275
395
|
runId = max_run_id + 1
|
|
276
396
|
|
|
277
397
|
for name, df in dataframes.items():
|
|
278
398
|
name = name.replace(" ", "")
|
|
279
399
|
cols = {
|
|
280
|
-
"
|
|
281
|
-
"
|
|
282
|
-
"
|
|
283
|
-
"
|
|
284
|
-
"
|
|
400
|
+
"Workspace Name": workspace_name,
|
|
401
|
+
"Workspace Id": workspace_id,
|
|
402
|
+
"Lakehouse Name": lakehouse_name,
|
|
403
|
+
"Lakehouse Id": lakehouse_id,
|
|
404
|
+
"Table Name": table_name,
|
|
285
405
|
}
|
|
286
406
|
for i, (col, param) in enumerate(cols.items()):
|
|
287
407
|
df[col] = param
|
|
@@ -291,8 +411,10 @@ def delta_analyzer(
|
|
|
291
411
|
df["Timestamp"] = pd.to_datetime(df["Timestamp"])
|
|
292
412
|
|
|
293
413
|
if export:
|
|
294
|
-
df["
|
|
295
|
-
df["
|
|
414
|
+
df["Run Id"] = runId
|
|
415
|
+
df["Run Id"] = df["Run Id"].astype(int)
|
|
416
|
+
|
|
417
|
+
df.columns = df.columns.str.replace(" ", "")
|
|
296
418
|
save_as_delta_table(
|
|
297
419
|
dataframe=df,
|
|
298
420
|
delta_table_name=f"{prefix}{name}",
|
|
@@ -301,3 +423,51 @@ def delta_analyzer(
|
|
|
301
423
|
)
|
|
302
424
|
|
|
303
425
|
return dataframes
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
@log
|
|
429
|
+
def get_delta_table_history(
|
|
430
|
+
table_name: str,
|
|
431
|
+
lakehouse: Optional[str | UUID] = None,
|
|
432
|
+
workspace: Optional[str | UUID] = None,
|
|
433
|
+
) -> pd.DataFrame:
|
|
434
|
+
"""
|
|
435
|
+
Returns the history of a delta table as a pandas dataframe.
|
|
436
|
+
|
|
437
|
+
Parameters
|
|
438
|
+
----------
|
|
439
|
+
table_name : str
|
|
440
|
+
The delta table name.
|
|
441
|
+
lakehouse : str | uuid.UUID, default=None
|
|
442
|
+
The Fabric lakehouse name or ID.
|
|
443
|
+
Defaults to None which resolves to the lakehouse attached to the notebook.
|
|
444
|
+
workspace : str | uuid.UUID, default=None
|
|
445
|
+
The Fabric workspace name or ID used by the lakehouse.
|
|
446
|
+
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
447
|
+
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
448
|
+
|
|
449
|
+
Returns
|
|
450
|
+
-------
|
|
451
|
+
pandas.DataFrame
|
|
452
|
+
A dataframe showing the history of the delta table.
|
|
453
|
+
"""
|
|
454
|
+
|
|
455
|
+
def camel_to_title(text):
|
|
456
|
+
return re.sub(r"([a-z])([A-Z])", r"\1 \2", text).title()
|
|
457
|
+
|
|
458
|
+
spark = _create_spark_session()
|
|
459
|
+
|
|
460
|
+
(workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace=workspace)
|
|
461
|
+
(lakehouse_name, lakehouse_id) = resolve_lakehouse_name_and_id(
|
|
462
|
+
lakehouse=lakehouse, workspace=workspace
|
|
463
|
+
)
|
|
464
|
+
path = create_abfss_path(lakehouse_id, workspace_id, table_name)
|
|
465
|
+
|
|
466
|
+
from delta import DeltaTable
|
|
467
|
+
|
|
468
|
+
delta_table = DeltaTable.forPath(spark, path)
|
|
469
|
+
df = delta_table.history().toPandas()
|
|
470
|
+
|
|
471
|
+
df.rename(columns=lambda col: camel_to_title(col), inplace=True)
|
|
472
|
+
|
|
473
|
+
return df
|