semantic-link-labs 0.9.0__py3-none-any.whl → 0.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of semantic-link-labs might be problematic. Click here for more details.
- {semantic_link_labs-0.9.0.dist-info → semantic_link_labs-0.9.2.dist-info}/METADATA +68 -7
- {semantic_link_labs-0.9.0.dist-info → semantic_link_labs-0.9.2.dist-info}/RECORD +83 -76
- sempy_labs/__init__.py +14 -12
- sempy_labs/_authentication.py +0 -2
- sempy_labs/_capacities.py +120 -142
- sempy_labs/_capacity_migration.py +61 -94
- sempy_labs/_clear_cache.py +9 -8
- sempy_labs/_connections.py +72 -105
- sempy_labs/_data_pipelines.py +47 -49
- sempy_labs/_dataflows.py +45 -51
- sempy_labs/_dax.py +228 -6
- sempy_labs/_delta_analyzer.py +303 -0
- sempy_labs/_deployment_pipelines.py +72 -66
- sempy_labs/_environments.py +39 -36
- sempy_labs/_eventhouses.py +35 -35
- sempy_labs/_eventstreams.py +38 -39
- sempy_labs/_external_data_shares.py +29 -42
- sempy_labs/_gateways.py +57 -101
- sempy_labs/_generate_semantic_model.py +22 -30
- sempy_labs/_git.py +46 -66
- sempy_labs/_graphQL.py +95 -0
- sempy_labs/_helper_functions.py +175 -30
- sempy_labs/_job_scheduler.py +47 -59
- sempy_labs/_kql_databases.py +27 -34
- sempy_labs/_kql_querysets.py +23 -30
- sempy_labs/_list_functions.py +262 -164
- sempy_labs/_managed_private_endpoints.py +52 -47
- sempy_labs/_mirrored_databases.py +110 -134
- sempy_labs/_mirrored_warehouses.py +13 -13
- sempy_labs/_ml_experiments.py +36 -36
- sempy_labs/_ml_models.py +37 -38
- sempy_labs/_model_dependencies.py +2 -0
- sempy_labs/_notebooks.py +28 -29
- sempy_labs/_one_lake_integration.py +2 -0
- sempy_labs/_query_scale_out.py +63 -81
- sempy_labs/_refresh_semantic_model.py +12 -14
- sempy_labs/_spark.py +54 -79
- sempy_labs/_sql.py +7 -11
- sempy_labs/_vertipaq.py +8 -3
- sempy_labs/_warehouses.py +30 -33
- sempy_labs/_workloads.py +15 -20
- sempy_labs/_workspace_identity.py +13 -17
- sempy_labs/_workspaces.py +49 -48
- sempy_labs/admin/__init__.py +2 -0
- sempy_labs/admin/_basic_functions.py +244 -281
- sempy_labs/admin/_domains.py +188 -103
- sempy_labs/admin/_external_data_share.py +26 -31
- sempy_labs/admin/_git.py +17 -22
- sempy_labs/admin/_items.py +34 -48
- sempy_labs/admin/_scanner.py +20 -13
- sempy_labs/directlake/_directlake_schema_compare.py +2 -0
- sempy_labs/directlake/_dl_helper.py +10 -11
- sempy_labs/directlake/_generate_shared_expression.py +4 -5
- sempy_labs/directlake/_get_directlake_lakehouse.py +1 -0
- sempy_labs/directlake/_list_directlake_model_calc_tables.py +1 -0
- sempy_labs/directlake/_show_unsupported_directlake_objects.py +2 -0
- sempy_labs/directlake/_warm_cache.py +2 -0
- sempy_labs/graph/__init__.py +33 -0
- sempy_labs/graph/_groups.py +402 -0
- sempy_labs/graph/_teams.py +113 -0
- sempy_labs/graph/_users.py +191 -0
- sempy_labs/lakehouse/__init__.py +4 -0
- sempy_labs/lakehouse/_get_lakehouse_columns.py +10 -10
- sempy_labs/lakehouse/_get_lakehouse_tables.py +14 -20
- sempy_labs/lakehouse/_lakehouse.py +101 -4
- sempy_labs/lakehouse/_shortcuts.py +42 -20
- sempy_labs/migration/__init__.py +4 -0
- sempy_labs/migration/_direct_lake_to_import.py +66 -0
- sempy_labs/migration/_migrate_calctables_to_lakehouse.py +1 -0
- sempy_labs/migration/_migrate_calctables_to_semantic_model.py +1 -0
- sempy_labs/migration/_migrate_model_objects_to_semantic_model.py +1 -0
- sempy_labs/migration/_migrate_tables_columns_to_semantic_model.py +2 -0
- sempy_labs/report/_download_report.py +8 -13
- sempy_labs/report/_generate_report.py +49 -46
- sempy_labs/report/_paginated.py +20 -26
- sempy_labs/report/_report_functions.py +50 -45
- sempy_labs/report/_report_list_functions.py +2 -0
- sempy_labs/report/_report_rebind.py +6 -10
- sempy_labs/report/_reportwrapper.py +187 -220
- sempy_labs/tom/_model.py +8 -5
- {semantic_link_labs-0.9.0.dist-info → semantic_link_labs-0.9.2.dist-info}/LICENSE +0 -0
- {semantic_link_labs-0.9.0.dist-info → semantic_link_labs-0.9.2.dist-info}/WHEEL +0 -0
- {semantic_link_labs-0.9.0.dist-info → semantic_link_labs-0.9.2.dist-info}/top_level.txt +0 -0
sempy_labs/_dax.py
CHANGED
|
@@ -4,12 +4,16 @@ from sempy_labs._helper_functions import (
|
|
|
4
4
|
resolve_workspace_name_and_id,
|
|
5
5
|
format_dax_object_name,
|
|
6
6
|
resolve_dataset_name_and_id,
|
|
7
|
+
_base_api,
|
|
8
|
+
generate_guid,
|
|
7
9
|
)
|
|
8
10
|
from sempy_labs._model_dependencies import get_model_calc_dependencies
|
|
9
|
-
from typing import Optional, List
|
|
11
|
+
from typing import Optional, List, Tuple
|
|
10
12
|
from sempy._utils._log import log
|
|
11
13
|
from uuid import UUID
|
|
12
14
|
from sempy_labs.directlake._warm_cache import _put_columns_into_memory
|
|
15
|
+
import sempy_labs._icons as icons
|
|
16
|
+
import time
|
|
13
17
|
|
|
14
18
|
|
|
15
19
|
@log
|
|
@@ -47,15 +51,15 @@ def evaluate_dax_impersonation(
|
|
|
47
51
|
(workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
|
|
48
52
|
(dataset_name, dataset_id) = resolve_dataset_name_and_id(dataset, workspace_id)
|
|
49
53
|
|
|
50
|
-
|
|
54
|
+
payload = {
|
|
51
55
|
"queries": [{"query": dax_query}],
|
|
52
56
|
"impersonatedUserName": user_name,
|
|
53
57
|
}
|
|
54
58
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
+
response = _base_api(
|
|
60
|
+
request=f"/v1.0/myorg/groups/{workspace_id}/datasets/{dataset_id}/executeQueries",
|
|
61
|
+
method="post",
|
|
62
|
+
payload=payload,
|
|
59
63
|
)
|
|
60
64
|
data = response.json()["results"][0]["tables"]
|
|
61
65
|
column_names = data[0]["rows"][0].keys()
|
|
@@ -100,6 +104,8 @@ def get_dax_query_dependencies(
|
|
|
100
104
|
(workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
|
|
101
105
|
(dataset_name, dataset_id) = resolve_dataset_name_and_id(dataset, workspace_id)
|
|
102
106
|
|
|
107
|
+
fabric.refresh_tom_cache(workspace=workspace)
|
|
108
|
+
|
|
103
109
|
if isinstance(dax_string, str):
|
|
104
110
|
dax_string = [dax_string]
|
|
105
111
|
|
|
@@ -257,3 +263,219 @@ def get_dax_query_memory_size(
|
|
|
257
263
|
)
|
|
258
264
|
|
|
259
265
|
return df["Total Size"].sum()
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
@log
|
|
269
|
+
def _dax_perf_test(
|
|
270
|
+
dataset: str,
|
|
271
|
+
dax_queries: dict,
|
|
272
|
+
clear_cache_before_run: bool = False,
|
|
273
|
+
refresh_type: Optional[str] = None,
|
|
274
|
+
rest_time: int = 2,
|
|
275
|
+
workspace: Optional[str] = None,
|
|
276
|
+
) -> Tuple[pd.DataFrame, dict]:
|
|
277
|
+
"""
|
|
278
|
+
Runs a performance test on a set of DAX queries.
|
|
279
|
+
|
|
280
|
+
Parameters
|
|
281
|
+
----------
|
|
282
|
+
dataset : str
|
|
283
|
+
Name of the semantic model.
|
|
284
|
+
dax_queries : dict
|
|
285
|
+
The dax queries to run in a dictionary format. Here is an example:
|
|
286
|
+
{
|
|
287
|
+
"Sales Amount Test", """ """ EVALUATE SUMMARIZECOLUMNS("Sales Amount", [Sales Amount]) """ """,
|
|
288
|
+
"Order Quantity with Product", """ """ EVALUATE SUMMARIZECOLUMNS('Product'[Color], "Order Qty", [Order Qty]) """ """,
|
|
289
|
+
}
|
|
290
|
+
clear_cache_before_run : bool, default=False
|
|
291
|
+
refresh_type : str, default=None
|
|
292
|
+
rest_time : int, default=2
|
|
293
|
+
Rest time (in seconds) between the execution of each DAX query.
|
|
294
|
+
workspace : str, default=None
|
|
295
|
+
The Fabric workspace name.
|
|
296
|
+
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
297
|
+
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
298
|
+
|
|
299
|
+
Returns
|
|
300
|
+
-------
|
|
301
|
+
Tuple[pandas.DataFrame, dict]
|
|
302
|
+
A pandas dataframe showing the SQL profiler trace results of the DAX queries.
|
|
303
|
+
A dictionary of the query results in pandas dataframes.
|
|
304
|
+
"""
|
|
305
|
+
from sempy_labs._refresh_semantic_model import refresh_semantic_model
|
|
306
|
+
from sempy_labs._clear_cache import clear_cache
|
|
307
|
+
|
|
308
|
+
event_schema = {
|
|
309
|
+
"QueryBegin": [
|
|
310
|
+
"EventClass",
|
|
311
|
+
"EventSubclass",
|
|
312
|
+
"CurrentTime",
|
|
313
|
+
"NTUserName",
|
|
314
|
+
"TextData",
|
|
315
|
+
"StartTime",
|
|
316
|
+
"ApplicationName",
|
|
317
|
+
],
|
|
318
|
+
"QueryEnd": [
|
|
319
|
+
"EventClass",
|
|
320
|
+
"EventSubclass",
|
|
321
|
+
"CurrentTime",
|
|
322
|
+
"NTUserName",
|
|
323
|
+
"TextData",
|
|
324
|
+
"StartTime",
|
|
325
|
+
"EndTime",
|
|
326
|
+
"Duration",
|
|
327
|
+
"CpuTime",
|
|
328
|
+
"Success",
|
|
329
|
+
"ApplicationName",
|
|
330
|
+
],
|
|
331
|
+
"VertiPaqSEQueryBegin": [
|
|
332
|
+
"EventClass",
|
|
333
|
+
"EventSubclass",
|
|
334
|
+
"CurrentTime",
|
|
335
|
+
"NTUserName",
|
|
336
|
+
"TextData",
|
|
337
|
+
"StartTime",
|
|
338
|
+
],
|
|
339
|
+
"VertiPaqSEQueryEnd": [
|
|
340
|
+
"EventClass",
|
|
341
|
+
"EventSubclass",
|
|
342
|
+
"CurrentTime",
|
|
343
|
+
"NTUserName",
|
|
344
|
+
"TextData",
|
|
345
|
+
"StartTime",
|
|
346
|
+
"EndTime",
|
|
347
|
+
"Duration",
|
|
348
|
+
"CpuTime",
|
|
349
|
+
"Success",
|
|
350
|
+
],
|
|
351
|
+
"VertiPaqSEQueryCacheMatch": [
|
|
352
|
+
"EventClass",
|
|
353
|
+
"EventSubclass",
|
|
354
|
+
"CurrentTime",
|
|
355
|
+
"NTUserName",
|
|
356
|
+
"TextData",
|
|
357
|
+
],
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
# Add Execution Metrics
|
|
361
|
+
event_schema["ExecutionMetrics"] = ["EventClass", "ApplicationName", "TextData"]
|
|
362
|
+
# Add DAX Query Plan
|
|
363
|
+
# event_schema["DAXQueryPlan"] = ["EventClass", "EventSubclass", "CurrentTime", "StartTime", "EndTime", "Duration", "CpuTime", "ApplicationName", "TextData"]
|
|
364
|
+
|
|
365
|
+
query_results = {}
|
|
366
|
+
|
|
367
|
+
# Establish trace connection
|
|
368
|
+
with fabric.create_trace_connection(
|
|
369
|
+
dataset=dataset, workspace=workspace
|
|
370
|
+
) as trace_connection:
|
|
371
|
+
with trace_connection.create_trace(event_schema) as trace:
|
|
372
|
+
trace.start()
|
|
373
|
+
print(f"{icons.in_progress} Starting performance testing...")
|
|
374
|
+
# Loop through DAX queries
|
|
375
|
+
for name, dax in dax_queries.items():
|
|
376
|
+
|
|
377
|
+
if clear_cache_before_run:
|
|
378
|
+
clear_cache(dataset=dataset, workspace=workspace)
|
|
379
|
+
if refresh_type is not None:
|
|
380
|
+
refresh_semantic_model(
|
|
381
|
+
dataset=dataset, workspace=workspace, refresh_type=refresh_type
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
# EVALUATE {1} is used to initate a warm cache
|
|
385
|
+
fabric.evaluate_dax(
|
|
386
|
+
dataset=dataset, workspace=workspace, dax_string="""EVALUATE {1}"""
|
|
387
|
+
)
|
|
388
|
+
# Run DAX Query
|
|
389
|
+
result = fabric.evaluate_dax(
|
|
390
|
+
dataset=dataset, workspace=workspace, dax_string=dax
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
# Add results to output
|
|
394
|
+
query_results[name] = result
|
|
395
|
+
|
|
396
|
+
time.sleep(rest_time)
|
|
397
|
+
print(f"{icons.green_dot} The '{name}' query has completed.")
|
|
398
|
+
|
|
399
|
+
df = trace.stop()
|
|
400
|
+
# Allow time to collect trace results
|
|
401
|
+
time.sleep(5)
|
|
402
|
+
|
|
403
|
+
# Step 1: Filter out unnecessary operations
|
|
404
|
+
query_names = list(dax_queries.keys())
|
|
405
|
+
df = df[
|
|
406
|
+
~df["Application Name"].isin(["PowerBI", "PowerBIEIM"])
|
|
407
|
+
& (~df["Text Data"].str.startswith("EVALUATE {1}"))
|
|
408
|
+
]
|
|
409
|
+
query_begin = df["Event Class"] == "QueryBegin"
|
|
410
|
+
temp_column_name = "QueryName_INT"
|
|
411
|
+
df = df.copy()
|
|
412
|
+
df[temp_column_name] = query_begin.cumsum()
|
|
413
|
+
df[temp_column_name] = (
|
|
414
|
+
df[temp_column_name]
|
|
415
|
+
.where(query_begin, None) # Assign None to non-query begin rows
|
|
416
|
+
.ffill() # Forward fill None values
|
|
417
|
+
.astype("Int64") # Use pandas nullable integer type for numeric indices
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
df.loc[df[temp_column_name].notna(), "Query Name"] = (
|
|
421
|
+
df[temp_column_name]
|
|
422
|
+
.dropna()
|
|
423
|
+
.astype(int)
|
|
424
|
+
.map(lambda x: query_names[x - 1])
|
|
425
|
+
)
|
|
426
|
+
df = df[df[temp_column_name] != None]
|
|
427
|
+
df = df.drop(columns=[temp_column_name])
|
|
428
|
+
|
|
429
|
+
query_to_guid = {
|
|
430
|
+
name: generate_guid() for name in df["Query Name"].unique()
|
|
431
|
+
}
|
|
432
|
+
df["Query ID"] = df["Query Name"].map(query_to_guid)
|
|
433
|
+
|
|
434
|
+
df = df.reset_index(drop=True)
|
|
435
|
+
|
|
436
|
+
return df, query_results
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def _dax_perf_test_bulk(
|
|
440
|
+
mapping: dict,
|
|
441
|
+
clear_cache_before_run: bool = False,
|
|
442
|
+
refresh_type: Optional[str] = None,
|
|
443
|
+
rest_time: int = 2,
|
|
444
|
+
):
|
|
445
|
+
"""
|
|
446
|
+
mapping is something like this:
|
|
447
|
+
|
|
448
|
+
mapping = {
|
|
449
|
+
"Workspace1": {
|
|
450
|
+
"Dataset1": {
|
|
451
|
+
"Query1": "EVALUATE ...",
|
|
452
|
+
"Query2": "EVALUATE ...",
|
|
453
|
+
},
|
|
454
|
+
"Dataset2": {
|
|
455
|
+
"Query3": "EVALUATE ...",
|
|
456
|
+
"Query4": "EVALUATE ...",
|
|
457
|
+
}
|
|
458
|
+
},
|
|
459
|
+
"Workspace2": {
|
|
460
|
+
"Dataset3": {
|
|
461
|
+
"Query5": "EVALUATE ...",
|
|
462
|
+
"Query6": "EVALUATE ...",
|
|
463
|
+
},
|
|
464
|
+
"Dataset4": {
|
|
465
|
+
"Query7": "EVALUATE ...",
|
|
466
|
+
"Query8": "EVALUATE ...",
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
"""
|
|
471
|
+
|
|
472
|
+
for workspace, datasets in mapping.items():
|
|
473
|
+
for dataset, queries in datasets.items():
|
|
474
|
+
_dax_perf_test(
|
|
475
|
+
dataset=dataset,
|
|
476
|
+
dax_queries=queries,
|
|
477
|
+
clear_cache_before_run=clear_cache_before_run,
|
|
478
|
+
refresh_type=refresh_type,
|
|
479
|
+
rest_time=rest_time,
|
|
480
|
+
workspace=workspace,
|
|
481
|
+
)
|
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import datetime
|
|
3
|
+
from typing import Dict
|
|
4
|
+
import pyarrow.dataset as ds
|
|
5
|
+
import pyarrow.parquet as pq
|
|
6
|
+
from pyspark.sql import SparkSession
|
|
7
|
+
from sempy_labs._helper_functions import (
|
|
8
|
+
create_abfss_path,
|
|
9
|
+
save_as_delta_table,
|
|
10
|
+
_get_column_aggregate,
|
|
11
|
+
_create_dataframe,
|
|
12
|
+
_update_dataframe_datatypes,
|
|
13
|
+
resolve_workspace_name_and_id,
|
|
14
|
+
resolve_lakehouse_name_and_id,
|
|
15
|
+
)
|
|
16
|
+
from sempy_labs.lakehouse._get_lakehouse_tables import get_lakehouse_tables
|
|
17
|
+
from sempy_labs.lakehouse._lakehouse import lakehouse_attached
|
|
18
|
+
import sempy_labs._icons as icons
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def delta_analyzer(
|
|
22
|
+
table_name: str,
|
|
23
|
+
approx_distinct_count: bool = True,
|
|
24
|
+
export: bool = False,
|
|
25
|
+
) -> Dict[str, pd.DataFrame]:
|
|
26
|
+
"""
|
|
27
|
+
Analyzes a delta table and shows the results in dictionary containing a set of 5 dataframes. The table being analyzed must be in the lakehouse attached to the notebook.
|
|
28
|
+
|
|
29
|
+
The 5 dataframes returned by this function are:
|
|
30
|
+
|
|
31
|
+
* Summary
|
|
32
|
+
* Parquet Files
|
|
33
|
+
* Row Groups
|
|
34
|
+
* Column Chunks
|
|
35
|
+
* Columns
|
|
36
|
+
|
|
37
|
+
Read more about Delta Analyzer `here <https://github.com/microsoft/Analysis-Services/tree/master/DeltaAnalyzer>`_.
|
|
38
|
+
|
|
39
|
+
Parameters
|
|
40
|
+
----------
|
|
41
|
+
table_name : str
|
|
42
|
+
The delta table name.
|
|
43
|
+
approx_distinct_count: bool, default=True
|
|
44
|
+
If True, uses approx_count_distinct to calculate the cardinality of each column. If False, uses COUNT(DISTINCT) instead.
|
|
45
|
+
export : bool, default=False
|
|
46
|
+
If True, exports the resulting dataframes to delta tables in the lakehouse attached to the notebook.
|
|
47
|
+
|
|
48
|
+
Returns
|
|
49
|
+
-------
|
|
50
|
+
Dict[str, pandas.DataFrame]
|
|
51
|
+
A dictionary of pandas dataframes showing semantic model objects which violated the best practice analyzer rules.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
if not lakehouse_attached():
|
|
55
|
+
raise ValueError(
|
|
56
|
+
f"{icons.red_dot} No lakehouse is attached to this notebook. Please attach a lakehouse to the notebook before running the Delta Analyzer."
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
prefix = "SLL_DeltaAnalyzer_"
|
|
60
|
+
now = datetime.datetime.now()
|
|
61
|
+
(workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace=None)
|
|
62
|
+
(lakehouse_name, lakehouse_id) = resolve_lakehouse_name_and_id(
|
|
63
|
+
lakehouse=None, workspace=None
|
|
64
|
+
)
|
|
65
|
+
path = create_abfss_path(lakehouse_id, workspace_id, table_name)
|
|
66
|
+
table_path = f"/lakehouse/default/Tables/{table_name}"
|
|
67
|
+
|
|
68
|
+
parquet_file_df_columns = {
|
|
69
|
+
"ParquetFile": "string",
|
|
70
|
+
"RowCount": "int",
|
|
71
|
+
"RowGroups": "int",
|
|
72
|
+
}
|
|
73
|
+
row_group_df_columns = {
|
|
74
|
+
"ParquetFile": "string",
|
|
75
|
+
"RowGroupID": "int",
|
|
76
|
+
"RowCount": "int",
|
|
77
|
+
"CompressedSize": "int",
|
|
78
|
+
"UncompressedSize": "int",
|
|
79
|
+
"CompressionRatio": "float",
|
|
80
|
+
}
|
|
81
|
+
column_chunk_df_columns = {
|
|
82
|
+
"ParquetFile": "string",
|
|
83
|
+
"ColumnID": "int",
|
|
84
|
+
"ColumnName": "string",
|
|
85
|
+
"ColumnType": "string",
|
|
86
|
+
"CompressedSize": "int",
|
|
87
|
+
"UncompressedSize": "int",
|
|
88
|
+
"HasDict": "bool",
|
|
89
|
+
"DictOffset": "int_fillna",
|
|
90
|
+
"ValueCount": "int",
|
|
91
|
+
"Encodings": "string",
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
parquet_file_df = _create_dataframe(columns=parquet_file_df_columns)
|
|
95
|
+
row_group_df = _create_dataframe(columns=row_group_df_columns)
|
|
96
|
+
column_chunk_df = _create_dataframe(columns=column_chunk_df_columns)
|
|
97
|
+
|
|
98
|
+
spark = SparkSession.builder.getOrCreate()
|
|
99
|
+
# delta_table = DeltaTable.forPath(spark, path)
|
|
100
|
+
# detail_df = spark.sql(f"DESCRIBE DETAIL `{table_name}`").collect()[0]
|
|
101
|
+
|
|
102
|
+
# num_files = detail_df.numFiles
|
|
103
|
+
# size_in_bytes = detail_df.sizeInBytes
|
|
104
|
+
|
|
105
|
+
latest_files = spark.read.format("delta").load(path).inputFiles()
|
|
106
|
+
file_paths = [f.split("/")[-1] for f in latest_files]
|
|
107
|
+
row_count = spark.table(table_name).count()
|
|
108
|
+
row_groups = 0
|
|
109
|
+
max_rows_per_row_group = 0
|
|
110
|
+
min_rows_per_row_group = float("inf")
|
|
111
|
+
# dt = DeltaTable.forPath(spark, path)
|
|
112
|
+
# schema = dt.toDF().schema
|
|
113
|
+
# is_vorder = False
|
|
114
|
+
# if (
|
|
115
|
+
# dt.detail()
|
|
116
|
+
# .collect()[0]
|
|
117
|
+
# .asDict()
|
|
118
|
+
# .get("properties")
|
|
119
|
+
# .get("delta.parquet.vorder.enabled")
|
|
120
|
+
# == "true"
|
|
121
|
+
# ):
|
|
122
|
+
# is_vorder = True
|
|
123
|
+
|
|
124
|
+
schema = ds.dataset(table_path).schema.metadata
|
|
125
|
+
is_vorder = any(b"vorder" in key for key in schema.keys())
|
|
126
|
+
# v_order_level = (
|
|
127
|
+
# int(schema.get(b"com.microsoft.parquet.vorder.level").decode("utf-8"))
|
|
128
|
+
# if is_vorder
|
|
129
|
+
# else None
|
|
130
|
+
# )
|
|
131
|
+
|
|
132
|
+
for file_name in file_paths:
|
|
133
|
+
parquet_file = pq.ParquetFile(f"{table_path}/{file_name}")
|
|
134
|
+
row_groups += parquet_file.num_row_groups
|
|
135
|
+
|
|
136
|
+
# Generate rowgroup dataframe
|
|
137
|
+
new_data = {
|
|
138
|
+
"ParquetFile": file_name,
|
|
139
|
+
"RowCount": parquet_file.metadata.num_rows,
|
|
140
|
+
"RowGroups": parquet_file.num_row_groups,
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
parquet_file_df = pd.concat(
|
|
144
|
+
[parquet_file_df, pd.DataFrame(new_data, index=[0])], ignore_index=True
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
for i in range(parquet_file.num_row_groups):
|
|
148
|
+
row_group = parquet_file.metadata.row_group(i)
|
|
149
|
+
num_rows = row_group.num_rows
|
|
150
|
+
|
|
151
|
+
max_rows_per_row_group = max(max_rows_per_row_group, num_rows)
|
|
152
|
+
min_rows_per_row_group = min(min_rows_per_row_group, num_rows)
|
|
153
|
+
|
|
154
|
+
total_compressed_size = 0
|
|
155
|
+
total_uncompressed_size = 0
|
|
156
|
+
|
|
157
|
+
for j in range(row_group.num_columns):
|
|
158
|
+
column_chunk = row_group.column(j)
|
|
159
|
+
total_compressed_size += column_chunk.total_compressed_size
|
|
160
|
+
total_uncompressed_size += column_chunk.total_uncompressed_size
|
|
161
|
+
|
|
162
|
+
# Generate Column Chunk Dataframe
|
|
163
|
+
new_data = {
|
|
164
|
+
"ParquetFile": file_name,
|
|
165
|
+
"ColumnID": j,
|
|
166
|
+
"ColumnName": column_chunk.path_in_schema,
|
|
167
|
+
"ColumnType": column_chunk.physical_type,
|
|
168
|
+
"CompressedSize": column_chunk.total_compressed_size,
|
|
169
|
+
"UncompressedSize": column_chunk.total_uncompressed_size,
|
|
170
|
+
"HasDict": column_chunk.has_dictionary_page,
|
|
171
|
+
"DictOffset": column_chunk.dictionary_page_offset,
|
|
172
|
+
"ValueCount": column_chunk.num_values,
|
|
173
|
+
"Encodings": str(column_chunk.encodings),
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
column_chunk_df = pd.concat(
|
|
177
|
+
[column_chunk_df, pd.DataFrame(new_data, index=[0])],
|
|
178
|
+
ignore_index=True,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
# Generate rowgroup dataframe
|
|
182
|
+
new_data = {
|
|
183
|
+
"ParquetFile": file_name,
|
|
184
|
+
"RowGroupID": i + 1,
|
|
185
|
+
"RowCount": num_rows,
|
|
186
|
+
"CompressedSize": total_compressed_size,
|
|
187
|
+
"UncompressedSize": total_uncompressed_size,
|
|
188
|
+
"CompressionRatio": total_compressed_size / total_uncompressed_size,
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
if not row_group_df.empty:
|
|
192
|
+
row_group_df = pd.concat(
|
|
193
|
+
[row_group_df, pd.DataFrame(new_data, index=[0])], ignore_index=True
|
|
194
|
+
)
|
|
195
|
+
else:
|
|
196
|
+
row_group_df = pd.DataFrame(new_data, index=[0])
|
|
197
|
+
|
|
198
|
+
avg_rows_per_row_group = row_count / row_groups
|
|
199
|
+
|
|
200
|
+
# Generate summary dataframe
|
|
201
|
+
summary_df = pd.DataFrame(
|
|
202
|
+
[
|
|
203
|
+
{
|
|
204
|
+
"RowCount": row_count,
|
|
205
|
+
"RowGroups": row_groups,
|
|
206
|
+
"ParquetFiles": len(file_paths),
|
|
207
|
+
"MaxRowsPerRowGroup": max_rows_per_row_group,
|
|
208
|
+
"MinRowsPerRowGroup": min_rows_per_row_group,
|
|
209
|
+
"AvgRowsPerRowGroup": avg_rows_per_row_group,
|
|
210
|
+
"VOrderEnabled": is_vorder,
|
|
211
|
+
# "VOrderLevel": v_order_level,
|
|
212
|
+
}
|
|
213
|
+
]
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# Clean up data types
|
|
217
|
+
_update_dataframe_datatypes(
|
|
218
|
+
dataframe=column_chunk_df, column_map=column_chunk_df_columns
|
|
219
|
+
)
|
|
220
|
+
_update_dataframe_datatypes(dataframe=row_group_df, column_map=row_group_df_columns)
|
|
221
|
+
_update_dataframe_datatypes(
|
|
222
|
+
dataframe=parquet_file_df, column_map=parquet_file_df_columns
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
# Generate column dataframe
|
|
226
|
+
column_df = column_chunk_df.groupby(
|
|
227
|
+
["ColumnName", "ColumnType"], as_index=False
|
|
228
|
+
).agg({"CompressedSize": "sum", "UncompressedSize": "sum"})
|
|
229
|
+
|
|
230
|
+
# Add distinct count to column_df
|
|
231
|
+
for ind, r in column_df.iterrows():
|
|
232
|
+
col_name = r["ColumnName"]
|
|
233
|
+
if approx_distinct_count:
|
|
234
|
+
dc = _get_column_aggregate(
|
|
235
|
+
table_name=table_name,
|
|
236
|
+
column_name=col_name,
|
|
237
|
+
function="approx",
|
|
238
|
+
lakehouse=lakehouse_name,
|
|
239
|
+
)
|
|
240
|
+
else:
|
|
241
|
+
dc = _get_column_aggregate(
|
|
242
|
+
table_name=table_name,
|
|
243
|
+
column_name=col_name,
|
|
244
|
+
function="distinctcount",
|
|
245
|
+
lakehouse=lakehouse_name,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
if "Cardinality" not in column_df.columns:
|
|
249
|
+
column_df["Cardinality"] = None
|
|
250
|
+
|
|
251
|
+
column_df.at[ind, "Cardinality"] = dc
|
|
252
|
+
|
|
253
|
+
column_df["Cardinality"] = column_df["Cardinality"].astype(int)
|
|
254
|
+
summary_df["TotalSize"] = column_df["CompressedSize"].sum()
|
|
255
|
+
|
|
256
|
+
dataframes = {
|
|
257
|
+
"Summary": summary_df,
|
|
258
|
+
"Parquet Files": parquet_file_df,
|
|
259
|
+
"Row Groups": row_group_df,
|
|
260
|
+
"Column Chunks": column_chunk_df,
|
|
261
|
+
"Columns": column_df,
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
save_table = f"{prefix}Summary"
|
|
265
|
+
|
|
266
|
+
if export:
|
|
267
|
+
dfL = get_lakehouse_tables()
|
|
268
|
+
dfL_filt = dfL[dfL["Table Name"] == save_table]
|
|
269
|
+
if dfL_filt.empty:
|
|
270
|
+
runId = 1
|
|
271
|
+
else:
|
|
272
|
+
max_run_id = _get_column_aggregate(
|
|
273
|
+
lakehouse=lakehouse_name, table_name=save_table
|
|
274
|
+
)
|
|
275
|
+
runId = max_run_id + 1
|
|
276
|
+
|
|
277
|
+
for name, df in dataframes.items():
|
|
278
|
+
name = name.replace(" ", "")
|
|
279
|
+
cols = {
|
|
280
|
+
"WorkspaceName": workspace_name,
|
|
281
|
+
"WorkspaceId": workspace_id,
|
|
282
|
+
"LakehouseName": lakehouse_name,
|
|
283
|
+
"LakehouseId": lakehouse_id,
|
|
284
|
+
"TableName": table_name,
|
|
285
|
+
}
|
|
286
|
+
for i, (col, param) in enumerate(cols.items()):
|
|
287
|
+
df[col] = param
|
|
288
|
+
df.insert(i, col, df.pop(col))
|
|
289
|
+
|
|
290
|
+
df["Timestamp"] = now
|
|
291
|
+
df["Timestamp"] = pd.to_datetime(df["Timestamp"])
|
|
292
|
+
|
|
293
|
+
if export:
|
|
294
|
+
df["RunId"] = runId
|
|
295
|
+
df["RunId"] = df["RunId"].astype(int)
|
|
296
|
+
save_as_delta_table(
|
|
297
|
+
dataframe=df,
|
|
298
|
+
delta_table_name=f"{prefix}{name}",
|
|
299
|
+
write_mode="append",
|
|
300
|
+
merge_schema=True,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
return dataframes
|