semantic-link-labs 0.9.0__py3-none-any.whl → 0.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of semantic-link-labs might be problematic. Click here for more details.

Files changed (83) hide show
  1. {semantic_link_labs-0.9.0.dist-info → semantic_link_labs-0.9.2.dist-info}/METADATA +68 -7
  2. {semantic_link_labs-0.9.0.dist-info → semantic_link_labs-0.9.2.dist-info}/RECORD +83 -76
  3. sempy_labs/__init__.py +14 -12
  4. sempy_labs/_authentication.py +0 -2
  5. sempy_labs/_capacities.py +120 -142
  6. sempy_labs/_capacity_migration.py +61 -94
  7. sempy_labs/_clear_cache.py +9 -8
  8. sempy_labs/_connections.py +72 -105
  9. sempy_labs/_data_pipelines.py +47 -49
  10. sempy_labs/_dataflows.py +45 -51
  11. sempy_labs/_dax.py +228 -6
  12. sempy_labs/_delta_analyzer.py +303 -0
  13. sempy_labs/_deployment_pipelines.py +72 -66
  14. sempy_labs/_environments.py +39 -36
  15. sempy_labs/_eventhouses.py +35 -35
  16. sempy_labs/_eventstreams.py +38 -39
  17. sempy_labs/_external_data_shares.py +29 -42
  18. sempy_labs/_gateways.py +57 -101
  19. sempy_labs/_generate_semantic_model.py +22 -30
  20. sempy_labs/_git.py +46 -66
  21. sempy_labs/_graphQL.py +95 -0
  22. sempy_labs/_helper_functions.py +175 -30
  23. sempy_labs/_job_scheduler.py +47 -59
  24. sempy_labs/_kql_databases.py +27 -34
  25. sempy_labs/_kql_querysets.py +23 -30
  26. sempy_labs/_list_functions.py +262 -164
  27. sempy_labs/_managed_private_endpoints.py +52 -47
  28. sempy_labs/_mirrored_databases.py +110 -134
  29. sempy_labs/_mirrored_warehouses.py +13 -13
  30. sempy_labs/_ml_experiments.py +36 -36
  31. sempy_labs/_ml_models.py +37 -38
  32. sempy_labs/_model_dependencies.py +2 -0
  33. sempy_labs/_notebooks.py +28 -29
  34. sempy_labs/_one_lake_integration.py +2 -0
  35. sempy_labs/_query_scale_out.py +63 -81
  36. sempy_labs/_refresh_semantic_model.py +12 -14
  37. sempy_labs/_spark.py +54 -79
  38. sempy_labs/_sql.py +7 -11
  39. sempy_labs/_vertipaq.py +8 -3
  40. sempy_labs/_warehouses.py +30 -33
  41. sempy_labs/_workloads.py +15 -20
  42. sempy_labs/_workspace_identity.py +13 -17
  43. sempy_labs/_workspaces.py +49 -48
  44. sempy_labs/admin/__init__.py +2 -0
  45. sempy_labs/admin/_basic_functions.py +244 -281
  46. sempy_labs/admin/_domains.py +188 -103
  47. sempy_labs/admin/_external_data_share.py +26 -31
  48. sempy_labs/admin/_git.py +17 -22
  49. sempy_labs/admin/_items.py +34 -48
  50. sempy_labs/admin/_scanner.py +20 -13
  51. sempy_labs/directlake/_directlake_schema_compare.py +2 -0
  52. sempy_labs/directlake/_dl_helper.py +10 -11
  53. sempy_labs/directlake/_generate_shared_expression.py +4 -5
  54. sempy_labs/directlake/_get_directlake_lakehouse.py +1 -0
  55. sempy_labs/directlake/_list_directlake_model_calc_tables.py +1 -0
  56. sempy_labs/directlake/_show_unsupported_directlake_objects.py +2 -0
  57. sempy_labs/directlake/_warm_cache.py +2 -0
  58. sempy_labs/graph/__init__.py +33 -0
  59. sempy_labs/graph/_groups.py +402 -0
  60. sempy_labs/graph/_teams.py +113 -0
  61. sempy_labs/graph/_users.py +191 -0
  62. sempy_labs/lakehouse/__init__.py +4 -0
  63. sempy_labs/lakehouse/_get_lakehouse_columns.py +10 -10
  64. sempy_labs/lakehouse/_get_lakehouse_tables.py +14 -20
  65. sempy_labs/lakehouse/_lakehouse.py +101 -4
  66. sempy_labs/lakehouse/_shortcuts.py +42 -20
  67. sempy_labs/migration/__init__.py +4 -0
  68. sempy_labs/migration/_direct_lake_to_import.py +66 -0
  69. sempy_labs/migration/_migrate_calctables_to_lakehouse.py +1 -0
  70. sempy_labs/migration/_migrate_calctables_to_semantic_model.py +1 -0
  71. sempy_labs/migration/_migrate_model_objects_to_semantic_model.py +1 -0
  72. sempy_labs/migration/_migrate_tables_columns_to_semantic_model.py +2 -0
  73. sempy_labs/report/_download_report.py +8 -13
  74. sempy_labs/report/_generate_report.py +49 -46
  75. sempy_labs/report/_paginated.py +20 -26
  76. sempy_labs/report/_report_functions.py +50 -45
  77. sempy_labs/report/_report_list_functions.py +2 -0
  78. sempy_labs/report/_report_rebind.py +6 -10
  79. sempy_labs/report/_reportwrapper.py +187 -220
  80. sempy_labs/tom/_model.py +8 -5
  81. {semantic_link_labs-0.9.0.dist-info → semantic_link_labs-0.9.2.dist-info}/LICENSE +0 -0
  82. {semantic_link_labs-0.9.0.dist-info → semantic_link_labs-0.9.2.dist-info}/WHEEL +0 -0
  83. {semantic_link_labs-0.9.0.dist-info → semantic_link_labs-0.9.2.dist-info}/top_level.txt +0 -0
sempy_labs/_dax.py CHANGED
@@ -4,12 +4,16 @@ from sempy_labs._helper_functions import (
4
4
  resolve_workspace_name_and_id,
5
5
  format_dax_object_name,
6
6
  resolve_dataset_name_and_id,
7
+ _base_api,
8
+ generate_guid,
7
9
  )
8
10
  from sempy_labs._model_dependencies import get_model_calc_dependencies
9
- from typing import Optional, List
11
+ from typing import Optional, List, Tuple
10
12
  from sempy._utils._log import log
11
13
  from uuid import UUID
12
14
  from sempy_labs.directlake._warm_cache import _put_columns_into_memory
15
+ import sempy_labs._icons as icons
16
+ import time
13
17
 
14
18
 
15
19
  @log
@@ -47,15 +51,15 @@ def evaluate_dax_impersonation(
47
51
  (workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
48
52
  (dataset_name, dataset_id) = resolve_dataset_name_and_id(dataset, workspace_id)
49
53
 
50
- request_body = {
54
+ payload = {
51
55
  "queries": [{"query": dax_query}],
52
56
  "impersonatedUserName": user_name,
53
57
  }
54
58
 
55
- client = fabric.PowerBIRestClient()
56
- response = client.post(
57
- f"/v1.0/myorg/groups/{workspace_id}/datasets/{dataset_id}/executeQueries",
58
- json=request_body,
59
+ response = _base_api(
60
+ request=f"/v1.0/myorg/groups/{workspace_id}/datasets/{dataset_id}/executeQueries",
61
+ method="post",
62
+ payload=payload,
59
63
  )
60
64
  data = response.json()["results"][0]["tables"]
61
65
  column_names = data[0]["rows"][0].keys()
@@ -100,6 +104,8 @@ def get_dax_query_dependencies(
100
104
  (workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
101
105
  (dataset_name, dataset_id) = resolve_dataset_name_and_id(dataset, workspace_id)
102
106
 
107
+ fabric.refresh_tom_cache(workspace=workspace)
108
+
103
109
  if isinstance(dax_string, str):
104
110
  dax_string = [dax_string]
105
111
 
@@ -257,3 +263,219 @@ def get_dax_query_memory_size(
257
263
  )
258
264
 
259
265
  return df["Total Size"].sum()
266
+
267
+
268
+ @log
269
+ def _dax_perf_test(
270
+ dataset: str,
271
+ dax_queries: dict,
272
+ clear_cache_before_run: bool = False,
273
+ refresh_type: Optional[str] = None,
274
+ rest_time: int = 2,
275
+ workspace: Optional[str] = None,
276
+ ) -> Tuple[pd.DataFrame, dict]:
277
+ """
278
+ Runs a performance test on a set of DAX queries.
279
+
280
+ Parameters
281
+ ----------
282
+ dataset : str
283
+ Name of the semantic model.
284
+ dax_queries : dict
285
+ The dax queries to run in a dictionary format. Here is an example:
286
+ {
287
+ "Sales Amount Test", """ """ EVALUATE SUMMARIZECOLUMNS("Sales Amount", [Sales Amount]) """ """,
288
+ "Order Quantity with Product", """ """ EVALUATE SUMMARIZECOLUMNS('Product'[Color], "Order Qty", [Order Qty]) """ """,
289
+ }
290
+ clear_cache_before_run : bool, default=False
291
+ refresh_type : str, default=None
292
+ rest_time : int, default=2
293
+ Rest time (in seconds) between the execution of each DAX query.
294
+ workspace : str, default=None
295
+ The Fabric workspace name.
296
+ Defaults to None which resolves to the workspace of the attached lakehouse
297
+ or if no lakehouse attached, resolves to the workspace of the notebook.
298
+
299
+ Returns
300
+ -------
301
+ Tuple[pandas.DataFrame, dict]
302
+ A pandas dataframe showing the SQL profiler trace results of the DAX queries.
303
+ A dictionary of the query results in pandas dataframes.
304
+ """
305
+ from sempy_labs._refresh_semantic_model import refresh_semantic_model
306
+ from sempy_labs._clear_cache import clear_cache
307
+
308
+ event_schema = {
309
+ "QueryBegin": [
310
+ "EventClass",
311
+ "EventSubclass",
312
+ "CurrentTime",
313
+ "NTUserName",
314
+ "TextData",
315
+ "StartTime",
316
+ "ApplicationName",
317
+ ],
318
+ "QueryEnd": [
319
+ "EventClass",
320
+ "EventSubclass",
321
+ "CurrentTime",
322
+ "NTUserName",
323
+ "TextData",
324
+ "StartTime",
325
+ "EndTime",
326
+ "Duration",
327
+ "CpuTime",
328
+ "Success",
329
+ "ApplicationName",
330
+ ],
331
+ "VertiPaqSEQueryBegin": [
332
+ "EventClass",
333
+ "EventSubclass",
334
+ "CurrentTime",
335
+ "NTUserName",
336
+ "TextData",
337
+ "StartTime",
338
+ ],
339
+ "VertiPaqSEQueryEnd": [
340
+ "EventClass",
341
+ "EventSubclass",
342
+ "CurrentTime",
343
+ "NTUserName",
344
+ "TextData",
345
+ "StartTime",
346
+ "EndTime",
347
+ "Duration",
348
+ "CpuTime",
349
+ "Success",
350
+ ],
351
+ "VertiPaqSEQueryCacheMatch": [
352
+ "EventClass",
353
+ "EventSubclass",
354
+ "CurrentTime",
355
+ "NTUserName",
356
+ "TextData",
357
+ ],
358
+ }
359
+
360
+ # Add Execution Metrics
361
+ event_schema["ExecutionMetrics"] = ["EventClass", "ApplicationName", "TextData"]
362
+ # Add DAX Query Plan
363
+ # event_schema["DAXQueryPlan"] = ["EventClass", "EventSubclass", "CurrentTime", "StartTime", "EndTime", "Duration", "CpuTime", "ApplicationName", "TextData"]
364
+
365
+ query_results = {}
366
+
367
+ # Establish trace connection
368
+ with fabric.create_trace_connection(
369
+ dataset=dataset, workspace=workspace
370
+ ) as trace_connection:
371
+ with trace_connection.create_trace(event_schema) as trace:
372
+ trace.start()
373
+ print(f"{icons.in_progress} Starting performance testing...")
374
+ # Loop through DAX queries
375
+ for name, dax in dax_queries.items():
376
+
377
+ if clear_cache_before_run:
378
+ clear_cache(dataset=dataset, workspace=workspace)
379
+ if refresh_type is not None:
380
+ refresh_semantic_model(
381
+ dataset=dataset, workspace=workspace, refresh_type=refresh_type
382
+ )
383
+
384
+ # EVALUATE {1} is used to initate a warm cache
385
+ fabric.evaluate_dax(
386
+ dataset=dataset, workspace=workspace, dax_string="""EVALUATE {1}"""
387
+ )
388
+ # Run DAX Query
389
+ result = fabric.evaluate_dax(
390
+ dataset=dataset, workspace=workspace, dax_string=dax
391
+ )
392
+
393
+ # Add results to output
394
+ query_results[name] = result
395
+
396
+ time.sleep(rest_time)
397
+ print(f"{icons.green_dot} The '{name}' query has completed.")
398
+
399
+ df = trace.stop()
400
+ # Allow time to collect trace results
401
+ time.sleep(5)
402
+
403
+ # Step 1: Filter out unnecessary operations
404
+ query_names = list(dax_queries.keys())
405
+ df = df[
406
+ ~df["Application Name"].isin(["PowerBI", "PowerBIEIM"])
407
+ & (~df["Text Data"].str.startswith("EVALUATE {1}"))
408
+ ]
409
+ query_begin = df["Event Class"] == "QueryBegin"
410
+ temp_column_name = "QueryName_INT"
411
+ df = df.copy()
412
+ df[temp_column_name] = query_begin.cumsum()
413
+ df[temp_column_name] = (
414
+ df[temp_column_name]
415
+ .where(query_begin, None) # Assign None to non-query begin rows
416
+ .ffill() # Forward fill None values
417
+ .astype("Int64") # Use pandas nullable integer type for numeric indices
418
+ )
419
+
420
+ df.loc[df[temp_column_name].notna(), "Query Name"] = (
421
+ df[temp_column_name]
422
+ .dropna()
423
+ .astype(int)
424
+ .map(lambda x: query_names[x - 1])
425
+ )
426
+ df = df[df[temp_column_name] != None]
427
+ df = df.drop(columns=[temp_column_name])
428
+
429
+ query_to_guid = {
430
+ name: generate_guid() for name in df["Query Name"].unique()
431
+ }
432
+ df["Query ID"] = df["Query Name"].map(query_to_guid)
433
+
434
+ df = df.reset_index(drop=True)
435
+
436
+ return df, query_results
437
+
438
+
439
+ def _dax_perf_test_bulk(
440
+ mapping: dict,
441
+ clear_cache_before_run: bool = False,
442
+ refresh_type: Optional[str] = None,
443
+ rest_time: int = 2,
444
+ ):
445
+ """
446
+ mapping is something like this:
447
+
448
+ mapping = {
449
+ "Workspace1": {
450
+ "Dataset1": {
451
+ "Query1": "EVALUATE ...",
452
+ "Query2": "EVALUATE ...",
453
+ },
454
+ "Dataset2": {
455
+ "Query3": "EVALUATE ...",
456
+ "Query4": "EVALUATE ...",
457
+ }
458
+ },
459
+ "Workspace2": {
460
+ "Dataset3": {
461
+ "Query5": "EVALUATE ...",
462
+ "Query6": "EVALUATE ...",
463
+ },
464
+ "Dataset4": {
465
+ "Query7": "EVALUATE ...",
466
+ "Query8": "EVALUATE ...",
467
+ }
468
+ }
469
+ }
470
+ """
471
+
472
+ for workspace, datasets in mapping.items():
473
+ for dataset, queries in datasets.items():
474
+ _dax_perf_test(
475
+ dataset=dataset,
476
+ dax_queries=queries,
477
+ clear_cache_before_run=clear_cache_before_run,
478
+ refresh_type=refresh_type,
479
+ rest_time=rest_time,
480
+ workspace=workspace,
481
+ )
@@ -0,0 +1,303 @@
1
+ import pandas as pd
2
+ import datetime
3
+ from typing import Dict
4
+ import pyarrow.dataset as ds
5
+ import pyarrow.parquet as pq
6
+ from pyspark.sql import SparkSession
7
+ from sempy_labs._helper_functions import (
8
+ create_abfss_path,
9
+ save_as_delta_table,
10
+ _get_column_aggregate,
11
+ _create_dataframe,
12
+ _update_dataframe_datatypes,
13
+ resolve_workspace_name_and_id,
14
+ resolve_lakehouse_name_and_id,
15
+ )
16
+ from sempy_labs.lakehouse._get_lakehouse_tables import get_lakehouse_tables
17
+ from sempy_labs.lakehouse._lakehouse import lakehouse_attached
18
+ import sempy_labs._icons as icons
19
+
20
+
21
+ def delta_analyzer(
22
+ table_name: str,
23
+ approx_distinct_count: bool = True,
24
+ export: bool = False,
25
+ ) -> Dict[str, pd.DataFrame]:
26
+ """
27
+ Analyzes a delta table and shows the results in dictionary containing a set of 5 dataframes. The table being analyzed must be in the lakehouse attached to the notebook.
28
+
29
+ The 5 dataframes returned by this function are:
30
+
31
+ * Summary
32
+ * Parquet Files
33
+ * Row Groups
34
+ * Column Chunks
35
+ * Columns
36
+
37
+ Read more about Delta Analyzer `here <https://github.com/microsoft/Analysis-Services/tree/master/DeltaAnalyzer>`_.
38
+
39
+ Parameters
40
+ ----------
41
+ table_name : str
42
+ The delta table name.
43
+ approx_distinct_count: bool, default=True
44
+ If True, uses approx_count_distinct to calculate the cardinality of each column. If False, uses COUNT(DISTINCT) instead.
45
+ export : bool, default=False
46
+ If True, exports the resulting dataframes to delta tables in the lakehouse attached to the notebook.
47
+
48
+ Returns
49
+ -------
50
+ Dict[str, pandas.DataFrame]
51
+ A dictionary of pandas dataframes showing semantic model objects which violated the best practice analyzer rules.
52
+ """
53
+
54
+ if not lakehouse_attached():
55
+ raise ValueError(
56
+ f"{icons.red_dot} No lakehouse is attached to this notebook. Please attach a lakehouse to the notebook before running the Delta Analyzer."
57
+ )
58
+
59
+ prefix = "SLL_DeltaAnalyzer_"
60
+ now = datetime.datetime.now()
61
+ (workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace=None)
62
+ (lakehouse_name, lakehouse_id) = resolve_lakehouse_name_and_id(
63
+ lakehouse=None, workspace=None
64
+ )
65
+ path = create_abfss_path(lakehouse_id, workspace_id, table_name)
66
+ table_path = f"/lakehouse/default/Tables/{table_name}"
67
+
68
+ parquet_file_df_columns = {
69
+ "ParquetFile": "string",
70
+ "RowCount": "int",
71
+ "RowGroups": "int",
72
+ }
73
+ row_group_df_columns = {
74
+ "ParquetFile": "string",
75
+ "RowGroupID": "int",
76
+ "RowCount": "int",
77
+ "CompressedSize": "int",
78
+ "UncompressedSize": "int",
79
+ "CompressionRatio": "float",
80
+ }
81
+ column_chunk_df_columns = {
82
+ "ParquetFile": "string",
83
+ "ColumnID": "int",
84
+ "ColumnName": "string",
85
+ "ColumnType": "string",
86
+ "CompressedSize": "int",
87
+ "UncompressedSize": "int",
88
+ "HasDict": "bool",
89
+ "DictOffset": "int_fillna",
90
+ "ValueCount": "int",
91
+ "Encodings": "string",
92
+ }
93
+
94
+ parquet_file_df = _create_dataframe(columns=parquet_file_df_columns)
95
+ row_group_df = _create_dataframe(columns=row_group_df_columns)
96
+ column_chunk_df = _create_dataframe(columns=column_chunk_df_columns)
97
+
98
+ spark = SparkSession.builder.getOrCreate()
99
+ # delta_table = DeltaTable.forPath(spark, path)
100
+ # detail_df = spark.sql(f"DESCRIBE DETAIL `{table_name}`").collect()[0]
101
+
102
+ # num_files = detail_df.numFiles
103
+ # size_in_bytes = detail_df.sizeInBytes
104
+
105
+ latest_files = spark.read.format("delta").load(path).inputFiles()
106
+ file_paths = [f.split("/")[-1] for f in latest_files]
107
+ row_count = spark.table(table_name).count()
108
+ row_groups = 0
109
+ max_rows_per_row_group = 0
110
+ min_rows_per_row_group = float("inf")
111
+ # dt = DeltaTable.forPath(spark, path)
112
+ # schema = dt.toDF().schema
113
+ # is_vorder = False
114
+ # if (
115
+ # dt.detail()
116
+ # .collect()[0]
117
+ # .asDict()
118
+ # .get("properties")
119
+ # .get("delta.parquet.vorder.enabled")
120
+ # == "true"
121
+ # ):
122
+ # is_vorder = True
123
+
124
+ schema = ds.dataset(table_path).schema.metadata
125
+ is_vorder = any(b"vorder" in key for key in schema.keys())
126
+ # v_order_level = (
127
+ # int(schema.get(b"com.microsoft.parquet.vorder.level").decode("utf-8"))
128
+ # if is_vorder
129
+ # else None
130
+ # )
131
+
132
+ for file_name in file_paths:
133
+ parquet_file = pq.ParquetFile(f"{table_path}/{file_name}")
134
+ row_groups += parquet_file.num_row_groups
135
+
136
+ # Generate rowgroup dataframe
137
+ new_data = {
138
+ "ParquetFile": file_name,
139
+ "RowCount": parquet_file.metadata.num_rows,
140
+ "RowGroups": parquet_file.num_row_groups,
141
+ }
142
+
143
+ parquet_file_df = pd.concat(
144
+ [parquet_file_df, pd.DataFrame(new_data, index=[0])], ignore_index=True
145
+ )
146
+
147
+ for i in range(parquet_file.num_row_groups):
148
+ row_group = parquet_file.metadata.row_group(i)
149
+ num_rows = row_group.num_rows
150
+
151
+ max_rows_per_row_group = max(max_rows_per_row_group, num_rows)
152
+ min_rows_per_row_group = min(min_rows_per_row_group, num_rows)
153
+
154
+ total_compressed_size = 0
155
+ total_uncompressed_size = 0
156
+
157
+ for j in range(row_group.num_columns):
158
+ column_chunk = row_group.column(j)
159
+ total_compressed_size += column_chunk.total_compressed_size
160
+ total_uncompressed_size += column_chunk.total_uncompressed_size
161
+
162
+ # Generate Column Chunk Dataframe
163
+ new_data = {
164
+ "ParquetFile": file_name,
165
+ "ColumnID": j,
166
+ "ColumnName": column_chunk.path_in_schema,
167
+ "ColumnType": column_chunk.physical_type,
168
+ "CompressedSize": column_chunk.total_compressed_size,
169
+ "UncompressedSize": column_chunk.total_uncompressed_size,
170
+ "HasDict": column_chunk.has_dictionary_page,
171
+ "DictOffset": column_chunk.dictionary_page_offset,
172
+ "ValueCount": column_chunk.num_values,
173
+ "Encodings": str(column_chunk.encodings),
174
+ }
175
+
176
+ column_chunk_df = pd.concat(
177
+ [column_chunk_df, pd.DataFrame(new_data, index=[0])],
178
+ ignore_index=True,
179
+ )
180
+
181
+ # Generate rowgroup dataframe
182
+ new_data = {
183
+ "ParquetFile": file_name,
184
+ "RowGroupID": i + 1,
185
+ "RowCount": num_rows,
186
+ "CompressedSize": total_compressed_size,
187
+ "UncompressedSize": total_uncompressed_size,
188
+ "CompressionRatio": total_compressed_size / total_uncompressed_size,
189
+ }
190
+
191
+ if not row_group_df.empty:
192
+ row_group_df = pd.concat(
193
+ [row_group_df, pd.DataFrame(new_data, index=[0])], ignore_index=True
194
+ )
195
+ else:
196
+ row_group_df = pd.DataFrame(new_data, index=[0])
197
+
198
+ avg_rows_per_row_group = row_count / row_groups
199
+
200
+ # Generate summary dataframe
201
+ summary_df = pd.DataFrame(
202
+ [
203
+ {
204
+ "RowCount": row_count,
205
+ "RowGroups": row_groups,
206
+ "ParquetFiles": len(file_paths),
207
+ "MaxRowsPerRowGroup": max_rows_per_row_group,
208
+ "MinRowsPerRowGroup": min_rows_per_row_group,
209
+ "AvgRowsPerRowGroup": avg_rows_per_row_group,
210
+ "VOrderEnabled": is_vorder,
211
+ # "VOrderLevel": v_order_level,
212
+ }
213
+ ]
214
+ )
215
+
216
+ # Clean up data types
217
+ _update_dataframe_datatypes(
218
+ dataframe=column_chunk_df, column_map=column_chunk_df_columns
219
+ )
220
+ _update_dataframe_datatypes(dataframe=row_group_df, column_map=row_group_df_columns)
221
+ _update_dataframe_datatypes(
222
+ dataframe=parquet_file_df, column_map=parquet_file_df_columns
223
+ )
224
+
225
+ # Generate column dataframe
226
+ column_df = column_chunk_df.groupby(
227
+ ["ColumnName", "ColumnType"], as_index=False
228
+ ).agg({"CompressedSize": "sum", "UncompressedSize": "sum"})
229
+
230
+ # Add distinct count to column_df
231
+ for ind, r in column_df.iterrows():
232
+ col_name = r["ColumnName"]
233
+ if approx_distinct_count:
234
+ dc = _get_column_aggregate(
235
+ table_name=table_name,
236
+ column_name=col_name,
237
+ function="approx",
238
+ lakehouse=lakehouse_name,
239
+ )
240
+ else:
241
+ dc = _get_column_aggregate(
242
+ table_name=table_name,
243
+ column_name=col_name,
244
+ function="distinctcount",
245
+ lakehouse=lakehouse_name,
246
+ )
247
+
248
+ if "Cardinality" not in column_df.columns:
249
+ column_df["Cardinality"] = None
250
+
251
+ column_df.at[ind, "Cardinality"] = dc
252
+
253
+ column_df["Cardinality"] = column_df["Cardinality"].astype(int)
254
+ summary_df["TotalSize"] = column_df["CompressedSize"].sum()
255
+
256
+ dataframes = {
257
+ "Summary": summary_df,
258
+ "Parquet Files": parquet_file_df,
259
+ "Row Groups": row_group_df,
260
+ "Column Chunks": column_chunk_df,
261
+ "Columns": column_df,
262
+ }
263
+
264
+ save_table = f"{prefix}Summary"
265
+
266
+ if export:
267
+ dfL = get_lakehouse_tables()
268
+ dfL_filt = dfL[dfL["Table Name"] == save_table]
269
+ if dfL_filt.empty:
270
+ runId = 1
271
+ else:
272
+ max_run_id = _get_column_aggregate(
273
+ lakehouse=lakehouse_name, table_name=save_table
274
+ )
275
+ runId = max_run_id + 1
276
+
277
+ for name, df in dataframes.items():
278
+ name = name.replace(" ", "")
279
+ cols = {
280
+ "WorkspaceName": workspace_name,
281
+ "WorkspaceId": workspace_id,
282
+ "LakehouseName": lakehouse_name,
283
+ "LakehouseId": lakehouse_id,
284
+ "TableName": table_name,
285
+ }
286
+ for i, (col, param) in enumerate(cols.items()):
287
+ df[col] = param
288
+ df.insert(i, col, df.pop(col))
289
+
290
+ df["Timestamp"] = now
291
+ df["Timestamp"] = pd.to_datetime(df["Timestamp"])
292
+
293
+ if export:
294
+ df["RunId"] = runId
295
+ df["RunId"] = df["RunId"].astype(int)
296
+ save_as_delta_table(
297
+ dataframe=df,
298
+ delta_table_name=f"{prefix}{name}",
299
+ write_mode="append",
300
+ merge_schema=True,
301
+ )
302
+
303
+ return dataframes