semantic-link-labs 0.12.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (243) hide show
  1. semantic_link_labs-0.12.8.dist-info/METADATA +354 -0
  2. semantic_link_labs-0.12.8.dist-info/RECORD +243 -0
  3. semantic_link_labs-0.12.8.dist-info/WHEEL +5 -0
  4. semantic_link_labs-0.12.8.dist-info/licenses/LICENSE +21 -0
  5. semantic_link_labs-0.12.8.dist-info/top_level.txt +1 -0
  6. sempy_labs/__init__.py +606 -0
  7. sempy_labs/_a_lib_info.py +2 -0
  8. sempy_labs/_ai.py +437 -0
  9. sempy_labs/_authentication.py +264 -0
  10. sempy_labs/_bpa_translation/_model/_translations_am-ET.po +869 -0
  11. sempy_labs/_bpa_translation/_model/_translations_ar-AE.po +908 -0
  12. sempy_labs/_bpa_translation/_model/_translations_bg-BG.po +968 -0
  13. sempy_labs/_bpa_translation/_model/_translations_ca-ES.po +963 -0
  14. sempy_labs/_bpa_translation/_model/_translations_cs-CZ.po +943 -0
  15. sempy_labs/_bpa_translation/_model/_translations_da-DK.po +945 -0
  16. sempy_labs/_bpa_translation/_model/_translations_de-DE.po +988 -0
  17. sempy_labs/_bpa_translation/_model/_translations_el-GR.po +993 -0
  18. sempy_labs/_bpa_translation/_model/_translations_es-ES.po +971 -0
  19. sempy_labs/_bpa_translation/_model/_translations_fa-IR.po +933 -0
  20. sempy_labs/_bpa_translation/_model/_translations_fi-FI.po +942 -0
  21. sempy_labs/_bpa_translation/_model/_translations_fr-FR.po +994 -0
  22. sempy_labs/_bpa_translation/_model/_translations_ga-IE.po +967 -0
  23. sempy_labs/_bpa_translation/_model/_translations_he-IL.po +902 -0
  24. sempy_labs/_bpa_translation/_model/_translations_hi-IN.po +944 -0
  25. sempy_labs/_bpa_translation/_model/_translations_hu-HU.po +963 -0
  26. sempy_labs/_bpa_translation/_model/_translations_id-ID.po +946 -0
  27. sempy_labs/_bpa_translation/_model/_translations_is-IS.po +939 -0
  28. sempy_labs/_bpa_translation/_model/_translations_it-IT.po +986 -0
  29. sempy_labs/_bpa_translation/_model/_translations_ja-JP.po +846 -0
  30. sempy_labs/_bpa_translation/_model/_translations_ko-KR.po +839 -0
  31. sempy_labs/_bpa_translation/_model/_translations_mt-MT.po +967 -0
  32. sempy_labs/_bpa_translation/_model/_translations_nl-NL.po +978 -0
  33. sempy_labs/_bpa_translation/_model/_translations_pl-PL.po +962 -0
  34. sempy_labs/_bpa_translation/_model/_translations_pt-BR.po +962 -0
  35. sempy_labs/_bpa_translation/_model/_translations_pt-PT.po +957 -0
  36. sempy_labs/_bpa_translation/_model/_translations_ro-RO.po +968 -0
  37. sempy_labs/_bpa_translation/_model/_translations_ru-RU.po +964 -0
  38. sempy_labs/_bpa_translation/_model/_translations_sk-SK.po +952 -0
  39. sempy_labs/_bpa_translation/_model/_translations_sl-SL.po +950 -0
  40. sempy_labs/_bpa_translation/_model/_translations_sv-SE.po +942 -0
  41. sempy_labs/_bpa_translation/_model/_translations_ta-IN.po +976 -0
  42. sempy_labs/_bpa_translation/_model/_translations_te-IN.po +947 -0
  43. sempy_labs/_bpa_translation/_model/_translations_th-TH.po +924 -0
  44. sempy_labs/_bpa_translation/_model/_translations_tr-TR.po +953 -0
  45. sempy_labs/_bpa_translation/_model/_translations_uk-UA.po +961 -0
  46. sempy_labs/_bpa_translation/_model/_translations_zh-CN.po +804 -0
  47. sempy_labs/_bpa_translation/_model/_translations_zu-ZA.po +969 -0
  48. sempy_labs/_capacities.py +1198 -0
  49. sempy_labs/_capacity_migration.py +660 -0
  50. sempy_labs/_clear_cache.py +351 -0
  51. sempy_labs/_connections.py +610 -0
  52. sempy_labs/_dashboards.py +69 -0
  53. sempy_labs/_data_access_security.py +98 -0
  54. sempy_labs/_data_pipelines.py +162 -0
  55. sempy_labs/_dataflows.py +668 -0
  56. sempy_labs/_dax.py +501 -0
  57. sempy_labs/_daxformatter.py +80 -0
  58. sempy_labs/_delta_analyzer.py +467 -0
  59. sempy_labs/_delta_analyzer_history.py +301 -0
  60. sempy_labs/_dictionary_diffs.py +221 -0
  61. sempy_labs/_documentation.py +147 -0
  62. sempy_labs/_domains.py +51 -0
  63. sempy_labs/_eventhouses.py +182 -0
  64. sempy_labs/_external_data_shares.py +230 -0
  65. sempy_labs/_gateways.py +521 -0
  66. sempy_labs/_generate_semantic_model.py +521 -0
  67. sempy_labs/_get_connection_string.py +84 -0
  68. sempy_labs/_git.py +543 -0
  69. sempy_labs/_graphQL.py +90 -0
  70. sempy_labs/_helper_functions.py +2833 -0
  71. sempy_labs/_icons.py +149 -0
  72. sempy_labs/_job_scheduler.py +609 -0
  73. sempy_labs/_kql_databases.py +149 -0
  74. sempy_labs/_kql_querysets.py +124 -0
  75. sempy_labs/_kusto.py +137 -0
  76. sempy_labs/_labels.py +124 -0
  77. sempy_labs/_list_functions.py +1720 -0
  78. sempy_labs/_managed_private_endpoints.py +253 -0
  79. sempy_labs/_mirrored_databases.py +416 -0
  80. sempy_labs/_mirrored_warehouses.py +60 -0
  81. sempy_labs/_ml_experiments.py +113 -0
  82. sempy_labs/_model_auto_build.py +140 -0
  83. sempy_labs/_model_bpa.py +557 -0
  84. sempy_labs/_model_bpa_bulk.py +378 -0
  85. sempy_labs/_model_bpa_rules.py +859 -0
  86. sempy_labs/_model_dependencies.py +343 -0
  87. sempy_labs/_mounted_data_factories.py +123 -0
  88. sempy_labs/_notebooks.py +441 -0
  89. sempy_labs/_one_lake_integration.py +151 -0
  90. sempy_labs/_onelake.py +131 -0
  91. sempy_labs/_query_scale_out.py +433 -0
  92. sempy_labs/_refresh_semantic_model.py +435 -0
  93. sempy_labs/_semantic_models.py +468 -0
  94. sempy_labs/_spark.py +455 -0
  95. sempy_labs/_sql.py +241 -0
  96. sempy_labs/_sql_audit_settings.py +207 -0
  97. sempy_labs/_sql_endpoints.py +214 -0
  98. sempy_labs/_tags.py +201 -0
  99. sempy_labs/_translations.py +43 -0
  100. sempy_labs/_user_delegation_key.py +44 -0
  101. sempy_labs/_utils.py +79 -0
  102. sempy_labs/_vertipaq.py +1021 -0
  103. sempy_labs/_vpax.py +388 -0
  104. sempy_labs/_warehouses.py +234 -0
  105. sempy_labs/_workloads.py +140 -0
  106. sempy_labs/_workspace_identity.py +72 -0
  107. sempy_labs/_workspaces.py +595 -0
  108. sempy_labs/admin/__init__.py +170 -0
  109. sempy_labs/admin/_activities.py +167 -0
  110. sempy_labs/admin/_apps.py +145 -0
  111. sempy_labs/admin/_artifacts.py +65 -0
  112. sempy_labs/admin/_basic_functions.py +463 -0
  113. sempy_labs/admin/_capacities.py +508 -0
  114. sempy_labs/admin/_dataflows.py +45 -0
  115. sempy_labs/admin/_datasets.py +186 -0
  116. sempy_labs/admin/_domains.py +522 -0
  117. sempy_labs/admin/_external_data_share.py +100 -0
  118. sempy_labs/admin/_git.py +72 -0
  119. sempy_labs/admin/_items.py +265 -0
  120. sempy_labs/admin/_labels.py +211 -0
  121. sempy_labs/admin/_reports.py +241 -0
  122. sempy_labs/admin/_scanner.py +118 -0
  123. sempy_labs/admin/_shared.py +82 -0
  124. sempy_labs/admin/_sharing_links.py +110 -0
  125. sempy_labs/admin/_tags.py +131 -0
  126. sempy_labs/admin/_tenant.py +503 -0
  127. sempy_labs/admin/_tenant_keys.py +89 -0
  128. sempy_labs/admin/_users.py +140 -0
  129. sempy_labs/admin/_workspaces.py +236 -0
  130. sempy_labs/deployment_pipeline/__init__.py +23 -0
  131. sempy_labs/deployment_pipeline/_items.py +580 -0
  132. sempy_labs/directlake/__init__.py +57 -0
  133. sempy_labs/directlake/_autosync.py +58 -0
  134. sempy_labs/directlake/_directlake_schema_compare.py +120 -0
  135. sempy_labs/directlake/_directlake_schema_sync.py +161 -0
  136. sempy_labs/directlake/_dl_helper.py +274 -0
  137. sempy_labs/directlake/_generate_shared_expression.py +94 -0
  138. sempy_labs/directlake/_get_directlake_lakehouse.py +62 -0
  139. sempy_labs/directlake/_get_shared_expression.py +34 -0
  140. sempy_labs/directlake/_guardrails.py +96 -0
  141. sempy_labs/directlake/_list_directlake_model_calc_tables.py +70 -0
  142. sempy_labs/directlake/_show_unsupported_directlake_objects.py +90 -0
  143. sempy_labs/directlake/_update_directlake_model_lakehouse_connection.py +239 -0
  144. sempy_labs/directlake/_update_directlake_partition_entity.py +259 -0
  145. sempy_labs/directlake/_warm_cache.py +236 -0
  146. sempy_labs/dotnet_lib/dotnet.runtime.config.json +10 -0
  147. sempy_labs/environment/__init__.py +23 -0
  148. sempy_labs/environment/_items.py +212 -0
  149. sempy_labs/environment/_pubstage.py +223 -0
  150. sempy_labs/eventstream/__init__.py +37 -0
  151. sempy_labs/eventstream/_items.py +263 -0
  152. sempy_labs/eventstream/_topology.py +652 -0
  153. sempy_labs/graph/__init__.py +59 -0
  154. sempy_labs/graph/_groups.py +651 -0
  155. sempy_labs/graph/_sensitivity_labels.py +120 -0
  156. sempy_labs/graph/_teams.py +125 -0
  157. sempy_labs/graph/_user_licenses.py +96 -0
  158. sempy_labs/graph/_users.py +516 -0
  159. sempy_labs/graph_model/__init__.py +15 -0
  160. sempy_labs/graph_model/_background_jobs.py +63 -0
  161. sempy_labs/graph_model/_items.py +149 -0
  162. sempy_labs/lakehouse/__init__.py +67 -0
  163. sempy_labs/lakehouse/_blobs.py +247 -0
  164. sempy_labs/lakehouse/_get_lakehouse_columns.py +102 -0
  165. sempy_labs/lakehouse/_get_lakehouse_tables.py +274 -0
  166. sempy_labs/lakehouse/_helper.py +250 -0
  167. sempy_labs/lakehouse/_lakehouse.py +351 -0
  168. sempy_labs/lakehouse/_livy_sessions.py +143 -0
  169. sempy_labs/lakehouse/_materialized_lake_views.py +157 -0
  170. sempy_labs/lakehouse/_partitioning.py +165 -0
  171. sempy_labs/lakehouse/_schemas.py +217 -0
  172. sempy_labs/lakehouse/_shortcuts.py +440 -0
  173. sempy_labs/migration/__init__.py +35 -0
  174. sempy_labs/migration/_create_pqt_file.py +238 -0
  175. sempy_labs/migration/_direct_lake_to_import.py +105 -0
  176. sempy_labs/migration/_migrate_calctables_to_lakehouse.py +398 -0
  177. sempy_labs/migration/_migrate_calctables_to_semantic_model.py +148 -0
  178. sempy_labs/migration/_migrate_model_objects_to_semantic_model.py +533 -0
  179. sempy_labs/migration/_migrate_tables_columns_to_semantic_model.py +172 -0
  180. sempy_labs/migration/_migration_validation.py +71 -0
  181. sempy_labs/migration/_refresh_calc_tables.py +131 -0
  182. sempy_labs/mirrored_azure_databricks_catalog/__init__.py +15 -0
  183. sempy_labs/mirrored_azure_databricks_catalog/_discover.py +213 -0
  184. sempy_labs/mirrored_azure_databricks_catalog/_refresh_catalog_metadata.py +45 -0
  185. sempy_labs/ml_model/__init__.py +23 -0
  186. sempy_labs/ml_model/_functions.py +427 -0
  187. sempy_labs/report/_BPAReportTemplate.json +232 -0
  188. sempy_labs/report/__init__.py +55 -0
  189. sempy_labs/report/_bpareporttemplate/.pbi/localSettings.json +9 -0
  190. sempy_labs/report/_bpareporttemplate/.platform +11 -0
  191. sempy_labs/report/_bpareporttemplate/StaticResources/SharedResources/BaseThemes/CY24SU06.json +710 -0
  192. sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/page.json +11 -0
  193. sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/1b08bce3bebabb0a27a8/visual.json +191 -0
  194. sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/2f22ddb70c301693c165/visual.json +438 -0
  195. sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/3b1182230aa6c600b43a/visual.json +127 -0
  196. sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/58577ba6380c69891500/visual.json +576 -0
  197. sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/a2a8fa5028b3b776c96c/visual.json +207 -0
  198. sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/adfd47ef30652707b987/visual.json +506 -0
  199. sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/b6a80ee459e716e170b1/visual.json +127 -0
  200. sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/ce3130a721c020cc3d81/visual.json +513 -0
  201. sempy_labs/report/_bpareporttemplate/definition/pages/92735ae19b31712208ad/page.json +8 -0
  202. sempy_labs/report/_bpareporttemplate/definition/pages/92735ae19b31712208ad/visuals/66e60dfb526437cd78d1/visual.json +112 -0
  203. sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/page.json +11 -0
  204. sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/07deb8bce824e1be37d7/visual.json +513 -0
  205. sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/0b1c68838818b32ad03b/visual.json +352 -0
  206. sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/0c171de9d2683d10b930/visual.json +37 -0
  207. sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/0efa01be0510e40a645e/visual.json +542 -0
  208. sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/6bf2f0eb830ab53cc668/visual.json +221 -0
  209. sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/88d8141cb8500b60030c/visual.json +127 -0
  210. sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/a753273590beed656a03/visual.json +576 -0
  211. sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/b8fdc82cddd61ac447bc/visual.json +127 -0
  212. sempy_labs/report/_bpareporttemplate/definition/pages/d37dce724a0ccc30044b/page.json +9 -0
  213. sempy_labs/report/_bpareporttemplate/definition/pages/d37dce724a0ccc30044b/visuals/ce8532a7e25020271077/visual.json +38 -0
  214. sempy_labs/report/_bpareporttemplate/definition/pages/pages.json +10 -0
  215. sempy_labs/report/_bpareporttemplate/definition/report.json +176 -0
  216. sempy_labs/report/_bpareporttemplate/definition/version.json +4 -0
  217. sempy_labs/report/_bpareporttemplate/definition.pbir +14 -0
  218. sempy_labs/report/_download_report.py +76 -0
  219. sempy_labs/report/_export_report.py +257 -0
  220. sempy_labs/report/_generate_report.py +427 -0
  221. sempy_labs/report/_paginated.py +76 -0
  222. sempy_labs/report/_report_bpa.py +354 -0
  223. sempy_labs/report/_report_bpa_rules.py +115 -0
  224. sempy_labs/report/_report_functions.py +581 -0
  225. sempy_labs/report/_report_helper.py +227 -0
  226. sempy_labs/report/_report_list_functions.py +110 -0
  227. sempy_labs/report/_report_rebind.py +149 -0
  228. sempy_labs/report/_reportwrapper.py +3100 -0
  229. sempy_labs/report/_save_report.py +147 -0
  230. sempy_labs/snowflake_database/__init__.py +10 -0
  231. sempy_labs/snowflake_database/_items.py +105 -0
  232. sempy_labs/sql_database/__init__.py +21 -0
  233. sempy_labs/sql_database/_items.py +201 -0
  234. sempy_labs/sql_database/_mirroring.py +79 -0
  235. sempy_labs/theme/__init__.py +12 -0
  236. sempy_labs/theme/_org_themes.py +129 -0
  237. sempy_labs/tom/__init__.py +3 -0
  238. sempy_labs/tom/_model.py +5977 -0
  239. sempy_labs/variable_library/__init__.py +19 -0
  240. sempy_labs/variable_library/_functions.py +403 -0
  241. sempy_labs/warehouse/__init__.py +28 -0
  242. sempy_labs/warehouse/_items.py +234 -0
  243. sempy_labs/warehouse/_restore_points.py +309 -0
@@ -0,0 +1,467 @@
1
+ import pandas as pd
2
+ import re
3
+ from datetime import datetime
4
+ import os
5
+ from uuid import UUID
6
+ from typing import Dict, Optional
7
+ import pyarrow.parquet as pq
8
+ from sempy_labs._helper_functions import (
9
+ create_abfss_path,
10
+ save_as_delta_table,
11
+ _get_column_aggregate,
12
+ _create_dataframe,
13
+ _update_dataframe_datatypes,
14
+ resolve_workspace_name_and_id,
15
+ resolve_lakehouse_name_and_id,
16
+ _read_delta_table,
17
+ _mount,
18
+ _read_delta_table_history,
19
+ resolve_workspace_id,
20
+ resolve_lakehouse_id,
21
+ _get_delta_table,
22
+ )
23
+ from sempy._utils._log import log
24
+ from sempy_labs.lakehouse._get_lakehouse_tables import get_lakehouse_tables
25
+ from sempy_labs.lakehouse._lakehouse import (
26
+ lakehouse_attached,
27
+ )
28
+ from sempy_labs.lakehouse._helper import (
29
+ is_v_ordered,
30
+ )
31
+ import sempy_labs._icons as icons
32
+ from tqdm.auto import tqdm
33
+
34
+
35
+ @log
36
+ def get_parquet_file_infos(path):
37
+
38
+ import notebookutils
39
+
40
+ files = []
41
+ items = notebookutils.fs.ls(path)
42
+ for item in items:
43
+ if item.isDir:
44
+ # Ignore the _delta_log directory
45
+ if "_delta_log" not in item.path:
46
+ files.extend(get_parquet_file_infos(item.path))
47
+ else:
48
+ # Filter out non-Parquet files and files with size 0
49
+ if item.path.endswith(".parquet") and item.size > 0:
50
+ files.append((item.path, item.size))
51
+ return files
52
+
53
+
54
+ @log
55
+ def delta_analyzer(
56
+ table_name: str,
57
+ approx_distinct_count: bool = True,
58
+ export: bool = False,
59
+ lakehouse: Optional[str | UUID] = None,
60
+ workspace: Optional[str | UUID] = None,
61
+ column_stats: bool = True,
62
+ skip_cardinality: bool = True,
63
+ schema: Optional[str] = None,
64
+ ) -> Dict[str, pd.DataFrame]:
65
+ """
66
+ Analyzes a delta table and shows the results in dictionary containing a set of 5 dataframes. If 'export' is set to True, the results will be saved to delta tables in the lakehouse attached to the notebook.
67
+
68
+ The 5 dataframes returned by this function are:
69
+
70
+ * Summary
71
+ * Parquet Files
72
+ * Row Groups
73
+ * Column Chunks
74
+ * Columns
75
+
76
+ Read more about Delta Analyzer `here <https://github.com/microsoft/Analysis-Services/tree/master/DeltaAnalyzer>`_.
77
+
78
+ Parameters
79
+ ----------
80
+ table_name : str
81
+ The delta table name.
82
+ approx_distinct_count: bool, default=True
83
+ If True, uses approx_count_distinct to calculate the cardinality of each column. If False, uses COUNT(DISTINCT) instead.
84
+ export : bool, default=False
85
+ If True, exports the resulting dataframes to delta tables in the lakehouse attached to the notebook.
86
+ lakehouse : str | uuid.UUID, default=None
87
+ The Fabric lakehouse name or ID.
88
+ Defaults to None which resolves to the lakehouse attached to the notebook.
89
+ workspace : str | uuid.UUID, default=None
90
+ The Fabric workspace name or ID used by the lakehouse.
91
+ Defaults to None which resolves to the workspace of the attached lakehouse
92
+ or if no lakehouse attached, resolves to the workspace of the notebook.
93
+ column_stats : bool, default=True
94
+ If True, collects data about column chunks and columns. If False, skips that step and only returns the other 3 dataframes.
95
+ skip_cardinality : bool, default=True
96
+ If True, skips the cardinality calculation for each column. If False, calculates the cardinality for each column.
97
+ schema : str, default=None
98
+ The name of the schema to which the table belongs (for schema-enabled lakehouses). If None, the default schema is used.
99
+
100
+ Returns
101
+ -------
102
+ Dict[str, pandas.DataFrame]
103
+ A dictionary of pandas dataframes showing semantic model objects which violated the best practice analyzer rules.
104
+ """
105
+
106
+ # Must calculate column stats if calculating cardinality
107
+ if not skip_cardinality:
108
+ column_stats = True
109
+
110
+ prefix = "SLL_DeltaAnalyzer_"
111
+ now = datetime.now()
112
+ (workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace=workspace)
113
+ (lakehouse_name, lakehouse_id) = resolve_lakehouse_name_and_id(
114
+ lakehouse=lakehouse, workspace=workspace
115
+ )
116
+
117
+ delta_table_path = create_abfss_path(
118
+ lakehouse_id, workspace_id, table_name, schema=schema
119
+ )
120
+ local_path = _mount(lakehouse=lakehouse, workspace=workspace)
121
+
122
+ parquet_file_df_columns = {
123
+ # "Dataset": "string",
124
+ "Parquet File": "string",
125
+ "Row Count": "int",
126
+ "Row Groups": "int",
127
+ "Created By": "string",
128
+ "Total Table Rows": "int",
129
+ "Total Table Row Groups": "int",
130
+ }
131
+ row_group_df_columns = {
132
+ # "Dataset": "string",
133
+ "Parquet File": "string",
134
+ "Row Group ID": "int",
135
+ "Row Count": "int",
136
+ "Compressed Size": "int",
137
+ "Uncompressed Size": "int",
138
+ "Compression Ratio": "float",
139
+ "Total Table Rows": "int",
140
+ "Ratio Of Total Table Rows": "float",
141
+ "Total Table Row Groups": "int",
142
+ }
143
+ column_chunk_df_columns = {
144
+ # "Dataset": "string",
145
+ "Parquet File": "string",
146
+ "Column ID": "int",
147
+ "Column Name": "string",
148
+ "Column Type": "string",
149
+ "Compressed Size": "int",
150
+ "Uncompressed Size": "int",
151
+ "Has Dict": "bool",
152
+ "Dict Offset": "int_fillna",
153
+ "Value Count": "int",
154
+ "Encodings": "string",
155
+ "Statistics": "string",
156
+ "Primative Type": "string",
157
+ }
158
+
159
+ parquet_file_df = _create_dataframe(columns=parquet_file_df_columns)
160
+ row_group_df = _create_dataframe(columns=row_group_df_columns)
161
+ column_chunk_df = _create_dataframe(columns=column_chunk_df_columns)
162
+
163
+ row_groups = 0
164
+ max_rows_per_row_group = 0
165
+ min_rows_per_row_group = float("inf")
166
+
167
+ is_vorder = is_v_ordered(
168
+ table_name=table_name, lakehouse=lakehouse, workspace=workspace, schema=schema
169
+ )
170
+
171
+ # Get the common details of the Delta table
172
+ delta_table = _get_delta_table(delta_table_path)
173
+ table_df = delta_table.toDF()
174
+ # total_partition_count = table_df.rdd.getNumPartitions()
175
+ row_count = table_df.count()
176
+ table_details = delta_table.detail().collect()[0].asDict()
177
+ # created_at = table_details.get("createdAt")
178
+ # last_modified = table_details.get("lastModified")
179
+ # partition_columns = table_details.get("partitionColumns")
180
+ # clustering_columns = table_details.get("clusteringColumns")
181
+ num_latest_files = table_details.get("numFiles", 0)
182
+ # size_in_bytes = table_details.get("sizeInBytes")
183
+ # min_reader_version = table_details.get("minReaderVersion")
184
+ # min_writer_version = table_details.get("minWriterVersion")
185
+
186
+ latest_files = _read_delta_table(delta_table_path).inputFiles()
187
+ # file_paths = [f.split("/")[-1] for f in latest_files]
188
+ all_parquet_files = get_parquet_file_infos(delta_table_path)
189
+ common_file_paths = set(
190
+ [file_info[0] for file_info in all_parquet_files]
191
+ ).intersection(set(latest_files))
192
+ latest_version_files = [
193
+ file_info
194
+ for file_info in all_parquet_files
195
+ if file_info[0] in common_file_paths
196
+ ]
197
+
198
+ for idx, (file_path, file_size) in enumerate(
199
+ bar := tqdm(latest_version_files), start=1
200
+ ):
201
+ file_name = os.path.basename(file_path)
202
+ bar.set_description(
203
+ f"Analyzing the '{file_name}' parquet file ({idx}/{num_latest_files})..."
204
+ )
205
+
206
+ relative_path = file_path.split("Tables/")[1]
207
+ file_system_path = f"{local_path}/Tables/{relative_path}"
208
+ parquet_file = pq.ParquetFile(file_system_path)
209
+
210
+ row_groups += parquet_file.num_row_groups
211
+
212
+ # Generate rowgroup dataframe
213
+ new_data = {
214
+ # "Dataset": "Parquet Files",
215
+ "Parquet File": file_name,
216
+ "Row Count": parquet_file.metadata.num_rows,
217
+ "Row Groups": parquet_file.num_row_groups,
218
+ "Created By": parquet_file.metadata.created_by,
219
+ "Total Table Rows": -1,
220
+ "Total Table Row Groups": -1,
221
+ }
222
+
223
+ parquet_file_df = pd.concat(
224
+ [parquet_file_df, pd.DataFrame(new_data, index=[0])], ignore_index=True
225
+ )
226
+
227
+ # Loop through the row groups
228
+ for i in range(parquet_file.num_row_groups):
229
+ row_group = parquet_file.metadata.row_group(i)
230
+ num_rows = row_group.num_rows
231
+
232
+ max_rows_per_row_group = max(max_rows_per_row_group, num_rows)
233
+ min_rows_per_row_group = min(min_rows_per_row_group, num_rows)
234
+
235
+ total_compressed_size = 0
236
+ total_uncompressed_size = 0
237
+
238
+ # Loop through the columns
239
+ if column_stats:
240
+ for j in range(row_group.num_columns):
241
+ column_chunk = row_group.column(j)
242
+ total_compressed_size += column_chunk.total_compressed_size
243
+ total_uncompressed_size += column_chunk.total_uncompressed_size
244
+
245
+ # Generate Column Chunk Dataframe
246
+ new_data = {
247
+ # "Dataset": "Column Chunks",
248
+ "Parquet File": file_name,
249
+ "Column ID": j,
250
+ "Column Name": column_chunk.path_in_schema,
251
+ "Column Type": column_chunk.physical_type,
252
+ "Compressed Size": column_chunk.total_compressed_size,
253
+ "Uncompressed Size": column_chunk.total_uncompressed_size,
254
+ "Has Dict": column_chunk.has_dictionary_page,
255
+ "Dict Offset": column_chunk.dictionary_page_offset,
256
+ "Value Count": column_chunk.num_values,
257
+ "Encodings": str(column_chunk.encodings),
258
+ "Statistics": column_chunk.statistics,
259
+ "PrimativeType": column_chunk.physical_type,
260
+ }
261
+
262
+ column_chunk_df = pd.concat(
263
+ [column_chunk_df, pd.DataFrame(new_data, index=[0])],
264
+ ignore_index=True,
265
+ )
266
+
267
+ # Generate rowgroup dataframe
268
+ new_data = {
269
+ # "Dataset": "Row Groups",
270
+ "Parquet File": file_name,
271
+ "Row Group ID": i + 1,
272
+ "Row Count": num_rows,
273
+ "Compressed Size": total_compressed_size,
274
+ "Uncompressed Size": total_uncompressed_size,
275
+ "Compression Ratio": (
276
+ total_compressed_size / total_uncompressed_size
277
+ if column_stats
278
+ else 0
279
+ ),
280
+ "Total Table Rows": -1,
281
+ "Total Table Row Groups": -1,
282
+ }
283
+
284
+ if not row_group_df.empty:
285
+ row_group_df = pd.concat(
286
+ [row_group_df, pd.DataFrame(new_data, index=[0])], ignore_index=True
287
+ )
288
+ else:
289
+ row_group_df = pd.DataFrame(new_data, index=[0])
290
+
291
+ avg_rows_per_row_group = row_count / row_groups
292
+
293
+ # Generate summary dataframe
294
+ summary_df = pd.DataFrame(
295
+ [
296
+ {
297
+ # "Dataset": "Summary",
298
+ "Row Count": row_count,
299
+ "Row Groups": row_groups,
300
+ "Parquet Files": num_latest_files,
301
+ "Max Rows Per Row Group": max_rows_per_row_group,
302
+ "Min Rows Per Row Group": min_rows_per_row_group,
303
+ "Avg Rows Per Row Group": avg_rows_per_row_group,
304
+ "VOrder Enabled": is_vorder,
305
+ # "VOrderLevel": v_order_level,
306
+ }
307
+ ]
308
+ )
309
+
310
+ # Clean up data types
311
+ _update_dataframe_datatypes(dataframe=row_group_df, column_map=row_group_df_columns)
312
+ _update_dataframe_datatypes(
313
+ dataframe=parquet_file_df, column_map=parquet_file_df_columns
314
+ )
315
+
316
+ # Generate column dataframe
317
+ if column_stats:
318
+ _update_dataframe_datatypes(
319
+ dataframe=column_chunk_df, column_map=column_chunk_df_columns
320
+ )
321
+ column_df = column_chunk_df.groupby(
322
+ ["Column Name", "Column Type"], as_index=False
323
+ ).agg({"Compressed Size": "sum", "Uncompressed Size": "sum"})
324
+
325
+ # Add distinct count to column_df
326
+ if not skip_cardinality:
327
+ for ind, r in column_df.iterrows():
328
+ col_name = r["Column Name"]
329
+ if approx_distinct_count:
330
+ function = "approx"
331
+ else:
332
+ function = "distinctcount"
333
+ dc = _get_column_aggregate(
334
+ table_name=table_name,
335
+ column_name=col_name,
336
+ function=function,
337
+ lakehouse=lakehouse,
338
+ workspace=workspace,
339
+ )
340
+
341
+ if "Cardinality" not in column_df.columns:
342
+ column_df["Cardinality"] = None
343
+
344
+ column_df.at[ind, "Cardinality"] = dc
345
+
346
+ summary_df["Total Size"] = column_df["Compressed Size"].sum()
347
+
348
+ parquet_file_df["Total Table Rows"] = parquet_file_df["Row Count"].sum()
349
+ parquet_file_df["Total Table Row Groups"] = parquet_file_df["Row Groups"].sum()
350
+
351
+ row_group_df["Total Table Rows"] = parquet_file_df["Row Count"].sum()
352
+ row_group_df["Total Table Row Groups"] = parquet_file_df["Row Groups"].sum()
353
+ total_rows = row_group_df["Row Count"].sum()
354
+ row_group_df["Ratio Of Total Table Rows"] = (
355
+ row_group_df["Row Count"] / total_rows * 100.0
356
+ )
357
+
358
+ if column_stats:
359
+ column_df["Total Table Rows"] = parquet_file_df["Row Count"].sum()
360
+ column_df["Table Size"] = column_df["Compressed Size"].sum()
361
+ column_df["Size Percent Of Table"] = (
362
+ column_df["Compressed Size"] / column_df["Table Size"] * 100.0
363
+ )
364
+ if not skip_cardinality and column_stats:
365
+ column_df["Cardinality"] = column_df["Cardinality"].fillna(0).astype(int)
366
+ column_df["Cardinality Of Total Rows"] = (
367
+ column_df["Cardinality"] / column_df["Total Table Rows"] * 100.0
368
+ )
369
+
370
+ dataframes = {
371
+ "Summary": summary_df,
372
+ "Parquet Files": parquet_file_df,
373
+ "Row Groups": row_group_df,
374
+ }
375
+
376
+ if column_stats:
377
+ dataframes["Column Chunks"] = column_chunk_df
378
+ dataframes["Columns"] = column_df
379
+
380
+ save_table = f"{prefix}Summary"
381
+
382
+ if export:
383
+ if not lakehouse_attached():
384
+ raise ValueError(
385
+ f"{icons.red_dot} No lakehouse is attached to this notebook. Please attach a lakehouse to the notebook before running the Delta Analyzer."
386
+ )
387
+ dfL = get_lakehouse_tables()
388
+ dfL_filt = dfL[dfL["Table Name"] == save_table]
389
+ if dfL_filt.empty:
390
+ runId = 1
391
+ else:
392
+ max_run_id = _get_column_aggregate(
393
+ table_name=save_table,
394
+ )
395
+ runId = max_run_id + 1
396
+
397
+ for name, df in dataframes.items():
398
+ name = name.replace(" ", "")
399
+ cols = {
400
+ "Workspace Name": workspace_name,
401
+ "Workspace Id": workspace_id,
402
+ "Lakehouse Name": lakehouse_name,
403
+ "Lakehouse Id": lakehouse_id,
404
+ "Table Name": table_name,
405
+ }
406
+ for i, (col, param) in enumerate(cols.items()):
407
+ df[col] = param
408
+ df.insert(i, col, df.pop(col))
409
+
410
+ df["Timestamp"] = now
411
+ df["Timestamp"] = pd.to_datetime(df["Timestamp"])
412
+
413
+ if export:
414
+ df["Run Id"] = runId
415
+ df["Run Id"] = df["Run Id"].astype(int)
416
+
417
+ df.columns = df.columns.str.replace(" ", "")
418
+ save_as_delta_table(
419
+ dataframe=df,
420
+ delta_table_name=f"{prefix}{name}",
421
+ write_mode="append",
422
+ merge_schema=True,
423
+ )
424
+
425
+ return dataframes
426
+
427
+
428
+ @log
429
+ def get_delta_table_history(
430
+ table_name: str,
431
+ lakehouse: Optional[str | UUID] = None,
432
+ workspace: Optional[str | UUID] = None,
433
+ schema: Optional[str] = None,
434
+ ) -> pd.DataFrame:
435
+ """
436
+ Returns the history of a delta table as a pandas dataframe.
437
+
438
+ Parameters
439
+ ----------
440
+ table_name : str
441
+ The delta table name.
442
+ lakehouse : str | uuid.UUID, default=None
443
+ The Fabric lakehouse name or ID.
444
+ Defaults to None which resolves to the lakehouse attached to the notebook.
445
+ workspace : str | uuid.UUID, default=None
446
+ The Fabric workspace name or ID used by the lakehouse.
447
+ Defaults to None which resolves to the workspace of the attached lakehouse
448
+ or if no lakehouse attached, resolves to the workspace of the notebook.
449
+ schema : str, default=None
450
+ The name of the schema to which the table belongs (for schema-enabled lakehouses). If None, the default schema is used.
451
+
452
+ Returns
453
+ -------
454
+ pandas.DataFrame
455
+ A dataframe showing the history of the delta table.
456
+ """
457
+
458
+ def camel_to_title(text):
459
+ return re.sub(r"([a-z])([A-Z])", r"\1 \2", text).title()
460
+
461
+ workspace_id = resolve_workspace_id(workspace=workspace)
462
+ lakehouse_id = resolve_lakehouse_id(lakehouse=lakehouse, workspace=workspace_id)
463
+ path = create_abfss_path(lakehouse_id, workspace_id, table_name, schema)
464
+ df = _read_delta_table_history(path=path)
465
+ df.rename(columns=lambda col: camel_to_title(col), inplace=True)
466
+
467
+ return df