semantic-link-labs 0.12.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- semantic_link_labs-0.12.8.dist-info/METADATA +354 -0
- semantic_link_labs-0.12.8.dist-info/RECORD +243 -0
- semantic_link_labs-0.12.8.dist-info/WHEEL +5 -0
- semantic_link_labs-0.12.8.dist-info/licenses/LICENSE +21 -0
- semantic_link_labs-0.12.8.dist-info/top_level.txt +1 -0
- sempy_labs/__init__.py +606 -0
- sempy_labs/_a_lib_info.py +2 -0
- sempy_labs/_ai.py +437 -0
- sempy_labs/_authentication.py +264 -0
- sempy_labs/_bpa_translation/_model/_translations_am-ET.po +869 -0
- sempy_labs/_bpa_translation/_model/_translations_ar-AE.po +908 -0
- sempy_labs/_bpa_translation/_model/_translations_bg-BG.po +968 -0
- sempy_labs/_bpa_translation/_model/_translations_ca-ES.po +963 -0
- sempy_labs/_bpa_translation/_model/_translations_cs-CZ.po +943 -0
- sempy_labs/_bpa_translation/_model/_translations_da-DK.po +945 -0
- sempy_labs/_bpa_translation/_model/_translations_de-DE.po +988 -0
- sempy_labs/_bpa_translation/_model/_translations_el-GR.po +993 -0
- sempy_labs/_bpa_translation/_model/_translations_es-ES.po +971 -0
- sempy_labs/_bpa_translation/_model/_translations_fa-IR.po +933 -0
- sempy_labs/_bpa_translation/_model/_translations_fi-FI.po +942 -0
- sempy_labs/_bpa_translation/_model/_translations_fr-FR.po +994 -0
- sempy_labs/_bpa_translation/_model/_translations_ga-IE.po +967 -0
- sempy_labs/_bpa_translation/_model/_translations_he-IL.po +902 -0
- sempy_labs/_bpa_translation/_model/_translations_hi-IN.po +944 -0
- sempy_labs/_bpa_translation/_model/_translations_hu-HU.po +963 -0
- sempy_labs/_bpa_translation/_model/_translations_id-ID.po +946 -0
- sempy_labs/_bpa_translation/_model/_translations_is-IS.po +939 -0
- sempy_labs/_bpa_translation/_model/_translations_it-IT.po +986 -0
- sempy_labs/_bpa_translation/_model/_translations_ja-JP.po +846 -0
- sempy_labs/_bpa_translation/_model/_translations_ko-KR.po +839 -0
- sempy_labs/_bpa_translation/_model/_translations_mt-MT.po +967 -0
- sempy_labs/_bpa_translation/_model/_translations_nl-NL.po +978 -0
- sempy_labs/_bpa_translation/_model/_translations_pl-PL.po +962 -0
- sempy_labs/_bpa_translation/_model/_translations_pt-BR.po +962 -0
- sempy_labs/_bpa_translation/_model/_translations_pt-PT.po +957 -0
- sempy_labs/_bpa_translation/_model/_translations_ro-RO.po +968 -0
- sempy_labs/_bpa_translation/_model/_translations_ru-RU.po +964 -0
- sempy_labs/_bpa_translation/_model/_translations_sk-SK.po +952 -0
- sempy_labs/_bpa_translation/_model/_translations_sl-SL.po +950 -0
- sempy_labs/_bpa_translation/_model/_translations_sv-SE.po +942 -0
- sempy_labs/_bpa_translation/_model/_translations_ta-IN.po +976 -0
- sempy_labs/_bpa_translation/_model/_translations_te-IN.po +947 -0
- sempy_labs/_bpa_translation/_model/_translations_th-TH.po +924 -0
- sempy_labs/_bpa_translation/_model/_translations_tr-TR.po +953 -0
- sempy_labs/_bpa_translation/_model/_translations_uk-UA.po +961 -0
- sempy_labs/_bpa_translation/_model/_translations_zh-CN.po +804 -0
- sempy_labs/_bpa_translation/_model/_translations_zu-ZA.po +969 -0
- sempy_labs/_capacities.py +1198 -0
- sempy_labs/_capacity_migration.py +660 -0
- sempy_labs/_clear_cache.py +351 -0
- sempy_labs/_connections.py +610 -0
- sempy_labs/_dashboards.py +69 -0
- sempy_labs/_data_access_security.py +98 -0
- sempy_labs/_data_pipelines.py +162 -0
- sempy_labs/_dataflows.py +668 -0
- sempy_labs/_dax.py +501 -0
- sempy_labs/_daxformatter.py +80 -0
- sempy_labs/_delta_analyzer.py +467 -0
- sempy_labs/_delta_analyzer_history.py +301 -0
- sempy_labs/_dictionary_diffs.py +221 -0
- sempy_labs/_documentation.py +147 -0
- sempy_labs/_domains.py +51 -0
- sempy_labs/_eventhouses.py +182 -0
- sempy_labs/_external_data_shares.py +230 -0
- sempy_labs/_gateways.py +521 -0
- sempy_labs/_generate_semantic_model.py +521 -0
- sempy_labs/_get_connection_string.py +84 -0
- sempy_labs/_git.py +543 -0
- sempy_labs/_graphQL.py +90 -0
- sempy_labs/_helper_functions.py +2833 -0
- sempy_labs/_icons.py +149 -0
- sempy_labs/_job_scheduler.py +609 -0
- sempy_labs/_kql_databases.py +149 -0
- sempy_labs/_kql_querysets.py +124 -0
- sempy_labs/_kusto.py +137 -0
- sempy_labs/_labels.py +124 -0
- sempy_labs/_list_functions.py +1720 -0
- sempy_labs/_managed_private_endpoints.py +253 -0
- sempy_labs/_mirrored_databases.py +416 -0
- sempy_labs/_mirrored_warehouses.py +60 -0
- sempy_labs/_ml_experiments.py +113 -0
- sempy_labs/_model_auto_build.py +140 -0
- sempy_labs/_model_bpa.py +557 -0
- sempy_labs/_model_bpa_bulk.py +378 -0
- sempy_labs/_model_bpa_rules.py +859 -0
- sempy_labs/_model_dependencies.py +343 -0
- sempy_labs/_mounted_data_factories.py +123 -0
- sempy_labs/_notebooks.py +441 -0
- sempy_labs/_one_lake_integration.py +151 -0
- sempy_labs/_onelake.py +131 -0
- sempy_labs/_query_scale_out.py +433 -0
- sempy_labs/_refresh_semantic_model.py +435 -0
- sempy_labs/_semantic_models.py +468 -0
- sempy_labs/_spark.py +455 -0
- sempy_labs/_sql.py +241 -0
- sempy_labs/_sql_audit_settings.py +207 -0
- sempy_labs/_sql_endpoints.py +214 -0
- sempy_labs/_tags.py +201 -0
- sempy_labs/_translations.py +43 -0
- sempy_labs/_user_delegation_key.py +44 -0
- sempy_labs/_utils.py +79 -0
- sempy_labs/_vertipaq.py +1021 -0
- sempy_labs/_vpax.py +388 -0
- sempy_labs/_warehouses.py +234 -0
- sempy_labs/_workloads.py +140 -0
- sempy_labs/_workspace_identity.py +72 -0
- sempy_labs/_workspaces.py +595 -0
- sempy_labs/admin/__init__.py +170 -0
- sempy_labs/admin/_activities.py +167 -0
- sempy_labs/admin/_apps.py +145 -0
- sempy_labs/admin/_artifacts.py +65 -0
- sempy_labs/admin/_basic_functions.py +463 -0
- sempy_labs/admin/_capacities.py +508 -0
- sempy_labs/admin/_dataflows.py +45 -0
- sempy_labs/admin/_datasets.py +186 -0
- sempy_labs/admin/_domains.py +522 -0
- sempy_labs/admin/_external_data_share.py +100 -0
- sempy_labs/admin/_git.py +72 -0
- sempy_labs/admin/_items.py +265 -0
- sempy_labs/admin/_labels.py +211 -0
- sempy_labs/admin/_reports.py +241 -0
- sempy_labs/admin/_scanner.py +118 -0
- sempy_labs/admin/_shared.py +82 -0
- sempy_labs/admin/_sharing_links.py +110 -0
- sempy_labs/admin/_tags.py +131 -0
- sempy_labs/admin/_tenant.py +503 -0
- sempy_labs/admin/_tenant_keys.py +89 -0
- sempy_labs/admin/_users.py +140 -0
- sempy_labs/admin/_workspaces.py +236 -0
- sempy_labs/deployment_pipeline/__init__.py +23 -0
- sempy_labs/deployment_pipeline/_items.py +580 -0
- sempy_labs/directlake/__init__.py +57 -0
- sempy_labs/directlake/_autosync.py +58 -0
- sempy_labs/directlake/_directlake_schema_compare.py +120 -0
- sempy_labs/directlake/_directlake_schema_sync.py +161 -0
- sempy_labs/directlake/_dl_helper.py +274 -0
- sempy_labs/directlake/_generate_shared_expression.py +94 -0
- sempy_labs/directlake/_get_directlake_lakehouse.py +62 -0
- sempy_labs/directlake/_get_shared_expression.py +34 -0
- sempy_labs/directlake/_guardrails.py +96 -0
- sempy_labs/directlake/_list_directlake_model_calc_tables.py +70 -0
- sempy_labs/directlake/_show_unsupported_directlake_objects.py +90 -0
- sempy_labs/directlake/_update_directlake_model_lakehouse_connection.py +239 -0
- sempy_labs/directlake/_update_directlake_partition_entity.py +259 -0
- sempy_labs/directlake/_warm_cache.py +236 -0
- sempy_labs/dotnet_lib/dotnet.runtime.config.json +10 -0
- sempy_labs/environment/__init__.py +23 -0
- sempy_labs/environment/_items.py +212 -0
- sempy_labs/environment/_pubstage.py +223 -0
- sempy_labs/eventstream/__init__.py +37 -0
- sempy_labs/eventstream/_items.py +263 -0
- sempy_labs/eventstream/_topology.py +652 -0
- sempy_labs/graph/__init__.py +59 -0
- sempy_labs/graph/_groups.py +651 -0
- sempy_labs/graph/_sensitivity_labels.py +120 -0
- sempy_labs/graph/_teams.py +125 -0
- sempy_labs/graph/_user_licenses.py +96 -0
- sempy_labs/graph/_users.py +516 -0
- sempy_labs/graph_model/__init__.py +15 -0
- sempy_labs/graph_model/_background_jobs.py +63 -0
- sempy_labs/graph_model/_items.py +149 -0
- sempy_labs/lakehouse/__init__.py +67 -0
- sempy_labs/lakehouse/_blobs.py +247 -0
- sempy_labs/lakehouse/_get_lakehouse_columns.py +102 -0
- sempy_labs/lakehouse/_get_lakehouse_tables.py +274 -0
- sempy_labs/lakehouse/_helper.py +250 -0
- sempy_labs/lakehouse/_lakehouse.py +351 -0
- sempy_labs/lakehouse/_livy_sessions.py +143 -0
- sempy_labs/lakehouse/_materialized_lake_views.py +157 -0
- sempy_labs/lakehouse/_partitioning.py +165 -0
- sempy_labs/lakehouse/_schemas.py +217 -0
- sempy_labs/lakehouse/_shortcuts.py +440 -0
- sempy_labs/migration/__init__.py +35 -0
- sempy_labs/migration/_create_pqt_file.py +238 -0
- sempy_labs/migration/_direct_lake_to_import.py +105 -0
- sempy_labs/migration/_migrate_calctables_to_lakehouse.py +398 -0
- sempy_labs/migration/_migrate_calctables_to_semantic_model.py +148 -0
- sempy_labs/migration/_migrate_model_objects_to_semantic_model.py +533 -0
- sempy_labs/migration/_migrate_tables_columns_to_semantic_model.py +172 -0
- sempy_labs/migration/_migration_validation.py +71 -0
- sempy_labs/migration/_refresh_calc_tables.py +131 -0
- sempy_labs/mirrored_azure_databricks_catalog/__init__.py +15 -0
- sempy_labs/mirrored_azure_databricks_catalog/_discover.py +213 -0
- sempy_labs/mirrored_azure_databricks_catalog/_refresh_catalog_metadata.py +45 -0
- sempy_labs/ml_model/__init__.py +23 -0
- sempy_labs/ml_model/_functions.py +427 -0
- sempy_labs/report/_BPAReportTemplate.json +232 -0
- sempy_labs/report/__init__.py +55 -0
- sempy_labs/report/_bpareporttemplate/.pbi/localSettings.json +9 -0
- sempy_labs/report/_bpareporttemplate/.platform +11 -0
- sempy_labs/report/_bpareporttemplate/StaticResources/SharedResources/BaseThemes/CY24SU06.json +710 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/page.json +11 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/1b08bce3bebabb0a27a8/visual.json +191 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/2f22ddb70c301693c165/visual.json +438 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/3b1182230aa6c600b43a/visual.json +127 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/58577ba6380c69891500/visual.json +576 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/a2a8fa5028b3b776c96c/visual.json +207 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/adfd47ef30652707b987/visual.json +506 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/b6a80ee459e716e170b1/visual.json +127 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/01d72098bda5055bd500/visuals/ce3130a721c020cc3d81/visual.json +513 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/92735ae19b31712208ad/page.json +8 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/92735ae19b31712208ad/visuals/66e60dfb526437cd78d1/visual.json +112 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/page.json +11 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/07deb8bce824e1be37d7/visual.json +513 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/0b1c68838818b32ad03b/visual.json +352 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/0c171de9d2683d10b930/visual.json +37 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/0efa01be0510e40a645e/visual.json +542 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/6bf2f0eb830ab53cc668/visual.json +221 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/88d8141cb8500b60030c/visual.json +127 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/a753273590beed656a03/visual.json +576 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/c597da16dc7e63222a82/visuals/b8fdc82cddd61ac447bc/visual.json +127 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/d37dce724a0ccc30044b/page.json +9 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/d37dce724a0ccc30044b/visuals/ce8532a7e25020271077/visual.json +38 -0
- sempy_labs/report/_bpareporttemplate/definition/pages/pages.json +10 -0
- sempy_labs/report/_bpareporttemplate/definition/report.json +176 -0
- sempy_labs/report/_bpareporttemplate/definition/version.json +4 -0
- sempy_labs/report/_bpareporttemplate/definition.pbir +14 -0
- sempy_labs/report/_download_report.py +76 -0
- sempy_labs/report/_export_report.py +257 -0
- sempy_labs/report/_generate_report.py +427 -0
- sempy_labs/report/_paginated.py +76 -0
- sempy_labs/report/_report_bpa.py +354 -0
- sempy_labs/report/_report_bpa_rules.py +115 -0
- sempy_labs/report/_report_functions.py +581 -0
- sempy_labs/report/_report_helper.py +227 -0
- sempy_labs/report/_report_list_functions.py +110 -0
- sempy_labs/report/_report_rebind.py +149 -0
- sempy_labs/report/_reportwrapper.py +3100 -0
- sempy_labs/report/_save_report.py +147 -0
- sempy_labs/snowflake_database/__init__.py +10 -0
- sempy_labs/snowflake_database/_items.py +105 -0
- sempy_labs/sql_database/__init__.py +21 -0
- sempy_labs/sql_database/_items.py +201 -0
- sempy_labs/sql_database/_mirroring.py +79 -0
- sempy_labs/theme/__init__.py +12 -0
- sempy_labs/theme/_org_themes.py +129 -0
- sempy_labs/tom/__init__.py +3 -0
- sempy_labs/tom/_model.py +5977 -0
- sempy_labs/variable_library/__init__.py +19 -0
- sempy_labs/variable_library/_functions.py +403 -0
- sempy_labs/warehouse/__init__.py +28 -0
- sempy_labs/warehouse/_items.py +234 -0
- sempy_labs/warehouse/_restore_points.py +309 -0
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import pyarrow.parquet as pq
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from sempy_labs._helper_functions import (
|
|
6
|
+
_get_column_aggregate,
|
|
7
|
+
resolve_lakehouse_name_and_id,
|
|
8
|
+
save_as_delta_table,
|
|
9
|
+
resolve_workspace_id,
|
|
10
|
+
_read_delta_table,
|
|
11
|
+
_get_delta_table,
|
|
12
|
+
_mount,
|
|
13
|
+
create_abfss_path,
|
|
14
|
+
_pure_python_notebook,
|
|
15
|
+
)
|
|
16
|
+
from sempy_labs.directlake._guardrails import (
|
|
17
|
+
get_sku_size,
|
|
18
|
+
get_directlake_guardrails_for_sku,
|
|
19
|
+
)
|
|
20
|
+
from sempy_labs.lakehouse._lakehouse import lakehouse_attached
|
|
21
|
+
from typing import Optional
|
|
22
|
+
import sempy_labs._icons as icons
|
|
23
|
+
from sempy._utils._log import log
|
|
24
|
+
from uuid import UUID
|
|
25
|
+
from sempy_labs.lakehouse._schemas import list_tables
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@log
|
|
29
|
+
def get_lakehouse_tables(
|
|
30
|
+
lakehouse: Optional[str | UUID] = None,
|
|
31
|
+
workspace: Optional[str | UUID] = None,
|
|
32
|
+
extended: bool = False,
|
|
33
|
+
count_rows: bool = False,
|
|
34
|
+
export: bool = False,
|
|
35
|
+
exclude_shortcuts: bool = False,
|
|
36
|
+
) -> pd.DataFrame:
|
|
37
|
+
"""
|
|
38
|
+
Shows the tables of a lakehouse and their respective properties. Option to include additional properties relevant to Direct Lake guardrails.
|
|
39
|
+
|
|
40
|
+
This function can be executed in either a PySpark or pure Python notebook.
|
|
41
|
+
|
|
42
|
+
This is a wrapper function for the following API: `Tables - List Tables <https://learn.microsoft.com/rest/api/fabric/lakehouse/tables/list-tables>`_ plus extended capabilities.
|
|
43
|
+
However, the above mentioned API does not support Lakehouse schemas (Preview) until it is in GA (General Availability). This version also supports schema
|
|
44
|
+
enabled Lakehouses.
|
|
45
|
+
|
|
46
|
+
Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
|
|
47
|
+
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
lakehouse : str | uuid.UUID, default=None
|
|
51
|
+
The Fabric lakehouse name or ID.
|
|
52
|
+
Defaults to None which resolves to the lakehouse attached to the notebook.
|
|
53
|
+
workspace : str | uuid.UUID, default=None
|
|
54
|
+
The Fabric workspace name or ID used by the lakehouse.
|
|
55
|
+
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
56
|
+
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
57
|
+
extended : bool, default=False
|
|
58
|
+
Obtains additional columns relevant to the size of each table.
|
|
59
|
+
count_rows : bool, default=False
|
|
60
|
+
Obtains a row count for each lakehouse table.
|
|
61
|
+
export : bool, default=False
|
|
62
|
+
Exports the resulting dataframe to a delta table in the lakehouse.
|
|
63
|
+
exclude_shortcuts : bool, default=False
|
|
64
|
+
If True, excludes shortcuts.
|
|
65
|
+
|
|
66
|
+
Returns
|
|
67
|
+
-------
|
|
68
|
+
pandas.DataFrame
|
|
69
|
+
Shows the tables/columns within a lakehouse and their properties.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
workspace_id = resolve_workspace_id(workspace)
|
|
73
|
+
(lakehouse_name, lakehouse_id) = resolve_lakehouse_name_and_id(
|
|
74
|
+
lakehouse=lakehouse, workspace=workspace_id
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
df = list_tables(lakehouse=lakehouse, workspace=workspace)
|
|
78
|
+
|
|
79
|
+
local_path = _mount(lakehouse=lakehouse_id, workspace=workspace_id)
|
|
80
|
+
|
|
81
|
+
if extended:
|
|
82
|
+
sku_value = get_sku_size(workspace_id)
|
|
83
|
+
guardrail = get_directlake_guardrails_for_sku(sku_value)
|
|
84
|
+
# Avoid mounting the lakehouse if is already mounted
|
|
85
|
+
if not local_path:
|
|
86
|
+
local_path = _mount(lakehouse=lakehouse_id, workspace=workspace_id)
|
|
87
|
+
|
|
88
|
+
df["Files"], df["Row Groups"], df["Table Size"] = None, None, None
|
|
89
|
+
if count_rows:
|
|
90
|
+
df["Row Count"] = None
|
|
91
|
+
|
|
92
|
+
for i, r in df.iterrows():
|
|
93
|
+
schema_name = r["Schema Name"]
|
|
94
|
+
table_name = r["Table Name"]
|
|
95
|
+
if r["Type"] == "Managed" and r["Format"] == "delta":
|
|
96
|
+
delta_table_path = (
|
|
97
|
+
create_abfss_path(
|
|
98
|
+
lakehouse_id, workspace_id, table_name, schema_name
|
|
99
|
+
)
|
|
100
|
+
.replace("//", "/") # When schema_name = ""
|
|
101
|
+
.replace("abfss:/", "abfss://") # Put back the // after abfss:
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
if _pure_python_notebook():
|
|
105
|
+
from deltalake import DeltaTable
|
|
106
|
+
|
|
107
|
+
delta_table = DeltaTable(delta_table_path)
|
|
108
|
+
latest_files = [
|
|
109
|
+
file["path"]
|
|
110
|
+
for file in delta_table.get_add_actions().to_pylist()
|
|
111
|
+
]
|
|
112
|
+
size_in_bytes = 0
|
|
113
|
+
for f in latest_files:
|
|
114
|
+
local_file_path = os.path.join(
|
|
115
|
+
local_path, "Tables", schema_name, table_name, f
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
if os.path.exists(local_file_path):
|
|
119
|
+
size_in_bytes += os.path.getsize(local_file_path)
|
|
120
|
+
num_latest_files = len(latest_files)
|
|
121
|
+
else:
|
|
122
|
+
delta_table = _get_delta_table(delta_table_path)
|
|
123
|
+
|
|
124
|
+
latest_files = _read_delta_table(delta_table_path).inputFiles()
|
|
125
|
+
table_df = delta_table.toDF()
|
|
126
|
+
table_details = delta_table.detail().collect()[0].asDict()
|
|
127
|
+
size_in_bytes = table_details.get("sizeInBytes", 0)
|
|
128
|
+
num_latest_files = table_details.get("numFiles", 0)
|
|
129
|
+
|
|
130
|
+
table_path = os.path.join(local_path, "Tables", schema_name, table_name)
|
|
131
|
+
|
|
132
|
+
file_paths = []
|
|
133
|
+
for file in latest_files:
|
|
134
|
+
if _pure_python_notebook():
|
|
135
|
+
file_paths.append(file)
|
|
136
|
+
else:
|
|
137
|
+
# Append the <Partition folder>/<filename> or <filename>
|
|
138
|
+
find_table = file.find(table_name)
|
|
139
|
+
len_file = len(file)
|
|
140
|
+
len_table = len(table_name)
|
|
141
|
+
last_chars = len_file - (find_table + len_table + 1)
|
|
142
|
+
file_paths.append(file[-last_chars:])
|
|
143
|
+
|
|
144
|
+
num_rowgroups = 0
|
|
145
|
+
for filename in file_paths:
|
|
146
|
+
parquet_file_path = f"{table_path}/{filename}"
|
|
147
|
+
if os.path.exists(parquet_file_path):
|
|
148
|
+
parquet_file = pq.ParquetFile(parquet_file_path)
|
|
149
|
+
num_rowgroups += parquet_file.num_row_groups
|
|
150
|
+
|
|
151
|
+
df.at[i, "Files"] = num_latest_files
|
|
152
|
+
df.at[i, "Row Groups"] = num_rowgroups
|
|
153
|
+
df.at[i, "Table Size"] = size_in_bytes
|
|
154
|
+
|
|
155
|
+
if count_rows:
|
|
156
|
+
if _pure_python_notebook():
|
|
157
|
+
row_count = delta_table.to_pyarrow_table().num_rows
|
|
158
|
+
else:
|
|
159
|
+
row_count = table_df.count()
|
|
160
|
+
df.at[i, "Row Count"] = row_count
|
|
161
|
+
|
|
162
|
+
# Set "Schema Name" = "dbo" when it is ""
|
|
163
|
+
df.loc[df["Schema Name"] == "", "Schema Name"] = "dbo"
|
|
164
|
+
|
|
165
|
+
if extended:
|
|
166
|
+
intColumns = ["Files", "Row Groups", "Table Size"]
|
|
167
|
+
df[intColumns] = df[intColumns].astype(int)
|
|
168
|
+
|
|
169
|
+
col_name = guardrail.columns[0]
|
|
170
|
+
df["SKU"] = guardrail[col_name].iloc[0]
|
|
171
|
+
df["Parquet File Guardrail"] = guardrail["Parquet files per table"].iloc[0]
|
|
172
|
+
df["Row Group Guardrail"] = guardrail["Row groups per table"].iloc[0]
|
|
173
|
+
df["Row Count Guardrail"] = (
|
|
174
|
+
guardrail["Rows per table (millions)"].iloc[0] * 1000000
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
df["Parquet File Guardrail Hit"] = df["Files"] > df["Parquet File Guardrail"]
|
|
178
|
+
df["Row Group Guardrail Hit"] = df["Row Groups"] > df["Row Group Guardrail"]
|
|
179
|
+
if count_rows:
|
|
180
|
+
df["Row Count"] = df["Row Count"].astype(int)
|
|
181
|
+
df["Row Count Guardrail Hit"] = df["Row Count"] > df["Row Count Guardrail"]
|
|
182
|
+
|
|
183
|
+
if exclude_shortcuts:
|
|
184
|
+
from sempy_labs.lakehouse._shortcuts import list_shortcuts
|
|
185
|
+
|
|
186
|
+
# Exclude shortcuts
|
|
187
|
+
shortcuts = (
|
|
188
|
+
list_shortcuts(lakehouse=lakehouse, workspace=workspace)
|
|
189
|
+
.query("`Shortcut Path`.str.startswith('/Tables')", engine="python")
|
|
190
|
+
.assign(
|
|
191
|
+
FullPath=lambda df: df["Shortcut Path"].str.rstrip("/")
|
|
192
|
+
+ "/"
|
|
193
|
+
+ df["Shortcut Name"]
|
|
194
|
+
)["FullPath"]
|
|
195
|
+
.tolist()
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
df["FullPath"] = df.apply(
|
|
199
|
+
lambda x: (
|
|
200
|
+
f"/Tables/{x['Table Name']}"
|
|
201
|
+
if pd.isna(x["Schema Name"]) or x["Schema Name"] == ""
|
|
202
|
+
else f"/Tables/{x['Schema Name']}/{x['Table Name']}"
|
|
203
|
+
),
|
|
204
|
+
axis=1,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
df = df[~df["FullPath"].isin(shortcuts)].reset_index(drop=True)
|
|
208
|
+
|
|
209
|
+
if export:
|
|
210
|
+
if not lakehouse_attached():
|
|
211
|
+
raise ValueError(
|
|
212
|
+
f"{icons.red_dot} In order to save the dataframe, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook."
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
lake_table_name = "lakehouse_table_details"
|
|
216
|
+
df_filt = df[df["Table Name"] == lake_table_name]
|
|
217
|
+
|
|
218
|
+
if df_filt.empty:
|
|
219
|
+
run_id = 1
|
|
220
|
+
else:
|
|
221
|
+
max_run_id = _get_column_aggregate(table_name=lake_table_name)
|
|
222
|
+
run_id = max_run_id + 1
|
|
223
|
+
|
|
224
|
+
export_df = df.copy()
|
|
225
|
+
|
|
226
|
+
cols = [
|
|
227
|
+
"Files",
|
|
228
|
+
"Row Groups",
|
|
229
|
+
"Row Count",
|
|
230
|
+
"Table Size",
|
|
231
|
+
"SKU",
|
|
232
|
+
"Parquet File Guardrail",
|
|
233
|
+
"Row Group Guardrail",
|
|
234
|
+
"Row Count Guardrail",
|
|
235
|
+
"Parquet File Guardrail Hit",
|
|
236
|
+
"Row Group Guardrail Hit",
|
|
237
|
+
"Row Count Guardrail Hit",
|
|
238
|
+
]
|
|
239
|
+
|
|
240
|
+
for c in cols:
|
|
241
|
+
if c not in export_df:
|
|
242
|
+
if c in [
|
|
243
|
+
"Files",
|
|
244
|
+
"Row Groups",
|
|
245
|
+
"Row Count",
|
|
246
|
+
"Table Size",
|
|
247
|
+
"Parquet File Guardrail",
|
|
248
|
+
"Row Group Guardrail",
|
|
249
|
+
"Row Count Guardrail",
|
|
250
|
+
]:
|
|
251
|
+
export_df[c] = 0
|
|
252
|
+
export_df[c] = export_df[c].astype(int)
|
|
253
|
+
elif c in ["SKU"]:
|
|
254
|
+
export_df[c] = None
|
|
255
|
+
export_df[c] = export_df[c].astype(str)
|
|
256
|
+
elif c in [
|
|
257
|
+
"Parquet File Guardrail Hit",
|
|
258
|
+
"Row Group Guardrail Hit",
|
|
259
|
+
"Row Count Guardrail Hit",
|
|
260
|
+
]:
|
|
261
|
+
export_df[c] = False
|
|
262
|
+
export_df[c] = export_df[c].astype(bool)
|
|
263
|
+
|
|
264
|
+
print(
|
|
265
|
+
f"{icons.in_progress} Saving Lakehouse table properties to the '{lake_table_name}' table in the lakehouse...\n"
|
|
266
|
+
)
|
|
267
|
+
export_df["Timestamp"] = datetime.now()
|
|
268
|
+
export_df["RunId"] = run_id
|
|
269
|
+
|
|
270
|
+
save_as_delta_table(
|
|
271
|
+
dataframe=export_df, delta_table_name=lake_table_name, write_mode="append"
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
return df
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
from uuid import UUID
|
|
2
|
+
from typing import Optional, Literal
|
|
3
|
+
import pyarrow.dataset as ds
|
|
4
|
+
from sempy_labs._helper_functions import (
|
|
5
|
+
_mount,
|
|
6
|
+
delete_item,
|
|
7
|
+
_base_api,
|
|
8
|
+
resolve_workspace_name_and_id,
|
|
9
|
+
resolve_lakehouse_name_and_id,
|
|
10
|
+
)
|
|
11
|
+
from sempy._utils._log import log
|
|
12
|
+
import sempy_labs._icons as icons
|
|
13
|
+
import os
|
|
14
|
+
import json
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@log
|
|
18
|
+
def is_v_ordered(
|
|
19
|
+
table_name: str,
|
|
20
|
+
lakehouse: Optional[str | UUID] = None,
|
|
21
|
+
workspace: Optional[str | UUID] = None,
|
|
22
|
+
schema: Optional[str] = None,
|
|
23
|
+
) -> bool:
|
|
24
|
+
"""
|
|
25
|
+
Checks if a delta table is v-ordered.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
table_name : str
|
|
30
|
+
The name of the table to check.
|
|
31
|
+
lakehouse : str | uuid.UUID, default=None
|
|
32
|
+
The Fabric lakehouse name or ID.
|
|
33
|
+
Defaults to None which resolves to the lakehouse attached to the notebook.
|
|
34
|
+
workspace : str | uuid.UUID, default=None
|
|
35
|
+
The Fabric workspace name or ID used by the lakehouse.
|
|
36
|
+
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
37
|
+
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
38
|
+
schema : str, optional
|
|
39
|
+
The schema of the table to check. If not provided, the default schema is used.
|
|
40
|
+
|
|
41
|
+
Returns
|
|
42
|
+
-------
|
|
43
|
+
bool
|
|
44
|
+
True if the table is v-ordered, False otherwise.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
local_path = _mount(lakehouse=lakehouse, workspace=workspace)
|
|
48
|
+
table_path = (
|
|
49
|
+
f"{local_path}/Tables/{schema}/{table_name}"
|
|
50
|
+
if schema
|
|
51
|
+
else f"{local_path}/Tables/{table_name}"
|
|
52
|
+
)
|
|
53
|
+
ds_schema = ds.dataset(table_path).schema.metadata
|
|
54
|
+
|
|
55
|
+
if ds_schema:
|
|
56
|
+
return any(b"vorder" in key for key in ds_schema.keys())
|
|
57
|
+
|
|
58
|
+
delta_log_path = os.path.join(table_path, "_delta_log")
|
|
59
|
+
|
|
60
|
+
def read_vorder_tag(delta_log_path):
|
|
61
|
+
json_files = sorted(
|
|
62
|
+
[f for f in os.listdir(delta_log_path) if f.endswith(".json")], reverse=True
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
if not json_files:
|
|
66
|
+
return False
|
|
67
|
+
|
|
68
|
+
latest_file = os.path.join(delta_log_path, json_files[0])
|
|
69
|
+
|
|
70
|
+
with open(latest_file, "r") as f:
|
|
71
|
+
all_data = [
|
|
72
|
+
json.loads(line) for line in f if line.strip()
|
|
73
|
+
] # one dict per line
|
|
74
|
+
for data in all_data:
|
|
75
|
+
if "metaData" in data:
|
|
76
|
+
return (
|
|
77
|
+
data.get("metaData", {})
|
|
78
|
+
.get("configuration", {})
|
|
79
|
+
.get("delta.parquet.vorder.enabled", "false")
|
|
80
|
+
== "true"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# If no metaData, fall back to commitInfo
|
|
84
|
+
for data in all_data:
|
|
85
|
+
if "commitInfo" in data:
|
|
86
|
+
tags = data["commitInfo"].get("tags", {})
|
|
87
|
+
return tags.get("VORDER", "false").lower() == "true"
|
|
88
|
+
|
|
89
|
+
return False # Default if not found
|
|
90
|
+
|
|
91
|
+
return read_vorder_tag(delta_log_path)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@log
|
|
95
|
+
def delete_lakehouse(
|
|
96
|
+
lakehouse: str | UUID, workspace: Optional[str | UUID] = None
|
|
97
|
+
) -> None:
|
|
98
|
+
"""
|
|
99
|
+
Deletes a lakehouse.
|
|
100
|
+
|
|
101
|
+
This is a wrapper function for the following API: `Items - Delete Lakehouse <https://learn.microsoft.com/rest/api/fabric/lakehouse/items/delete-lakehouse>`_.
|
|
102
|
+
|
|
103
|
+
Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
|
|
104
|
+
|
|
105
|
+
Parameters
|
|
106
|
+
----------
|
|
107
|
+
lakehouse : str | uuid.UUID
|
|
108
|
+
The name or ID of the lakehouse to delete.
|
|
109
|
+
workspace : str | uuid.UUID, default=None
|
|
110
|
+
The Fabric workspace name or ID used by the lakehouse.
|
|
111
|
+
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
112
|
+
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
delete_item(item=lakehouse, item_type="lakehouse", workspace=workspace)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@log
|
|
119
|
+
def update_lakehouse(
|
|
120
|
+
name: Optional[str] = None,
|
|
121
|
+
description: Optional[str] = None,
|
|
122
|
+
lakehouse: Optional[str | UUID] = None,
|
|
123
|
+
workspace: Optional[str | UUID] = None,
|
|
124
|
+
):
|
|
125
|
+
"""
|
|
126
|
+
Updates a lakehouse.
|
|
127
|
+
|
|
128
|
+
This is a wrapper function for the following API: `Items - Update Lakehouse <https://learn.microsoft.com/rest/api/fabric/lakehouse/items/update-lakehouse>`_.
|
|
129
|
+
|
|
130
|
+
Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
|
|
131
|
+
|
|
132
|
+
Parameters
|
|
133
|
+
----------
|
|
134
|
+
name: str, default=None
|
|
135
|
+
The new name of the lakehouse.
|
|
136
|
+
Defaults to None which does not update the name.
|
|
137
|
+
description: str, default=None
|
|
138
|
+
The new description of the lakehouse.
|
|
139
|
+
Defaults to None which does not update the description.
|
|
140
|
+
lakehouse : str | uuid.UUID, default=None
|
|
141
|
+
The name or ID of the lakehouse to update.
|
|
142
|
+
Defaults to None which resolves to the lakehouse attached to the notebook.
|
|
143
|
+
workspace : str | uuid.UUID, default=None
|
|
144
|
+
The Fabric workspace name or ID used by the lakehouse.
|
|
145
|
+
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
146
|
+
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
if not name and not description:
|
|
150
|
+
raise ValueError(
|
|
151
|
+
f"{icons.red_dot} Either name or description must be provided."
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
(workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
|
|
155
|
+
(lakehouse_name, lakehouse_id) = resolve_lakehouse_name_and_id(
|
|
156
|
+
lakehouse, workspace_id
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
payload = {}
|
|
160
|
+
if name:
|
|
161
|
+
payload["displayName"] = name
|
|
162
|
+
if description:
|
|
163
|
+
payload["description"] = description
|
|
164
|
+
|
|
165
|
+
_base_api(
|
|
166
|
+
request=f"/v1/workspaces/{workspace_id}/lakehouses/{lakehouse_id}",
|
|
167
|
+
method="patch",
|
|
168
|
+
client="fabric_sp",
|
|
169
|
+
payload=payload,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
print(
|
|
173
|
+
f"{icons.green_dot} The '{lakehouse_name}' lakehouse within the '{workspace_name}' workspace has been updated accordingly."
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
@log
|
|
178
|
+
def load_table(
|
|
179
|
+
table_name: str,
|
|
180
|
+
file_path: str,
|
|
181
|
+
mode: Literal["Overwrite", "Append"],
|
|
182
|
+
lakehouse: Optional[str | UUID] = None,
|
|
183
|
+
workspace: Optional[str | UUID] = None,
|
|
184
|
+
):
|
|
185
|
+
"""
|
|
186
|
+
Loads a table into a lakehouse. Currently only files are supported, not folders.
|
|
187
|
+
|
|
188
|
+
This is a wrapper function for the following API: `Tables - Load Table <https://learn.microsoft.com/rest/api/fabric/lakehouse/tables/load-table>`_.
|
|
189
|
+
|
|
190
|
+
Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
|
|
191
|
+
|
|
192
|
+
Parameters
|
|
193
|
+
----------
|
|
194
|
+
table_name : str
|
|
195
|
+
The name of the table to load.
|
|
196
|
+
file_path : str
|
|
197
|
+
The path to the data to load.
|
|
198
|
+
mode : Literal["Overwrite", "Append"]
|
|
199
|
+
The mode to use when loading the data.
|
|
200
|
+
"Overwrite" will overwrite the existing data.
|
|
201
|
+
"Append" will append the data to the existing data.
|
|
202
|
+
lakehouse : str | uuid.UUID, default=None
|
|
203
|
+
The name or ID of the lakehouse to load the table into.
|
|
204
|
+
Defaults to None which resolves to the lakehouse attached to the notebook.
|
|
205
|
+
workspace : str | uuid.UUID, default=None
|
|
206
|
+
The Fabric workspace name or ID used by the lakehouse.
|
|
207
|
+
Defaults to None which resolves to the workspace of the attached lakehouse
|
|
208
|
+
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
209
|
+
"""
|
|
210
|
+
|
|
211
|
+
(workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
|
|
212
|
+
(lakehouse_name, lakehouse_id) = resolve_lakehouse_name_and_id(
|
|
213
|
+
lakehouse, workspace_id
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
file_extension = os.path.splitext(file_path)[1]
|
|
217
|
+
|
|
218
|
+
payload = {
|
|
219
|
+
"relativePath": file_path,
|
|
220
|
+
"pathType": "File",
|
|
221
|
+
"mode": mode,
|
|
222
|
+
"formatOptions": {},
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
if file_extension == ".csv":
|
|
226
|
+
payload["formatOptions"] = {"format": "Csv", "header": True, "delimiter": ","}
|
|
227
|
+
elif file_extension == ".parquet":
|
|
228
|
+
payload["formatOptions"] = {
|
|
229
|
+
"format": "Parquet",
|
|
230
|
+
"header": True,
|
|
231
|
+
}
|
|
232
|
+
# Solve for loading folders
|
|
233
|
+
# elif file_extension == '':
|
|
234
|
+
# payload['pathType'] = "Folder"
|
|
235
|
+
# payload["recursive"] = recursive
|
|
236
|
+
# payload['formatOptions']
|
|
237
|
+
else:
|
|
238
|
+
raise NotImplementedError()
|
|
239
|
+
|
|
240
|
+
_base_api(
|
|
241
|
+
request=f"/v1/workspaces/{workspace_id}/lakehouses/{lakehouse_id}/tables/{table_name}/load",
|
|
242
|
+
client="fabric_sp",
|
|
243
|
+
method="post",
|
|
244
|
+
status_codes=202,
|
|
245
|
+
lro_return_status_code=True,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
print(
|
|
249
|
+
f"{icons.green_dot} The '{table_name}' table has been loaded into the '{lakehouse_name}' lakehouse within the '{workspace_name}' workspace."
|
|
250
|
+
)
|