semantic-link-labs 0.9.9__py3-none-any.whl → 0.9.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of semantic-link-labs might be problematic. Click here for more details.
- {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/METADATA +30 -22
- {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/RECORD +47 -40
- {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/WHEEL +1 -1
- sempy_labs/__init__.py +28 -1
- sempy_labs/_clear_cache.py +12 -0
- sempy_labs/_dax.py +8 -2
- sempy_labs/_delta_analyzer.py +17 -26
- sempy_labs/_environments.py +19 -1
- sempy_labs/_generate_semantic_model.py +7 -8
- sempy_labs/_helper_functions.py +351 -151
- sempy_labs/_kql_databases.py +18 -0
- sempy_labs/_kusto.py +137 -0
- sempy_labs/_list_functions.py +18 -36
- sempy_labs/_model_bpa_rules.py +13 -3
- sempy_labs/_notebooks.py +44 -11
- sempy_labs/_semantic_models.py +93 -1
- sempy_labs/_sql.py +3 -2
- sempy_labs/_tags.py +194 -0
- sempy_labs/_variable_libraries.py +89 -0
- sempy_labs/_vertipaq.py +6 -6
- sempy_labs/_vpax.py +386 -0
- sempy_labs/_warehouses.py +3 -3
- sempy_labs/admin/__init__.py +14 -0
- sempy_labs/admin/_artifacts.py +3 -3
- sempy_labs/admin/_capacities.py +161 -1
- sempy_labs/admin/_dataflows.py +45 -0
- sempy_labs/admin/_items.py +16 -11
- sempy_labs/admin/_tags.py +126 -0
- sempy_labs/admin/_tenant.py +5 -5
- sempy_labs/directlake/_generate_shared_expression.py +29 -26
- sempy_labs/directlake/_update_directlake_model_lakehouse_connection.py +55 -5
- sempy_labs/dotnet_lib/dotnet.runtime.config.json +10 -0
- sempy_labs/lakehouse/__init__.py +16 -0
- sempy_labs/lakehouse/_blobs.py +115 -63
- sempy_labs/lakehouse/_get_lakehouse_columns.py +41 -18
- sempy_labs/lakehouse/_get_lakehouse_tables.py +62 -47
- sempy_labs/lakehouse/_helper.py +211 -0
- sempy_labs/lakehouse/_lakehouse.py +45 -36
- sempy_labs/lakehouse/_livy_sessions.py +137 -0
- sempy_labs/migration/_migrate_calctables_to_lakehouse.py +7 -12
- sempy_labs/migration/_refresh_calc_tables.py +7 -6
- sempy_labs/report/_download_report.py +1 -1
- sempy_labs/report/_generate_report.py +5 -1
- sempy_labs/report/_reportwrapper.py +31 -18
- sempy_labs/tom/_model.py +104 -35
- sempy_labs/report/_bpareporttemplate/.pbi/localSettings.json +0 -9
- sempy_labs/report/_bpareporttemplate/.platform +0 -11
- {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/licenses/LICENSE +0 -0
- {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/top_level.txt +0 -0
sempy_labs/_helper_functions.py
CHANGED
|
@@ -8,7 +8,7 @@ from sempy.fabric.exceptions import FabricHTTPException, WorkspaceNotFoundExcept
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
from functools import wraps
|
|
10
10
|
import datetime
|
|
11
|
-
from typing import Optional, Tuple, List
|
|
11
|
+
from typing import Optional, Tuple, List, Dict
|
|
12
12
|
from uuid import UUID
|
|
13
13
|
import sempy_labs._icons as icons
|
|
14
14
|
from azure.core.credentials import TokenCredential, AccessToken
|
|
@@ -74,6 +74,15 @@ def create_abfss_path(
|
|
|
74
74
|
return path
|
|
75
75
|
|
|
76
76
|
|
|
77
|
+
def create_abfss_path_from_path(
|
|
78
|
+
lakehouse_id: UUID, workspace_id: UUID, file_path: str
|
|
79
|
+
) -> str:
|
|
80
|
+
|
|
81
|
+
fp = _get_default_file_path()
|
|
82
|
+
|
|
83
|
+
return f"abfss://{workspace_id}@{fp}/{lakehouse_id}/{file_path}"
|
|
84
|
+
|
|
85
|
+
|
|
77
86
|
def _get_default_file_path() -> str:
|
|
78
87
|
|
|
79
88
|
default_file_storage = _get_fabric_context_setting(name="fs.defaultFS")
|
|
@@ -266,7 +275,7 @@ def create_item(
|
|
|
266
275
|
lro_return_status_code=True,
|
|
267
276
|
)
|
|
268
277
|
print(
|
|
269
|
-
f"{icons.green_dot} The '{name}' {item_type} has been successfully created within the
|
|
278
|
+
f"{icons.green_dot} The '{name}' {item_type} has been successfully created within the '{workspace_name}' workspace."
|
|
270
279
|
)
|
|
271
280
|
|
|
272
281
|
|
|
@@ -278,10 +287,9 @@ def get_item_definition(
|
|
|
278
287
|
return_dataframe: bool = True,
|
|
279
288
|
decode: bool = True,
|
|
280
289
|
):
|
|
281
|
-
|
|
282
290
|
from sempy_labs._utils import item_types
|
|
283
291
|
|
|
284
|
-
|
|
292
|
+
workspace_id = resolve_workspace_id(workspace)
|
|
285
293
|
item_id = resolve_item_id(item, type, workspace_id)
|
|
286
294
|
item_type_url = item_types.get(type)[1]
|
|
287
295
|
path = item_types.get(type)[2]
|
|
@@ -304,92 +312,11 @@ def get_item_definition(
|
|
|
304
312
|
p.get("payload") for p in result["definition"]["parts"] if p.get("path") == path
|
|
305
313
|
)
|
|
306
314
|
if decode:
|
|
307
|
-
json.loads(_decode_b64(value))
|
|
315
|
+
return json.loads(_decode_b64(value))
|
|
308
316
|
else:
|
|
309
317
|
return value
|
|
310
318
|
|
|
311
319
|
|
|
312
|
-
def resolve_item_id(
|
|
313
|
-
item: str | UUID, type: Optional[str] = None, workspace: Optional[str | UUID] = None
|
|
314
|
-
) -> UUID:
|
|
315
|
-
|
|
316
|
-
(workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
|
|
317
|
-
item_id = None
|
|
318
|
-
|
|
319
|
-
if _is_valid_uuid(item):
|
|
320
|
-
# Check (optional)
|
|
321
|
-
item_id = item
|
|
322
|
-
try:
|
|
323
|
-
_base_api(
|
|
324
|
-
request=f"/v1/workspaces/{workspace_id}/items/{item_id}",
|
|
325
|
-
client="fabric_sp",
|
|
326
|
-
)
|
|
327
|
-
except FabricHTTPException:
|
|
328
|
-
raise ValueError(
|
|
329
|
-
f"{icons.red_dot} The '{item_id}' item was not found in the '{workspace_name}' workspace."
|
|
330
|
-
)
|
|
331
|
-
else:
|
|
332
|
-
if type is None:
|
|
333
|
-
raise ValueError(
|
|
334
|
-
f"{icons.red_dot} The 'type' parameter is required if specifying an item name."
|
|
335
|
-
)
|
|
336
|
-
responses = _base_api(
|
|
337
|
-
request=f"/v1/workspaces/{workspace_id}/items?type={type}",
|
|
338
|
-
client="fabric_sp",
|
|
339
|
-
uses_pagination=True,
|
|
340
|
-
)
|
|
341
|
-
for r in responses:
|
|
342
|
-
for v in r.get("value", []):
|
|
343
|
-
display_name = v.get("displayName")
|
|
344
|
-
if display_name == item:
|
|
345
|
-
item_id = v.get("id")
|
|
346
|
-
break
|
|
347
|
-
|
|
348
|
-
if item_id is None:
|
|
349
|
-
raise ValueError(
|
|
350
|
-
f"{icons.red_dot} There's no item '{item}' of type '{type}' in the '{workspace_name}' workspace."
|
|
351
|
-
)
|
|
352
|
-
|
|
353
|
-
return item_id
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
def resolve_item_name_and_id(
|
|
357
|
-
item: str | UUID, type: Optional[str] = None, workspace: Optional[str | UUID] = None
|
|
358
|
-
) -> Tuple[str, UUID]:
|
|
359
|
-
|
|
360
|
-
workspace_id = resolve_workspace_id(workspace)
|
|
361
|
-
item_id = resolve_item_id(item=item, type=type, workspace=workspace_id)
|
|
362
|
-
item_name = (
|
|
363
|
-
_base_api(
|
|
364
|
-
request=f"/v1/workspaces/{workspace_id}/items/{item_id}", client="fabric_sp"
|
|
365
|
-
)
|
|
366
|
-
.json()
|
|
367
|
-
.get("displayName")
|
|
368
|
-
)
|
|
369
|
-
|
|
370
|
-
return item_name, item_id
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
def resolve_item_name(item_id: UUID, workspace: Optional[str | UUID] = None) -> str:
|
|
374
|
-
|
|
375
|
-
workspace_id = resolve_workspace_id(workspace)
|
|
376
|
-
try:
|
|
377
|
-
item_name = (
|
|
378
|
-
_base_api(
|
|
379
|
-
request=f"/v1/workspaces/{workspace_id}/items/{item_id}",
|
|
380
|
-
client="fabric_sp",
|
|
381
|
-
)
|
|
382
|
-
.json()
|
|
383
|
-
.get("displayName")
|
|
384
|
-
)
|
|
385
|
-
except FabricHTTPException:
|
|
386
|
-
raise ValueError(
|
|
387
|
-
f"{icons.red_dot} The '{item_id}' item was not found in the '{workspace_id}' workspace."
|
|
388
|
-
)
|
|
389
|
-
|
|
390
|
-
return item_name
|
|
391
|
-
|
|
392
|
-
|
|
393
320
|
def resolve_lakehouse_name_and_id(
|
|
394
321
|
lakehouse: Optional[str | UUID] = None, workspace: Optional[str | UUID] = None
|
|
395
322
|
) -> Tuple[str, UUID]:
|
|
@@ -663,11 +590,13 @@ def save_as_delta_table(
|
|
|
663
590
|
workspace: Optional[str | UUID] = None,
|
|
664
591
|
):
|
|
665
592
|
"""
|
|
666
|
-
Saves a pandas dataframe as a delta table in a Fabric lakehouse.
|
|
593
|
+
Saves a pandas or spark dataframe as a delta table in a Fabric lakehouse.
|
|
594
|
+
|
|
595
|
+
This function may be executed in either a PySpark or pure Python notebook. If executing in a pure Python notebook, the dataframe must be a pandas dataframe.
|
|
667
596
|
|
|
668
597
|
Parameters
|
|
669
598
|
----------
|
|
670
|
-
dataframe : pandas.DataFrame
|
|
599
|
+
dataframe : pandas.DataFrame | spark.Dataframe
|
|
671
600
|
The dataframe to be saved as a delta table.
|
|
672
601
|
delta_table_name : str
|
|
673
602
|
The name of the delta table.
|
|
@@ -686,19 +615,6 @@ def save_as_delta_table(
|
|
|
686
615
|
or if no lakehouse attached, resolves to the workspace of the notebook.
|
|
687
616
|
"""
|
|
688
617
|
|
|
689
|
-
from pyspark.sql.types import (
|
|
690
|
-
StringType,
|
|
691
|
-
IntegerType,
|
|
692
|
-
FloatType,
|
|
693
|
-
DateType,
|
|
694
|
-
StructType,
|
|
695
|
-
StructField,
|
|
696
|
-
BooleanType,
|
|
697
|
-
LongType,
|
|
698
|
-
DoubleType,
|
|
699
|
-
TimestampType,
|
|
700
|
-
)
|
|
701
|
-
|
|
702
618
|
(workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
|
|
703
619
|
(lakehouse_name, lakehouse_id) = resolve_lakehouse_name_and_id(
|
|
704
620
|
lakehouse=lakehouse, workspace=workspace_id
|
|
@@ -717,52 +633,101 @@ def save_as_delta_table(
|
|
|
717
633
|
f"{icons.red_dot} Invalid 'delta_table_name'. Delta tables in the lakehouse cannot have spaces in their names."
|
|
718
634
|
)
|
|
719
635
|
|
|
720
|
-
|
|
636
|
+
import pyarrow as pa
|
|
637
|
+
from pyspark.sql.types import (
|
|
638
|
+
StringType,
|
|
639
|
+
IntegerType,
|
|
640
|
+
FloatType,
|
|
641
|
+
DateType,
|
|
642
|
+
StructType,
|
|
643
|
+
StructField,
|
|
644
|
+
BooleanType,
|
|
645
|
+
LongType,
|
|
646
|
+
DoubleType,
|
|
647
|
+
TimestampType,
|
|
648
|
+
)
|
|
721
649
|
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
650
|
+
def get_type_mapping(pure_python):
|
|
651
|
+
common_mapping = {
|
|
652
|
+
"string": ("pa", pa.string(), StringType()),
|
|
653
|
+
"str": ("pa", pa.string(), StringType()),
|
|
654
|
+
"integer": ("pa", pa.int32(), IntegerType()),
|
|
655
|
+
"int": ("pa", pa.int32(), IntegerType()),
|
|
656
|
+
"float": ("pa", pa.float32(), FloatType()),
|
|
657
|
+
"double": ("pa", pa.float64(), DoubleType()),
|
|
658
|
+
"long": ("pa", pa.int64(), LongType()),
|
|
659
|
+
"bool": ("pa", pa.bool_(), BooleanType()),
|
|
660
|
+
"boolean": ("pa", pa.bool_(), BooleanType()),
|
|
661
|
+
"date": ("pa", pa.date32(), DateType()),
|
|
662
|
+
"timestamp": ("pa", pa.timestamp("us"), TimestampType()),
|
|
663
|
+
}
|
|
664
|
+
return {k: v[1] if pure_python else v[2] for k, v in common_mapping.items()}
|
|
735
665
|
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
666
|
+
def build_schema(schema_dict, type_mapping, use_arrow=True):
|
|
667
|
+
if use_arrow:
|
|
668
|
+
fields = [
|
|
669
|
+
pa.field(name, type_mapping.get(dtype.lower()))
|
|
670
|
+
for name, dtype in schema_dict.items()
|
|
671
|
+
]
|
|
672
|
+
return pa.schema(fields)
|
|
740
673
|
else:
|
|
741
|
-
|
|
674
|
+
return StructType(
|
|
742
675
|
[
|
|
743
|
-
StructField(
|
|
744
|
-
for
|
|
676
|
+
StructField(name, type_mapping.get(dtype.lower()), True)
|
|
677
|
+
for name, dtype in schema_dict.items()
|
|
745
678
|
]
|
|
746
679
|
)
|
|
747
|
-
|
|
680
|
+
|
|
681
|
+
# Main logic
|
|
682
|
+
schema_map = None
|
|
683
|
+
if schema is not None:
|
|
684
|
+
use_arrow = _pure_python_notebook()
|
|
685
|
+
type_mapping = get_type_mapping(use_arrow)
|
|
686
|
+
schema_map = build_schema(schema, type_mapping, use_arrow)
|
|
687
|
+
|
|
688
|
+
if isinstance(dataframe, pd.DataFrame):
|
|
689
|
+
dataframe.columns = [col.replace(" ", "_") for col in dataframe.columns]
|
|
690
|
+
if _pure_python_notebook():
|
|
691
|
+
spark_df = dataframe
|
|
692
|
+
else:
|
|
693
|
+
spark = _create_spark_session()
|
|
694
|
+
if schema is None:
|
|
695
|
+
spark_df = spark.createDataFrame(dataframe)
|
|
696
|
+
else:
|
|
697
|
+
spark_df = spark.createDataFrame(dataframe, schema_map)
|
|
748
698
|
else:
|
|
749
699
|
for col_name in dataframe.columns:
|
|
750
700
|
new_name = col_name.replace(" ", "_")
|
|
751
701
|
dataframe = dataframe.withColumnRenamed(col_name, new_name)
|
|
752
702
|
spark_df = dataframe
|
|
753
703
|
|
|
754
|
-
|
|
704
|
+
file_path = create_abfss_path(
|
|
755
705
|
lakehouse_id=lakehouse_id,
|
|
756
706
|
lakehouse_workspace_id=workspace_id,
|
|
757
707
|
delta_table_name=delta_table_name,
|
|
758
708
|
)
|
|
759
709
|
|
|
760
|
-
if
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
710
|
+
if _pure_python_notebook():
|
|
711
|
+
from deltalake import write_deltalake
|
|
712
|
+
|
|
713
|
+
write_args = {
|
|
714
|
+
"table_or_uri": file_path,
|
|
715
|
+
"data": spark_df,
|
|
716
|
+
"mode": write_mode,
|
|
717
|
+
"schema": schema_map,
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
if merge_schema:
|
|
721
|
+
write_args["schema_mode"] = "merge"
|
|
722
|
+
|
|
723
|
+
write_deltalake(**write_args)
|
|
764
724
|
else:
|
|
765
|
-
spark_df.write.mode(write_mode).format("delta")
|
|
725
|
+
writer = spark_df.write.mode(write_mode).format("delta")
|
|
726
|
+
if merge_schema:
|
|
727
|
+
writer = writer.option("mergeSchema", "true")
|
|
728
|
+
|
|
729
|
+
writer.save(file_path)
|
|
730
|
+
|
|
766
731
|
print(
|
|
767
732
|
f"{icons.green_dot} The dataframe has been saved as the '{delta_table_name}' table in the '{lakehouse_name}' lakehouse within the '{workspace_name}' workspace."
|
|
768
733
|
)
|
|
@@ -898,6 +863,87 @@ def resolve_workspace_name_and_id(
|
|
|
898
863
|
return workspace_name, workspace_id
|
|
899
864
|
|
|
900
865
|
|
|
866
|
+
def resolve_item_id(
|
|
867
|
+
item: str | UUID, type: Optional[str] = None, workspace: Optional[str | UUID] = None
|
|
868
|
+
) -> UUID:
|
|
869
|
+
|
|
870
|
+
(workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
|
|
871
|
+
item_id = None
|
|
872
|
+
|
|
873
|
+
if _is_valid_uuid(item):
|
|
874
|
+
# Check (optional)
|
|
875
|
+
item_id = item
|
|
876
|
+
try:
|
|
877
|
+
_base_api(
|
|
878
|
+
request=f"/v1/workspaces/{workspace_id}/items/{item_id}",
|
|
879
|
+
client="fabric_sp",
|
|
880
|
+
)
|
|
881
|
+
except FabricHTTPException:
|
|
882
|
+
raise ValueError(
|
|
883
|
+
f"{icons.red_dot} The '{item_id}' item was not found in the '{workspace_name}' workspace."
|
|
884
|
+
)
|
|
885
|
+
else:
|
|
886
|
+
if type is None:
|
|
887
|
+
raise ValueError(
|
|
888
|
+
f"{icons.red_dot} The 'type' parameter is required if specifying an item name."
|
|
889
|
+
)
|
|
890
|
+
responses = _base_api(
|
|
891
|
+
request=f"/v1/workspaces/{workspace_id}/items?type={type}",
|
|
892
|
+
client="fabric_sp",
|
|
893
|
+
uses_pagination=True,
|
|
894
|
+
)
|
|
895
|
+
for r in responses:
|
|
896
|
+
for v in r.get("value", []):
|
|
897
|
+
display_name = v.get("displayName")
|
|
898
|
+
if display_name == item:
|
|
899
|
+
item_id = v.get("id")
|
|
900
|
+
break
|
|
901
|
+
|
|
902
|
+
if item_id is None:
|
|
903
|
+
raise ValueError(
|
|
904
|
+
f"{icons.red_dot} There's no item '{item}' of type '{type}' in the '{workspace_name}' workspace."
|
|
905
|
+
)
|
|
906
|
+
|
|
907
|
+
return item_id
|
|
908
|
+
|
|
909
|
+
|
|
910
|
+
def resolve_item_name_and_id(
|
|
911
|
+
item: str | UUID, type: Optional[str] = None, workspace: Optional[str | UUID] = None
|
|
912
|
+
) -> Tuple[str, UUID]:
|
|
913
|
+
|
|
914
|
+
workspace_id = resolve_workspace_id(workspace)
|
|
915
|
+
item_id = resolve_item_id(item=item, type=type, workspace=workspace_id)
|
|
916
|
+
item_name = (
|
|
917
|
+
_base_api(
|
|
918
|
+
request=f"/v1/workspaces/{workspace_id}/items/{item_id}", client="fabric_sp"
|
|
919
|
+
)
|
|
920
|
+
.json()
|
|
921
|
+
.get("displayName")
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
return item_name, item_id
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
def resolve_item_name(item_id: UUID, workspace: Optional[str | UUID] = None) -> str:
|
|
928
|
+
|
|
929
|
+
workspace_id = resolve_workspace_id(workspace)
|
|
930
|
+
try:
|
|
931
|
+
item_name = (
|
|
932
|
+
_base_api(
|
|
933
|
+
request=f"/v1/workspaces/{workspace_id}/items/{item_id}",
|
|
934
|
+
client="fabric_sp",
|
|
935
|
+
)
|
|
936
|
+
.json()
|
|
937
|
+
.get("displayName")
|
|
938
|
+
)
|
|
939
|
+
except FabricHTTPException:
|
|
940
|
+
raise ValueError(
|
|
941
|
+
f"{icons.red_dot} The '{item_id}' item was not found in the '{workspace_id}' workspace."
|
|
942
|
+
)
|
|
943
|
+
|
|
944
|
+
return item_name
|
|
945
|
+
|
|
946
|
+
|
|
901
947
|
def _extract_json(dataframe: pd.DataFrame) -> dict:
|
|
902
948
|
|
|
903
949
|
payload = dataframe["payload"].iloc[0]
|
|
@@ -1497,32 +1543,108 @@ def generate_guid():
|
|
|
1497
1543
|
|
|
1498
1544
|
def _get_column_aggregate(
|
|
1499
1545
|
table_name: str,
|
|
1500
|
-
column_name: str = "RunId",
|
|
1546
|
+
column_name: str | List[str] = "RunId",
|
|
1501
1547
|
lakehouse: Optional[str | UUID] = None,
|
|
1502
1548
|
workspace: Optional[str | UUID] = None,
|
|
1503
1549
|
function: str = "max",
|
|
1504
1550
|
default_value: int = 0,
|
|
1505
|
-
|
|
1551
|
+
schema_name: Optional[str] = None,
|
|
1552
|
+
) -> int | Dict[str, int]:
|
|
1506
1553
|
|
|
1507
|
-
|
|
1508
|
-
|
|
1554
|
+
workspace_id = resolve_workspace_id(workspace)
|
|
1555
|
+
lakehouse_id = resolve_lakehouse_id(lakehouse, workspace_id)
|
|
1556
|
+
path = create_abfss_path(lakehouse_id, workspace_id, table_name, schema_name)
|
|
1557
|
+
df = _read_delta_table(path)
|
|
1509
1558
|
|
|
1510
|
-
function = function.
|
|
1511
|
-
(workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
|
|
1512
|
-
lakehouse_id = resolve_lakehouse_id(lakehouse, workspace)
|
|
1513
|
-
path = create_abfss_path(lakehouse_id, workspace_id, table_name)
|
|
1559
|
+
function = function.lower()
|
|
1514
1560
|
|
|
1515
|
-
|
|
1516
|
-
|
|
1561
|
+
if isinstance(column_name, str):
|
|
1562
|
+
column_name = [column_name]
|
|
1563
|
+
|
|
1564
|
+
if _pure_python_notebook():
|
|
1565
|
+
import polars as pl
|
|
1566
|
+
|
|
1567
|
+
if not isinstance(df, pd.DataFrame):
|
|
1568
|
+
df.to_pandas()
|
|
1569
|
+
|
|
1570
|
+
df = pl.from_pandas(df)
|
|
1517
1571
|
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
|
|
1521
|
-
|
|
1572
|
+
def get_expr(col):
|
|
1573
|
+
col_dtype = df.schema[col]
|
|
1574
|
+
|
|
1575
|
+
if "approx" in function:
|
|
1576
|
+
return pl.col(col).unique().count().alias(col)
|
|
1577
|
+
elif "distinct" in function:
|
|
1578
|
+
if col_dtype == pl.Decimal:
|
|
1579
|
+
return pl.col(col).cast(pl.Float64).n_unique().alias(col)
|
|
1580
|
+
else:
|
|
1581
|
+
return pl.col(col).n_unique().alias(col)
|
|
1582
|
+
elif function == "sum":
|
|
1583
|
+
return pl.col(col).sum().alias(col)
|
|
1584
|
+
elif function == "min":
|
|
1585
|
+
return pl.col(col).min().alias(col)
|
|
1586
|
+
elif function == "max":
|
|
1587
|
+
return pl.col(col).max().alias(col)
|
|
1588
|
+
elif function == "count":
|
|
1589
|
+
return pl.col(col).count().alias(col)
|
|
1590
|
+
elif function in {"avg", "mean"}:
|
|
1591
|
+
return pl.col(col).mean().alias(col)
|
|
1592
|
+
else:
|
|
1593
|
+
raise ValueError(f"Unsupported function: {function}")
|
|
1594
|
+
|
|
1595
|
+
exprs = [get_expr(col) for col in column_name]
|
|
1596
|
+
aggs = df.select(exprs).to_dict(as_series=False)
|
|
1597
|
+
|
|
1598
|
+
if len(column_name) == 1:
|
|
1599
|
+
result = aggs[column_name[0]][0] or default_value
|
|
1600
|
+
else:
|
|
1601
|
+
result = {col: aggs[col][0] for col in column_name}
|
|
1522
1602
|
else:
|
|
1523
|
-
|
|
1603
|
+
from pyspark.sql.functions import (
|
|
1604
|
+
count,
|
|
1605
|
+
sum,
|
|
1606
|
+
min,
|
|
1607
|
+
max,
|
|
1608
|
+
avg,
|
|
1609
|
+
approx_count_distinct,
|
|
1610
|
+
countDistinct,
|
|
1611
|
+
)
|
|
1612
|
+
|
|
1613
|
+
result = None
|
|
1614
|
+
if "approx" in function:
|
|
1615
|
+
spark_func = approx_count_distinct
|
|
1616
|
+
elif "distinct" in function:
|
|
1617
|
+
spark_func = countDistinct
|
|
1618
|
+
elif function == "count":
|
|
1619
|
+
spark_func = count
|
|
1620
|
+
elif function == "sum":
|
|
1621
|
+
spark_func = sum
|
|
1622
|
+
elif function == "min":
|
|
1623
|
+
spark_func = min
|
|
1624
|
+
elif function == "max":
|
|
1625
|
+
spark_func = max
|
|
1626
|
+
elif function == "avg":
|
|
1627
|
+
spark_func = avg
|
|
1628
|
+
else:
|
|
1629
|
+
raise ValueError(f"Unsupported function: {function}")
|
|
1524
1630
|
|
|
1525
|
-
|
|
1631
|
+
agg_exprs = []
|
|
1632
|
+
for col in column_name:
|
|
1633
|
+
agg_exprs.append(spark_func(col).alias(col))
|
|
1634
|
+
|
|
1635
|
+
aggs = df.agg(*agg_exprs).collect()[0]
|
|
1636
|
+
if len(column_name) == 1:
|
|
1637
|
+
result = aggs[0] or default_value
|
|
1638
|
+
else:
|
|
1639
|
+
result = {col: aggs[col] for col in column_name}
|
|
1640
|
+
|
|
1641
|
+
return result
|
|
1642
|
+
|
|
1643
|
+
|
|
1644
|
+
def _create_spark_dataframe(df: pd.DataFrame):
|
|
1645
|
+
|
|
1646
|
+
spark = _create_spark_session()
|
|
1647
|
+
return spark.createDataFrame(df)
|
|
1526
1648
|
|
|
1527
1649
|
|
|
1528
1650
|
def _make_list_unique(my_list):
|
|
@@ -1617,6 +1739,9 @@ def _process_and_display_chart(df, title, widget):
|
|
|
1617
1739
|
df["Start"] = df["Start"] - Offset
|
|
1618
1740
|
df["End"] = df["End"] - Offset
|
|
1619
1741
|
|
|
1742
|
+
unique_objects = df["Object Name"].nunique()
|
|
1743
|
+
height = min(max(400, unique_objects * 30), 1000)
|
|
1744
|
+
|
|
1620
1745
|
# Vega-Lite spec for Gantt chart
|
|
1621
1746
|
spec = (
|
|
1622
1747
|
"""{
|
|
@@ -1626,7 +1751,9 @@ def _process_and_display_chart(df, title, widget):
|
|
|
1626
1751
|
+ df.to_json(orient="records")
|
|
1627
1752
|
+ """ },
|
|
1628
1753
|
"width": 700,
|
|
1629
|
-
"height":
|
|
1754
|
+
"height": """
|
|
1755
|
+
+ str(height)
|
|
1756
|
+
+ """,
|
|
1630
1757
|
"mark": "bar",
|
|
1631
1758
|
"encoding": {
|
|
1632
1759
|
"y": {
|
|
@@ -1687,6 +1814,7 @@ def _convert_data_type(input_data_type: str) -> str:
|
|
|
1687
1814
|
"double": "Double",
|
|
1688
1815
|
"float": "Double",
|
|
1689
1816
|
"binary": "Boolean",
|
|
1817
|
+
"long": "Int64",
|
|
1690
1818
|
}
|
|
1691
1819
|
|
|
1692
1820
|
if "decimal" in input_data_type:
|
|
@@ -1842,6 +1970,18 @@ def _update_dataframe_datatypes(dataframe: pd.DataFrame, column_map: dict):
|
|
|
1842
1970
|
dataframe[column] = dataframe[column].fillna(0).astype(int)
|
|
1843
1971
|
elif data_type in ["str", "string"]:
|
|
1844
1972
|
dataframe[column] = dataframe[column].astype(str)
|
|
1973
|
+
# Avoid having empty lists or lists with a value of None.
|
|
1974
|
+
elif data_type in ["list"]:
|
|
1975
|
+
dataframe[column] = dataframe[column].apply(
|
|
1976
|
+
lambda x: (
|
|
1977
|
+
None
|
|
1978
|
+
if (type(x) == list and len(x) == 1 and x[0] == None)
|
|
1979
|
+
or (type(x) == list and len(x) == 0)
|
|
1980
|
+
else x
|
|
1981
|
+
)
|
|
1982
|
+
)
|
|
1983
|
+
elif data_type in ["dict"]:
|
|
1984
|
+
dataframe[column] = dataframe[column]
|
|
1845
1985
|
else:
|
|
1846
1986
|
raise NotImplementedError
|
|
1847
1987
|
|
|
@@ -1878,18 +2018,58 @@ def _create_spark_session():
|
|
|
1878
2018
|
return SparkSession.builder.getOrCreate()
|
|
1879
2019
|
|
|
1880
2020
|
|
|
1881
|
-
def
|
|
2021
|
+
def _get_delta_table(path: str) -> str:
|
|
2022
|
+
|
|
2023
|
+
from delta import DeltaTable
|
|
1882
2024
|
|
|
1883
2025
|
spark = _create_spark_session()
|
|
1884
2026
|
|
|
1885
|
-
return
|
|
2027
|
+
return DeltaTable.forPath(spark, path)
|
|
1886
2028
|
|
|
1887
2029
|
|
|
1888
|
-
def
|
|
2030
|
+
def _read_delta_table(path: str, to_pandas: bool = True, to_df: bool = False):
|
|
1889
2031
|
|
|
1890
|
-
|
|
2032
|
+
if _pure_python_notebook():
|
|
2033
|
+
from deltalake import DeltaTable
|
|
2034
|
+
|
|
2035
|
+
df = DeltaTable(table_uri=path)
|
|
2036
|
+
if to_pandas:
|
|
2037
|
+
df = df.to_pandas()
|
|
2038
|
+
else:
|
|
2039
|
+
spark = _create_spark_session()
|
|
2040
|
+
df = spark.read.format("delta").load(path)
|
|
2041
|
+
if to_df:
|
|
2042
|
+
df = df.toDF()
|
|
2043
|
+
|
|
2044
|
+
return df
|
|
2045
|
+
|
|
2046
|
+
|
|
2047
|
+
def _read_delta_table_history(path) -> pd.DataFrame:
|
|
2048
|
+
|
|
2049
|
+
if _pure_python_notebook():
|
|
2050
|
+
from deltalake import DeltaTable
|
|
2051
|
+
|
|
2052
|
+
df = pd.DataFrame(DeltaTable(table_uri=path).history())
|
|
2053
|
+
else:
|
|
2054
|
+
from delta import DeltaTable
|
|
2055
|
+
|
|
2056
|
+
spark = _create_spark_session()
|
|
2057
|
+
delta_table = DeltaTable.forPath(spark, path)
|
|
2058
|
+
df = delta_table.history().toPandas()
|
|
2059
|
+
|
|
2060
|
+
return df
|
|
2061
|
+
|
|
2062
|
+
|
|
2063
|
+
def _delta_table_row_count(path: str) -> int:
|
|
1891
2064
|
|
|
1892
|
-
|
|
2065
|
+
if _pure_python_notebook():
|
|
2066
|
+
from deltalake import DeltaTable
|
|
2067
|
+
|
|
2068
|
+
dt = DeltaTable(path)
|
|
2069
|
+
arrow_table = dt.to_pyarrow_table()
|
|
2070
|
+
return arrow_table.num_rows
|
|
2071
|
+
else:
|
|
2072
|
+
return _read_delta_table(path).count()
|
|
1893
2073
|
|
|
1894
2074
|
|
|
1895
2075
|
def _run_spark_sql_query(query):
|
|
@@ -2070,3 +2250,23 @@ def _xml_to_dict(element):
|
|
|
2070
2250
|
element.text.strip() if element.text and element.text.strip() else None
|
|
2071
2251
|
)
|
|
2072
2252
|
return data
|
|
2253
|
+
|
|
2254
|
+
|
|
2255
|
+
def file_exists(file_path: str) -> bool:
|
|
2256
|
+
"""
|
|
2257
|
+
Check if a file exists in the given path.
|
|
2258
|
+
|
|
2259
|
+
Parameters
|
|
2260
|
+
----------
|
|
2261
|
+
file_path : str
|
|
2262
|
+
The path to the file.
|
|
2263
|
+
|
|
2264
|
+
Returns
|
|
2265
|
+
-------
|
|
2266
|
+
bool
|
|
2267
|
+
True if the file exists, False otherwise.
|
|
2268
|
+
"""
|
|
2269
|
+
|
|
2270
|
+
import notebookutils
|
|
2271
|
+
|
|
2272
|
+
return len(notebookutils.fs.ls(file_path)) > 0
|
sempy_labs/_kql_databases.py
CHANGED
|
@@ -6,6 +6,8 @@ from sempy_labs._helper_functions import (
|
|
|
6
6
|
_create_dataframe,
|
|
7
7
|
delete_item,
|
|
8
8
|
create_item,
|
|
9
|
+
resolve_item_id,
|
|
10
|
+
resolve_workspace_id,
|
|
9
11
|
)
|
|
10
12
|
from uuid import UUID
|
|
11
13
|
import sempy_labs._icons as icons
|
|
@@ -121,3 +123,19 @@ def delete_kql_database(
|
|
|
121
123
|
)
|
|
122
124
|
|
|
123
125
|
delete_item(item=kql_database, type="KQLDatabase", workspace=workspace)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _resolve_cluster_uri(
|
|
129
|
+
kql_database: str | UUID, workspace: Optional[str | UUID] = None
|
|
130
|
+
) -> str:
|
|
131
|
+
|
|
132
|
+
workspace_id = resolve_workspace_id(workspace=workspace)
|
|
133
|
+
item_id = resolve_item_id(
|
|
134
|
+
item=kql_database, type="KQLDatabase", workspace=workspace
|
|
135
|
+
)
|
|
136
|
+
response = _base_api(
|
|
137
|
+
request=f"/v1/workspaces/{workspace_id}/kqlDatabases/{item_id}",
|
|
138
|
+
client="fabric_sp",
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
return response.json().get("properties", {}).get("queryServiceUri")
|