semantic-link-labs 0.9.9__py3-none-any.whl → 0.9.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of semantic-link-labs might be problematic. Click here for more details.

Files changed (49) hide show
  1. {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/METADATA +30 -22
  2. {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/RECORD +47 -40
  3. {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/WHEEL +1 -1
  4. sempy_labs/__init__.py +28 -1
  5. sempy_labs/_clear_cache.py +12 -0
  6. sempy_labs/_dax.py +8 -2
  7. sempy_labs/_delta_analyzer.py +17 -26
  8. sempy_labs/_environments.py +19 -1
  9. sempy_labs/_generate_semantic_model.py +7 -8
  10. sempy_labs/_helper_functions.py +351 -151
  11. sempy_labs/_kql_databases.py +18 -0
  12. sempy_labs/_kusto.py +137 -0
  13. sempy_labs/_list_functions.py +18 -36
  14. sempy_labs/_model_bpa_rules.py +13 -3
  15. sempy_labs/_notebooks.py +44 -11
  16. sempy_labs/_semantic_models.py +93 -1
  17. sempy_labs/_sql.py +3 -2
  18. sempy_labs/_tags.py +194 -0
  19. sempy_labs/_variable_libraries.py +89 -0
  20. sempy_labs/_vertipaq.py +6 -6
  21. sempy_labs/_vpax.py +386 -0
  22. sempy_labs/_warehouses.py +3 -3
  23. sempy_labs/admin/__init__.py +14 -0
  24. sempy_labs/admin/_artifacts.py +3 -3
  25. sempy_labs/admin/_capacities.py +161 -1
  26. sempy_labs/admin/_dataflows.py +45 -0
  27. sempy_labs/admin/_items.py +16 -11
  28. sempy_labs/admin/_tags.py +126 -0
  29. sempy_labs/admin/_tenant.py +5 -5
  30. sempy_labs/directlake/_generate_shared_expression.py +29 -26
  31. sempy_labs/directlake/_update_directlake_model_lakehouse_connection.py +55 -5
  32. sempy_labs/dotnet_lib/dotnet.runtime.config.json +10 -0
  33. sempy_labs/lakehouse/__init__.py +16 -0
  34. sempy_labs/lakehouse/_blobs.py +115 -63
  35. sempy_labs/lakehouse/_get_lakehouse_columns.py +41 -18
  36. sempy_labs/lakehouse/_get_lakehouse_tables.py +62 -47
  37. sempy_labs/lakehouse/_helper.py +211 -0
  38. sempy_labs/lakehouse/_lakehouse.py +45 -36
  39. sempy_labs/lakehouse/_livy_sessions.py +137 -0
  40. sempy_labs/migration/_migrate_calctables_to_lakehouse.py +7 -12
  41. sempy_labs/migration/_refresh_calc_tables.py +7 -6
  42. sempy_labs/report/_download_report.py +1 -1
  43. sempy_labs/report/_generate_report.py +5 -1
  44. sempy_labs/report/_reportwrapper.py +31 -18
  45. sempy_labs/tom/_model.py +104 -35
  46. sempy_labs/report/_bpareporttemplate/.pbi/localSettings.json +0 -9
  47. sempy_labs/report/_bpareporttemplate/.platform +0 -11
  48. {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/licenses/LICENSE +0 -0
  49. {semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.11.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,7 @@ from sempy.fabric.exceptions import FabricHTTPException, WorkspaceNotFoundExcept
8
8
  import pandas as pd
9
9
  from functools import wraps
10
10
  import datetime
11
- from typing import Optional, Tuple, List
11
+ from typing import Optional, Tuple, List, Dict
12
12
  from uuid import UUID
13
13
  import sempy_labs._icons as icons
14
14
  from azure.core.credentials import TokenCredential, AccessToken
@@ -74,6 +74,15 @@ def create_abfss_path(
74
74
  return path
75
75
 
76
76
 
77
+ def create_abfss_path_from_path(
78
+ lakehouse_id: UUID, workspace_id: UUID, file_path: str
79
+ ) -> str:
80
+
81
+ fp = _get_default_file_path()
82
+
83
+ return f"abfss://{workspace_id}@{fp}/{lakehouse_id}/{file_path}"
84
+
85
+
77
86
  def _get_default_file_path() -> str:
78
87
 
79
88
  default_file_storage = _get_fabric_context_setting(name="fs.defaultFS")
@@ -266,7 +275,7 @@ def create_item(
266
275
  lro_return_status_code=True,
267
276
  )
268
277
  print(
269
- f"{icons.green_dot} The '{name}' {item_type} has been successfully created within the in the '{workspace_name}' workspace."
278
+ f"{icons.green_dot} The '{name}' {item_type} has been successfully created within the '{workspace_name}' workspace."
270
279
  )
271
280
 
272
281
 
@@ -278,10 +287,9 @@ def get_item_definition(
278
287
  return_dataframe: bool = True,
279
288
  decode: bool = True,
280
289
  ):
281
-
282
290
  from sempy_labs._utils import item_types
283
291
 
284
- (workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
292
+ workspace_id = resolve_workspace_id(workspace)
285
293
  item_id = resolve_item_id(item, type, workspace_id)
286
294
  item_type_url = item_types.get(type)[1]
287
295
  path = item_types.get(type)[2]
@@ -304,92 +312,11 @@ def get_item_definition(
304
312
  p.get("payload") for p in result["definition"]["parts"] if p.get("path") == path
305
313
  )
306
314
  if decode:
307
- json.loads(_decode_b64(value))
315
+ return json.loads(_decode_b64(value))
308
316
  else:
309
317
  return value
310
318
 
311
319
 
312
- def resolve_item_id(
313
- item: str | UUID, type: Optional[str] = None, workspace: Optional[str | UUID] = None
314
- ) -> UUID:
315
-
316
- (workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
317
- item_id = None
318
-
319
- if _is_valid_uuid(item):
320
- # Check (optional)
321
- item_id = item
322
- try:
323
- _base_api(
324
- request=f"/v1/workspaces/{workspace_id}/items/{item_id}",
325
- client="fabric_sp",
326
- )
327
- except FabricHTTPException:
328
- raise ValueError(
329
- f"{icons.red_dot} The '{item_id}' item was not found in the '{workspace_name}' workspace."
330
- )
331
- else:
332
- if type is None:
333
- raise ValueError(
334
- f"{icons.red_dot} The 'type' parameter is required if specifying an item name."
335
- )
336
- responses = _base_api(
337
- request=f"/v1/workspaces/{workspace_id}/items?type={type}",
338
- client="fabric_sp",
339
- uses_pagination=True,
340
- )
341
- for r in responses:
342
- for v in r.get("value", []):
343
- display_name = v.get("displayName")
344
- if display_name == item:
345
- item_id = v.get("id")
346
- break
347
-
348
- if item_id is None:
349
- raise ValueError(
350
- f"{icons.red_dot} There's no item '{item}' of type '{type}' in the '{workspace_name}' workspace."
351
- )
352
-
353
- return item_id
354
-
355
-
356
- def resolve_item_name_and_id(
357
- item: str | UUID, type: Optional[str] = None, workspace: Optional[str | UUID] = None
358
- ) -> Tuple[str, UUID]:
359
-
360
- workspace_id = resolve_workspace_id(workspace)
361
- item_id = resolve_item_id(item=item, type=type, workspace=workspace_id)
362
- item_name = (
363
- _base_api(
364
- request=f"/v1/workspaces/{workspace_id}/items/{item_id}", client="fabric_sp"
365
- )
366
- .json()
367
- .get("displayName")
368
- )
369
-
370
- return item_name, item_id
371
-
372
-
373
- def resolve_item_name(item_id: UUID, workspace: Optional[str | UUID] = None) -> str:
374
-
375
- workspace_id = resolve_workspace_id(workspace)
376
- try:
377
- item_name = (
378
- _base_api(
379
- request=f"/v1/workspaces/{workspace_id}/items/{item_id}",
380
- client="fabric_sp",
381
- )
382
- .json()
383
- .get("displayName")
384
- )
385
- except FabricHTTPException:
386
- raise ValueError(
387
- f"{icons.red_dot} The '{item_id}' item was not found in the '{workspace_id}' workspace."
388
- )
389
-
390
- return item_name
391
-
392
-
393
320
  def resolve_lakehouse_name_and_id(
394
321
  lakehouse: Optional[str | UUID] = None, workspace: Optional[str | UUID] = None
395
322
  ) -> Tuple[str, UUID]:
@@ -663,11 +590,13 @@ def save_as_delta_table(
663
590
  workspace: Optional[str | UUID] = None,
664
591
  ):
665
592
  """
666
- Saves a pandas dataframe as a delta table in a Fabric lakehouse.
593
+ Saves a pandas or spark dataframe as a delta table in a Fabric lakehouse.
594
+
595
+ This function may be executed in either a PySpark or pure Python notebook. If executing in a pure Python notebook, the dataframe must be a pandas dataframe.
667
596
 
668
597
  Parameters
669
598
  ----------
670
- dataframe : pandas.DataFrame
599
+ dataframe : pandas.DataFrame | spark.Dataframe
671
600
  The dataframe to be saved as a delta table.
672
601
  delta_table_name : str
673
602
  The name of the delta table.
@@ -686,19 +615,6 @@ def save_as_delta_table(
686
615
  or if no lakehouse attached, resolves to the workspace of the notebook.
687
616
  """
688
617
 
689
- from pyspark.sql.types import (
690
- StringType,
691
- IntegerType,
692
- FloatType,
693
- DateType,
694
- StructType,
695
- StructField,
696
- BooleanType,
697
- LongType,
698
- DoubleType,
699
- TimestampType,
700
- )
701
-
702
618
  (workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
703
619
  (lakehouse_name, lakehouse_id) = resolve_lakehouse_name_and_id(
704
620
  lakehouse=lakehouse, workspace=workspace_id
@@ -717,52 +633,101 @@ def save_as_delta_table(
717
633
  f"{icons.red_dot} Invalid 'delta_table_name'. Delta tables in the lakehouse cannot have spaces in their names."
718
634
  )
719
635
 
720
- spark = _create_spark_session()
636
+ import pyarrow as pa
637
+ from pyspark.sql.types import (
638
+ StringType,
639
+ IntegerType,
640
+ FloatType,
641
+ DateType,
642
+ StructType,
643
+ StructField,
644
+ BooleanType,
645
+ LongType,
646
+ DoubleType,
647
+ TimestampType,
648
+ )
721
649
 
722
- type_mapping = {
723
- "string": StringType(),
724
- "str": StringType(),
725
- "integer": IntegerType(),
726
- "int": IntegerType(),
727
- "float": FloatType(),
728
- "date": DateType(),
729
- "bool": BooleanType(),
730
- "boolean": BooleanType(),
731
- "long": LongType(),
732
- "double": DoubleType(),
733
- "timestamp": TimestampType(),
734
- }
650
+ def get_type_mapping(pure_python):
651
+ common_mapping = {
652
+ "string": ("pa", pa.string(), StringType()),
653
+ "str": ("pa", pa.string(), StringType()),
654
+ "integer": ("pa", pa.int32(), IntegerType()),
655
+ "int": ("pa", pa.int32(), IntegerType()),
656
+ "float": ("pa", pa.float32(), FloatType()),
657
+ "double": ("pa", pa.float64(), DoubleType()),
658
+ "long": ("pa", pa.int64(), LongType()),
659
+ "bool": ("pa", pa.bool_(), BooleanType()),
660
+ "boolean": ("pa", pa.bool_(), BooleanType()),
661
+ "date": ("pa", pa.date32(), DateType()),
662
+ "timestamp": ("pa", pa.timestamp("us"), TimestampType()),
663
+ }
664
+ return {k: v[1] if pure_python else v[2] for k, v in common_mapping.items()}
735
665
 
736
- if isinstance(dataframe, pd.DataFrame):
737
- dataframe.columns = [col.replace(" ", "_") for col in dataframe.columns]
738
- if schema is None:
739
- spark_df = spark.createDataFrame(dataframe)
666
+ def build_schema(schema_dict, type_mapping, use_arrow=True):
667
+ if use_arrow:
668
+ fields = [
669
+ pa.field(name, type_mapping.get(dtype.lower()))
670
+ for name, dtype in schema_dict.items()
671
+ ]
672
+ return pa.schema(fields)
740
673
  else:
741
- schema_map = StructType(
674
+ return StructType(
742
675
  [
743
- StructField(column_name, type_mapping[data_type], True)
744
- for column_name, data_type in schema.items()
676
+ StructField(name, type_mapping.get(dtype.lower()), True)
677
+ for name, dtype in schema_dict.items()
745
678
  ]
746
679
  )
747
- spark_df = spark.createDataFrame(dataframe, schema_map)
680
+
681
+ # Main logic
682
+ schema_map = None
683
+ if schema is not None:
684
+ use_arrow = _pure_python_notebook()
685
+ type_mapping = get_type_mapping(use_arrow)
686
+ schema_map = build_schema(schema, type_mapping, use_arrow)
687
+
688
+ if isinstance(dataframe, pd.DataFrame):
689
+ dataframe.columns = [col.replace(" ", "_") for col in dataframe.columns]
690
+ if _pure_python_notebook():
691
+ spark_df = dataframe
692
+ else:
693
+ spark = _create_spark_session()
694
+ if schema is None:
695
+ spark_df = spark.createDataFrame(dataframe)
696
+ else:
697
+ spark_df = spark.createDataFrame(dataframe, schema_map)
748
698
  else:
749
699
  for col_name in dataframe.columns:
750
700
  new_name = col_name.replace(" ", "_")
751
701
  dataframe = dataframe.withColumnRenamed(col_name, new_name)
752
702
  spark_df = dataframe
753
703
 
754
- filePath = create_abfss_path(
704
+ file_path = create_abfss_path(
755
705
  lakehouse_id=lakehouse_id,
756
706
  lakehouse_workspace_id=workspace_id,
757
707
  delta_table_name=delta_table_name,
758
708
  )
759
709
 
760
- if merge_schema:
761
- spark_df.write.mode(write_mode).format("delta").option(
762
- "mergeSchema", "true"
763
- ).save(filePath)
710
+ if _pure_python_notebook():
711
+ from deltalake import write_deltalake
712
+
713
+ write_args = {
714
+ "table_or_uri": file_path,
715
+ "data": spark_df,
716
+ "mode": write_mode,
717
+ "schema": schema_map,
718
+ }
719
+
720
+ if merge_schema:
721
+ write_args["schema_mode"] = "merge"
722
+
723
+ write_deltalake(**write_args)
764
724
  else:
765
- spark_df.write.mode(write_mode).format("delta").save(filePath)
725
+ writer = spark_df.write.mode(write_mode).format("delta")
726
+ if merge_schema:
727
+ writer = writer.option("mergeSchema", "true")
728
+
729
+ writer.save(file_path)
730
+
766
731
  print(
767
732
  f"{icons.green_dot} The dataframe has been saved as the '{delta_table_name}' table in the '{lakehouse_name}' lakehouse within the '{workspace_name}' workspace."
768
733
  )
@@ -898,6 +863,87 @@ def resolve_workspace_name_and_id(
898
863
  return workspace_name, workspace_id
899
864
 
900
865
 
866
+ def resolve_item_id(
867
+ item: str | UUID, type: Optional[str] = None, workspace: Optional[str | UUID] = None
868
+ ) -> UUID:
869
+
870
+ (workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
871
+ item_id = None
872
+
873
+ if _is_valid_uuid(item):
874
+ # Check (optional)
875
+ item_id = item
876
+ try:
877
+ _base_api(
878
+ request=f"/v1/workspaces/{workspace_id}/items/{item_id}",
879
+ client="fabric_sp",
880
+ )
881
+ except FabricHTTPException:
882
+ raise ValueError(
883
+ f"{icons.red_dot} The '{item_id}' item was not found in the '{workspace_name}' workspace."
884
+ )
885
+ else:
886
+ if type is None:
887
+ raise ValueError(
888
+ f"{icons.red_dot} The 'type' parameter is required if specifying an item name."
889
+ )
890
+ responses = _base_api(
891
+ request=f"/v1/workspaces/{workspace_id}/items?type={type}",
892
+ client="fabric_sp",
893
+ uses_pagination=True,
894
+ )
895
+ for r in responses:
896
+ for v in r.get("value", []):
897
+ display_name = v.get("displayName")
898
+ if display_name == item:
899
+ item_id = v.get("id")
900
+ break
901
+
902
+ if item_id is None:
903
+ raise ValueError(
904
+ f"{icons.red_dot} There's no item '{item}' of type '{type}' in the '{workspace_name}' workspace."
905
+ )
906
+
907
+ return item_id
908
+
909
+
910
+ def resolve_item_name_and_id(
911
+ item: str | UUID, type: Optional[str] = None, workspace: Optional[str | UUID] = None
912
+ ) -> Tuple[str, UUID]:
913
+
914
+ workspace_id = resolve_workspace_id(workspace)
915
+ item_id = resolve_item_id(item=item, type=type, workspace=workspace_id)
916
+ item_name = (
917
+ _base_api(
918
+ request=f"/v1/workspaces/{workspace_id}/items/{item_id}", client="fabric_sp"
919
+ )
920
+ .json()
921
+ .get("displayName")
922
+ )
923
+
924
+ return item_name, item_id
925
+
926
+
927
+ def resolve_item_name(item_id: UUID, workspace: Optional[str | UUID] = None) -> str:
928
+
929
+ workspace_id = resolve_workspace_id(workspace)
930
+ try:
931
+ item_name = (
932
+ _base_api(
933
+ request=f"/v1/workspaces/{workspace_id}/items/{item_id}",
934
+ client="fabric_sp",
935
+ )
936
+ .json()
937
+ .get("displayName")
938
+ )
939
+ except FabricHTTPException:
940
+ raise ValueError(
941
+ f"{icons.red_dot} The '{item_id}' item was not found in the '{workspace_id}' workspace."
942
+ )
943
+
944
+ return item_name
945
+
946
+
901
947
  def _extract_json(dataframe: pd.DataFrame) -> dict:
902
948
 
903
949
  payload = dataframe["payload"].iloc[0]
@@ -1497,32 +1543,108 @@ def generate_guid():
1497
1543
 
1498
1544
  def _get_column_aggregate(
1499
1545
  table_name: str,
1500
- column_name: str = "RunId",
1546
+ column_name: str | List[str] = "RunId",
1501
1547
  lakehouse: Optional[str | UUID] = None,
1502
1548
  workspace: Optional[str | UUID] = None,
1503
1549
  function: str = "max",
1504
1550
  default_value: int = 0,
1505
- ) -> int:
1551
+ schema_name: Optional[str] = None,
1552
+ ) -> int | Dict[str, int]:
1506
1553
 
1507
- from pyspark.sql.functions import approx_count_distinct
1508
- from pyspark.sql import functions as F
1554
+ workspace_id = resolve_workspace_id(workspace)
1555
+ lakehouse_id = resolve_lakehouse_id(lakehouse, workspace_id)
1556
+ path = create_abfss_path(lakehouse_id, workspace_id, table_name, schema_name)
1557
+ df = _read_delta_table(path)
1509
1558
 
1510
- function = function.upper()
1511
- (workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
1512
- lakehouse_id = resolve_lakehouse_id(lakehouse, workspace)
1513
- path = create_abfss_path(lakehouse_id, workspace_id, table_name)
1559
+ function = function.lower()
1514
1560
 
1515
- spark = _create_spark_session()
1516
- df = spark.read.format("delta").load(path)
1561
+ if isinstance(column_name, str):
1562
+ column_name = [column_name]
1563
+
1564
+ if _pure_python_notebook():
1565
+ import polars as pl
1566
+
1567
+ if not isinstance(df, pd.DataFrame):
1568
+ df.to_pandas()
1569
+
1570
+ df = pl.from_pandas(df)
1517
1571
 
1518
- if function in {"COUNTDISTINCT", "DISTINCTCOUNT"}:
1519
- result = df.select(F.count_distinct(F.col(column_name)))
1520
- elif "APPROX" in function:
1521
- result = df.select(approx_count_distinct(column_name))
1572
+ def get_expr(col):
1573
+ col_dtype = df.schema[col]
1574
+
1575
+ if "approx" in function:
1576
+ return pl.col(col).unique().count().alias(col)
1577
+ elif "distinct" in function:
1578
+ if col_dtype == pl.Decimal:
1579
+ return pl.col(col).cast(pl.Float64).n_unique().alias(col)
1580
+ else:
1581
+ return pl.col(col).n_unique().alias(col)
1582
+ elif function == "sum":
1583
+ return pl.col(col).sum().alias(col)
1584
+ elif function == "min":
1585
+ return pl.col(col).min().alias(col)
1586
+ elif function == "max":
1587
+ return pl.col(col).max().alias(col)
1588
+ elif function == "count":
1589
+ return pl.col(col).count().alias(col)
1590
+ elif function in {"avg", "mean"}:
1591
+ return pl.col(col).mean().alias(col)
1592
+ else:
1593
+ raise ValueError(f"Unsupported function: {function}")
1594
+
1595
+ exprs = [get_expr(col) for col in column_name]
1596
+ aggs = df.select(exprs).to_dict(as_series=False)
1597
+
1598
+ if len(column_name) == 1:
1599
+ result = aggs[column_name[0]][0] or default_value
1600
+ else:
1601
+ result = {col: aggs[col][0] for col in column_name}
1522
1602
  else:
1523
- result = df.selectExpr(f"{function}({column_name})")
1603
+ from pyspark.sql.functions import (
1604
+ count,
1605
+ sum,
1606
+ min,
1607
+ max,
1608
+ avg,
1609
+ approx_count_distinct,
1610
+ countDistinct,
1611
+ )
1612
+
1613
+ result = None
1614
+ if "approx" in function:
1615
+ spark_func = approx_count_distinct
1616
+ elif "distinct" in function:
1617
+ spark_func = countDistinct
1618
+ elif function == "count":
1619
+ spark_func = count
1620
+ elif function == "sum":
1621
+ spark_func = sum
1622
+ elif function == "min":
1623
+ spark_func = min
1624
+ elif function == "max":
1625
+ spark_func = max
1626
+ elif function == "avg":
1627
+ spark_func = avg
1628
+ else:
1629
+ raise ValueError(f"Unsupported function: {function}")
1524
1630
 
1525
- return result.collect()[0][0] or default_value
1631
+ agg_exprs = []
1632
+ for col in column_name:
1633
+ agg_exprs.append(spark_func(col).alias(col))
1634
+
1635
+ aggs = df.agg(*agg_exprs).collect()[0]
1636
+ if len(column_name) == 1:
1637
+ result = aggs[0] or default_value
1638
+ else:
1639
+ result = {col: aggs[col] for col in column_name}
1640
+
1641
+ return result
1642
+
1643
+
1644
+ def _create_spark_dataframe(df: pd.DataFrame):
1645
+
1646
+ spark = _create_spark_session()
1647
+ return spark.createDataFrame(df)
1526
1648
 
1527
1649
 
1528
1650
  def _make_list_unique(my_list):
@@ -1617,6 +1739,9 @@ def _process_and_display_chart(df, title, widget):
1617
1739
  df["Start"] = df["Start"] - Offset
1618
1740
  df["End"] = df["End"] - Offset
1619
1741
 
1742
+ unique_objects = df["Object Name"].nunique()
1743
+ height = min(max(400, unique_objects * 30), 1000)
1744
+
1620
1745
  # Vega-Lite spec for Gantt chart
1621
1746
  spec = (
1622
1747
  """{
@@ -1626,7 +1751,9 @@ def _process_and_display_chart(df, title, widget):
1626
1751
  + df.to_json(orient="records")
1627
1752
  + """ },
1628
1753
  "width": 700,
1629
- "height": 400,
1754
+ "height": """
1755
+ + str(height)
1756
+ + """,
1630
1757
  "mark": "bar",
1631
1758
  "encoding": {
1632
1759
  "y": {
@@ -1687,6 +1814,7 @@ def _convert_data_type(input_data_type: str) -> str:
1687
1814
  "double": "Double",
1688
1815
  "float": "Double",
1689
1816
  "binary": "Boolean",
1817
+ "long": "Int64",
1690
1818
  }
1691
1819
 
1692
1820
  if "decimal" in input_data_type:
@@ -1842,6 +1970,18 @@ def _update_dataframe_datatypes(dataframe: pd.DataFrame, column_map: dict):
1842
1970
  dataframe[column] = dataframe[column].fillna(0).astype(int)
1843
1971
  elif data_type in ["str", "string"]:
1844
1972
  dataframe[column] = dataframe[column].astype(str)
1973
+ # Avoid having empty lists or lists with a value of None.
1974
+ elif data_type in ["list"]:
1975
+ dataframe[column] = dataframe[column].apply(
1976
+ lambda x: (
1977
+ None
1978
+ if (type(x) == list and len(x) == 1 and x[0] == None)
1979
+ or (type(x) == list and len(x) == 0)
1980
+ else x
1981
+ )
1982
+ )
1983
+ elif data_type in ["dict"]:
1984
+ dataframe[column] = dataframe[column]
1845
1985
  else:
1846
1986
  raise NotImplementedError
1847
1987
 
@@ -1878,18 +2018,58 @@ def _create_spark_session():
1878
2018
  return SparkSession.builder.getOrCreate()
1879
2019
 
1880
2020
 
1881
- def _read_delta_table(path: str):
2021
+ def _get_delta_table(path: str) -> str:
2022
+
2023
+ from delta import DeltaTable
1882
2024
 
1883
2025
  spark = _create_spark_session()
1884
2026
 
1885
- return spark.read.format("delta").load(path)
2027
+ return DeltaTable.forPath(spark, path)
1886
2028
 
1887
2029
 
1888
- def _delta_table_row_count(table_name: str) -> int:
2030
+ def _read_delta_table(path: str, to_pandas: bool = True, to_df: bool = False):
1889
2031
 
1890
- spark = _create_spark_session()
2032
+ if _pure_python_notebook():
2033
+ from deltalake import DeltaTable
2034
+
2035
+ df = DeltaTable(table_uri=path)
2036
+ if to_pandas:
2037
+ df = df.to_pandas()
2038
+ else:
2039
+ spark = _create_spark_session()
2040
+ df = spark.read.format("delta").load(path)
2041
+ if to_df:
2042
+ df = df.toDF()
2043
+
2044
+ return df
2045
+
2046
+
2047
+ def _read_delta_table_history(path) -> pd.DataFrame:
2048
+
2049
+ if _pure_python_notebook():
2050
+ from deltalake import DeltaTable
2051
+
2052
+ df = pd.DataFrame(DeltaTable(table_uri=path).history())
2053
+ else:
2054
+ from delta import DeltaTable
2055
+
2056
+ spark = _create_spark_session()
2057
+ delta_table = DeltaTable.forPath(spark, path)
2058
+ df = delta_table.history().toPandas()
2059
+
2060
+ return df
2061
+
2062
+
2063
+ def _delta_table_row_count(path: str) -> int:
1891
2064
 
1892
- return spark.table(table_name).count()
2065
+ if _pure_python_notebook():
2066
+ from deltalake import DeltaTable
2067
+
2068
+ dt = DeltaTable(path)
2069
+ arrow_table = dt.to_pyarrow_table()
2070
+ return arrow_table.num_rows
2071
+ else:
2072
+ return _read_delta_table(path).count()
1893
2073
 
1894
2074
 
1895
2075
  def _run_spark_sql_query(query):
@@ -2070,3 +2250,23 @@ def _xml_to_dict(element):
2070
2250
  element.text.strip() if element.text and element.text.strip() else None
2071
2251
  )
2072
2252
  return data
2253
+
2254
+
2255
+ def file_exists(file_path: str) -> bool:
2256
+ """
2257
+ Check if a file exists in the given path.
2258
+
2259
+ Parameters
2260
+ ----------
2261
+ file_path : str
2262
+ The path to the file.
2263
+
2264
+ Returns
2265
+ -------
2266
+ bool
2267
+ True if the file exists, False otherwise.
2268
+ """
2269
+
2270
+ import notebookutils
2271
+
2272
+ return len(notebookutils.fs.ls(file_path)) > 0
@@ -6,6 +6,8 @@ from sempy_labs._helper_functions import (
6
6
  _create_dataframe,
7
7
  delete_item,
8
8
  create_item,
9
+ resolve_item_id,
10
+ resolve_workspace_id,
9
11
  )
10
12
  from uuid import UUID
11
13
  import sempy_labs._icons as icons
@@ -121,3 +123,19 @@ def delete_kql_database(
121
123
  )
122
124
 
123
125
  delete_item(item=kql_database, type="KQLDatabase", workspace=workspace)
126
+
127
+
128
+ def _resolve_cluster_uri(
129
+ kql_database: str | UUID, workspace: Optional[str | UUID] = None
130
+ ) -> str:
131
+
132
+ workspace_id = resolve_workspace_id(workspace=workspace)
133
+ item_id = resolve_item_id(
134
+ item=kql_database, type="KQLDatabase", workspace=workspace
135
+ )
136
+ response = _base_api(
137
+ request=f"/v1/workspaces/{workspace_id}/kqlDatabases/{item_id}",
138
+ client="fabric_sp",
139
+ )
140
+
141
+ return response.json().get("properties", {}).get("queryServiceUri")