PyPI - semantic-link-labs - Versions diffs - 0.9.9__py3-none-any.whl → 0.9.10__py3-none-any.whl - Mend

semantic-link-labs 0.9.9py3-none-any.whl → 0.9.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of semantic-link-labs might be problematic. Click here for more details.

Files changed (29) hide show

{semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.10.dist-info}/METADATA +5 -3
{semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.10.dist-info}/RECORD +29 -27
{semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.10.dist-info}/WHEEL +1 -1
sempy_labs/__init__.py +6 -0
sempy_labs/_clear_cache.py +12 -0
sempy_labs/_dax.py +8 -2
sempy_labs/_delta_analyzer.py +8 -18
sempy_labs/_generate_semantic_model.py +6 -7
sempy_labs/_helper_functions.py +205 -64
sempy_labs/_kql_databases.py +18 -0
sempy_labs/_kusto.py +135 -0
sempy_labs/_list_functions.py +5 -1
sempy_labs/_vertipaq.py +6 -6
sempy_labs/_warehouses.py +3 -3
sempy_labs/admin/__init__.py +6 -0
sempy_labs/admin/_artifacts.py +3 -3
sempy_labs/admin/_capacities.py +161 -1
sempy_labs/admin/_dataflows.py +45 -0
sempy_labs/admin/_items.py +16 -11
sempy_labs/admin/_tenant.py +5 -5
sempy_labs/directlake/_generate_shared_expression.py +25 -26
sempy_labs/lakehouse/_get_lakehouse_columns.py +41 -18
sempy_labs/lakehouse/_get_lakehouse_tables.py +66 -39
sempy_labs/lakehouse/_lakehouse.py +44 -35
sempy_labs/migration/_migrate_calctables_to_lakehouse.py +7 -12
sempy_labs/migration/_refresh_calc_tables.py +7 -6
sempy_labs/tom/_model.py +21 -14
{semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.10.dist-info}/licenses/LICENSE +0 -0
{semantic_link_labs-0.9.9.dist-info → semantic_link_labs-0.9.10.dist-info}/top_level.txt +0 -0

sempy_labs/_helper_functions.py CHANGED Viewed

@@ -8,7 +8,7 @@ from sempy.fabric.exceptions import FabricHTTPException, WorkspaceNotFoundExcept
 import pandas as pd
 from functools import wraps
 import datetime
-from typing import Optional, Tuple, List
+from typing import Optional, Tuple, List, Dict
 from uuid import UUID
 import sempy_labs._icons as icons
 from azure.core.credentials import TokenCredential, AccessToken
@@ -663,11 +663,13 @@ def save_as_delta_table(
     workspace: Optional[str | UUID] = None,
 ):
     """
-    Saves a pandas dataframe as a delta table in a Fabric lakehouse.
+    Saves a pandas or spark dataframe as a delta table in a Fabric lakehouse.
+    This function may be executed in either a PySpark or pure Python notebook. If executing in a pure Python notebook, the dataframe must be a pandas dataframe.
     Parameters
     ----------
-    dataframe : pandas.DataFrame
+    dataframe : pandas.DataFrame | spark.Dataframe
         The dataframe to be saved as a delta table.
     delta_table_name : str
         The name of the delta table.
@@ -686,19 +688,6 @@ def save_as_delta_table(
         or if no lakehouse attached, resolves to the workspace of the notebook.
     """
-    from pyspark.sql.types import (
-        StringType,
-        IntegerType,
-        FloatType,
-        DateType,
-        StructType,
-        StructField,
-        BooleanType,
-        LongType,
-        DoubleType,
-        TimestampType,
-    )
     (workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
     (lakehouse_name, lakehouse_id) = resolve_lakehouse_name_and_id(
         lakehouse=lakehouse, workspace=workspace_id
@@ -717,52 +706,101 @@ def save_as_delta_table(
             f"{icons.red_dot} Invalid 'delta_table_name'. Delta tables in the lakehouse cannot have spaces in their names."
         )
-    spark = _create_spark_session()
+    import pyarrow as pa
+    from pyspark.sql.types import (
+        StringType,
+        IntegerType,
+        FloatType,
+        DateType,
+        StructType,
+        StructField,
+        BooleanType,
+        LongType,
+        DoubleType,
+        TimestampType,
+    )
-    type_mapping = {
-        "string": StringType(),
-        "str": StringType(),
-        "integer": IntegerType(),
-        "int": IntegerType(),
-        "float": FloatType(),
-        "date": DateType(),
-        "bool": BooleanType(),
-        "boolean": BooleanType(),
-        "long": LongType(),
-        "double": DoubleType(),
-        "timestamp": TimestampType(),
-    }
+    def get_type_mapping(pure_python):
+        common_mapping = {
+            "string": ("pa", pa.string(), StringType()),
+            "str": ("pa", pa.string(), StringType()),
+            "integer": ("pa", pa.int32(), IntegerType()),
+            "int": ("pa", pa.int32(), IntegerType()),
+            "float": ("pa", pa.float32(), FloatType()),
+            "double": ("pa", pa.float64(), DoubleType()),
+            "long": ("pa", pa.int64(), LongType()),
+            "bool": ("pa", pa.bool_(), BooleanType()),
+            "boolean": ("pa", pa.bool_(), BooleanType()),
+            "date": ("pa", pa.date32(), DateType()),
+            "timestamp": ("pa", pa.timestamp("ms"), TimestampType()),
+        }
+        return {k: v[1] if pure_python else v[2] for k, v in common_mapping.items()}
-    if isinstance(dataframe, pd.DataFrame):
-        dataframe.columns = [col.replace(" ", "_") for col in dataframe.columns]
-        if schema is None:
-            spark_df = spark.createDataFrame(dataframe)
+    def build_schema(schema_dict, type_mapping, use_arrow=True):
+        if use_arrow:
+            fields = [
+                pa.field(name, type_mapping.get(dtype.lower()))
+                for name, dtype in schema_dict.items()
+            ]
+            return pa.schema(fields)
         else:
-            schema_map = StructType(
+            return StructType(
                 [
-                    StructField(column_name, type_mapping[data_type], True)
-                    for column_name, data_type in schema.items()
+                    StructField(name, type_mapping.get(dtype.lower()), True)
+                    for name, dtype in schema_dict.items()
                 ]
             )
-            spark_df = spark.createDataFrame(dataframe, schema_map)
+    # Main logic
+    schema_map = None
+    if schema is not None:
+        use_arrow = _pure_python_notebook()
+        type_mapping = get_type_mapping(use_arrow)
+        schema_map = build_schema(schema, type_mapping, use_arrow)
+    if isinstance(dataframe, pd.DataFrame):
+        dataframe.columns = [col.replace(" ", "_") for col in dataframe.columns]
+        if _pure_python_notebook():
+            spark_df = dataframe
+        else:
+            spark = _create_spark_session()
+            if schema is None:
+                spark_df = spark.createDataFrame(dataframe)
+            else:
+                spark_df = spark.createDataFrame(dataframe, schema_map)
     else:
         for col_name in dataframe.columns:
             new_name = col_name.replace(" ", "_")
             dataframe = dataframe.withColumnRenamed(col_name, new_name)
         spark_df = dataframe
-    filePath = create_abfss_path(
+    file_path = create_abfss_path(
         lakehouse_id=lakehouse_id,
         lakehouse_workspace_id=workspace_id,
         delta_table_name=delta_table_name,
     )
-    if merge_schema:
-        spark_df.write.mode(write_mode).format("delta").option(
-            "mergeSchema", "true"
-        ).save(filePath)
+    if _pure_python_notebook():
+        from deltalake import write_deltalake
+        write_args = {
+            "table_or_uri": file_path,
+            "data": spark_df,
+            "mode": write_mode,
+            "schema": schema_map,
+        }
+        if merge_schema:
+            write_args["schema_mode"] = "merge"
+        write_deltalake(**write_args)
     else:
-        spark_df.write.mode(write_mode).format("delta").save(filePath)
+        writer = spark_df.write.mode(write_mode).format("delta")
+        if merge_schema:
+            writer = writer.option("mergeSchema", "true")
+        writer.save(file_path)
     print(
         f"{icons.green_dot} The dataframe has been saved as the '{delta_table_name}' table in the '{lakehouse_name}' lakehouse within the '{workspace_name}' workspace."
     )
@@ -1497,32 +1535,82 @@ def generate_guid():
 def _get_column_aggregate(
     table_name: str,
-    column_name: str = "RunId",
+    column_name: str | List[str] = "RunId",
     lakehouse: Optional[str | UUID] = None,
     workspace: Optional[str | UUID] = None,
     function: str = "max",
     default_value: int = 0,
-) -> int:
+) -> int | Dict[str, int]:
+    workspace_id = resolve_workspace_id(workspace)
+    lakehouse_id = resolve_lakehouse_id(lakehouse, workspace_id)
+    path = create_abfss_path(lakehouse_id, workspace_id, table_name)
+    df = _read_delta_table(path)
+    if isinstance(column_name, str):
+        result = _get_aggregate(
+            df=df,
+            column_name=column_name,
+            function=function,
+            default_value=default_value,
+        )
+    elif isinstance(column_name, list):
+        result = {}
+        for col in column_name:
+            result[col] = _get_aggregate(
+                df=df,
+                column_name=col,
+                function=function,
+                default_value=default_value,
+            )
+    else:
+        raise TypeError("column_name must be a string or a list of strings.")
+    return result
-    from pyspark.sql.functions import approx_count_distinct
-    from pyspark.sql import functions as F
+def _get_aggregate(df, column_name, function, default_value: int = 0) -> int:
     function = function.upper()
-    (workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace)
-    lakehouse_id = resolve_lakehouse_id(lakehouse, workspace)
-    path = create_abfss_path(lakehouse_id, workspace_id, table_name)
-    spark = _create_spark_session()
-    df = spark.read.format("delta").load(path)
+    if _pure_python_notebook():
+        import polars as pl
+        if not isinstance(df, pd.DataFrame):
+            df.to_pandas()
-    if function in {"COUNTDISTINCT", "DISTINCTCOUNT"}:
-        result = df.select(F.count_distinct(F.col(column_name)))
-    elif "APPROX" in function:
-        result = df.select(approx_count_distinct(column_name))
+        df = pl.from_pandas(df)
+        # Perform aggregation
+        if "DISTINCT" in function:
+            if isinstance(df[column_name].dtype, pl.Decimal):
+                result = df[column_name].cast(pl.Float64).n_unique()
+            else:
+                result = df[column_name].n_unique()
+        elif "APPROX" in function:
+            result = df[column_name].unique().shape[0]
+        else:
+            try:
+                result = getattr(df[column_name], function.lower())()
+            except AttributeError:
+                raise ValueError(f"Unsupported function: {function}")
+        return result if result is not None else default_value
     else:
-        result = df.selectExpr(f"{function}({column_name})")
+        from pyspark.sql.functions import approx_count_distinct
+        from pyspark.sql import functions as F
+        if isinstance(df, pd.DataFrame):
+            df = _create_spark_dataframe(df)
-    return result.collect()[0][0] or default_value
+        if "DISTINCT" in function:
+            result = df.select(F.count_distinct(F.col(column_name)))
+        elif "APPROX" in function:
+            result = df.select(approx_count_distinct(column_name))
+        else:
+            result = df.selectExpr(f"{function}({column_name})")
+        return result.collect()[0][0] or default_value
 def _make_list_unique(my_list):
@@ -1687,6 +1775,7 @@ def _convert_data_type(input_data_type: str) -> str:
         "double": "Double",
         "float": "Double",
         "binary": "Boolean",
+        "long": "Int64",
     }
     if "decimal" in input_data_type:
@@ -1842,6 +1931,18 @@ def _update_dataframe_datatypes(dataframe: pd.DataFrame, column_map: dict):
                 dataframe[column] = dataframe[column].fillna(0).astype(int)
             elif data_type in ["str", "string"]:
                 dataframe[column] = dataframe[column].astype(str)
+            # Avoid having empty lists or lists with a value of None.
+            elif data_type in ["list"]:
+                dataframe[column] = dataframe[column].apply(
+                    lambda x: (
+                        None
+                        if (type(x) == list and len(x) == 1 and x[0] == None)
+                        or (type(x) == list and len(x) == 0)
+                        else x
+                    )
+                )
+            elif data_type in ["dict"]:
+                dataframe[column] = dataframe[column]
             else:
                 raise NotImplementedError
@@ -1878,18 +1979,58 @@ def _create_spark_session():
     return SparkSession.builder.getOrCreate()
-def _read_delta_table(path: str):
+def _get_delta_table(path: str) -> str:
+    from delta import DeltaTable
     spark = _create_spark_session()
-    return spark.read.format("delta").load(path)
+    return DeltaTable.forPath(spark, path)
-def _delta_table_row_count(table_name: str) -> int:
+def _read_delta_table(path: str, to_pandas: bool = True, to_df: bool = False):
-    spark = _create_spark_session()
+    if _pure_python_notebook():
+        from deltalake import DeltaTable
-    return spark.table(table_name).count()
+        df = DeltaTable(table_uri=path)
+        if to_pandas:
+            df = df.to_pandas()
+    else:
+        spark = _create_spark_session()
+        df = spark.read.format("delta").load(path)
+        if to_df:
+            df = df.toDF()
+    return df
+def _read_delta_table_history(path) -> pd.DataFrame:
+    if _pure_python_notebook():
+        from deltalake import DeltaTable
+        df = pd.DataFrame(DeltaTable(table_uri=path).history())
+    else:
+        from delta import DeltaTable
+        spark = _create_spark_session()
+        delta_table = DeltaTable.forPath(spark, path)
+        df = delta_table.history().toPandas()
+    return df
+def _delta_table_row_count(path: str) -> int:
+    if _pure_python_notebook():
+        from deltalake import DeltaTable
+        dt = DeltaTable(path)
+        arrow_table = dt.to_pyarrow_table()
+        return arrow_table.num_rows
+    else:
+        return _read_delta_table(path).count()
 def _run_spark_sql_query(query):

sempy_labs/_kql_databases.py CHANGED Viewed

@@ -6,6 +6,8 @@ from sempy_labs._helper_functions import (
     _create_dataframe,
     delete_item,
     create_item,
+    resolve_item_id,
+    resolve_workspace_id,
 )
 from uuid import UUID
 import sempy_labs._icons as icons
@@ -121,3 +123,19 @@ def delete_kql_database(
         )
     delete_item(item=kql_database, type="KQLDatabase", workspace=workspace)
+def _resolve_cluster_uri(
+    kql_database: str | UUID, workspace: Optional[str | UUID] = None
+) -> str:
+    workspace_id = resolve_workspace_id(workspace=workspace)
+    item_id = resolve_item_id(
+        item=kql_database, type="KQLDatabase", workspace=workspace
+    )
+    response = _base_api(
+        request=f"/v1/workspaces/{workspace_id}/kqlDatabases/{item_id}",
+        client="fabric_sp",
+    )
+    return response.json().get("properties", {}).get("queryServiceUri")

sempy_labs/_kusto.py ADDED Viewed

@@ -0,0 +1,135 @@
+import requests
+import pandas as pd
+from sempy.fabric.exceptions import FabricHTTPException
+from sempy._utils._log import log
+import sempy_labs._icons as icons
+from typing import Optional
+from uuid import UUID
+from sempy_labs._kql_databases import _resolve_cluster_uri
+from sempy_labs._helper_functions import resolve_item_id
+@log
+def query_kusto(
+    query: str,
+    kql_database: str | UUID,
+    workspace: Optional[str | UUID] = None,
+    language: str = "kql",
+) -> pd.DataFrame:
+    """
+    Runs a KQL query against a KQL database.
+    Parameters
+    ----------
+    query : str
+        The query (supports KQL or SQL - make sure to specify the language parameter accordingly).
+    kql_database : str | uuid.UUID
+        The KQL database name or ID.
+    workspace : str | uuid.UUID, default=None
+        The Fabric workspace name or ID.
+        Defaults to None which resolves to the workspace of the attached lakehouse
+        or if no lakehouse attached, resolves to the workspace of the notebook.
+    language : str, default="kql"
+        The language of the query. Currently "kql' and "sql" are supported.
+    Returns
+    -------
+    pandas.DataFrame
+        A pandas dataframe showing the result of the KQL query.
+    """
+    import notebookutils
+    language = language.lower()
+    if language not in ["kql", "sql"]:
+        raise ValueError(
+            f"{icons._red_dot} Invalid language '{language}'. Only 'kql' and 'sql' are supported."
+        )
+    cluster_uri = _resolve_cluster_uri(kql_database=kql_database, workspace=workspace)
+    token = notebookutils.credentials.getToken(cluster_uri)
+    headers = {
+        "Authorization": f"Bearer {token}",
+        "Content-Type": "application/json",
+        "Accept": "application/json",
+    }
+    kql_database_id = resolve_item_id(
+        item=kql_database, type="KQLDatabase", workspace=workspace
+    )
+    payload = {"db": kql_database_id, "csl": query}
+    if language == "sql":
+        payload["properties"] = {"Options": {"query_language": "sql"}}
+    response = requests.post(
+        f"{cluster_uri}/v1/rest/query",
+        headers=headers,
+        json=payload,
+    )
+    if response.status_code != 200:
+        raise FabricHTTPException(response)
+    results = response.json()
+    columns_info = results["Tables"][0]["Columns"]
+    rows = results["Tables"][0]["Rows"]
+    df = pd.DataFrame(rows, columns=[col["ColumnName"] for col in columns_info])
+    for col_info in columns_info:
+        col_name = col_info["ColumnName"]
+        data_type = col_info["DataType"]
+        try:
+            if data_type == "DateTime":
+                df[col_name] = pd.to_datetime(df[col_name])
+            elif data_type in ["Int64", "Int32", "Long"]:
+                df[col_name] = (
+                    pd.to_numeric(df[col_name], errors="coerce")
+                    .fillna(0)
+                    .astype("int64")
+                )
+            elif data_type == "Real" or data_type == "Double":
+                df[col_name] = pd.to_numeric(df[col_name], errors="coerce")
+            else:
+                # Convert any other type to string, change as needed
+                df[col_name] = df[col_name].astype(str)
+        except Exception as e:
+            print(
+                f"{icons.yellow_dot} Could not convert column {col_name} to {data_type}, defaulting to string: {str(e)}"
+            )
+            df[col_name] = df[col_name].astype(str)
+    return df
+def query_workspace_monitoring(
+    query: str, workspace: Optional[str | UUID] = None, language: str = "kql"
+) -> pd.DataFrame:
+    """
+    Runs a query against the Fabric workspace monitoring database. Workspace monitoring must be enabled on the workspace to use this function.
+    Parameters
+    ----------
+    query : str
+        The query (supports KQL or SQL - make sure to specify the language parameter accordingly).
+    workspace : str | uuid.UUID, default=None
+        The Fabric workspace name or ID.
+        Defaults to None which resolves to the workspace of the attached lakehouse
+        or if no lakehouse attached, resolves to the workspace of the notebook.
+    language : str, default="kql"
+        The language of the query. Currently "kql' and "sql" are supported.
+    Returns
+    -------
+    pandas.DataFrame
+        A pandas dataframe showing the result of the query.
+    """
+    return query_kusto(
+        query=query,
+        kql_database="Monitoring KQL database",
+        workspace=workspace,
+        language=language,
+    )

sempy_labs/_list_functions.py CHANGED Viewed

@@ -240,7 +240,11 @@ def list_tables(
                         "Columns": sum(
                             1 for c in t.Columns if str(c.Type) != "RowNumber"
                         ),
-                        "% DB": round((total_size / model_size) * 100, 2),
+                        "% DB": (
+                            round((total_size / model_size) * 100, 2)
+                            if model_size not in (0, None, float("nan"))
+                            else 0.0
+                        ),
                     }
                 )

sempy_labs/_vertipaq.py CHANGED Viewed

@@ -8,7 +8,6 @@ import datetime
 import warnings
 from sempy_labs._helper_functions import (
     format_dax_object_name,
-    resolve_lakehouse_name,
     save_as_delta_table,
     resolve_workspace_capacity,
     _get_column_aggregate,
@@ -20,7 +19,6 @@ from sempy_labs._helper_functions import (
 )
 from sempy_labs._list_functions import list_relationships, list_tables
 from sempy_labs.lakehouse import lakehouse_attached, get_lakehouse_tables
-from sempy_labs.directlake import get_direct_lake_source
 from typing import Optional
 from sempy._utils._log import log
 import sempy_labs._icons as icons
@@ -176,10 +174,12 @@ def vertipaq_analyzer(
     )
     artifact_type = None
-    if is_direct_lake:
-        artifact_type, lakehouse_name, lakehouse_id, lakehouse_workspace_id = (
-            get_direct_lake_source(dataset=dataset_id, workspace=workspace_id)
-        )
+    lakehouse_workspace_id = None
+    lakehouse_name = None
+    # if is_direct_lake:
+    #    artifact_type, lakehouse_name, lakehouse_id, lakehouse_workspace_id = (
+    #        get_direct_lake_source(dataset=dataset_id, workspace=workspace_id)
+    #    )
     dfR["Missing Rows"] = 0
     dfR["Missing Rows"] = dfR["Missing Rows"].astype(int)

sempy_labs/_warehouses.py CHANGED Viewed

@@ -53,11 +53,11 @@ def create_warehouse(
             "defaultCollation"
         ] = "Latin1_General_100_CI_AS_KS_WS_SC_UTF8"
-    response = _base_api(
+    result = _base_api(
         request=f"/v1/workspaces/{workspace_id}/warehouses",
         payload=payload,
         method="post",
-        lro_return_status_code=True,
+        lro_return_json=True,
         status_codes=[201, 202],
     )
@@ -65,7 +65,7 @@ def create_warehouse(
         f"{icons.green_dot} The '{warehouse}' warehouse has been created within the '{workspace_name}' workspace."
     )
-    return response.get("id")
+    return result.get("id")
 def list_warehouses(workspace: Optional[str | UUID] = None) -> pd.DataFrame:

sempy_labs/admin/__init__.py CHANGED Viewed

@@ -38,6 +38,7 @@ from sempy_labs.admin._capacities import (
     get_capacity_assignment_status,
     get_capacity_state,
     list_capacity_users,
+    get_refreshables,
 )
 from sempy_labs.admin._tenant import (
     list_tenant_settings,
@@ -80,6 +81,9 @@ from sempy_labs.admin._external_data_share import (
 from sempy_labs.admin._git import (
     list_git_connections,
 )
+from sempy_labs.admin._dataflows import (
+    export_dataflow,
+)
 __all__ = [
     "list_items",
@@ -133,4 +137,6 @@ __all__ = [
     "list_capacity_users",
     "list_user_subscriptions",
     "list_report_subscriptions",
+    "get_refreshables",
+    "export_dataflow",
 ]

sempy_labs/admin/_artifacts.py CHANGED Viewed

@@ -31,7 +31,7 @@ def list_unused_artifacts(workspace: Optional[str | UUID] = None) -> pd.DataFram
         "Artifact Name": "string",
         "Artifact Id": "string",
         "Artifact Type": "string",
-        "Artifact Size in MB": "int",
+        "Artifact Size in MB": "string",
         "Created Date Time": "datetime",
         "Last Accessed Date Time": "datetime",
     }
@@ -47,8 +47,8 @@ def list_unused_artifacts(workspace: Optional[str | UUID] = None) -> pd.DataFram
     for r in responses:
         for i in r.get("unusedArtifactEntities", []):
             new_data = {
-                "Artifact Name": i.get("artifactId"),
-                "Artifact Id": i.get("displayName"),
+                "Artifact Name": i.get("displayName"),
+                "Artifact Id": i.get("artifactId"),
                 "Artifact Type": i.get("artifactType"),
                 "Artifact Size in MB": i.get("artifactSizeInMB"),
                 "Created Date Time": i.get("createdDateTime"),

semantic-link-labs 0.9.9__py3-none-any.whl → 0.9.10__py3-none-any.whl

Potentially problematic release.

semantic-link-labs 0.9.9py3-none-any.whl → 0.9.10py3-none-any.whl