semantic-link-labs 0.9.9__py3-none-any.whl → 0.9.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of semantic-link-labs might be problematic. Click here for more details.

@@ -5,6 +5,7 @@ from typing import Optional, Tuple
5
5
  from sempy._utils._log import log
6
6
  from sempy_labs._helper_functions import (
7
7
  _base_api,
8
+ _build_url,
8
9
  _create_dataframe,
9
10
  _update_dataframe_datatypes,
10
11
  _is_valid_uuid,
@@ -57,6 +58,24 @@ def _resolve_capacity_name_and_id(
57
58
  return capacity_name, capacity_id
58
59
 
59
60
 
61
+ def _resolve_capacity_id(
62
+ capacity: str | UUID,
63
+ ) -> UUID:
64
+
65
+ if _is_valid_uuid(capacity):
66
+ capacity_id = capacity
67
+ else:
68
+ dfC = list_capacities(capacity=capacity)
69
+ if dfC.empty:
70
+ raise ValueError(
71
+ f"{icons.red_dot} The '{capacity}' capacity was not found."
72
+ )
73
+
74
+ capacity_id = dfC["Capacity Id"].iloc[0]
75
+
76
+ return capacity_id
77
+
78
+
60
79
  def _list_capacities_meta() -> pd.DataFrame:
61
80
  """
62
81
  Shows the a list of capacities and their properties. This function is the admin version.
@@ -221,7 +240,7 @@ def list_capacities(
221
240
  "Sku": "string",
222
241
  "Region": "string",
223
242
  "State": "string",
224
- "Admins": "string",
243
+ "Admins": "list",
225
244
  }
226
245
  df = _create_dataframe(columns=columns)
227
246
 
@@ -309,3 +328,144 @@ def list_capacity_users(capacity: str | UUID) -> pd.DataFrame:
309
328
  _update_dataframe_datatypes(dataframe=df, column_map=columns)
310
329
 
311
330
  return df
331
+
332
+
333
+ @log
334
+ def get_refreshables(
335
+ top: Optional[int] = None,
336
+ expand: Optional[str] = None,
337
+ filter: Optional[str] = None,
338
+ skip: Optional[int] = None,
339
+ capacity: Optional[str | UUID] = None,
340
+ ) -> pd.DataFrame | dict:
341
+ """
342
+ Returns a list of refreshables for the organization within a capacity.
343
+
344
+ Power BI retains a seven-day refresh history for each dataset, up to a maximum of sixty refreshes.
345
+
346
+ This is a wrapper function for the following API: `Admin - Get Refreshables <https://learn.microsoft.com/rest/api/power-bi/admin/get-refreshables>`_.
347
+
348
+ Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
349
+
350
+ Parameters
351
+ ----------
352
+ top : int, default=None
353
+ Returns only the first n results.
354
+ expand : str, default=None
355
+ Accepts a comma-separated list of data types, which will be expanded inline in the response. Supports capacities and groups.
356
+ filter : str, default=None
357
+ Returns a subset of a results based on Odata filter query parameter condition.
358
+ skip : int, default=None
359
+ Skips the first n results. Use with top to fetch results beyond the first 1000.
360
+ capacity : str | uuid.UUID, default=None
361
+ The capacity name or ID to filter. If None, all capacities are returned.
362
+
363
+ Returns
364
+ -------
365
+ pandas.DataFrame
366
+ Returns a list of refreshables for the organization within a capacity.
367
+ """
368
+
369
+ columns = {
370
+ "Workspace Id": "string",
371
+ "Workspace Name": "string",
372
+ "Item Id": "string",
373
+ "Item Name": "string",
374
+ "Item Kind": "string",
375
+ "Capacity Id": "string",
376
+ "Capacity Name": "string",
377
+ "Capacity SKU": "string",
378
+ "Refresh Count": "int",
379
+ "Refresh Failures": "int",
380
+ "Average Duration": "float",
381
+ "Median Duration": "float",
382
+ "Refreshes Per Day": "int",
383
+ "Refresh Type": "string",
384
+ "Start Time": "string",
385
+ "End Time": "string",
386
+ "Status": "string",
387
+ "Request Id": "string",
388
+ "Service Exception Json": "string",
389
+ "Extended Status": "dict",
390
+ "Refresh Attempts": "list",
391
+ "Refresh Schedule Days": "list",
392
+ "Refresh Schedule Times": "list",
393
+ "Refresh Schedule Enabled": "bool",
394
+ "Refresh Schedule Local Timezone Id": "string",
395
+ "Refresh Schedule Notify Option": "string",
396
+ "Configured By": "list",
397
+ }
398
+
399
+ df = _create_dataframe(columns=columns)
400
+
401
+ params = {}
402
+ url = (
403
+ "/v1.0/myorg/admin/capacities/refreshables"
404
+ if capacity is None
405
+ else f"/v1.0/myorg/admin/capacities/{_resolve_capacity_id(capacity=capacity)}/refreshables"
406
+ )
407
+
408
+ if top is not None:
409
+ params["$top"] = top
410
+
411
+ if expand is not None:
412
+ params["$expand"] = expand
413
+
414
+ if filter is not None:
415
+ params["$filter"] = filter
416
+
417
+ if skip is not None:
418
+ params["$skip"] = skip
419
+
420
+ url = _build_url(url, params)
421
+
422
+ responses = _base_api(request=url, client="fabric_sp")
423
+
424
+ refreshables = []
425
+
426
+ for i in responses.json().get("value", []):
427
+ last_refresh = i.get("lastRefresh", {})
428
+ refresh_schedule = i.get("refreshSchedule", {})
429
+ new_data = {
430
+ "Workspace Id": i.get("group", {}).get("id"),
431
+ "Workspace Name": i.get("group", {}).get("name"),
432
+ "Item Id": i.get("id"),
433
+ "Item Name": i.get("name"),
434
+ "Item Kind": i.get("kind"),
435
+ "Capacity Id": (
436
+ i.get("capacity", {}).get("id").lower()
437
+ if i.get("capacity", {}).get("id")
438
+ else None
439
+ ),
440
+ "Capacity Name": i.get("capacity", {}).get("displayName"),
441
+ "Capacity SKU": i.get("capacity", {}).get("sku"),
442
+ "Refresh Count": i.get("refreshCount", 0),
443
+ "Refresh Failures": i.get("refreshFailures", 0),
444
+ "Average Duration": i.get("averageDuration", 0),
445
+ "Median Duration": i.get("medianDuration", 0),
446
+ "Refreshes Per Day": i.get("refreshesPerDay", 0),
447
+ "Refresh Type": last_refresh.get("refreshType"),
448
+ "Start Time": last_refresh.get("startTime"),
449
+ "End Time": last_refresh.get("endTime"),
450
+ "Status": last_refresh.get("status"),
451
+ "Request Id": last_refresh.get("requestId"),
452
+ "Service Exception Json": last_refresh.get("serviceExceptionJson"),
453
+ "Extended Status": last_refresh.get("extendedStatus"),
454
+ "Refresh Attempts": last_refresh.get("refreshAttempts"),
455
+ "Refresh Schedule Days": refresh_schedule.get("days"),
456
+ "Refresh Schedule Times": refresh_schedule.get("times"),
457
+ "Refresh Schedule Enabled": refresh_schedule.get("enabled"),
458
+ "Refresh Schedule Local Timezone Id": refresh_schedule.get(
459
+ "localTimeZoneId"
460
+ ),
461
+ "Refresh Schedule Notify Option": refresh_schedule.get("notifyOption"),
462
+ "Configured By": i.get("configuredBy"),
463
+ }
464
+
465
+ refreshables.append(new_data)
466
+
467
+ if len(refreshables) > 0:
468
+ df = pd.DataFrame(refreshables)
469
+ _update_dataframe_datatypes(dataframe=df, column_map=columns)
470
+
471
+ return df
@@ -0,0 +1,45 @@
1
+ from typing import Optional
2
+ from sempy_labs._helper_functions import (
3
+ _base_api,
4
+ )
5
+ from sempy_labs.admin._items import (
6
+ _resolve_item_id,
7
+ )
8
+ from uuid import UUID
9
+ from sempy._utils._log import log
10
+
11
+
12
+ @log
13
+ def export_dataflow(
14
+ dataflow: str | UUID,
15
+ workspace: Optional[str | UUID] = None,
16
+ ) -> dict:
17
+ """
18
+ Shows a list of datasets for the organization.
19
+
20
+ This is a wrapper function for the following API: `Admin - Dataflows ExportDataflowAsAdmin <https://learn.microsoft.com/rest/api/power-bi/admin/dataflows-export-dataflow-as-admin>`_.
21
+
22
+ Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
23
+
24
+ Parameters
25
+ ----------
26
+ dataflow : str | UUID, default=None
27
+ The dataflow Name or Id.
28
+ workspace : str | uuid.UUID, default=None
29
+ The Fabric workspace name or id.
30
+ Defaults to None which resolves to the workspace of the attached lakehouse
31
+ or if no lakehouse attached, resolves to the workspace of the notebook.
32
+ Only used if given a dataflow name and not an id.
33
+
34
+ Returns
35
+ -------
36
+ dict
37
+ Exported Json file.
38
+ """
39
+ dataflow_id = _resolve_item_id(item=dataflow, type="dataflow", workspace=workspace)
40
+
41
+ url = f"/v1.0/myorg/admin/dataflows/{dataflow_id}/export"
42
+
43
+ response = _base_api(request=url, client="fabric_sp")
44
+
45
+ return response.json()
@@ -17,20 +17,26 @@ from sempy_labs._helper_functions import (
17
17
 
18
18
 
19
19
  def _resolve_item_id(
20
- item_name: str,
20
+ item: str,
21
21
  type: Optional[str] = None,
22
22
  workspace: Optional[str | UUID] = None,
23
23
  ) -> UUID:
24
+ if _is_valid_uuid(item):
25
+ item_id = item
24
26
 
25
- dfI = list_items(workspace=workspace, type=type)
26
- dfI_filt = dfI[dfI["Item Name"] == item_name]
27
+ else:
28
+ workspace_id = _resolve_workspace_name_and_id(workspace)[1]
29
+ dfI = list_items(workspace=workspace_id, type=type)
30
+ dfI_filt = dfI[dfI["Item Name"] == item]
27
31
 
28
- if len(dfI_filt) == 0:
29
- raise ValueError(
30
- f"The '{item_name}' {type} does not exist within the '{workspace}' workspace or is not of type '{type}'."
31
- )
32
+ if len(dfI_filt) == 0:
33
+ raise ValueError(
34
+ f"The '{item}' {type} does not exist within the '{workspace}' workspace or is not of type '{type}'."
35
+ )
36
+
37
+ item_id = dfI_filt["Item Id"].iloc[0]
32
38
 
33
- return dfI_filt["Item Id"].iloc[0]
39
+ return item_id
34
40
 
35
41
 
36
42
  def _resolve_item_name_and_id(
@@ -84,9 +90,8 @@ def list_items(
84
90
  capacity : str | uuid.UUID, default=None
85
91
  The capacity name or id.
86
92
  workspace : str | uuid.UUID, default=None
87
- The Fabric workspace name.
88
- Defaults to None which resolves to the workspace of the attached lakehouse
89
- or if no lakehouse attached, resolves to the workspace of the notebook.
93
+ The Fabric workspace name or id.
94
+ Defaults to None which looks into all the workspaces.
90
95
  state : str, default=None
91
96
  The item state.
92
97
  type : str, default=None
@@ -32,7 +32,7 @@ def list_tenant_settings() -> pd.DataFrame:
32
32
  "Enabled": "bool",
33
33
  "Can Specify Security Groups": "bool",
34
34
  "Tenant Setting Group": "string",
35
- "Enabled Security Groups": "string",
35
+ "Enabled Security Groups": "list",
36
36
  }
37
37
  df = _create_dataframe(columns=columns)
38
38
 
@@ -86,9 +86,9 @@ def list_capacity_tenant_settings_overrides(
86
86
  "Setting Title": "string",
87
87
  "Setting Enabled": "bool",
88
88
  "Can Specify Security Groups": "bool",
89
- "Enabled Security Groups": "string",
89
+ "Enabled Security Groups": "list",
90
90
  "Tenant Setting Group": "string",
91
- "Tenant Setting Properties": "string",
91
+ "Tenant Setting Properties": "list",
92
92
  "Delegate to Workspace": "bool",
93
93
  "Delegated From": "string",
94
94
  }
@@ -395,7 +395,7 @@ def list_workspaces_tenant_settings_overrides() -> pd.DataFrame:
395
395
  "Title": "string",
396
396
  "Enabled": "bool",
397
397
  "Can Specify Security Groups": "bool",
398
- "Enabled Security Groups": "string",
398
+ "Enabled Security Groups": "list",
399
399
  "Tenant Setting Group": "string",
400
400
  "Delegated From": "string",
401
401
  }
@@ -454,7 +454,7 @@ def list_domain_tenant_settings_overrides() -> pd.DataFrame:
454
454
  "Title": "string",
455
455
  "Enabled": "bool",
456
456
  "Can Specify Security Groups": "bool",
457
- "Enabled Security Groups": "string",
457
+ "Enabled Security Groups": "list",
458
458
  "Tenant Setting Group": "string",
459
459
  "Delegated To Workspace": "bool",
460
460
  "Delegated From": "string",
@@ -56,34 +56,33 @@ def generate_shared_expression(
56
56
  item=item_name, type=item_type, workspace=workspace_id
57
57
  )
58
58
 
59
- item_type_rest = f"{item_type.lower()}s"
60
- response = _base_api(
61
- request=f"/v1/workspaces/{workspace_id}/{item_type_rest}/{item_id}"
62
- )
59
+ if use_sql_endpoint:
60
+ item_type_rest = f"{item_type.lower()}s"
61
+ response = _base_api(
62
+ request=f"/v1/workspaces/{workspace_id}/{item_type_rest}/{item_id}"
63
+ )
63
64
 
64
- prop = response.json().get("properties")
65
+ prop = response.json().get("properties")
65
66
 
66
- if item_type == "Lakehouse":
67
- sqlprop = prop.get("sqlEndpointProperties")
68
- sqlEPCS = sqlprop.get("connectionString")
69
- sqlepid = sqlprop.get("id")
70
- provStatus = sqlprop.get("provisioningStatus")
71
- elif item_type == "Warehouse":
72
- sqlEPCS = prop.get("connectionString")
73
- sqlepid = item_id
74
- provStatus = None
67
+ if item_type == "Lakehouse":
68
+ sqlprop = prop.get("sqlEndpointProperties")
69
+ sqlEPCS = sqlprop.get("connectionString")
70
+ sqlepid = sqlprop.get("id")
71
+ provStatus = sqlprop.get("provisioningStatus")
72
+ elif item_type == "Warehouse":
73
+ sqlEPCS = prop.get("connectionString")
74
+ sqlepid = item_id
75
+ provStatus = None
75
76
 
76
- if provStatus == "InProgress":
77
- raise ValueError(
78
- f"{icons.red_dot} The SQL Endpoint for the '{item_name}' lakehouse within the '{workspace_name}' workspace has not yet been provisioned. Please wait until it has been provisioned."
79
- )
77
+ if provStatus == "InProgress":
78
+ raise ValueError(
79
+ f"{icons.red_dot} The SQL Endpoint for the '{item_name}' {item_type.lower()} within the '{workspace_name}' workspace has not yet been provisioned. Please wait until it has been provisioned."
80
+ )
80
81
 
81
- start_expr = "let\n\tdatabase = "
82
- end_expr = "\nin\n\tdatabase"
83
- mid_expr = f'Sql.Database("{sqlEPCS}", "{sqlepid}")'
84
-
85
- # Build DL/OL expression
86
- if not use_sql_endpoint and item_type == "Lakehouse":
87
- return f'AzureDataLakeStorage{{"server":"onelake.dfs.fabric.microsoft.com","path":"/{workspace_id}/{item_id}/"}}'
88
- else:
82
+ start_expr = "let\n\tdatabase = "
83
+ end_expr = "\nin\n\tdatabase"
84
+ mid_expr = f'Sql.Database("{sqlEPCS}", "{sqlepid}")'
89
85
  return f"{start_expr}{mid_expr}{end_expr}"
86
+ else:
87
+ # Build DL/OL expression
88
+ return f"""let\n\tSource = AzureStorage.DataLake("onelake.dfs.fabric.microsoft.com/{workspace_id}/{item_id}")\nin\n\tSource"""
@@ -1,14 +1,17 @@
1
1
  import pandas as pd
2
+ import re
2
3
  from sempy_labs._helper_functions import (
3
4
  format_dax_object_name,
4
5
  resolve_workspace_name_and_id,
5
6
  resolve_lakehouse_name_and_id,
6
7
  _create_dataframe,
7
- _create_spark_session,
8
+ _get_delta_table,
9
+ _pure_python_notebook,
8
10
  )
9
11
  from typing import Optional
10
12
  from sempy._utils._log import log
11
13
  from uuid import UUID
14
+ import sempy_labs._icons as icons
12
15
 
13
16
 
14
17
  @log
@@ -16,7 +19,9 @@ def get_lakehouse_columns(
16
19
  lakehouse: Optional[str | UUID] = None, workspace: Optional[str | UUID] = None
17
20
  ) -> pd.DataFrame:
18
21
  """
19
- Shows the tables and columns of a lakehouse and their respective properties.
22
+ Shows the tables and columns of a lakehouse and their respective properties. This function can be executed in either a PySpark or pure Python notebook. Note that data types may show differently when using PySpark vs pure Python.
23
+
24
+ Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
20
25
 
21
26
  Parameters
22
27
  ----------
@@ -34,7 +39,6 @@ def get_lakehouse_columns(
34
39
  Shows the tables/columns within a lakehouse and their properties.
35
40
  """
36
41
  from sempy_labs.lakehouse._get_lakehouse_tables import get_lakehouse_tables
37
- from delta import DeltaTable
38
42
 
39
43
  columns = {
40
44
  "Workspace Name": "string",
@@ -51,29 +55,48 @@ def get_lakehouse_columns(
51
55
  lakehouse=lakehouse, workspace=workspace_id
52
56
  )
53
57
 
54
- spark = _create_spark_session()
55
-
56
58
  tables = get_lakehouse_tables(
57
59
  lakehouse=lakehouse_id, workspace=workspace_id, extended=False, count_rows=False
58
60
  )
59
61
  tables_filt = tables[tables["Format"] == "delta"]
60
62
 
61
- for _, r in tables_filt.iterrows():
62
- table_name = r["Table Name"]
63
- path = r["Location"]
64
- delta_table = DeltaTable.forPath(spark, path)
65
- sparkdf = delta_table.toDF()
66
-
67
- for col_name, data_type in sparkdf.dtypes:
68
- full_column_name = format_dax_object_name(table_name, col_name)
69
- new_data = {
63
+ def add_column_metadata(table_name, col_name, data_type):
64
+ new_rows.append(
65
+ {
70
66
  "Workspace Name": workspace_name,
71
- "Lakehouse Name": lakehouse,
67
+ "Lakehouse Name": lakehouse_name,
72
68
  "Table Name": table_name,
73
69
  "Column Name": col_name,
74
- "Full Column Name": full_column_name,
70
+ "Full Column Name": format_dax_object_name(table_name, col_name),
75
71
  "Data Type": data_type,
76
72
  }
77
- df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True)
73
+ )
74
+
75
+ new_rows = []
76
+
77
+ for _, r in tables_filt.iterrows():
78
+ table_name = r["Table Name"]
79
+ path = r["Location"]
80
+
81
+ if _pure_python_notebook():
82
+ from deltalake import DeltaTable
83
+
84
+ table_schema = DeltaTable(path).schema()
85
+
86
+ for field in table_schema.fields:
87
+ col_name = field.name
88
+ match = re.search(r'"(.*?)"', str(field.type))
89
+ if not match:
90
+ raise ValueError(
91
+ f"{icons.red_dot} Could not find data type for column {col_name}."
92
+ )
93
+ data_type = match.group(1)
94
+ add_column_metadata(table_name, col_name, data_type)
95
+ else:
96
+ delta_table = _get_delta_table(path=path)
97
+ table_df = delta_table.toDF()
98
+
99
+ for col_name, data_type in table_df.dtypes:
100
+ add_column_metadata(table_name, col_name, data_type)
78
101
 
79
- return df
102
+ return pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
@@ -1,7 +1,7 @@
1
- import sempy.fabric as fabric
1
+ import os
2
2
  import pandas as pd
3
3
  import pyarrow.parquet as pq
4
- import datetime
4
+ from datetime import datetime
5
5
  from sempy_labs._helper_functions import (
6
6
  _get_column_aggregate,
7
7
  resolve_workspace_name_and_id,
@@ -9,7 +9,13 @@ from sempy_labs._helper_functions import (
9
9
  save_as_delta_table,
10
10
  _base_api,
11
11
  _create_dataframe,
12
- _create_spark_session,
12
+ resolve_workspace_id,
13
+ resolve_lakehouse_id,
14
+ _read_delta_table,
15
+ _get_delta_table,
16
+ _mount,
17
+ create_abfss_path,
18
+ _pure_python_notebook,
13
19
  )
14
20
  from sempy_labs.directlake._guardrails import (
15
21
  get_sku_size,
@@ -33,8 +39,12 @@ def get_lakehouse_tables(
33
39
  """
34
40
  Shows the tables of a lakehouse and their respective properties. Option to include additional properties relevant to Direct Lake guardrails.
35
41
 
42
+ This function can be executed in either a PySpark or pure Python notebook.
43
+
36
44
  This is a wrapper function for the following API: `Tables - List Tables <https://learn.microsoft.com/rest/api/fabric/lakehouse/tables/list-tables>`_ plus extended capabilities.
37
45
 
46
+ Service Principal Authentication is supported (see `here <https://github.com/microsoft/semantic-link-labs/blob/main/notebooks/Service%20Principal.ipynb>`_ for examples).
47
+
38
48
  Parameters
39
49
  ----------
40
50
  lakehouse : str | uuid.UUID, default=None
@@ -76,8 +86,8 @@ def get_lakehouse_tables(
76
86
  extended = True
77
87
 
78
88
  if (
79
- workspace_id != fabric.get_workspace_id()
80
- and lakehouse_id != fabric.get_lakehouse_id()
89
+ workspace_id != resolve_workspace_id()
90
+ and lakehouse_id != resolve_lakehouse_id()
81
91
  and count_rows
82
92
  ):
83
93
  raise ValueError(
@@ -88,6 +98,7 @@ def get_lakehouse_tables(
88
98
  responses = _base_api(
89
99
  request=f"v1/workspaces/{workspace_id}/lakehouses/{lakehouse_id}/tables",
90
100
  uses_pagination=True,
101
+ client="fabric_sp",
91
102
  )
92
103
 
93
104
  if not responses[0].get("data"):
@@ -112,40 +123,59 @@ def get_lakehouse_tables(
112
123
  if extended:
113
124
  sku_value = get_sku_size(workspace_id)
114
125
  guardrail = get_directlake_guardrails_for_sku(sku_value)
115
- spark = _create_spark_session()
116
- df["Files"] = None
117
- df["Row Groups"] = None
118
- df["Table Size"] = None
126
+ local_path = _mount()
127
+
128
+ df["Files"], df["Row Groups"], df["Table Size"] = None, None, None
119
129
  if count_rows:
120
130
  df["Row Count"] = None
131
+
121
132
  for i, r in df.iterrows():
122
- tName = r["Table Name"]
133
+ table_name = r["Table Name"]
123
134
  if r["Type"] == "Managed" and r["Format"] == "delta":
124
- detail_df = spark.sql(f"DESCRIBE DETAIL `{tName}`").collect()[0]
125
- num_files = detail_df.numFiles
126
- size_in_bytes = detail_df.sizeInBytes
127
-
128
- delta_table_path = f"Tables/{tName}"
129
- latest_files = (
130
- spark.read.format("delta").load(delta_table_path).inputFiles()
135
+ delta_table_path = create_abfss_path(
136
+ lakehouse_id, workspace_id, table_name
131
137
  )
132
- file_paths = [f.split("/")[-1] for f in latest_files]
133
138
 
134
- # Handle FileNotFoundError
139
+ if _pure_python_notebook():
140
+ from deltalake import DeltaTable
141
+
142
+ delta_table = DeltaTable(delta_table_path)
143
+ latest_files = [
144
+ file["path"]
145
+ for file in delta_table.get_add_actions().to_pylist()
146
+ ]
147
+ size_in_bytes = 0
148
+ for f in latest_files:
149
+ local_file_path = os.path.join(
150
+ local_path, "Tables", table_name, os.path.basename(f)
151
+ )
152
+ if os.path.exists(local_file_path):
153
+ size_in_bytes += os.path.getsize(local_file_path)
154
+ num_latest_files = len(latest_files)
155
+ else:
156
+ delta_table = _get_delta_table(delta_table_path)
157
+ latest_files = _read_delta_table(delta_table_path).inputFiles()
158
+ table_df = delta_table.toDF()
159
+ table_details = delta_table.detail().collect()[0].asDict()
160
+ num_latest_files = table_details.get("numFiles", 0)
161
+ size_in_bytes = table_details.get("sizeInBytes", 0)
162
+
163
+ table_path = os.path.join(local_path, "Tables", table_name)
164
+ file_paths = [os.path.basename(f) for f in latest_files]
165
+
135
166
  num_rowgroups = 0
136
167
  for filename in file_paths:
137
- try:
138
- num_rowgroups += pq.ParquetFile(
139
- f"/lakehouse/default/{delta_table_path}/{filename}"
140
- ).num_row_groups
141
- except FileNotFoundError:
142
- continue
143
- df.at[i, "Files"] = num_files
168
+ parquet_file = pq.ParquetFile(f"{table_path}/{filename}")
169
+ num_rowgroups += parquet_file.num_row_groups
170
+ df.at[i, "Files"] = num_latest_files
144
171
  df.at[i, "Row Groups"] = num_rowgroups
145
172
  df.at[i, "Table Size"] = size_in_bytes
146
173
  if count_rows:
147
- num_rows = spark.table(tName).count()
148
- df.at[i, "Row Count"] = num_rows
174
+ if _pure_python_notebook():
175
+ row_count = delta_table.to_pyarrow_table().num_rows
176
+ else:
177
+ row_count = table_df.count()
178
+ df.at[i, "Row Count"] = row_count
149
179
 
150
180
  if extended:
151
181
  intColumns = ["Files", "Row Groups", "Table Size"]
@@ -168,19 +198,16 @@ def get_lakehouse_tables(
168
198
  if export:
169
199
  if not lakehouse_attached():
170
200
  raise ValueError(
171
- f"{icons.red_dot} In order to save the report.json file, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook."
201
+ f"{icons.red_dot} In order to save the dataframe, a lakehouse must be attached to the notebook. Please attach a lakehouse to this notebook."
172
202
  )
173
203
 
174
- (current_lakehouse_name, current_lakehouse_id) = resolve_lakehouse_name_and_id()
175
- lakeTName = "lakehouse_table_details"
176
- lakeT_filt = df[df["Table Name"] == lakeTName]
204
+ lake_table_name = "lakehouse_table_details"
205
+ df_filt = df[df["Table Name"] == lake_table_name]
177
206
 
178
- if len(lakeT_filt) == 0:
207
+ if df_filt.empty:
179
208
  run_id = 1
180
209
  else:
181
- max_run_id = _get_column_aggregate(
182
- lakehouse=current_lakehouse_name, table_name=lakeTName
183
- )
210
+ max_run_id = _get_column_aggregate(table_name=lake_table_name)
184
211
  run_id = max_run_id + 1
185
212
 
186
213
  export_df = df.copy()
@@ -224,13 +251,13 @@ def get_lakehouse_tables(
224
251
  export_df[c] = export_df[c].astype(bool)
225
252
 
226
253
  print(
227
- f"{icons.in_progress} Saving Lakehouse table properties to the '{lakeTName}' table in the lakehouse...\n"
254
+ f"{icons.in_progress} Saving Lakehouse table properties to the '{lake_table_name}' table in the lakehouse...\n"
228
255
  )
229
- export_df["Timestamp"] = datetime.datetime.now()
256
+ export_df["Timestamp"] = datetime.now()
230
257
  export_df["RunId"] = run_id
231
258
 
232
259
  save_as_delta_table(
233
- dataframe=export_df, delta_table_name=lakeTName, write_mode="append"
260
+ dataframe=export_df, delta_table_name=lake_table_name, write_mode="append"
234
261
  )
235
262
 
236
263
  return df