PyPI - duckrun - Versions diffs - 0.2.18.dev1__tar.gz → 0.2.18.dev3__tar.gz - Mend

duckrun 0.2.18.dev1tar.gz → 0.2.18.dev3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of duckrun might be problematic. Click here for more details.

Files changed (20) hide show

{duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: duckrun
-Version: 0.2.18.dev1
+Version: 0.2.18.dev3
 Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
 Author: mim
 License: MIT

{duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/duckrun/__init__.py RENAMED Viewed

@@ -3,7 +3,7 @@
 from duckrun.core import Duckrun
 from duckrun.notebook import import_notebook_from_web, import_notebook
-__version__ = "0.2.18.dev1"
+__version__ = "0.2.18.dev2"
 # Expose unified connect method at module level
 connect = Duckrun.connect

{duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/duckrun/core.py RENAMED Viewed

@@ -1035,12 +1035,13 @@ class Duckrun(WorkspaceOperationsMixin):
         """Get underlying DuckDB connection"""
         return self.con
-    def get_stats(self, source: str):
+    def get_stats(self, source: str = None):
         """
         Get comprehensive statistics for Delta Lake tables.
         Args:
-            source: Can be one of:
+            source: Optional. Can be one of:
+                   - None: Use all tables in the connection's schema (default)
                    - Table name: 'table_name' (uses current schema)
                    - Schema.table: 'schema.table_name' (specific table in schema)
                    - Schema only: 'schema' (all tables in schema)
@@ -1052,6 +1053,9 @@ class Duckrun(WorkspaceOperationsMixin):
         Examples:
             con = duckrun.connect("tmp/data.lakehouse/aemo")
+            # All tables in current schema (aemo)
+            stats = con.get_stats()
             # Single table in current schema
             stats = con.get_stats('price')
@@ -1184,7 +1188,7 @@ class Duckrun(WorkspaceOperationsMixin):
                 - URL: "https://raw.githubusercontent.com/.../model.bim"
                 - Local file: "model.bim"
                 - Workspace/Model: "workspace_name/model_name"
-            dataset_name: Name for the semantic model (default: source model name if workspace/model format, else lakehouse_schema)
+            dataset_name: Name for the semantic model (default: schema name)
             wait_seconds: Seconds to wait for permission propagation (default: 5)
         Returns:
@@ -1193,14 +1197,14 @@ class Duckrun(WorkspaceOperationsMixin):
         Examples:
             dr = Duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
+            # Deploy with schema name as dataset name (dbo)
+            dr.deploy("https://github.com/.../model.bim")
             # Deploy from workspace/model (uses same name by default)
             dr.deploy("Source Workspace/Source Model")  # Creates "Source Model"
             # Deploy with custom name
-            dr.deploy("Source Workspace/Source Model", dataset_name="Sales Model Copy")
-            # Deploy from URL or local file
-            dr.deploy("https://raw.githubusercontent.com/.../model.bim", dataset_name="My Model")
+            dr.deploy("https://github.com/.../model.bim", dataset_name="Sales Model")
         """
         from .semantic_model import deploy_semantic_model
@@ -1212,9 +1216,9 @@ class Duckrun(WorkspaceOperationsMixin):
                 if len(parts) == 2:
                     dataset_name = parts[1]  # Use the model name
                 else:
-                    dataset_name = f"{self.lakehouse_name}_{self.schema}"
+                    dataset_name = self.schema  # Use schema name
             else:
-                dataset_name = f"{self.lakehouse_name}_{self.schema}"
+                dataset_name = self.schema  # Use schema name
         # Call the deployment function (DirectLake only)
         return deploy_semantic_model(

{duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/duckrun/semantic_model.py RENAMED Viewed

@@ -130,13 +130,66 @@ def check_dataset_exists(dataset_name, workspace_id, client):
 def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
-    """Refresh a dataset and monitor progress using Power BI API"""
+    """Refresh a dataset and monitor progress using Power BI API
+    For DirectLake models, performs a two-step refresh:
+    1. clearValues - Purges data from memory
+    2. full - Reframes data from Delta tables
+    If a refresh is already in progress, waits for it to complete before starting a new one.
+    """
     # If dataset_id not provided, look it up by name
     if not dataset_id:
         dataset_id = get_dataset_id(dataset_name, workspace_id, client)
-    payload = {
+    # Use Power BI API for refresh (not Fabric API)
+    powerbi_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes"
+    headers = client._get_headers()
+    # Check for in-progress refreshes
+    print("   Checking for in-progress refreshes...")
+    try:
+        status_response = requests.get(f"{powerbi_url}?$top=1", headers=headers)
+        if status_response.status_code == 200:
+            refreshes = status_response.json().get('value', [])
+            if refreshes:
+                latest_refresh = refreshes[0]
+                status = latest_refresh.get('status')
+                if status in ['InProgress', 'Unknown']:
+                    refresh_id = latest_refresh.get('requestId')
+                    print(f"   ⚠️  Found in-progress refresh (ID: {refresh_id})")
+                    print(f"   Waiting for current refresh to complete...")
+                    # Wait for the in-progress refresh to complete
+                    max_wait_attempts = 60
+                    for attempt in range(max_wait_attempts):
+                        time.sleep(5)
+                        check_response = requests.get(f"{powerbi_url}/{refresh_id}", headers=headers)
+                        if check_response.status_code == 200:
+                            current_status = check_response.json().get('status')
+                            if current_status == 'Completed':
+                                print(f"   ✓ Previous refresh completed")
+                                break
+                            elif current_status == 'Failed':
+                                print(f"   ⚠️  Previous refresh failed, continuing with new refresh")
+                                break
+                            elif current_status == 'Cancelled':
+                                print(f"   ⚠️  Previous refresh was cancelled, continuing with new refresh")
+                                break
+                            if attempt % 6 == 0:
+                                print(f"   Still waiting... (status: {current_status})")
+                    else:
+                        print(f"   ⚠️  Timeout waiting for previous refresh, will attempt new refresh anyway")
+    except Exception as e:
+        print(f"   ⚠️  Could not check refresh status: {e}")
+        print(f"   Continuing with refresh attempt...")
+    # Step 1: clearValues - Purge data from memory
+    print("   Step 1: Clearing values from memory...")
+    clearvalues_payload = {
         "type": "clearValues",
         "commitMode": "transactional",
         "maxParallelism": 10,
@@ -144,14 +197,63 @@ def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
         "objects": []
     }
-    # Use Power BI API for refresh (not Fabric API)
-    powerbi_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes"
-    headers = client._get_headers()
+    response = requests.post(powerbi_url, headers=headers, json=clearvalues_payload)
+    if response.status_code in [200, 202]:
+        # For 202, monitor the clearValues operation
+        if response.status_code == 202:
+            location = response.headers.get('Location')
+            if location:
+                clear_refresh_id = location.split('/')[-1]
+                print("   ✓ Clear values initiated, monitoring progress...")
+                max_attempts = 60
+                for attempt in range(max_attempts):
+                    time.sleep(2)
+                    status_url = f"https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/refreshes/{clear_refresh_id}"
+                    status_response = requests.get(status_url, headers=headers)
+                    status_response.raise_for_status()
+                    status = status_response.json().get('status')
+                    if status == 'Completed':
+                        print(f"   ✓ Clear values completed")
+                        break
+                    elif status == 'Failed':
+                        error = status_response.json().get('serviceExceptionJson', '')
+                        raise Exception(f"Clear values failed: {error}")
+                    elif status == 'Cancelled':
+                        raise Exception("Clear values was cancelled")
+                    if attempt % 10 == 0 and attempt > 0:
+                        print(f"   Clear values status: {status}...")
+                else:
+                    raise Exception(f"Clear values timed out")
+        else:
+            print("   ✓ Clear values completed")
+    else:
+        # Provide detailed error message
+        try:
+            error_details = response.json()
+            error_message = error_details.get('error', {}).get('message', response.text)
+            raise Exception(f"Clear values failed with status {response.status_code}: {error_message}")
+        except (json.JSONDecodeError, ValueError):
+            response.raise_for_status()
+    # Step 2: full refresh - Reframe data from Delta tables
+    print("   Step 2: Full refresh to reframe data...")
+    full_payload = {
+        "type": "full",
+        "commitMode": "transactional",
+        "maxParallelism": 10,
+        "retryCount": 2,
+        "objects": []
+    }
-    response = requests.post(powerbi_url, headers=headers, json=payload)
+    response = requests.post(powerbi_url, headers=headers, json=full_payload)
     if response.status_code in [200, 202]:
-        print(f"✓ Refresh initiated")
+        print(f"   ✓ Refresh initiated")
         # For 202, get the refresh_id from the Location header
         if response.status_code == 202:
@@ -183,7 +285,13 @@ def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
                 raise Exception(f"Refresh timed out")
     else:
-        response.raise_for_status()
+        # Provide detailed error message
+        try:
+            error_details = response.json()
+            error_message = error_details.get('error', {}).get('message', response.text)
+            raise Exception(f"Refresh request failed with status {response.status_code}: {error_message}")
+        except (json.JSONDecodeError, ValueError):
+            response.raise_for_status()
 def download_bim_from_github(url_or_path):
@@ -471,13 +579,13 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
         dataset_exists = check_dataset_exists(dataset_name, workspace_id, client)
         if dataset_exists:
-            print(f"\n✓ Dataset exists - refreshing...")
+            print(f"✓ Dataset '{dataset_name}' already exists - skipping deployment")
             if wait_seconds > 0:
                 print(f"   Waiting {wait_seconds} seconds...")
                 time.sleep(wait_seconds)
-            print("\n[Step 6/6] Refreshing semantic model...")
+            print("\n[Step 3/3] Refreshing existing semantic model...")
             refresh_dataset(dataset_name, workspace_id, client)
             print("\n" + "=" * 70)

{duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/duckrun/stats.py RENAMED Viewed

@@ -60,13 +60,14 @@ def _get_existing_tables_in_schema(duckrun_instance, schema_name: str) -> list:
         return []
-def get_stats(duckrun_instance, source: str):
+def get_stats(duckrun_instance, source: str = None):
     """
     Get comprehensive statistics for Delta Lake tables.
     Args:
         duckrun_instance: The Duckrun connection instance
-        source: Can be one of:
+        source: Optional. Can be one of:
+               - None: Use all tables in the connection's schema (default)
                - Table name: 'table_name' (uses main schema in DuckDB)
                - Schema.table: 'schema.table_name' (specific table in schema, if multi-schema)
                - Schema only: 'schema' (all tables in schema, if multi-schema)
@@ -78,6 +79,9 @@ def get_stats(duckrun_instance, source: str):
     Examples:
         con = duckrun.connect("tmp/data.lakehouse/test")
+        # All tables in the connection's schema
+        stats = con.get_stats()
         # Single table in main schema (DuckDB uses 'main', not 'test')
         stats = con.get_stats('price_today')
@@ -93,6 +97,10 @@ def get_stats(duckrun_instance, source: str):
     duckdb_schema = "main"
     url_schema = duckrun_instance.schema  # This is from the connection URL path
+    # If source is not provided, default to all tables in the connection's schema
+    if source is None:
+        source = url_schema
     # Parse the source and validate existence
     if '.' in source:
         # Format: schema.table - only valid if multi-schema is enabled

{duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/duckrun.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: duckrun
-Version: 0.2.18.dev1
+Version: 0.2.18.dev3
 Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
 Author: mim
 License: MIT

{duckrun-0.2.18.dev1 → duckrun-0.2.18.dev3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "duckrun"
-version = "0.2.18.dev1"
+version = "0.2.18.dev3"
 description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
 readme = "README.md"
 license = {text = "MIT"}