duckrun 0.2.18.dev3__tar.gz → 0.2.18.dev5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckrun might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.18.dev3
3
+ Version: 0.2.18.dev5
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -1179,7 +1179,7 @@ class Duckrun(WorkspaceOperationsMixin):
1179
1179
  return False
1180
1180
 
1181
1181
  def deploy(self, bim_url: str, dataset_name: Optional[str] = None,
1182
- wait_seconds: int = 5) -> int:
1182
+ wait_seconds: int = 5, refresh: str = "full") -> int:
1183
1183
  """
1184
1184
  Deploy a semantic model from a BIM file using DirectLake mode.
1185
1185
 
@@ -1190,6 +1190,9 @@ class Duckrun(WorkspaceOperationsMixin):
1190
1190
  - Workspace/Model: "workspace_name/model_name"
1191
1191
  dataset_name: Name for the semantic model (default: schema name)
1192
1192
  wait_seconds: Seconds to wait for permission propagation (default: 5)
1193
+ refresh: Refresh strategy:
1194
+ - "full": Clear values and process full refresh (default)
1195
+ - "ignore": Skip refresh entirely
1193
1196
 
1194
1197
  Returns:
1195
1198
  1 for success, 0 for failure
@@ -1205,6 +1208,9 @@ class Duckrun(WorkspaceOperationsMixin):
1205
1208
 
1206
1209
  # Deploy with custom name
1207
1210
  dr.deploy("https://github.com/.../model.bim", dataset_name="Sales Model")
1211
+
1212
+ # Deploy without refresh
1213
+ dr.deploy("https://github.com/.../model.bim", refresh="ignore")
1208
1214
  """
1209
1215
  from .semantic_model import deploy_semantic_model
1210
1216
 
@@ -1227,7 +1233,8 @@ class Duckrun(WorkspaceOperationsMixin):
1227
1233
  schema_name=self.schema,
1228
1234
  dataset_name=dataset_name,
1229
1235
  bim_url_or_path=bim_url,
1230
- wait_seconds=wait_seconds
1236
+ wait_seconds=wait_seconds,
1237
+ refresh=refresh
1231
1238
  )
1232
1239
 
1233
1240
  def close(self):
@@ -129,16 +129,21 @@ def check_dataset_exists(dataset_name, workspace_id, client):
129
129
  return False
130
130
 
131
131
 
132
- def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
132
+ def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None, refresh="full"):
133
133
  """Refresh a dataset and monitor progress using Power BI API
134
134
 
135
- For DirectLake models, performs a two-step refresh:
136
- 1. clearValues - Purges data from memory
137
- 2. full - Reframes data from Delta tables
135
+ For DirectLake models, performs refresh based on refresh parameter:
136
+ - refresh="full": Two-step refresh (clearValues + full reframe)
137
+ - refresh="ignore": Skip refresh entirely
138
138
 
139
139
  If a refresh is already in progress, waits for it to complete before starting a new one.
140
140
  """
141
141
 
142
+ # Skip refresh entirely if refresh is "ignore"
143
+ if refresh == "ignore":
144
+ print(" Ignoring refresh - skipping refresh")
145
+ return
146
+
142
147
  # If dataset_id not provided, look it up by name
143
148
  if not dataset_id:
144
149
  dataset_id = get_dataset_id(dataset_name, workspace_id, client)
@@ -539,7 +544,7 @@ def create_dataset_from_bim(dataset_name, bim_content, workspace_id, client):
539
544
 
540
545
 
541
546
  def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_name, dataset_name,
542
- bim_url_or_path, wait_seconds=5):
547
+ bim_url_or_path, wait_seconds=5, refresh="full"):
543
548
  """
544
549
  Deploy a semantic model using DirectLake mode.
545
550
 
@@ -550,6 +555,9 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
550
555
  dataset_name: Name for the semantic model
551
556
  bim_url_or_path: URL to the BIM file or local file path (e.g., 'model.bim' or 'https://...')
552
557
  wait_seconds: Seconds to wait before refresh (default: 5)
558
+ refresh: Refresh strategy (default: "full")
559
+ - "full": Clear values and process full refresh
560
+ - "ignore": Skip refresh entirely
553
561
 
554
562
  Returns:
555
563
  1 for success, 0 for failure
@@ -562,6 +570,9 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
562
570
  # Using a local file
563
571
  dr.deploy("./my_model.bim")
564
572
  dr.deploy("C:/path/to/model.bim")
573
+
574
+ # Deploy without refresh
575
+ dr.deploy("./my_model.bim", refresh="ignore")
565
576
  """
566
577
  print("=" * 70)
567
578
  print("Semantic Model Deployment (DirectLake)")
@@ -586,7 +597,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
586
597
  time.sleep(wait_seconds)
587
598
 
588
599
  print("\n[Step 3/3] Refreshing existing semantic model...")
589
- refresh_dataset(dataset_name, workspace_id, client)
600
+ refresh_dataset(dataset_name, workspace_id, client, refresh=refresh)
590
601
 
591
602
  print("\n" + "=" * 70)
592
603
  print("🎉 Refresh Completed!")
@@ -618,7 +629,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
618
629
 
619
630
  # Step 6: Refresh using the dataset ID returned from creation
620
631
  print("\n[Step 6/6] Refreshing semantic model...")
621
- refresh_dataset(dataset_name, workspace_id, client, dataset_id=dataset_id)
632
+ refresh_dataset(dataset_name, workspace_id, client, dataset_id=dataset_id, refresh=refresh)
622
633
 
623
634
  print("\n" + "=" * 70)
624
635
  print("🎉 Deployment Completed!")
@@ -645,7 +656,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
645
656
  return 0
646
657
 
647
658
 
648
- def copy_model(ws_source, model_name, destination, new_model_name=None, wait_seconds=5):
659
+ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_seconds=5, refresh="full"):
649
660
  """
650
661
  Copy a semantic model from one workspace to another.
651
662
 
@@ -658,6 +669,9 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
658
669
  destination: Destination in format "workspace/lakehouse.lakehouse/schema"
659
670
  new_model_name: Name for the new semantic model (default: same as source)
660
671
  wait_seconds: Seconds to wait before refresh (default: 5)
672
+ refresh: Refresh strategy (default: "full")
673
+ - "full": Clear values and process full refresh
674
+ - "ignore": Skip refresh entirely
661
675
 
662
676
  Returns:
663
677
  1 for success, 0 for failure
@@ -670,6 +684,9 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
670
684
  copy_model("Source WS", "Production Model", "Target WS/Data Lake.lakehouse/analytics",
671
685
  new_model_name="Production Model - Copy")
672
686
 
687
+ # Copy without refresh
688
+ copy_model("Source WS", "Model", "Target WS/LH.lakehouse/dbo", refresh="ignore")
689
+
673
690
  # Using the connect pattern
674
691
  import duckrun
675
692
  duckrun.semantic_model.copy_model("Source", "Model", "Target/LH.lakehouse/dbo")
@@ -796,7 +813,8 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
796
813
  schema_name=schema,
797
814
  dataset_name=new_model_name,
798
815
  bim_url_or_path=temp_bim_path,
799
- wait_seconds=wait_seconds
816
+ wait_seconds=wait_seconds,
817
+ refresh=refresh
800
818
  )
801
819
 
802
820
  # Clean up temp file
@@ -60,6 +60,49 @@ def _get_existing_tables_in_schema(duckrun_instance, schema_name: str) -> list:
60
60
  return []
61
61
 
62
62
 
63
+ def _match_tables_by_pattern(duckrun_instance, pattern: str) -> dict:
64
+ """Match tables across all schemas using a wildcard pattern.
65
+ Pattern can be:
66
+ - '*.summary' - matches 'summary' table in all schemas
67
+ - '*summary' - matches any table ending with 'summary'
68
+ - 'schema.*' - matches all tables in 'schema'
69
+ Returns a dict mapping schema names to lists of matching table names."""
70
+ import fnmatch
71
+
72
+ try:
73
+ # Query all schemas and tables in one go
74
+ query = """
75
+ SELECT table_schema, table_name
76
+ FROM information_schema.tables
77
+ WHERE table_schema NOT LIKE 'pg_%'
78
+ AND table_schema != 'information_schema'
79
+ AND table_name NOT LIKE 'tbl_%'
80
+ """
81
+ result = duckrun_instance.con.execute(query).fetchall()
82
+
83
+ matched = {}
84
+
85
+ # Check if pattern contains a dot (schema.table pattern)
86
+ if '.' in pattern:
87
+ schema_pattern, table_pattern = pattern.split('.', 1)
88
+ for schema, table in result:
89
+ if fnmatch.fnmatch(schema, schema_pattern) and fnmatch.fnmatch(table, table_pattern):
90
+ if schema not in matched:
91
+ matched[schema] = []
92
+ matched[schema].append(table)
93
+ else:
94
+ # Pattern matches only table names
95
+ for schema, table in result:
96
+ if fnmatch.fnmatch(table, pattern):
97
+ if schema not in matched:
98
+ matched[schema] = []
99
+ matched[schema].append(table)
100
+
101
+ return matched
102
+ except:
103
+ return {}
104
+
105
+
63
106
  def get_stats(duckrun_instance, source: str = None):
64
107
  """
65
108
  Get comprehensive statistics for Delta Lake tables.
@@ -71,6 +114,7 @@ def get_stats(duckrun_instance, source: str = None):
71
114
  - Table name: 'table_name' (uses main schema in DuckDB)
72
115
  - Schema.table: 'schema.table_name' (specific table in schema, if multi-schema)
73
116
  - Schema only: 'schema' (all tables in schema, if multi-schema)
117
+ - Wildcard pattern: '*.summary' (matches tables across all schemas)
74
118
 
75
119
  Returns:
76
120
  Arrow table with statistics including total rows, file count, row groups,
@@ -90,6 +134,9 @@ def get_stats(duckrun_instance, source: str = None):
90
134
 
91
135
  # All tables in a schema (only if multi-schema enabled)
92
136
  stats = con.get_stats('aemo')
137
+
138
+ # Wildcard pattern across all schemas (only if multi-schema enabled)
139
+ stats = con.get_stats('*.summary')
93
140
  """
94
141
  timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
95
142
 
@@ -101,8 +148,27 @@ def get_stats(duckrun_instance, source: str = None):
101
148
  if source is None:
102
149
  source = url_schema
103
150
 
151
+ # Check if source contains wildcard characters
152
+ if '*' in source or '?' in source:
153
+ # Wildcard pattern mode - only valid if multi-schema is enabled
154
+ if not duckrun_instance.scan_all_schemas:
155
+ raise ValueError(f"Wildcard pattern '{source}' not supported. Connection was made to a specific schema '{url_schema}'. Enable multi-schema mode to use wildcards.")
156
+
157
+ matched_tables = _match_tables_by_pattern(duckrun_instance, source)
158
+
159
+ if not matched_tables:
160
+ raise ValueError(f"No tables found matching pattern '{source}'")
161
+
162
+ # Flatten the matched tables into a list with schema info
163
+ tables_with_schemas = []
164
+ for schema, tables in matched_tables.items():
165
+ for table in tables:
166
+ tables_with_schemas.append((schema, table))
167
+
168
+ print(f"Found {len(tables_with_schemas)} tables matching pattern '{source}'")
169
+
104
170
  # Parse the source and validate existence
105
- if '.' in source:
171
+ elif '.' in source:
106
172
  # Format: schema.table - only valid if multi-schema is enabled
107
173
  schema_name, table_name = source.split('.', 1)
108
174
 
@@ -113,46 +179,45 @@ def get_stats(duckrun_instance, source: str = None):
113
179
  if not _table_exists(duckrun_instance, schema_name, table_name):
114
180
  raise ValueError(f"Table '{table_name}' does not exist in schema '{schema_name}'")
115
181
 
116
- list_tables = [table_name]
182
+ tables_with_schemas = [(schema_name, table_name)]
117
183
  else:
118
184
  # Could be just table name or schema name
119
185
  if duckrun_instance.scan_all_schemas:
120
186
  # Multi-schema mode: DuckDB has actual schemas
121
187
  # First check if it's a table in main schema
122
188
  if _table_exists(duckrun_instance, duckdb_schema, source):
123
- list_tables = [source]
124
- schema_name = duckdb_schema
189
+ tables_with_schemas = [(duckdb_schema, source)]
125
190
  # Otherwise, check if it's a schema name
126
191
  elif _schema_exists(duckrun_instance, source):
127
192
  schema_name = source
128
193
  list_tables = _get_existing_tables_in_schema(duckrun_instance, source)
129
194
  if not list_tables:
130
195
  raise ValueError(f"Schema '{source}' exists but contains no tables")
196
+ tables_with_schemas = [(schema_name, tbl) for tbl in list_tables]
131
197
  else:
132
198
  raise ValueError(f"Neither table '{source}' in main schema nor schema '{source}' exists")
133
199
  else:
134
200
  # Single-schema mode: tables are in DuckDB's main schema, use URL schema for file paths
135
201
  if _table_exists(duckrun_instance, duckdb_schema, source):
136
202
  # It's a table name
137
- list_tables = [source]
138
- schema_name = url_schema # Use URL schema for file path construction
203
+ tables_with_schemas = [(url_schema, source)]
139
204
  elif source == url_schema:
140
205
  # Special case: user asked for stats on the URL schema name - list all tables
141
206
  list_tables = _get_existing_tables_in_schema(duckrun_instance, duckdb_schema)
142
- schema_name = url_schema # Use URL schema for file path construction
143
207
  if not list_tables:
144
208
  raise ValueError(f"No tables found in schema '{url_schema}'")
209
+ tables_with_schemas = [(url_schema, tbl) for tbl in list_tables]
145
210
  else:
146
211
  raise ValueError(f"Table '{source}' does not exist in the current context (schema: {url_schema})")
147
212
 
148
213
  # Use the existing connection
149
214
  con = duckrun_instance.con
150
215
 
151
- print(f"Processing {len(list_tables)} tables: {list_tables}")
216
+ print(f"Processing {len(tables_with_schemas)} tables from {len(set(s for s, t in tables_with_schemas))} schema(s)")
152
217
 
153
218
  successful_tables = []
154
- for idx, tbl in enumerate(list_tables):
155
- print(f"[{idx+1}/{len(list_tables)}] Processing table '{tbl}'...")
219
+ for idx, (schema_name, tbl) in enumerate(tables_with_schemas):
220
+ print(f"[{idx+1}/{len(tables_with_schemas)}] Processing table '{schema_name}.{tbl}'...")
156
221
  # Construct lakehouse path using correct ABFSS URL format (no .Lakehouse suffix)
157
222
  table_path = f"{duckrun_instance.table_base_url}{schema_name}/{tbl}"
158
223
 
@@ -179,8 +244,18 @@ def get_stats(duckrun_instance, source: str = None):
179
244
  print(f"Warning: Could not convert RecordBatch for table '{tbl}': Unexpected type {type(add_actions)}")
180
245
  xx = {}
181
246
 
182
- # Check if VORDER exists
183
- vorder = 'tags.VORDER' in xx.keys()
247
+ # Check if VORDER exists - handle both formats:
248
+ # 1. Flattened format: 'tags.VORDER' or 'tags.vorder' in keys
249
+ # 2. Nested format: check in 'tags' dict for 'VORDER' or 'vorder'
250
+ vorder = False
251
+ if 'tags.VORDER' in xx.keys() or 'tags.vorder' in xx.keys():
252
+ vorder = True
253
+ elif 'tags' in xx.keys() and xx['tags']:
254
+ # Check nested tags dictionary (tags is a list of dicts, one per file)
255
+ for tag_dict in xx['tags']:
256
+ if tag_dict and ('VORDER' in tag_dict or 'vorder' in tag_dict):
257
+ vorder = True
258
+ break
184
259
 
185
260
  # Calculate total size
186
261
  total_size = sum(xx['size_bytes']) if xx['size_bytes'] else 0
@@ -195,6 +270,7 @@ def get_stats(duckrun_instance, source: str = None):
195
270
  con.execute(f'''
196
271
  CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
197
272
  SELECT
273
+ '{schema_name}' as schema,
198
274
  '{tbl}' as tbl,
199
275
  'empty' as file_name,
200
276
  0 as num_rows,
@@ -210,6 +286,7 @@ def get_stats(duckrun_instance, source: str = None):
210
286
  con.execute(f'''
211
287
  CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
212
288
  SELECT
289
+ '{schema_name}' as schema,
213
290
  '{tbl}' as tbl,
214
291
  fm.file_name,
215
292
  fm.num_rows,
@@ -245,6 +322,7 @@ def get_stats(duckrun_instance, source: str = None):
245
322
  con.execute(f'''
246
323
  CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
247
324
  SELECT
325
+ '{schema_name}' as schema,
248
326
  '{tbl}' as tbl,
249
327
  'empty' as file_name,
250
328
  0 as num_rows,
@@ -272,6 +350,7 @@ def get_stats(duckrun_instance, source: str = None):
272
350
  con.execute(f'''
273
351
  CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
274
352
  SELECT
353
+ '{schema_name}' as schema,
275
354
  '{tbl}' as tbl,
276
355
  fm.file_name,
277
356
  fm.num_rows,
@@ -299,7 +378,7 @@ def get_stats(duckrun_instance, source: str = None):
299
378
  # No tables were processed successfully - return empty dataframe
300
379
  print("⚠️ No tables could be processed successfully")
301
380
  import pandas as pd
302
- return pd.DataFrame(columns=['tbl', 'total_rows', 'num_files', 'num_row_group',
381
+ return pd.DataFrame(columns=['schema', 'tbl', 'total_rows', 'num_files', 'num_row_group',
303
382
  'average_row_group', 'file_size_MB', 'vorder', 'compression', 'timestamp'])
304
383
 
305
384
  # Union all successfully processed temp tables
@@ -309,6 +388,7 @@ def get_stats(duckrun_instance, source: str = None):
309
388
  # Generate final summary
310
389
  final_result = con.execute(f'''
311
390
  SELECT
391
+ schema,
312
392
  tbl,
313
393
  SUM(num_rows) as total_rows,
314
394
  COUNT(*) as num_files,
@@ -320,7 +400,7 @@ def get_stats(duckrun_instance, source: str = None):
320
400
  ANY_VALUE(timestamp) as timestamp
321
401
  FROM ({union_query})
322
402
  WHERE tbl IS NOT NULL
323
- GROUP BY tbl
403
+ GROUP BY schema, tbl
324
404
  ORDER BY total_rows DESC
325
405
  ''').df()
326
406
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.18.dev3
3
+ Version: 0.2.18.dev5
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.2.18.dev3"
7
+ version = "0.2.18.dev5"
8
8
  description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
File without changes
File without changes
File without changes