duckrun 0.2.18.dev3__py3-none-any.whl → 0.2.19.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckrun might be problematic. Click here for more details.

duckrun/__init__.py CHANGED
@@ -3,7 +3,7 @@
3
3
  from duckrun.core import Duckrun
4
4
  from duckrun.notebook import import_notebook_from_web, import_notebook
5
5
 
6
- __version__ = "0.2.18.dev2"
6
+ __version__ = "0.2.18"
7
7
 
8
8
  # Expose unified connect method at module level
9
9
  connect = Duckrun.connect
duckrun/core.py CHANGED
@@ -1035,7 +1035,7 @@ class Duckrun(WorkspaceOperationsMixin):
1035
1035
  """Get underlying DuckDB connection"""
1036
1036
  return self.con
1037
1037
 
1038
- def get_stats(self, source: str = None):
1038
+ def get_stats(self, source: str = None, detailed = False):
1039
1039
  """
1040
1040
  Get comprehensive statistics for Delta Lake tables.
1041
1041
 
@@ -1045,27 +1045,34 @@ class Duckrun(WorkspaceOperationsMixin):
1045
1045
  - Table name: 'table_name' (uses current schema)
1046
1046
  - Schema.table: 'schema.table_name' (specific table in schema)
1047
1047
  - Schema only: 'schema' (all tables in schema)
1048
+ detailed: Optional. Controls the level of detail in statistics:
1049
+ - False (default): Aggregated table-level stats
1050
+ - True: Row group level statistics with compression details
1048
1051
 
1049
1052
  Returns:
1050
- Arrow table with statistics including total rows, file count, row groups,
1051
- average row group size, file sizes, VORDER status, and timestamp
1053
+ DataFrame with statistics based on detailed parameter:
1054
+ - If detailed=False: Aggregated table-level summary
1055
+ - If detailed=True: Granular file and row group level stats
1052
1056
 
1053
1057
  Examples:
1054
1058
  con = duckrun.connect("tmp/data.lakehouse/aemo")
1055
1059
 
1056
- # All tables in current schema (aemo)
1060
+ # All tables in current schema (aemo) - aggregated
1057
1061
  stats = con.get_stats()
1058
1062
 
1059
- # Single table in current schema
1063
+ # Single table in current schema - aggregated
1060
1064
  stats = con.get_stats('price')
1061
1065
 
1066
+ # Single table with detailed row group statistics
1067
+ stats_detailed = con.get_stats('price', detailed=True)
1068
+
1062
1069
  # Specific table in different schema
1063
1070
  stats = con.get_stats('aemo.price')
1064
1071
 
1065
1072
  # All tables in a schema
1066
1073
  stats = con.get_stats('aemo')
1067
1074
  """
1068
- return _get_stats(self, source)
1075
+ return _get_stats(self, source, detailed)
1069
1076
 
1070
1077
  def list_lakehouses(self) -> List[str]:
1071
1078
  """
@@ -1179,7 +1186,7 @@ class Duckrun(WorkspaceOperationsMixin):
1179
1186
  return False
1180
1187
 
1181
1188
  def deploy(self, bim_url: str, dataset_name: Optional[str] = None,
1182
- wait_seconds: int = 5) -> int:
1189
+ wait_seconds: int = 5, refresh: str = "full") -> int:
1183
1190
  """
1184
1191
  Deploy a semantic model from a BIM file using DirectLake mode.
1185
1192
 
@@ -1190,6 +1197,9 @@ class Duckrun(WorkspaceOperationsMixin):
1190
1197
  - Workspace/Model: "workspace_name/model_name"
1191
1198
  dataset_name: Name for the semantic model (default: schema name)
1192
1199
  wait_seconds: Seconds to wait for permission propagation (default: 5)
1200
+ refresh: Refresh strategy:
1201
+ - "full": Clear values and process full refresh (default)
1202
+ - "ignore": Skip refresh entirely
1193
1203
 
1194
1204
  Returns:
1195
1205
  1 for success, 0 for failure
@@ -1205,6 +1215,9 @@ class Duckrun(WorkspaceOperationsMixin):
1205
1215
 
1206
1216
  # Deploy with custom name
1207
1217
  dr.deploy("https://github.com/.../model.bim", dataset_name="Sales Model")
1218
+
1219
+ # Deploy without refresh
1220
+ dr.deploy("https://github.com/.../model.bim", refresh="ignore")
1208
1221
  """
1209
1222
  from .semantic_model import deploy_semantic_model
1210
1223
 
@@ -1227,7 +1240,8 @@ class Duckrun(WorkspaceOperationsMixin):
1227
1240
  schema_name=self.schema,
1228
1241
  dataset_name=dataset_name,
1229
1242
  bim_url_or_path=bim_url,
1230
- wait_seconds=wait_seconds
1243
+ wait_seconds=wait_seconds,
1244
+ refresh=refresh
1231
1245
  )
1232
1246
 
1233
1247
  def close(self):
duckrun/notebook.py CHANGED
@@ -160,6 +160,7 @@ def import_notebook_from_web(
160
160
  update_url = f"{base_url}/workspaces/{workspace_id}/notebooks/{notebook_id}/updateDefinition"
161
161
  payload = {
162
162
  "definition": {
163
+ "format": "ipynb",
163
164
  "parts": [
164
165
  {
165
166
  "path": "notebook-content.py",
@@ -192,6 +193,7 @@ def import_notebook_from_web(
192
193
  payload = {
193
194
  "displayName": notebook_name,
194
195
  "definition": {
196
+ "format": "ipynb",
195
197
  "parts": [
196
198
  {
197
199
  "path": "notebook-content.py",
duckrun/semantic_model.py CHANGED
@@ -129,16 +129,21 @@ def check_dataset_exists(dataset_name, workspace_id, client):
129
129
  return False
130
130
 
131
131
 
132
- def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
132
+ def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None, refresh="full"):
133
133
  """Refresh a dataset and monitor progress using Power BI API
134
134
 
135
- For DirectLake models, performs a two-step refresh:
136
- 1. clearValues - Purges data from memory
137
- 2. full - Reframes data from Delta tables
135
+ For DirectLake models, performs refresh based on refresh parameter:
136
+ - refresh="full": Two-step refresh (clearValues + full reframe)
137
+ - refresh="ignore": Skip refresh entirely
138
138
 
139
139
  If a refresh is already in progress, waits for it to complete before starting a new one.
140
140
  """
141
141
 
142
+ # Skip refresh entirely if refresh is "ignore"
143
+ if refresh == "ignore":
144
+ print(" Ignoring refresh - skipping refresh")
145
+ return
146
+
142
147
  # If dataset_id not provided, look it up by name
143
148
  if not dataset_id:
144
149
  dataset_id = get_dataset_id(dataset_name, workspace_id, client)
@@ -539,7 +544,7 @@ def create_dataset_from_bim(dataset_name, bim_content, workspace_id, client):
539
544
 
540
545
 
541
546
  def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_name, dataset_name,
542
- bim_url_or_path, wait_seconds=5):
547
+ bim_url_or_path, wait_seconds=5, refresh="full"):
543
548
  """
544
549
  Deploy a semantic model using DirectLake mode.
545
550
 
@@ -550,6 +555,9 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
550
555
  dataset_name: Name for the semantic model
551
556
  bim_url_or_path: URL to the BIM file or local file path (e.g., 'model.bim' or 'https://...')
552
557
  wait_seconds: Seconds to wait before refresh (default: 5)
558
+ refresh: Refresh strategy (default: "full")
559
+ - "full": Clear values and process full refresh
560
+ - "ignore": Skip refresh entirely
553
561
 
554
562
  Returns:
555
563
  1 for success, 0 for failure
@@ -562,6 +570,9 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
562
570
  # Using a local file
563
571
  dr.deploy("./my_model.bim")
564
572
  dr.deploy("C:/path/to/model.bim")
573
+
574
+ # Deploy without refresh
575
+ dr.deploy("./my_model.bim", refresh="ignore")
565
576
  """
566
577
  print("=" * 70)
567
578
  print("Semantic Model Deployment (DirectLake)")
@@ -586,7 +597,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
586
597
  time.sleep(wait_seconds)
587
598
 
588
599
  print("\n[Step 3/3] Refreshing existing semantic model...")
589
- refresh_dataset(dataset_name, workspace_id, client)
600
+ refresh_dataset(dataset_name, workspace_id, client, refresh=refresh)
590
601
 
591
602
  print("\n" + "=" * 70)
592
603
  print("🎉 Refresh Completed!")
@@ -618,7 +629,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
618
629
 
619
630
  # Step 6: Refresh using the dataset ID returned from creation
620
631
  print("\n[Step 6/6] Refreshing semantic model...")
621
- refresh_dataset(dataset_name, workspace_id, client, dataset_id=dataset_id)
632
+ refresh_dataset(dataset_name, workspace_id, client, dataset_id=dataset_id, refresh=refresh)
622
633
 
623
634
  print("\n" + "=" * 70)
624
635
  print("🎉 Deployment Completed!")
@@ -645,7 +656,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
645
656
  return 0
646
657
 
647
658
 
648
- def copy_model(ws_source, model_name, destination, new_model_name=None, wait_seconds=5):
659
+ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_seconds=5, refresh="full"):
649
660
  """
650
661
  Copy a semantic model from one workspace to another.
651
662
 
@@ -658,6 +669,9 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
658
669
  destination: Destination in format "workspace/lakehouse.lakehouse/schema"
659
670
  new_model_name: Name for the new semantic model (default: same as source)
660
671
  wait_seconds: Seconds to wait before refresh (default: 5)
672
+ refresh: Refresh strategy (default: "full")
673
+ - "full": Clear values and process full refresh
674
+ - "ignore": Skip refresh entirely
661
675
 
662
676
  Returns:
663
677
  1 for success, 0 for failure
@@ -670,6 +684,9 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
670
684
  copy_model("Source WS", "Production Model", "Target WS/Data Lake.lakehouse/analytics",
671
685
  new_model_name="Production Model - Copy")
672
686
 
687
+ # Copy without refresh
688
+ copy_model("Source WS", "Model", "Target WS/LH.lakehouse/dbo", refresh="ignore")
689
+
673
690
  # Using the connect pattern
674
691
  import duckrun
675
692
  duckrun.semantic_model.copy_model("Source", "Model", "Target/LH.lakehouse/dbo")
@@ -796,7 +813,8 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
796
813
  schema_name=schema,
797
814
  dataset_name=new_model_name,
798
815
  bim_url_or_path=temp_bim_path,
799
- wait_seconds=wait_seconds
816
+ wait_seconds=wait_seconds,
817
+ refresh=refresh
800
818
  )
801
819
 
802
820
  # Clean up temp file
duckrun/stats.py CHANGED
@@ -60,7 +60,50 @@ def _get_existing_tables_in_schema(duckrun_instance, schema_name: str) -> list:
60
60
  return []
61
61
 
62
62
 
63
- def get_stats(duckrun_instance, source: str = None):
63
+ def _match_tables_by_pattern(duckrun_instance, pattern: str) -> dict:
64
+ """Match tables across all schemas using a wildcard pattern.
65
+ Pattern can be:
66
+ - '*.summary' - matches 'summary' table in all schemas
67
+ - '*summary' - matches any table ending with 'summary'
68
+ - 'schema.*' - matches all tables in 'schema'
69
+ Returns a dict mapping schema names to lists of matching table names."""
70
+ import fnmatch
71
+
72
+ try:
73
+ # Query all schemas and tables in one go
74
+ query = """
75
+ SELECT table_schema, table_name
76
+ FROM information_schema.tables
77
+ WHERE table_schema NOT LIKE 'pg_%'
78
+ AND table_schema != 'information_schema'
79
+ AND table_name NOT LIKE 'tbl_%'
80
+ """
81
+ result = duckrun_instance.con.execute(query).fetchall()
82
+
83
+ matched = {}
84
+
85
+ # Check if pattern contains a dot (schema.table pattern)
86
+ if '.' in pattern:
87
+ schema_pattern, table_pattern = pattern.split('.', 1)
88
+ for schema, table in result:
89
+ if fnmatch.fnmatch(schema, schema_pattern) and fnmatch.fnmatch(table, table_pattern):
90
+ if schema not in matched:
91
+ matched[schema] = []
92
+ matched[schema].append(table)
93
+ else:
94
+ # Pattern matches only table names
95
+ for schema, table in result:
96
+ if fnmatch.fnmatch(table, pattern):
97
+ if schema not in matched:
98
+ matched[schema] = []
99
+ matched[schema].append(table)
100
+
101
+ return matched
102
+ except:
103
+ return {}
104
+
105
+
106
+ def get_stats(duckrun_instance, source: str = None, detailed = False):
64
107
  """
65
108
  Get comprehensive statistics for Delta Lake tables.
66
109
 
@@ -71,25 +114,35 @@ def get_stats(duckrun_instance, source: str = None):
71
114
  - Table name: 'table_name' (uses main schema in DuckDB)
72
115
  - Schema.table: 'schema.table_name' (specific table in schema, if multi-schema)
73
116
  - Schema only: 'schema' (all tables in schema, if multi-schema)
117
+ - Wildcard pattern: '*.summary' (matches tables across all schemas)
118
+ detailed: Optional. Controls the level of detail in statistics:
119
+ - False (default): Aggregated table-level stats (total rows, file count,
120
+ row groups, average row group size, file sizes, VORDER status)
121
+ - True: Row group level statistics with compression details, row group sizes,
122
+ and parquet metadata
74
123
 
75
124
  Returns:
76
- Arrow table with statistics including total rows, file count, row groups,
77
- average row group size, file sizes, VORDER status, and timestamp
125
+ DataFrame with statistics based on detailed parameter:
126
+ - If detailed=False: Aggregated table-level summary
127
+ - If detailed=True: Granular file and row group level stats
78
128
 
79
129
  Examples:
80
130
  con = duckrun.connect("tmp/data.lakehouse/test")
81
131
 
82
- # All tables in the connection's schema
132
+ # All tables in the connection's schema (aggregated)
83
133
  stats = con.get_stats()
84
134
 
85
- # Single table in main schema (DuckDB uses 'main', not 'test')
86
- stats = con.get_stats('price_today')
135
+ # Single table with detailed row group statistics
136
+ stats_detailed = con.get_stats('price_today', detailed=True)
87
137
 
88
138
  # Specific table in different schema (only if multi-schema enabled)
89
139
  stats = con.get_stats('aemo.price')
90
140
 
91
141
  # All tables in a schema (only if multi-schema enabled)
92
142
  stats = con.get_stats('aemo')
143
+
144
+ # Wildcard pattern across all schemas (only if multi-schema enabled)
145
+ stats = con.get_stats('*.summary')
93
146
  """
94
147
  timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
95
148
 
@@ -101,8 +154,27 @@ def get_stats(duckrun_instance, source: str = None):
101
154
  if source is None:
102
155
  source = url_schema
103
156
 
157
+ # Check if source contains wildcard characters
158
+ if '*' in source or '?' in source:
159
+ # Wildcard pattern mode - only valid if multi-schema is enabled
160
+ if not duckrun_instance.scan_all_schemas:
161
+ raise ValueError(f"Wildcard pattern '{source}' not supported. Connection was made to a specific schema '{url_schema}'. Enable multi-schema mode to use wildcards.")
162
+
163
+ matched_tables = _match_tables_by_pattern(duckrun_instance, source)
164
+
165
+ if not matched_tables:
166
+ raise ValueError(f"No tables found matching pattern '{source}'")
167
+
168
+ # Flatten the matched tables into a list with schema info
169
+ tables_with_schemas = []
170
+ for schema, tables in matched_tables.items():
171
+ for table in tables:
172
+ tables_with_schemas.append((schema, table))
173
+
174
+ print(f"Found {len(tables_with_schemas)} tables matching pattern '{source}'")
175
+
104
176
  # Parse the source and validate existence
105
- if '.' in source:
177
+ elif '.' in source:
106
178
  # Format: schema.table - only valid if multi-schema is enabled
107
179
  schema_name, table_name = source.split('.', 1)
108
180
 
@@ -113,46 +185,45 @@ def get_stats(duckrun_instance, source: str = None):
113
185
  if not _table_exists(duckrun_instance, schema_name, table_name):
114
186
  raise ValueError(f"Table '{table_name}' does not exist in schema '{schema_name}'")
115
187
 
116
- list_tables = [table_name]
188
+ tables_with_schemas = [(schema_name, table_name)]
117
189
  else:
118
190
  # Could be just table name or schema name
119
191
  if duckrun_instance.scan_all_schemas:
120
192
  # Multi-schema mode: DuckDB has actual schemas
121
193
  # First check if it's a table in main schema
122
194
  if _table_exists(duckrun_instance, duckdb_schema, source):
123
- list_tables = [source]
124
- schema_name = duckdb_schema
195
+ tables_with_schemas = [(duckdb_schema, source)]
125
196
  # Otherwise, check if it's a schema name
126
197
  elif _schema_exists(duckrun_instance, source):
127
198
  schema_name = source
128
199
  list_tables = _get_existing_tables_in_schema(duckrun_instance, source)
129
200
  if not list_tables:
130
201
  raise ValueError(f"Schema '{source}' exists but contains no tables")
202
+ tables_with_schemas = [(schema_name, tbl) for tbl in list_tables]
131
203
  else:
132
204
  raise ValueError(f"Neither table '{source}' in main schema nor schema '{source}' exists")
133
205
  else:
134
206
  # Single-schema mode: tables are in DuckDB's main schema, use URL schema for file paths
135
207
  if _table_exists(duckrun_instance, duckdb_schema, source):
136
208
  # It's a table name
137
- list_tables = [source]
138
- schema_name = url_schema # Use URL schema for file path construction
209
+ tables_with_schemas = [(url_schema, source)]
139
210
  elif source == url_schema:
140
211
  # Special case: user asked for stats on the URL schema name - list all tables
141
212
  list_tables = _get_existing_tables_in_schema(duckrun_instance, duckdb_schema)
142
- schema_name = url_schema # Use URL schema for file path construction
143
213
  if not list_tables:
144
214
  raise ValueError(f"No tables found in schema '{url_schema}'")
215
+ tables_with_schemas = [(url_schema, tbl) for tbl in list_tables]
145
216
  else:
146
217
  raise ValueError(f"Table '{source}' does not exist in the current context (schema: {url_schema})")
147
218
 
148
219
  # Use the existing connection
149
220
  con = duckrun_instance.con
150
221
 
151
- print(f"Processing {len(list_tables)} tables: {list_tables}")
222
+ print(f"Processing {len(tables_with_schemas)} tables from {len(set(s for s, t in tables_with_schemas))} schema(s)")
152
223
 
153
224
  successful_tables = []
154
- for idx, tbl in enumerate(list_tables):
155
- print(f"[{idx+1}/{len(list_tables)}] Processing table '{tbl}'...")
225
+ for idx, (schema_name, tbl) in enumerate(tables_with_schemas):
226
+ print(f"[{idx+1}/{len(tables_with_schemas)}] Processing table '{schema_name}.{tbl}'...")
156
227
  # Construct lakehouse path using correct ABFSS URL format (no .Lakehouse suffix)
157
228
  table_path = f"{duckrun_instance.table_base_url}{schema_name}/{tbl}"
158
229
 
@@ -179,8 +250,18 @@ def get_stats(duckrun_instance, source: str = None):
179
250
  print(f"Warning: Could not convert RecordBatch for table '{tbl}': Unexpected type {type(add_actions)}")
180
251
  xx = {}
181
252
 
182
- # Check if VORDER exists
183
- vorder = 'tags.VORDER' in xx.keys()
253
+ # Check if VORDER exists - handle both formats:
254
+ # 1. Flattened format: 'tags.VORDER' or 'tags.vorder' in keys
255
+ # 2. Nested format: check in 'tags' dict for 'VORDER' or 'vorder'
256
+ vorder = False
257
+ if 'tags.VORDER' in xx.keys() or 'tags.vorder' in xx.keys():
258
+ vorder = True
259
+ elif 'tags' in xx.keys() and xx['tags']:
260
+ # Check nested tags dictionary (tags is a list of dicts, one per file)
261
+ for tag_dict in xx['tags']:
262
+ if tag_dict and ('VORDER' in tag_dict or 'vorder' in tag_dict):
263
+ vorder = True
264
+ break
184
265
 
185
266
  # Calculate total size
186
267
  total_size = sum(xx['size_bytes']) if xx['size_bytes'] else 0
@@ -195,6 +276,7 @@ def get_stats(duckrun_instance, source: str = None):
195
276
  con.execute(f'''
196
277
  CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
197
278
  SELECT
279
+ '{schema_name}' as schema,
198
280
  '{tbl}' as tbl,
199
281
  'empty' as file_name,
200
282
  0 as num_rows,
@@ -207,21 +289,36 @@ def get_stats(duckrun_instance, source: str = None):
207
289
  ''')
208
290
  else:
209
291
  # Get parquet metadata and create temp table with compression info
210
- con.execute(f'''
211
- CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
212
- SELECT
213
- '{tbl}' as tbl,
214
- fm.file_name,
215
- fm.num_rows,
216
- fm.num_row_groups,
217
- CEIL({total_size}/(1024*1024)) as size,
218
- {vorder} as vorder,
219
- COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
220
- '{timestamp}' as timestamp
221
- FROM parquet_file_metadata({delta}) fm
222
- LEFT JOIN parquet_metadata({delta}) pm ON fm.file_name = pm.file_name
223
- GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
224
- ''')
292
+ if detailed == True:
293
+ # Detailed mode: Include ALL parquet_metadata columns
294
+ con.execute(f'''
295
+ CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
296
+ SELECT
297
+ '{schema_name}' as schema,
298
+ '{tbl}' as tbl,
299
+ {vorder} as vorder,
300
+ pm.*,
301
+ '{timestamp}' as timestamp
302
+ FROM parquet_metadata({delta}) pm
303
+ ''')
304
+ else:
305
+ # Aggregated mode: Original summary statistics
306
+ con.execute(f'''
307
+ CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
308
+ SELECT
309
+ '{schema_name}' as schema,
310
+ '{tbl}' as tbl,
311
+ fm.file_name,
312
+ fm.num_rows,
313
+ fm.num_row_groups,
314
+ CEIL({total_size}/(1024*1024)) as size,
315
+ {vorder} as vorder,
316
+ COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
317
+ '{timestamp}' as timestamp
318
+ FROM parquet_file_metadata({delta}) fm
319
+ LEFT JOIN parquet_metadata({delta}) pm ON fm.file_name = pm.file_name
320
+ GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
321
+ ''')
225
322
 
226
323
  except Exception as e:
227
324
  error_msg = str(e)
@@ -245,6 +342,7 @@ def get_stats(duckrun_instance, source: str = None):
245
342
  con.execute(f'''
246
343
  CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
247
344
  SELECT
345
+ '{schema_name}' as schema,
248
346
  '{tbl}' as tbl,
249
347
  'empty' as file_name,
250
348
  0 as num_rows,
@@ -269,21 +367,36 @@ def get_stats(duckrun_instance, source: str = None):
269
367
  filenames.append(table_path + "/" + filename)
270
368
 
271
369
  # Use parquet_file_metadata to get actual parquet stats with compression
272
- con.execute(f'''
273
- CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
274
- SELECT
275
- '{tbl}' as tbl,
276
- fm.file_name,
277
- fm.num_rows,
278
- fm.num_row_groups,
279
- 0 as size,
280
- false as vorder,
281
- COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
282
- '{timestamp}' as timestamp
283
- FROM parquet_file_metadata({filenames}) fm
284
- LEFT JOIN parquet_metadata({filenames}) pm ON fm.file_name = pm.file_name
285
- GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
286
- ''')
370
+ if detailed == True:
371
+ # Detailed mode: Include ALL parquet_metadata columns
372
+ con.execute(f'''
373
+ CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
374
+ SELECT
375
+ '{schema_name}' as schema,
376
+ '{tbl}' as tbl,
377
+ false as vorder,
378
+ pm.*,
379
+ '{timestamp}' as timestamp
380
+ FROM parquet_metadata({filenames}) pm
381
+ ''')
382
+ else:
383
+ # Aggregated mode: Original summary statistics
384
+ con.execute(f'''
385
+ CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
386
+ SELECT
387
+ '{schema_name}' as schema,
388
+ '{tbl}' as tbl,
389
+ fm.file_name,
390
+ fm.num_rows,
391
+ fm.num_row_groups,
392
+ 0 as size,
393
+ false as vorder,
394
+ COALESCE(STRING_AGG(DISTINCT pm.compression, ', ' ORDER BY pm.compression), 'UNCOMPRESSED') as compression,
395
+ '{timestamp}' as timestamp
396
+ FROM parquet_file_metadata({filenames}) fm
397
+ LEFT JOIN parquet_metadata({filenames}) pm ON fm.file_name = pm.file_name
398
+ GROUP BY fm.file_name, fm.num_rows, fm.num_row_groups
399
+ ''')
287
400
 
288
401
  print(f" ✓ Successfully processed '{tbl}' using DuckDB fallback with parquet metadata")
289
402
  except Exception as fallback_error:
@@ -299,30 +412,44 @@ def get_stats(duckrun_instance, source: str = None):
299
412
  # No tables were processed successfully - return empty dataframe
300
413
  print("⚠️ No tables could be processed successfully")
301
414
  import pandas as pd
302
- return pd.DataFrame(columns=['tbl', 'total_rows', 'num_files', 'num_row_group',
303
- 'average_row_group', 'file_size_MB', 'vorder', 'compression', 'timestamp'])
415
+ if detailed == True:
416
+ return pd.DataFrame(columns=['schema', 'tbl', 'vorder', 'timestamp'])
417
+ else:
418
+ return pd.DataFrame(columns=['schema', 'tbl', 'total_rows', 'num_files', 'num_row_group',
419
+ 'average_row_group', 'file_size_MB', 'vorder', 'compression', 'timestamp'])
304
420
 
305
421
  # Union all successfully processed temp tables
306
422
  union_parts = [f'SELECT * FROM tbl_{i}' for i in successful_tables]
307
423
  union_query = ' UNION ALL '.join(union_parts)
308
424
 
309
- # Generate final summary
310
- final_result = con.execute(f'''
311
- SELECT
312
- tbl,
313
- SUM(num_rows) as total_rows,
314
- COUNT(*) as num_files,
315
- SUM(num_row_groups) as num_row_group,
316
- CAST(CEIL(SUM(num_rows)::DOUBLE / NULLIF(SUM(num_row_groups), 0)) AS INTEGER) as average_row_group,
317
- MIN(size) as file_size_MB,
318
- ANY_VALUE(vorder) as vorder,
319
- STRING_AGG(DISTINCT compression, ', ' ORDER BY compression) as compression,
320
- ANY_VALUE(timestamp) as timestamp
321
- FROM ({union_query})
322
- WHERE tbl IS NOT NULL
323
- GROUP BY tbl
324
- ORDER BY total_rows DESC
325
- ''').df()
425
+ # Generate final summary based on detailed flag
426
+ if detailed == True:
427
+ # Detailed mode: Return ALL parquet_metadata columns
428
+ final_result = con.execute(f'''
429
+ SELECT *
430
+ FROM ({union_query})
431
+ WHERE tbl IS NOT NULL
432
+ ORDER BY schema, tbl, file_name, row_group_id, column_id
433
+ ''').df()
434
+ else:
435
+ # Aggregated mode: Original summary statistics
436
+ final_result = con.execute(f'''
437
+ SELECT
438
+ schema,
439
+ tbl,
440
+ SUM(num_rows) as total_rows,
441
+ COUNT(*) as num_files,
442
+ SUM(num_row_groups) as num_row_group,
443
+ CAST(CEIL(SUM(num_rows)::DOUBLE / NULLIF(SUM(num_row_groups), 0)) AS INTEGER) as average_row_group,
444
+ MIN(size) as file_size_MB,
445
+ ANY_VALUE(vorder) as vorder,
446
+ STRING_AGG(DISTINCT compression, ', ' ORDER BY compression) as compression,
447
+ ANY_VALUE(timestamp) as timestamp
448
+ FROM ({union_query})
449
+ WHERE tbl IS NOT NULL
450
+ GROUP BY schema, tbl
451
+ ORDER BY total_rows DESC
452
+ ''').df()
326
453
 
327
454
  return final_result
328
455
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.18.dev3
3
+ Version: 0.2.19.dev1
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -0,0 +1,15 @@
1
+ duckrun/__init__.py,sha256=-DPOb_ETaBC0M7YqXj482FE1aZ-SxJeSeY6KB6hPgWU,350
2
+ duckrun/auth.py,sha256=EMaf-L2zeNOjbHOT97xYxfZNfWo4WrwrU1h3vBQTgEc,9624
3
+ duckrun/core.py,sha256=jpg1okp6-Y4HubTJmSjyT9uhUc5pFr4A0tcNxNujSig,69086
4
+ duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
5
+ duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
6
+ duckrun/notebook.py,sha256=lzDRBoWZ_lePF-_5BbA1_42BImLZC5yrq6nzlmlKglM,12183
7
+ duckrun/runner.py,sha256=NGVyerJA44UP2umRdndfL0fuFM_gdOZmuJUz-PLOFf0,13461
8
+ duckrun/semantic_model.py,sha256=shRPBN1II60K_PH8JOqke-_3hAwLspcx4Add0VJRwwU,35913
9
+ duckrun/stats.py,sha256=8Qc9Mimvv7ALbOHw5-UPWrSflFrGrtkCQkB0QYL8jCw,21923
10
+ duckrun/writer.py,sha256=wIsU77DSj4J7d9_bIhvk6AbC51uUrLW0e6pcSPQOY1c,9424
11
+ duckrun-0.2.19.dev1.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
12
+ duckrun-0.2.19.dev1.dist-info/METADATA,sha256=RvjFSOTabsqOYCk2ApzQ5ichMistEScyLKnrn61ODRs,20807
13
+ duckrun-0.2.19.dev1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
+ duckrun-0.2.19.dev1.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
15
+ duckrun-0.2.19.dev1.dist-info/RECORD,,
@@ -1,15 +0,0 @@
1
- duckrun/__init__.py,sha256=vqv_bJjHjrrXGs8Zyxuy-GKTCyJlZ5z3npPQgE9ipBY,355
2
- duckrun/auth.py,sha256=EMaf-L2zeNOjbHOT97xYxfZNfWo4WrwrU1h3vBQTgEc,9624
3
- duckrun/core.py,sha256=irIepj0d-4J7Er5YeQIaOZQuycBYQ1FSNmTEBgaGVm4,68270
4
- duckrun/files.py,sha256=Fvdjg3DyHJzIVzKo8M_j-eGz4zU61lOB38Y_onbQJkI,10137
5
- duckrun/lakehouse.py,sha256=j--Z3zo8AOWt1GF9VzRosmmTAy6ey2D0LVubti58twU,14109
6
- duckrun/notebook.py,sha256=SzdKTpvzHiWMrvg7mCd3DN6R4gU_6Gm7gfkuETzylaE,12103
7
- duckrun/runner.py,sha256=NGVyerJA44UP2umRdndfL0fuFM_gdOZmuJUz-PLOFf0,13461
8
- duckrun/semantic_model.py,sha256=kc-g97A-Lbsa1H89EtumZTUPmGYN2uXhspGbG6ZuG2M,35049
9
- duckrun/stats.py,sha256=qvWnPk2P8Ob_tzaiNfdQmUQqMVq2FWv3EgArE7hPl44,15482
10
- duckrun/writer.py,sha256=wIsU77DSj4J7d9_bIhvk6AbC51uUrLW0e6pcSPQOY1c,9424
11
- duckrun-0.2.18.dev3.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
12
- duckrun-0.2.18.dev3.dist-info/METADATA,sha256=tGhjPWM7NxHEQQbBp_PkSWNamMXJwJ92Ns8Qfse8TXs,20807
13
- duckrun-0.2.18.dev3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
- duckrun-0.2.18.dev3.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
15
- duckrun-0.2.18.dev3.dist-info/RECORD,,