duckrun 0.2.18.dev3__tar.gz → 0.2.18.dev5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of duckrun might be problematic. Click here for more details.
- {duckrun-0.2.18.dev3 → duckrun-0.2.18.dev5}/PKG-INFO +1 -1
- {duckrun-0.2.18.dev3 → duckrun-0.2.18.dev5}/duckrun/core.py +9 -2
- {duckrun-0.2.18.dev3 → duckrun-0.2.18.dev5}/duckrun/semantic_model.py +27 -9
- {duckrun-0.2.18.dev3 → duckrun-0.2.18.dev5}/duckrun/stats.py +94 -14
- {duckrun-0.2.18.dev3 → duckrun-0.2.18.dev5}/duckrun.egg-info/PKG-INFO +1 -1
- {duckrun-0.2.18.dev3 → duckrun-0.2.18.dev5}/pyproject.toml +1 -1
- {duckrun-0.2.18.dev3 → duckrun-0.2.18.dev5}/LICENSE +0 -0
- {duckrun-0.2.18.dev3 → duckrun-0.2.18.dev5}/README.md +0 -0
- {duckrun-0.2.18.dev3 → duckrun-0.2.18.dev5}/duckrun/__init__.py +0 -0
- {duckrun-0.2.18.dev3 → duckrun-0.2.18.dev5}/duckrun/auth.py +0 -0
- {duckrun-0.2.18.dev3 → duckrun-0.2.18.dev5}/duckrun/files.py +0 -0
- {duckrun-0.2.18.dev3 → duckrun-0.2.18.dev5}/duckrun/lakehouse.py +0 -0
- {duckrun-0.2.18.dev3 → duckrun-0.2.18.dev5}/duckrun/notebook.py +0 -0
- {duckrun-0.2.18.dev3 → duckrun-0.2.18.dev5}/duckrun/runner.py +0 -0
- {duckrun-0.2.18.dev3 → duckrun-0.2.18.dev5}/duckrun/writer.py +0 -0
- {duckrun-0.2.18.dev3 → duckrun-0.2.18.dev5}/duckrun.egg-info/SOURCES.txt +0 -0
- {duckrun-0.2.18.dev3 → duckrun-0.2.18.dev5}/duckrun.egg-info/dependency_links.txt +0 -0
- {duckrun-0.2.18.dev3 → duckrun-0.2.18.dev5}/duckrun.egg-info/requires.txt +0 -0
- {duckrun-0.2.18.dev3 → duckrun-0.2.18.dev5}/duckrun.egg-info/top_level.txt +0 -0
- {duckrun-0.2.18.dev3 → duckrun-0.2.18.dev5}/setup.cfg +0 -0
|
@@ -1179,7 +1179,7 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1179
1179
|
return False
|
|
1180
1180
|
|
|
1181
1181
|
def deploy(self, bim_url: str, dataset_name: Optional[str] = None,
|
|
1182
|
-
wait_seconds: int = 5) -> int:
|
|
1182
|
+
wait_seconds: int = 5, refresh: str = "full") -> int:
|
|
1183
1183
|
"""
|
|
1184
1184
|
Deploy a semantic model from a BIM file using DirectLake mode.
|
|
1185
1185
|
|
|
@@ -1190,6 +1190,9 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1190
1190
|
- Workspace/Model: "workspace_name/model_name"
|
|
1191
1191
|
dataset_name: Name for the semantic model (default: schema name)
|
|
1192
1192
|
wait_seconds: Seconds to wait for permission propagation (default: 5)
|
|
1193
|
+
refresh: Refresh strategy:
|
|
1194
|
+
- "full": Clear values and process full refresh (default)
|
|
1195
|
+
- "ignore": Skip refresh entirely
|
|
1193
1196
|
|
|
1194
1197
|
Returns:
|
|
1195
1198
|
1 for success, 0 for failure
|
|
@@ -1205,6 +1208,9 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1205
1208
|
|
|
1206
1209
|
# Deploy with custom name
|
|
1207
1210
|
dr.deploy("https://github.com/.../model.bim", dataset_name="Sales Model")
|
|
1211
|
+
|
|
1212
|
+
# Deploy without refresh
|
|
1213
|
+
dr.deploy("https://github.com/.../model.bim", refresh="ignore")
|
|
1208
1214
|
"""
|
|
1209
1215
|
from .semantic_model import deploy_semantic_model
|
|
1210
1216
|
|
|
@@ -1227,7 +1233,8 @@ class Duckrun(WorkspaceOperationsMixin):
|
|
|
1227
1233
|
schema_name=self.schema,
|
|
1228
1234
|
dataset_name=dataset_name,
|
|
1229
1235
|
bim_url_or_path=bim_url,
|
|
1230
|
-
wait_seconds=wait_seconds
|
|
1236
|
+
wait_seconds=wait_seconds,
|
|
1237
|
+
refresh=refresh
|
|
1231
1238
|
)
|
|
1232
1239
|
|
|
1233
1240
|
def close(self):
|
|
@@ -129,16 +129,21 @@ def check_dataset_exists(dataset_name, workspace_id, client):
|
|
|
129
129
|
return False
|
|
130
130
|
|
|
131
131
|
|
|
132
|
-
def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None):
|
|
132
|
+
def refresh_dataset(dataset_name, workspace_id, client, dataset_id=None, refresh="full"):
|
|
133
133
|
"""Refresh a dataset and monitor progress using Power BI API
|
|
134
134
|
|
|
135
|
-
For DirectLake models, performs
|
|
136
|
-
|
|
137
|
-
|
|
135
|
+
For DirectLake models, performs refresh based on refresh parameter:
|
|
136
|
+
- refresh="full": Two-step refresh (clearValues + full reframe)
|
|
137
|
+
- refresh="ignore": Skip refresh entirely
|
|
138
138
|
|
|
139
139
|
If a refresh is already in progress, waits for it to complete before starting a new one.
|
|
140
140
|
"""
|
|
141
141
|
|
|
142
|
+
# Skip refresh entirely if refresh is "ignore"
|
|
143
|
+
if refresh == "ignore":
|
|
144
|
+
print(" Ignoring refresh - skipping refresh")
|
|
145
|
+
return
|
|
146
|
+
|
|
142
147
|
# If dataset_id not provided, look it up by name
|
|
143
148
|
if not dataset_id:
|
|
144
149
|
dataset_id = get_dataset_id(dataset_name, workspace_id, client)
|
|
@@ -539,7 +544,7 @@ def create_dataset_from_bim(dataset_name, bim_content, workspace_id, client):
|
|
|
539
544
|
|
|
540
545
|
|
|
541
546
|
def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_name, dataset_name,
|
|
542
|
-
bim_url_or_path, wait_seconds=5):
|
|
547
|
+
bim_url_or_path, wait_seconds=5, refresh="full"):
|
|
543
548
|
"""
|
|
544
549
|
Deploy a semantic model using DirectLake mode.
|
|
545
550
|
|
|
@@ -550,6 +555,9 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
550
555
|
dataset_name: Name for the semantic model
|
|
551
556
|
bim_url_or_path: URL to the BIM file or local file path (e.g., 'model.bim' or 'https://...')
|
|
552
557
|
wait_seconds: Seconds to wait before refresh (default: 5)
|
|
558
|
+
refresh: Refresh strategy (default: "full")
|
|
559
|
+
- "full": Clear values and process full refresh
|
|
560
|
+
- "ignore": Skip refresh entirely
|
|
553
561
|
|
|
554
562
|
Returns:
|
|
555
563
|
1 for success, 0 for failure
|
|
@@ -562,6 +570,9 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
562
570
|
# Using a local file
|
|
563
571
|
dr.deploy("./my_model.bim")
|
|
564
572
|
dr.deploy("C:/path/to/model.bim")
|
|
573
|
+
|
|
574
|
+
# Deploy without refresh
|
|
575
|
+
dr.deploy("./my_model.bim", refresh="ignore")
|
|
565
576
|
"""
|
|
566
577
|
print("=" * 70)
|
|
567
578
|
print("Semantic Model Deployment (DirectLake)")
|
|
@@ -586,7 +597,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
586
597
|
time.sleep(wait_seconds)
|
|
587
598
|
|
|
588
599
|
print("\n[Step 3/3] Refreshing existing semantic model...")
|
|
589
|
-
refresh_dataset(dataset_name, workspace_id, client)
|
|
600
|
+
refresh_dataset(dataset_name, workspace_id, client, refresh=refresh)
|
|
590
601
|
|
|
591
602
|
print("\n" + "=" * 70)
|
|
592
603
|
print("🎉 Refresh Completed!")
|
|
@@ -618,7 +629,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
618
629
|
|
|
619
630
|
# Step 6: Refresh using the dataset ID returned from creation
|
|
620
631
|
print("\n[Step 6/6] Refreshing semantic model...")
|
|
621
|
-
refresh_dataset(dataset_name, workspace_id, client, dataset_id=dataset_id)
|
|
632
|
+
refresh_dataset(dataset_name, workspace_id, client, dataset_id=dataset_id, refresh=refresh)
|
|
622
633
|
|
|
623
634
|
print("\n" + "=" * 70)
|
|
624
635
|
print("🎉 Deployment Completed!")
|
|
@@ -645,7 +656,7 @@ def deploy_semantic_model(workspace_name_or_id, lakehouse_name_or_id, schema_nam
|
|
|
645
656
|
return 0
|
|
646
657
|
|
|
647
658
|
|
|
648
|
-
def copy_model(ws_source, model_name, destination, new_model_name=None, wait_seconds=5):
|
|
659
|
+
def copy_model(ws_source, model_name, destination, new_model_name=None, wait_seconds=5, refresh="full"):
|
|
649
660
|
"""
|
|
650
661
|
Copy a semantic model from one workspace to another.
|
|
651
662
|
|
|
@@ -658,6 +669,9 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
|
|
|
658
669
|
destination: Destination in format "workspace/lakehouse.lakehouse/schema"
|
|
659
670
|
new_model_name: Name for the new semantic model (default: same as source)
|
|
660
671
|
wait_seconds: Seconds to wait before refresh (default: 5)
|
|
672
|
+
refresh: Refresh strategy (default: "full")
|
|
673
|
+
- "full": Clear values and process full refresh
|
|
674
|
+
- "ignore": Skip refresh entirely
|
|
661
675
|
|
|
662
676
|
Returns:
|
|
663
677
|
1 for success, 0 for failure
|
|
@@ -670,6 +684,9 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
|
|
|
670
684
|
copy_model("Source WS", "Production Model", "Target WS/Data Lake.lakehouse/analytics",
|
|
671
685
|
new_model_name="Production Model - Copy")
|
|
672
686
|
|
|
687
|
+
# Copy without refresh
|
|
688
|
+
copy_model("Source WS", "Model", "Target WS/LH.lakehouse/dbo", refresh="ignore")
|
|
689
|
+
|
|
673
690
|
# Using the connect pattern
|
|
674
691
|
import duckrun
|
|
675
692
|
duckrun.semantic_model.copy_model("Source", "Model", "Target/LH.lakehouse/dbo")
|
|
@@ -796,7 +813,8 @@ def copy_model(ws_source, model_name, destination, new_model_name=None, wait_sec
|
|
|
796
813
|
schema_name=schema,
|
|
797
814
|
dataset_name=new_model_name,
|
|
798
815
|
bim_url_or_path=temp_bim_path,
|
|
799
|
-
wait_seconds=wait_seconds
|
|
816
|
+
wait_seconds=wait_seconds,
|
|
817
|
+
refresh=refresh
|
|
800
818
|
)
|
|
801
819
|
|
|
802
820
|
# Clean up temp file
|
|
@@ -60,6 +60,49 @@ def _get_existing_tables_in_schema(duckrun_instance, schema_name: str) -> list:
|
|
|
60
60
|
return []
|
|
61
61
|
|
|
62
62
|
|
|
63
|
+
def _match_tables_by_pattern(duckrun_instance, pattern: str) -> dict:
|
|
64
|
+
"""Match tables across all schemas using a wildcard pattern.
|
|
65
|
+
Pattern can be:
|
|
66
|
+
- '*.summary' - matches 'summary' table in all schemas
|
|
67
|
+
- '*summary' - matches any table ending with 'summary'
|
|
68
|
+
- 'schema.*' - matches all tables in 'schema'
|
|
69
|
+
Returns a dict mapping schema names to lists of matching table names."""
|
|
70
|
+
import fnmatch
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
# Query all schemas and tables in one go
|
|
74
|
+
query = """
|
|
75
|
+
SELECT table_schema, table_name
|
|
76
|
+
FROM information_schema.tables
|
|
77
|
+
WHERE table_schema NOT LIKE 'pg_%'
|
|
78
|
+
AND table_schema != 'information_schema'
|
|
79
|
+
AND table_name NOT LIKE 'tbl_%'
|
|
80
|
+
"""
|
|
81
|
+
result = duckrun_instance.con.execute(query).fetchall()
|
|
82
|
+
|
|
83
|
+
matched = {}
|
|
84
|
+
|
|
85
|
+
# Check if pattern contains a dot (schema.table pattern)
|
|
86
|
+
if '.' in pattern:
|
|
87
|
+
schema_pattern, table_pattern = pattern.split('.', 1)
|
|
88
|
+
for schema, table in result:
|
|
89
|
+
if fnmatch.fnmatch(schema, schema_pattern) and fnmatch.fnmatch(table, table_pattern):
|
|
90
|
+
if schema not in matched:
|
|
91
|
+
matched[schema] = []
|
|
92
|
+
matched[schema].append(table)
|
|
93
|
+
else:
|
|
94
|
+
# Pattern matches only table names
|
|
95
|
+
for schema, table in result:
|
|
96
|
+
if fnmatch.fnmatch(table, pattern):
|
|
97
|
+
if schema not in matched:
|
|
98
|
+
matched[schema] = []
|
|
99
|
+
matched[schema].append(table)
|
|
100
|
+
|
|
101
|
+
return matched
|
|
102
|
+
except:
|
|
103
|
+
return {}
|
|
104
|
+
|
|
105
|
+
|
|
63
106
|
def get_stats(duckrun_instance, source: str = None):
|
|
64
107
|
"""
|
|
65
108
|
Get comprehensive statistics for Delta Lake tables.
|
|
@@ -71,6 +114,7 @@ def get_stats(duckrun_instance, source: str = None):
|
|
|
71
114
|
- Table name: 'table_name' (uses main schema in DuckDB)
|
|
72
115
|
- Schema.table: 'schema.table_name' (specific table in schema, if multi-schema)
|
|
73
116
|
- Schema only: 'schema' (all tables in schema, if multi-schema)
|
|
117
|
+
- Wildcard pattern: '*.summary' (matches tables across all schemas)
|
|
74
118
|
|
|
75
119
|
Returns:
|
|
76
120
|
Arrow table with statistics including total rows, file count, row groups,
|
|
@@ -90,6 +134,9 @@ def get_stats(duckrun_instance, source: str = None):
|
|
|
90
134
|
|
|
91
135
|
# All tables in a schema (only if multi-schema enabled)
|
|
92
136
|
stats = con.get_stats('aemo')
|
|
137
|
+
|
|
138
|
+
# Wildcard pattern across all schemas (only if multi-schema enabled)
|
|
139
|
+
stats = con.get_stats('*.summary')
|
|
93
140
|
"""
|
|
94
141
|
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
95
142
|
|
|
@@ -101,8 +148,27 @@ def get_stats(duckrun_instance, source: str = None):
|
|
|
101
148
|
if source is None:
|
|
102
149
|
source = url_schema
|
|
103
150
|
|
|
151
|
+
# Check if source contains wildcard characters
|
|
152
|
+
if '*' in source or '?' in source:
|
|
153
|
+
# Wildcard pattern mode - only valid if multi-schema is enabled
|
|
154
|
+
if not duckrun_instance.scan_all_schemas:
|
|
155
|
+
raise ValueError(f"Wildcard pattern '{source}' not supported. Connection was made to a specific schema '{url_schema}'. Enable multi-schema mode to use wildcards.")
|
|
156
|
+
|
|
157
|
+
matched_tables = _match_tables_by_pattern(duckrun_instance, source)
|
|
158
|
+
|
|
159
|
+
if not matched_tables:
|
|
160
|
+
raise ValueError(f"No tables found matching pattern '{source}'")
|
|
161
|
+
|
|
162
|
+
# Flatten the matched tables into a list with schema info
|
|
163
|
+
tables_with_schemas = []
|
|
164
|
+
for schema, tables in matched_tables.items():
|
|
165
|
+
for table in tables:
|
|
166
|
+
tables_with_schemas.append((schema, table))
|
|
167
|
+
|
|
168
|
+
print(f"Found {len(tables_with_schemas)} tables matching pattern '{source}'")
|
|
169
|
+
|
|
104
170
|
# Parse the source and validate existence
|
|
105
|
-
|
|
171
|
+
elif '.' in source:
|
|
106
172
|
# Format: schema.table - only valid if multi-schema is enabled
|
|
107
173
|
schema_name, table_name = source.split('.', 1)
|
|
108
174
|
|
|
@@ -113,46 +179,45 @@ def get_stats(duckrun_instance, source: str = None):
|
|
|
113
179
|
if not _table_exists(duckrun_instance, schema_name, table_name):
|
|
114
180
|
raise ValueError(f"Table '{table_name}' does not exist in schema '{schema_name}'")
|
|
115
181
|
|
|
116
|
-
|
|
182
|
+
tables_with_schemas = [(schema_name, table_name)]
|
|
117
183
|
else:
|
|
118
184
|
# Could be just table name or schema name
|
|
119
185
|
if duckrun_instance.scan_all_schemas:
|
|
120
186
|
# Multi-schema mode: DuckDB has actual schemas
|
|
121
187
|
# First check if it's a table in main schema
|
|
122
188
|
if _table_exists(duckrun_instance, duckdb_schema, source):
|
|
123
|
-
|
|
124
|
-
schema_name = duckdb_schema
|
|
189
|
+
tables_with_schemas = [(duckdb_schema, source)]
|
|
125
190
|
# Otherwise, check if it's a schema name
|
|
126
191
|
elif _schema_exists(duckrun_instance, source):
|
|
127
192
|
schema_name = source
|
|
128
193
|
list_tables = _get_existing_tables_in_schema(duckrun_instance, source)
|
|
129
194
|
if not list_tables:
|
|
130
195
|
raise ValueError(f"Schema '{source}' exists but contains no tables")
|
|
196
|
+
tables_with_schemas = [(schema_name, tbl) for tbl in list_tables]
|
|
131
197
|
else:
|
|
132
198
|
raise ValueError(f"Neither table '{source}' in main schema nor schema '{source}' exists")
|
|
133
199
|
else:
|
|
134
200
|
# Single-schema mode: tables are in DuckDB's main schema, use URL schema for file paths
|
|
135
201
|
if _table_exists(duckrun_instance, duckdb_schema, source):
|
|
136
202
|
# It's a table name
|
|
137
|
-
|
|
138
|
-
schema_name = url_schema # Use URL schema for file path construction
|
|
203
|
+
tables_with_schemas = [(url_schema, source)]
|
|
139
204
|
elif source == url_schema:
|
|
140
205
|
# Special case: user asked for stats on the URL schema name - list all tables
|
|
141
206
|
list_tables = _get_existing_tables_in_schema(duckrun_instance, duckdb_schema)
|
|
142
|
-
schema_name = url_schema # Use URL schema for file path construction
|
|
143
207
|
if not list_tables:
|
|
144
208
|
raise ValueError(f"No tables found in schema '{url_schema}'")
|
|
209
|
+
tables_with_schemas = [(url_schema, tbl) for tbl in list_tables]
|
|
145
210
|
else:
|
|
146
211
|
raise ValueError(f"Table '{source}' does not exist in the current context (schema: {url_schema})")
|
|
147
212
|
|
|
148
213
|
# Use the existing connection
|
|
149
214
|
con = duckrun_instance.con
|
|
150
215
|
|
|
151
|
-
print(f"Processing {len(
|
|
216
|
+
print(f"Processing {len(tables_with_schemas)} tables from {len(set(s for s, t in tables_with_schemas))} schema(s)")
|
|
152
217
|
|
|
153
218
|
successful_tables = []
|
|
154
|
-
for idx, tbl in enumerate(
|
|
155
|
-
print(f"[{idx+1}/{len(
|
|
219
|
+
for idx, (schema_name, tbl) in enumerate(tables_with_schemas):
|
|
220
|
+
print(f"[{idx+1}/{len(tables_with_schemas)}] Processing table '{schema_name}.{tbl}'...")
|
|
156
221
|
# Construct lakehouse path using correct ABFSS URL format (no .Lakehouse suffix)
|
|
157
222
|
table_path = f"{duckrun_instance.table_base_url}{schema_name}/{tbl}"
|
|
158
223
|
|
|
@@ -179,8 +244,18 @@ def get_stats(duckrun_instance, source: str = None):
|
|
|
179
244
|
print(f"Warning: Could not convert RecordBatch for table '{tbl}': Unexpected type {type(add_actions)}")
|
|
180
245
|
xx = {}
|
|
181
246
|
|
|
182
|
-
# Check if VORDER exists
|
|
183
|
-
|
|
247
|
+
# Check if VORDER exists - handle both formats:
|
|
248
|
+
# 1. Flattened format: 'tags.VORDER' or 'tags.vorder' in keys
|
|
249
|
+
# 2. Nested format: check in 'tags' dict for 'VORDER' or 'vorder'
|
|
250
|
+
vorder = False
|
|
251
|
+
if 'tags.VORDER' in xx.keys() or 'tags.vorder' in xx.keys():
|
|
252
|
+
vorder = True
|
|
253
|
+
elif 'tags' in xx.keys() and xx['tags']:
|
|
254
|
+
# Check nested tags dictionary (tags is a list of dicts, one per file)
|
|
255
|
+
for tag_dict in xx['tags']:
|
|
256
|
+
if tag_dict and ('VORDER' in tag_dict or 'vorder' in tag_dict):
|
|
257
|
+
vorder = True
|
|
258
|
+
break
|
|
184
259
|
|
|
185
260
|
# Calculate total size
|
|
186
261
|
total_size = sum(xx['size_bytes']) if xx['size_bytes'] else 0
|
|
@@ -195,6 +270,7 @@ def get_stats(duckrun_instance, source: str = None):
|
|
|
195
270
|
con.execute(f'''
|
|
196
271
|
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
197
272
|
SELECT
|
|
273
|
+
'{schema_name}' as schema,
|
|
198
274
|
'{tbl}' as tbl,
|
|
199
275
|
'empty' as file_name,
|
|
200
276
|
0 as num_rows,
|
|
@@ -210,6 +286,7 @@ def get_stats(duckrun_instance, source: str = None):
|
|
|
210
286
|
con.execute(f'''
|
|
211
287
|
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
212
288
|
SELECT
|
|
289
|
+
'{schema_name}' as schema,
|
|
213
290
|
'{tbl}' as tbl,
|
|
214
291
|
fm.file_name,
|
|
215
292
|
fm.num_rows,
|
|
@@ -245,6 +322,7 @@ def get_stats(duckrun_instance, source: str = None):
|
|
|
245
322
|
con.execute(f'''
|
|
246
323
|
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
247
324
|
SELECT
|
|
325
|
+
'{schema_name}' as schema,
|
|
248
326
|
'{tbl}' as tbl,
|
|
249
327
|
'empty' as file_name,
|
|
250
328
|
0 as num_rows,
|
|
@@ -272,6 +350,7 @@ def get_stats(duckrun_instance, source: str = None):
|
|
|
272
350
|
con.execute(f'''
|
|
273
351
|
CREATE OR REPLACE TEMP TABLE tbl_{idx} AS
|
|
274
352
|
SELECT
|
|
353
|
+
'{schema_name}' as schema,
|
|
275
354
|
'{tbl}' as tbl,
|
|
276
355
|
fm.file_name,
|
|
277
356
|
fm.num_rows,
|
|
@@ -299,7 +378,7 @@ def get_stats(duckrun_instance, source: str = None):
|
|
|
299
378
|
# No tables were processed successfully - return empty dataframe
|
|
300
379
|
print("⚠️ No tables could be processed successfully")
|
|
301
380
|
import pandas as pd
|
|
302
|
-
return pd.DataFrame(columns=['tbl', 'total_rows', 'num_files', 'num_row_group',
|
|
381
|
+
return pd.DataFrame(columns=['schema', 'tbl', 'total_rows', 'num_files', 'num_row_group',
|
|
303
382
|
'average_row_group', 'file_size_MB', 'vorder', 'compression', 'timestamp'])
|
|
304
383
|
|
|
305
384
|
# Union all successfully processed temp tables
|
|
@@ -309,6 +388,7 @@ def get_stats(duckrun_instance, source: str = None):
|
|
|
309
388
|
# Generate final summary
|
|
310
389
|
final_result = con.execute(f'''
|
|
311
390
|
SELECT
|
|
391
|
+
schema,
|
|
312
392
|
tbl,
|
|
313
393
|
SUM(num_rows) as total_rows,
|
|
314
394
|
COUNT(*) as num_files,
|
|
@@ -320,7 +400,7 @@ def get_stats(duckrun_instance, source: str = None):
|
|
|
320
400
|
ANY_VALUE(timestamp) as timestamp
|
|
321
401
|
FROM ({union_query})
|
|
322
402
|
WHERE tbl IS NOT NULL
|
|
323
|
-
GROUP BY tbl
|
|
403
|
+
GROUP BY schema, tbl
|
|
324
404
|
ORDER BY total_rows DESC
|
|
325
405
|
''').df()
|
|
326
406
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "duckrun"
|
|
7
|
-
version = "0.2.18.
|
|
7
|
+
version = "0.2.18.dev5"
|
|
8
8
|
description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|