duckrun 0.2.20.dev5__tar.gz → 0.2.21.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/PKG-INFO +92 -65
  2. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/README.md +91 -64
  3. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/writer.py +18 -6
  4. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun.egg-info/PKG-INFO +92 -65
  5. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/pyproject.toml +1 -1
  6. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/LICENSE +0 -0
  7. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/__init__.py +0 -0
  8. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/auth.py +0 -0
  9. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/core.py +0 -0
  10. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/ducklake_metadata.py +0 -0
  11. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/files.py +0 -0
  12. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/lakehouse.py +0 -0
  13. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/notebook.py +0 -0
  14. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/rle.py +0 -0
  15. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/runner.py +0 -0
  16. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/semantic_model.py +0 -0
  17. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun/stats.py +0 -0
  18. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun.egg-info/SOURCES.txt +0 -0
  19. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun.egg-info/dependency_links.txt +0 -0
  20. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun.egg-info/requires.txt +0 -0
  21. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/duckrun.egg-info/top_level.txt +0 -0
  22. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/setup.cfg +0 -0
  23. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/tests/test_checkpoint_format.py +0 -0
  24. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/tests/test_ducklake_export.py +0 -0
  25. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/tests/test_register.py +0 -0
  26. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/tests/test_rle.py +0 -0
  27. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/tests/test_writer_dictionary.py +0 -0
  28. {duckrun-0.2.20.dev5 → duckrun-0.2.21.dev1}/tests/test_writer_integration.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.20.dev5
3
+ Version: 0.2.21.dev1
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -20,13 +20,15 @@ Dynamic: license-file
20
20
 
21
21
  <img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
22
22
 
23
- A helper package for working with Microsoft Fabric lakehouses - orchestration, SQL queries, and file management powered by DuckDB.
23
+ [![PyPI version](https://badge.fury.io/py/duckrun.svg)](https://badge.fury.io/py/duckrun)
24
+ [![Downloads](https://pepy.tech/badge/duckrun)](https://pepy.tech/project/duckrun)
25
+
26
+ A helper package for working with Microsoft Fabric lakehouses - orchestration, SQL queries, and file management powered by DuckDB/Delta_rs.
24
27
 
25
28
  ## Important Notes
26
29
 
27
30
  **Requirements:**
28
- - Lakehouse must have a schema (e.g., `dbo`, `sales`, `analytics`)
29
- - **Workspace names with spaces are fully supported!** ✅
31
+ - Lakehouse without schema are not supported
30
32
 
31
33
  **Delta Lake Version:** This package uses an older version of deltalake to maintain row size control capabilities, which is crucial for Power BI performance optimization. The newer Rust-based deltalake versions don't yet support the row group size parameters that are essential for optimal DirectLake performance.
32
34
 
@@ -48,68 +50,110 @@ pip install duckrun[local]
48
50
 
49
51
  Note: When running locally, your internet speed will be the main bottleneck.
50
52
 
51
- ## Quick Start
52
-
53
- ### Simple Example for New Users
53
+ ## Getting Started
54
54
 
55
55
  ```python
56
56
  import duckrun
57
57
 
58
- # Connect to a workspace and manage lakehouses
59
- con = duckrun.connect('My Workspace')
60
- con.list_lakehouses() # See what lakehouses exist
61
- con.create_lakehouse_if_not_exists('data') # Create if needed
62
-
63
- # Connect to a specific lakehouse and query data
64
- con = duckrun.connect("My Workspace/data.lakehouse/dbo")
58
+ # Connect to a lakehouse and start querying
59
+ con = duckrun.connect("My Workspace/MyLakehouse.lakehouse/dbo")
65
60
  con.sql("SELECT * FROM my_table LIMIT 10").show()
61
+
62
+ # Write results to a new table
63
+ con.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
66
64
  ```
67
65
 
68
- ### Full Feature Overview
66
+ That's it! Connect to your lakehouse and run SQL queries with DuckDB's speed.
69
67
 
68
+ ## Core Functionalities
69
+
70
+ ### 1. **Data Exploration & Querying**
71
+ Query Delta tables using SQL with DuckDB performance:
70
72
  ```python
71
- import duckrun
73
+ con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
74
+ con.sql("SELECT * FROM sales WHERE year = 2024").show()
75
+ df = con.sql("SELECT COUNT(*) FROM orders").df()
76
+ ```
72
77
 
73
- # 1. Workspace Management (list and create lakehouses)
74
- ws = duckrun.connect("My Workspace")
75
- lakehouses = ws.list_lakehouses() # Returns list of lakehouse names
76
- ws.create_lakehouse_if_not_exists("New_Lakehouse")
78
+ ### 2. **Write to Delta Tables**
79
+ Use Spark-style API to write query results:
80
+ ```python
81
+ con.sql("SELECT * FROM source") \
82
+ .write \
83
+ .mode("overwrite") \
84
+ .saveAsTable("target")
85
+ ```
77
86
 
78
- # 2. Connect to lakehouse with a specific schema
79
- con = duckrun.connect("My Workspace/MyLakehouse.lakehouse/dbo")
87
+ ### 3. **Workspace Management**
88
+ List and create lakehouses:
89
+ ```python
90
+ ws = duckrun.connect("My Workspace")
91
+ ws.list_lakehouses()
92
+ ws.create_lakehouse_if_not_exists("New Lakehouse")
93
+ ```
80
94
 
81
- # Workspace names with spaces are supported!
82
- con = duckrun.connect("Data Analytics/SalesData.lakehouse/analytics")
95
+ ### 4. **File Management**
96
+ Upload/download files to OneLake Files:
97
+ ```python
98
+ con.copy("./local_folder", "remote_folder")
99
+ con.download("remote_folder", "./local_folder")
100
+ ```
83
101
 
84
- # Schema defaults to 'dbo' if not specified (scans all schemas)
85
- # ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!
86
- con = duckrun.connect("My Workspace/My_Lakehouse.lakehouse")
102
+ ### 5. **Pipeline Orchestration**
103
+ Run SQL and Python tasks in sequence:
104
+ ```python
105
+ con = duckrun.connect("workspace/lakehouse.lakehouse/dbo", sql_folder="./sql")
106
+ pipeline = [
107
+ ('clean_data', 'overwrite'),
108
+ ('aggregate', 'append')
109
+ ]
110
+ con.run(pipeline)
111
+ ```
87
112
 
88
- # 3. Explore data
89
- con.sql("SELECT * FROM my_table LIMIT 10").show()
113
+ ### 6. **Semantic Model Deployment**
114
+ Deploy Power BI models with DirectLake:
115
+ ```python
116
+ con.deploy("https://github.com/user/repo/model.bim")
117
+ con.deploy("./local_model.bim", dataset_name="Sales Model")
118
+ ```
90
119
 
91
- # 4. Write to Delta tables (Spark-style API)
92
- con.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
120
+ ### 7. **Download Semantic Models**
121
+ Download BIM files from deployed models:
122
+ ```python
123
+ bim_content = con.download_bim("Sales Model")
124
+ con.download_bim("Sales Model", "sales_model.bim")
125
+ ```
93
126
 
94
- # 5. Upload/download files to/from OneLake Files
95
- con.copy("./local_folder", "target_folder") # Upload files
96
- con.download("target_folder", "./downloaded") # Download files
127
+ ### 8. **Schema Evolution & Partitioning**
128
+ Handle evolving schemas and optimize with partitioning:
129
+ ```python
130
+ con.sql("SELECT * FROM source") \
131
+ .write \
132
+ .mode("append") \
133
+ .option("mergeSchema", "true") \
134
+ .partitionBy("region", "year") \
135
+ .saveAsTable("target")
97
136
  ```
98
137
 
99
- That's it! No `sql_folder` needed for data exploration.
138
+ ### 9. **SQL Lookup Functions**
139
+ Resolve workspace/lakehouse names from GUIDs in SQL:
140
+ ```python
141
+ con.sql("""
142
+ SELECT
143
+ workspace_id,
144
+ get_workspace_name(workspace_id) as workspace_name,
145
+ get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name
146
+ FROM storage_logs
147
+ """).show()
148
+ ```
100
149
 
101
150
  ## Connection Format
102
151
 
103
152
  ```python
104
- # Workspace management (list and create lakehouses)
105
- ws = duckrun.connect("My Workspace")
106
- ws.list_lakehouses() # Returns: ['lakehouse1', 'lakehouse2', ...]
107
- ws.create_lakehouse_if_not_exists("New Lakehouse")
108
-
109
- # Lakehouse connection with schema (recommended for best performance)
153
+ # Lakehouse connection with schema (recommended)
110
154
  con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
111
155
 
112
- # Supports workspace names with spaces!
156
+ # Workspace names with spaces are supported!
113
157
  con = duckrun.connect("Data Analytics/Sales Data.lakehouse/analytics")
114
158
 
115
159
  # Without schema (defaults to 'dbo', scans all schemas)
@@ -120,30 +164,13 @@ con = duckrun.connect("My Workspace/My Lakehouse.lakehouse")
120
164
  con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo", sql_folder="./sql")
121
165
  ```
122
166
 
123
- ### Multi-Schema Support
167
+ ## Detailed Usage
124
168
 
125
- When you don't specify a schema, Duckrun will:
126
- - **Default to `dbo`** for write operations
127
- - **Scan all schemas** to discover and attach all Delta tables
128
- - **Prefix table names** with schema to avoid conflicts (e.g., `dbo_customers`, `bronze_raw_data`)
129
-
130
- **Performance Note:** Scanning all schemas requires listing all files in the lakehouse, which can be slow for large lakehouses with many tables. For better performance, always specify a schema when possible.
131
-
132
- ```python
133
- # Fast: scans only 'dbo' schema
134
- con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
135
-
136
- # Slower: scans all schemas
137
- con = duckrun.connect("workspace/lakehouse.lakehouse")
138
-
139
- # Query tables from different schemas (when scanning all)
140
- con.sql("SELECT * FROM dbo_customers").show()
141
- con.sql("SELECT * FROM bronze_raw_data").show()
142
- ```
169
+ ### Data Exploration
143
170
 
144
- ## Three Ways to Use Duckrun
171
+ ## Detailed Documentation
145
172
 
146
- ### 1. Data Exploration (Spark-Style API)
173
+ ### Data Exploration
147
174
 
148
175
  Perfect for ad-hoc analysis and interactive notebooks:
149
176
 
@@ -184,7 +211,7 @@ con.sql("""
184
211
 
185
212
  **Note:** `.format("delta")` is optional - Delta is the default format!
186
213
 
187
- ### 2. File Management (OneLake Files)
214
+ ### File Management (OneLake Files)
188
215
 
189
216
  Upload and download files to/from OneLake Files section (not Delta tables):
190
217
 
@@ -215,7 +242,7 @@ con.download("daily_reports", "./reports", ['.csv'])
215
242
  - ✅ **Preserves folder structure** during upload/download
216
243
  - ✅ **Progress reporting** with file sizes and upload status
217
244
 
218
- ### 3. Pipeline Orchestration
245
+ ### Pipeline Orchestration
219
246
 
220
247
  For production workflows with reusable SQL and Python tasks:
221
248
 
@@ -1,12 +1,14 @@
1
1
  <img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
2
2
 
3
- A helper package for working with Microsoft Fabric lakehouses - orchestration, SQL queries, and file management powered by DuckDB.
3
+ [![PyPI version](https://badge.fury.io/py/duckrun.svg)](https://badge.fury.io/py/duckrun)
4
+ [![Downloads](https://pepy.tech/badge/duckrun)](https://pepy.tech/project/duckrun)
5
+
6
+ A helper package for working with Microsoft Fabric lakehouses - orchestration, SQL queries, and file management powered by DuckDB/Delta_rs.
4
7
 
5
8
  ## Important Notes
6
9
 
7
10
  **Requirements:**
8
- - Lakehouse must have a schema (e.g., `dbo`, `sales`, `analytics`)
9
- - **Workspace names with spaces are fully supported!** ✅
11
+ - Lakehouse without schema are not supported
10
12
 
11
13
  **Delta Lake Version:** This package uses an older version of deltalake to maintain row size control capabilities, which is crucial for Power BI performance optimization. The newer Rust-based deltalake versions don't yet support the row group size parameters that are essential for optimal DirectLake performance.
12
14
 
@@ -28,68 +30,110 @@ pip install duckrun[local]
28
30
 
29
31
  Note: When running locally, your internet speed will be the main bottleneck.
30
32
 
31
- ## Quick Start
32
-
33
- ### Simple Example for New Users
33
+ ## Getting Started
34
34
 
35
35
  ```python
36
36
  import duckrun
37
37
 
38
- # Connect to a workspace and manage lakehouses
39
- con = duckrun.connect('My Workspace')
40
- con.list_lakehouses() # See what lakehouses exist
41
- con.create_lakehouse_if_not_exists('data') # Create if needed
42
-
43
- # Connect to a specific lakehouse and query data
44
- con = duckrun.connect("My Workspace/data.lakehouse/dbo")
38
+ # Connect to a lakehouse and start querying
39
+ con = duckrun.connect("My Workspace/MyLakehouse.lakehouse/dbo")
45
40
  con.sql("SELECT * FROM my_table LIMIT 10").show()
41
+
42
+ # Write results to a new table
43
+ con.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
46
44
  ```
47
45
 
48
- ### Full Feature Overview
46
+ That's it! Connect to your lakehouse and run SQL queries with DuckDB's speed.
49
47
 
48
+ ## Core Functionalities
49
+
50
+ ### 1. **Data Exploration & Querying**
51
+ Query Delta tables using SQL with DuckDB performance:
50
52
  ```python
51
- import duckrun
53
+ con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
54
+ con.sql("SELECT * FROM sales WHERE year = 2024").show()
55
+ df = con.sql("SELECT COUNT(*) FROM orders").df()
56
+ ```
52
57
 
53
- # 1. Workspace Management (list and create lakehouses)
54
- ws = duckrun.connect("My Workspace")
55
- lakehouses = ws.list_lakehouses() # Returns list of lakehouse names
56
- ws.create_lakehouse_if_not_exists("New_Lakehouse")
58
+ ### 2. **Write to Delta Tables**
59
+ Use Spark-style API to write query results:
60
+ ```python
61
+ con.sql("SELECT * FROM source") \
62
+ .write \
63
+ .mode("overwrite") \
64
+ .saveAsTable("target")
65
+ ```
57
66
 
58
- # 2. Connect to lakehouse with a specific schema
59
- con = duckrun.connect("My Workspace/MyLakehouse.lakehouse/dbo")
67
+ ### 3. **Workspace Management**
68
+ List and create lakehouses:
69
+ ```python
70
+ ws = duckrun.connect("My Workspace")
71
+ ws.list_lakehouses()
72
+ ws.create_lakehouse_if_not_exists("New Lakehouse")
73
+ ```
60
74
 
61
- # Workspace names with spaces are supported!
62
- con = duckrun.connect("Data Analytics/SalesData.lakehouse/analytics")
75
+ ### 4. **File Management**
76
+ Upload/download files to OneLake Files:
77
+ ```python
78
+ con.copy("./local_folder", "remote_folder")
79
+ con.download("remote_folder", "./local_folder")
80
+ ```
63
81
 
64
- # Schema defaults to 'dbo' if not specified (scans all schemas)
65
- # ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!
66
- con = duckrun.connect("My Workspace/My_Lakehouse.lakehouse")
82
+ ### 5. **Pipeline Orchestration**
83
+ Run SQL and Python tasks in sequence:
84
+ ```python
85
+ con = duckrun.connect("workspace/lakehouse.lakehouse/dbo", sql_folder="./sql")
86
+ pipeline = [
87
+ ('clean_data', 'overwrite'),
88
+ ('aggregate', 'append')
89
+ ]
90
+ con.run(pipeline)
91
+ ```
67
92
 
68
- # 3. Explore data
69
- con.sql("SELECT * FROM my_table LIMIT 10").show()
93
+ ### 6. **Semantic Model Deployment**
94
+ Deploy Power BI models with DirectLake:
95
+ ```python
96
+ con.deploy("https://github.com/user/repo/model.bim")
97
+ con.deploy("./local_model.bim", dataset_name="Sales Model")
98
+ ```
70
99
 
71
- # 4. Write to Delta tables (Spark-style API)
72
- con.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
100
+ ### 7. **Download Semantic Models**
101
+ Download BIM files from deployed models:
102
+ ```python
103
+ bim_content = con.download_bim("Sales Model")
104
+ con.download_bim("Sales Model", "sales_model.bim")
105
+ ```
73
106
 
74
- # 5. Upload/download files to/from OneLake Files
75
- con.copy("./local_folder", "target_folder") # Upload files
76
- con.download("target_folder", "./downloaded") # Download files
107
+ ### 8. **Schema Evolution & Partitioning**
108
+ Handle evolving schemas and optimize with partitioning:
109
+ ```python
110
+ con.sql("SELECT * FROM source") \
111
+ .write \
112
+ .mode("append") \
113
+ .option("mergeSchema", "true") \
114
+ .partitionBy("region", "year") \
115
+ .saveAsTable("target")
77
116
  ```
78
117
 
79
- That's it! No `sql_folder` needed for data exploration.
118
+ ### 9. **SQL Lookup Functions**
119
+ Resolve workspace/lakehouse names from GUIDs in SQL:
120
+ ```python
121
+ con.sql("""
122
+ SELECT
123
+ workspace_id,
124
+ get_workspace_name(workspace_id) as workspace_name,
125
+ get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name
126
+ FROM storage_logs
127
+ """).show()
128
+ ```
80
129
 
81
130
  ## Connection Format
82
131
 
83
132
  ```python
84
- # Workspace management (list and create lakehouses)
85
- ws = duckrun.connect("My Workspace")
86
- ws.list_lakehouses() # Returns: ['lakehouse1', 'lakehouse2', ...]
87
- ws.create_lakehouse_if_not_exists("New Lakehouse")
88
-
89
- # Lakehouse connection with schema (recommended for best performance)
133
+ # Lakehouse connection with schema (recommended)
90
134
  con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
91
135
 
92
- # Supports workspace names with spaces!
136
+ # Workspace names with spaces are supported!
93
137
  con = duckrun.connect("Data Analytics/Sales Data.lakehouse/analytics")
94
138
 
95
139
  # Without schema (defaults to 'dbo', scans all schemas)
@@ -100,30 +144,13 @@ con = duckrun.connect("My Workspace/My Lakehouse.lakehouse")
100
144
  con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo", sql_folder="./sql")
101
145
  ```
102
146
 
103
- ### Multi-Schema Support
147
+ ## Detailed Usage
104
148
 
105
- When you don't specify a schema, Duckrun will:
106
- - **Default to `dbo`** for write operations
107
- - **Scan all schemas** to discover and attach all Delta tables
108
- - **Prefix table names** with schema to avoid conflicts (e.g., `dbo_customers`, `bronze_raw_data`)
109
-
110
- **Performance Note:** Scanning all schemas requires listing all files in the lakehouse, which can be slow for large lakehouses with many tables. For better performance, always specify a schema when possible.
111
-
112
- ```python
113
- # Fast: scans only 'dbo' schema
114
- con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
115
-
116
- # Slower: scans all schemas
117
- con = duckrun.connect("workspace/lakehouse.lakehouse")
118
-
119
- # Query tables from different schemas (when scanning all)
120
- con.sql("SELECT * FROM dbo_customers").show()
121
- con.sql("SELECT * FROM bronze_raw_data").show()
122
- ```
149
+ ### Data Exploration
123
150
 
124
- ## Three Ways to Use Duckrun
151
+ ## Detailed Documentation
125
152
 
126
- ### 1. Data Exploration (Spark-Style API)
153
+ ### Data Exploration
127
154
 
128
155
  Perfect for ad-hoc analysis and interactive notebooks:
129
156
 
@@ -164,7 +191,7 @@ con.sql("""
164
191
 
165
192
  **Note:** `.format("delta")` is optional - Delta is the default format!
166
193
 
167
- ### 2. File Management (OneLake Files)
194
+ ### File Management (OneLake Files)
168
195
 
169
196
  Upload and download files to/from OneLake Files section (not Delta tables):
170
197
 
@@ -195,7 +222,7 @@ con.download("daily_reports", "./reports", ['.csv'])
195
222
  - ✅ **Preserves folder structure** during upload/download
196
223
  - ✅ **Progress reporting** with file sizes and upload status
197
224
 
198
- ### 3. Pipeline Orchestration
225
+ ### Pipeline Orchestration
199
226
 
200
227
  For production workflows with reusable SQL and Python tasks:
201
228
 
@@ -81,11 +81,10 @@ def _build_write_deltalake_args(path, df, mode, schema_mode=None, partition_by=N
81
81
  args['max_rows_per_file'] = RG
82
82
  args['max_rows_per_group'] = RG
83
83
  args['min_rows_per_group'] = RG
84
- # Set ZSTD compression and dictionary encoding for PyArrow engine
84
+ # Set ZSTD compression for PyArrow engine
85
85
  if _HAS_PYARROW_DATASET:
86
86
  args['file_options'] = ds.ParquetFileFormat().make_write_options(
87
- compression='ZSTD',
88
- use_dictionary=True
87
+ compression='ZSTD'
89
88
  )
90
89
  else:
91
90
  # Version 0.20+: no optimization available (rust by default, no row group params supported)
@@ -115,9 +114,9 @@ class DeltaWriter:
115
114
  return self
116
115
 
117
116
  def mode(self, write_mode: str):
118
- """Set write mode: 'overwrite' or 'append'"""
119
- if write_mode not in {"overwrite", "append"}:
120
- raise ValueError(f"Mode must be 'overwrite' or 'append', got '{write_mode}'")
117
+ """Set write mode: 'overwrite', 'append', or 'ignore'"""
118
+ if write_mode not in {"overwrite", "append", "ignore"}:
119
+ raise ValueError(f"Mode must be 'overwrite', 'append', or 'ignore', got '{write_mode}'")
121
120
  self._mode = write_mode
122
121
  return self
123
122
 
@@ -155,6 +154,19 @@ class DeltaWriter:
155
154
 
156
155
  self.duckrun._create_onelake_secret()
157
156
  path = f"{self.duckrun.table_base_url}{schema}/{table}"
157
+
158
+ # Handle 'ignore' mode - skip if table already exists
159
+ if self._mode == 'ignore':
160
+ try:
161
+ DeltaTable(path)
162
+ print(f"Table {schema}.{table} exists. Skipping (mode='ignore')")
163
+ return table
164
+ except Exception:
165
+ # Table doesn't exist, proceed with creation
166
+ print(f"Creating table {schema}.{table} (mode='ignore', table doesn't exist)")
167
+ # Change mode to 'overwrite' for actual write
168
+ self._mode = 'overwrite'
169
+
158
170
  df = self.relation.record_batch()
159
171
 
160
172
  # Build write arguments based on schema_mode and partition_by
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.2.20.dev5
3
+ Version: 0.2.21.dev1
4
4
  Summary: Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)
5
5
  Author: mim
6
6
  License: MIT
@@ -20,13 +20,15 @@ Dynamic: license-file
20
20
 
21
21
  <img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
22
22
 
23
- A helper package for working with Microsoft Fabric lakehouses - orchestration, SQL queries, and file management powered by DuckDB.
23
+ [![PyPI version](https://badge.fury.io/py/duckrun.svg)](https://badge.fury.io/py/duckrun)
24
+ [![Downloads](https://pepy.tech/badge/duckrun)](https://pepy.tech/project/duckrun)
25
+
26
+ A helper package for working with Microsoft Fabric lakehouses - orchestration, SQL queries, and file management powered by DuckDB/Delta_rs.
24
27
 
25
28
  ## Important Notes
26
29
 
27
30
  **Requirements:**
28
- - Lakehouse must have a schema (e.g., `dbo`, `sales`, `analytics`)
29
- - **Workspace names with spaces are fully supported!** ✅
31
+ - Lakehouse without schema are not supported
30
32
 
31
33
  **Delta Lake Version:** This package uses an older version of deltalake to maintain row size control capabilities, which is crucial for Power BI performance optimization. The newer Rust-based deltalake versions don't yet support the row group size parameters that are essential for optimal DirectLake performance.
32
34
 
@@ -48,68 +50,110 @@ pip install duckrun[local]
48
50
 
49
51
  Note: When running locally, your internet speed will be the main bottleneck.
50
52
 
51
- ## Quick Start
52
-
53
- ### Simple Example for New Users
53
+ ## Getting Started
54
54
 
55
55
  ```python
56
56
  import duckrun
57
57
 
58
- # Connect to a workspace and manage lakehouses
59
- con = duckrun.connect('My Workspace')
60
- con.list_lakehouses() # See what lakehouses exist
61
- con.create_lakehouse_if_not_exists('data') # Create if needed
62
-
63
- # Connect to a specific lakehouse and query data
64
- con = duckrun.connect("My Workspace/data.lakehouse/dbo")
58
+ # Connect to a lakehouse and start querying
59
+ con = duckrun.connect("My Workspace/MyLakehouse.lakehouse/dbo")
65
60
  con.sql("SELECT * FROM my_table LIMIT 10").show()
61
+
62
+ # Write results to a new table
63
+ con.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
66
64
  ```
67
65
 
68
- ### Full Feature Overview
66
+ That's it! Connect to your lakehouse and run SQL queries with DuckDB's speed.
69
67
 
68
+ ## Core Functionalities
69
+
70
+ ### 1. **Data Exploration & Querying**
71
+ Query Delta tables using SQL with DuckDB performance:
70
72
  ```python
71
- import duckrun
73
+ con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
74
+ con.sql("SELECT * FROM sales WHERE year = 2024").show()
75
+ df = con.sql("SELECT COUNT(*) FROM orders").df()
76
+ ```
72
77
 
73
- # 1. Workspace Management (list and create lakehouses)
74
- ws = duckrun.connect("My Workspace")
75
- lakehouses = ws.list_lakehouses() # Returns list of lakehouse names
76
- ws.create_lakehouse_if_not_exists("New_Lakehouse")
78
+ ### 2. **Write to Delta Tables**
79
+ Use Spark-style API to write query results:
80
+ ```python
81
+ con.sql("SELECT * FROM source") \
82
+ .write \
83
+ .mode("overwrite") \
84
+ .saveAsTable("target")
85
+ ```
77
86
 
78
- # 2. Connect to lakehouse with a specific schema
79
- con = duckrun.connect("My Workspace/MyLakehouse.lakehouse/dbo")
87
+ ### 3. **Workspace Management**
88
+ List and create lakehouses:
89
+ ```python
90
+ ws = duckrun.connect("My Workspace")
91
+ ws.list_lakehouses()
92
+ ws.create_lakehouse_if_not_exists("New Lakehouse")
93
+ ```
80
94
 
81
- # Workspace names with spaces are supported!
82
- con = duckrun.connect("Data Analytics/SalesData.lakehouse/analytics")
95
+ ### 4. **File Management**
96
+ Upload/download files to OneLake Files:
97
+ ```python
98
+ con.copy("./local_folder", "remote_folder")
99
+ con.download("remote_folder", "./local_folder")
100
+ ```
83
101
 
84
- # Schema defaults to 'dbo' if not specified (scans all schemas)
85
- # ⚠️ WARNING: Scanning all schemas can be slow for large lakehouses!
86
- con = duckrun.connect("My Workspace/My_Lakehouse.lakehouse")
102
+ ### 5. **Pipeline Orchestration**
103
+ Run SQL and Python tasks in sequence:
104
+ ```python
105
+ con = duckrun.connect("workspace/lakehouse.lakehouse/dbo", sql_folder="./sql")
106
+ pipeline = [
107
+ ('clean_data', 'overwrite'),
108
+ ('aggregate', 'append')
109
+ ]
110
+ con.run(pipeline)
111
+ ```
87
112
 
88
- # 3. Explore data
89
- con.sql("SELECT * FROM my_table LIMIT 10").show()
113
+ ### 6. **Semantic Model Deployment**
114
+ Deploy Power BI models with DirectLake:
115
+ ```python
116
+ con.deploy("https://github.com/user/repo/model.bim")
117
+ con.deploy("./local_model.bim", dataset_name="Sales Model")
118
+ ```
90
119
 
91
- # 4. Write to Delta tables (Spark-style API)
92
- con.sql("SELECT * FROM source").write.mode("overwrite").saveAsTable("target")
120
+ ### 7. **Download Semantic Models**
121
+ Download BIM files from deployed models:
122
+ ```python
123
+ bim_content = con.download_bim("Sales Model")
124
+ con.download_bim("Sales Model", "sales_model.bim")
125
+ ```
93
126
 
94
- # 5. Upload/download files to/from OneLake Files
95
- con.copy("./local_folder", "target_folder") # Upload files
96
- con.download("target_folder", "./downloaded") # Download files
127
+ ### 8. **Schema Evolution & Partitioning**
128
+ Handle evolving schemas and optimize with partitioning:
129
+ ```python
130
+ con.sql("SELECT * FROM source") \
131
+ .write \
132
+ .mode("append") \
133
+ .option("mergeSchema", "true") \
134
+ .partitionBy("region", "year") \
135
+ .saveAsTable("target")
97
136
  ```
98
137
 
99
- That's it! No `sql_folder` needed for data exploration.
138
+ ### 9. **SQL Lookup Functions**
139
+ Resolve workspace/lakehouse names from GUIDs in SQL:
140
+ ```python
141
+ con.sql("""
142
+ SELECT
143
+ workspace_id,
144
+ get_workspace_name(workspace_id) as workspace_name,
145
+ get_lakehouse_name(workspace_id, lakehouse_id) as lakehouse_name
146
+ FROM storage_logs
147
+ """).show()
148
+ ```
100
149
 
101
150
  ## Connection Format
102
151
 
103
152
  ```python
104
- # Workspace management (list and create lakehouses)
105
- ws = duckrun.connect("My Workspace")
106
- ws.list_lakehouses() # Returns: ['lakehouse1', 'lakehouse2', ...]
107
- ws.create_lakehouse_if_not_exists("New Lakehouse")
108
-
109
- # Lakehouse connection with schema (recommended for best performance)
153
+ # Lakehouse connection with schema (recommended)
110
154
  con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo")
111
155
 
112
- # Supports workspace names with spaces!
156
+ # Workspace names with spaces are supported!
113
157
  con = duckrun.connect("Data Analytics/Sales Data.lakehouse/analytics")
114
158
 
115
159
  # Without schema (defaults to 'dbo', scans all schemas)
@@ -120,30 +164,13 @@ con = duckrun.connect("My Workspace/My Lakehouse.lakehouse")
120
164
  con = duckrun.connect("My Workspace/My Lakehouse.lakehouse/dbo", sql_folder="./sql")
121
165
  ```
122
166
 
123
- ### Multi-Schema Support
167
+ ## Detailed Usage
124
168
 
125
- When you don't specify a schema, Duckrun will:
126
- - **Default to `dbo`** for write operations
127
- - **Scan all schemas** to discover and attach all Delta tables
128
- - **Prefix table names** with schema to avoid conflicts (e.g., `dbo_customers`, `bronze_raw_data`)
129
-
130
- **Performance Note:** Scanning all schemas requires listing all files in the lakehouse, which can be slow for large lakehouses with many tables. For better performance, always specify a schema when possible.
131
-
132
- ```python
133
- # Fast: scans only 'dbo' schema
134
- con = duckrun.connect("workspace/lakehouse.lakehouse/dbo")
135
-
136
- # Slower: scans all schemas
137
- con = duckrun.connect("workspace/lakehouse.lakehouse")
138
-
139
- # Query tables from different schemas (when scanning all)
140
- con.sql("SELECT * FROM dbo_customers").show()
141
- con.sql("SELECT * FROM bronze_raw_data").show()
142
- ```
169
+ ### Data Exploration
143
170
 
144
- ## Three Ways to Use Duckrun
171
+ ## Detailed Documentation
145
172
 
146
- ### 1. Data Exploration (Spark-Style API)
173
+ ### Data Exploration
147
174
 
148
175
  Perfect for ad-hoc analysis and interactive notebooks:
149
176
 
@@ -184,7 +211,7 @@ con.sql("""
184
211
 
185
212
  **Note:** `.format("delta")` is optional - Delta is the default format!
186
213
 
187
- ### 2. File Management (OneLake Files)
214
+ ### File Management (OneLake Files)
188
215
 
189
216
  Upload and download files to/from OneLake Files section (not Delta tables):
190
217
 
@@ -215,7 +242,7 @@ con.download("daily_reports", "./reports", ['.csv'])
215
242
  - ✅ **Preserves folder structure** during upload/download
216
243
  - ✅ **Progress reporting** with file sizes and upload status
217
244
 
218
- ### 3. Pipeline Orchestration
245
+ ### Pipeline Orchestration
219
246
 
220
247
  For production workflows with reusable SQL and Python tasks:
221
248
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "duckrun"
7
- version = "0.2.20.dev5"
7
+ version = "0.2.21.dev1"
8
8
  description = "Helper library for Fabric Python using duckdb, arrow and delta_rs (orchestration, queries, etc.)"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
File without changes
File without changes