duckrun 0.1.6.3__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
duckrun/core.py CHANGED
@@ -8,6 +8,9 @@ from string import Template
8
8
  import obstore as obs
9
9
  from obstore.store import AzureStore
10
10
 
11
+ # Row Group configuration for optimal Delta Lake performance
12
+ RG = 8_000_000
13
+
11
14
 
12
15
  class DeltaWriter:
13
16
  """Spark-style write API for Delta Lake"""
@@ -48,7 +51,7 @@ class DeltaWriter:
48
51
  df = self.relation.record_batch()
49
52
 
50
53
  print(f"Writing to Delta table: {schema}.{table} (mode={self._mode})")
51
- write_deltalake(path, df, mode=self._mode)
54
+ write_deltalake(path, df, mode=self._mode, max_rows_per_file=RG, max_rows_per_group=RG, min_rows_per_group=RG)
52
55
 
53
56
  self.duckrun.con.sql(f"DROP VIEW IF EXISTS {table}")
54
57
  self.duckrun.con.sql(f"""
@@ -127,77 +130,57 @@ class Duckrun:
127
130
  self._attach_lakehouse()
128
131
 
129
132
  @classmethod
130
- def connect(cls, workspace: Union[str, None] = None, lakehouse_name: Optional[str] = None,
131
- schema: str = "dbo", sql_folder: Optional[str] = None,
133
+ def connect(cls, connection_string: str, sql_folder: Optional[str] = None,
132
134
  compaction_threshold: int = 100):
133
135
  """
134
136
  Create and connect to lakehouse.
135
137
 
136
- Supports two formats:
137
- 1. Compact: connect("ws/lh.lakehouse/schema", sql_folder=...) or connect("ws/lh.lakehouse")
138
- 2. Traditional: connect("ws", "lh", "schema", sql_folder) or connect("ws", "lh")
138
+ Uses compact format: connect("ws/lh.lakehouse/schema") or connect("ws/lh.lakehouse")
139
139
 
140
140
  Args:
141
- workspace: Workspace name or full path "ws/lh.lakehouse/schema"
142
- lakehouse_name: Lakehouse name (optional if using compact format)
143
- schema: Schema name (defaults to "dbo")
141
+ connection_string: OneLake path "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
144
142
  sql_folder: Optional path or URL to SQL files folder
145
143
  compaction_threshold: File count threshold for compaction
146
144
 
147
145
  Examples:
148
- # Compact format (second param treated as sql_folder if it's a URL/path string)
149
- dr = Duckrun.connect("temp/power.lakehouse/wa", "https://github.com/.../sql/")
150
- dr = Duckrun.connect("ws/lh.lakehouse/schema", "./sql")
146
+ dr = Duckrun.connect("ws/lh.lakehouse/schema", sql_folder="./sql")
151
147
  dr = Duckrun.connect("ws/lh.lakehouse/schema") # no SQL folder
152
-
153
- # Traditional format
154
- dr = Duckrun.connect("ws", "lh", "schema", "./sql")
155
- dr = Duckrun.connect("ws", "lh", "schema")
148
+ dr = Duckrun.connect("ws/lh.lakehouse") # defaults to dbo schema
156
149
  """
157
150
  print("Connecting to Lakehouse...")
158
151
 
159
152
  scan_all_schemas = False
160
153
 
161
- # Check if using compact format: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
162
- # If second param looks like a path/URL and not a lakehouse name, treat it as sql_folder
163
- if workspace and "/" in workspace and (lakehouse_name is None or
164
- (isinstance(lakehouse_name, str) and ('/' in lakehouse_name or lakehouse_name.startswith('http') or lakehouse_name.startswith('.')))):
165
-
166
- # If lakehouse_name looks like a sql_folder, shift it
167
- if lakehouse_name and ('/' in lakehouse_name or lakehouse_name.startswith('http') or lakehouse_name.startswith('.')):
168
- sql_folder = lakehouse_name
169
- lakehouse_name = None
170
-
171
- parts = workspace.split("/")
172
- if len(parts) == 2:
173
- workspace, lakehouse_name = parts
174
- scan_all_schemas = True
175
- print(f"ℹ️ No schema specified. Using default schema 'dbo' for operations.")
176
- print(f" Scanning all schemas for table discovery...\n")
177
- elif len(parts) == 3:
178
- workspace, lakehouse_name, schema = parts
179
- else:
180
- raise ValueError(
181
- f"Invalid connection string format: '{workspace}'. "
182
- "Expected format: 'workspace/lakehouse.lakehouse' or 'workspace/lakehouse.lakehouse/schema'"
183
- )
184
-
185
- if lakehouse_name.endswith(".lakehouse"):
186
- lakehouse_name = lakehouse_name[:-10]
187
- elif lakehouse_name is not None:
188
- # Traditional format - check if schema was explicitly provided
189
- if schema == "dbo":
190
- scan_all_schemas = True
191
- print(f"ℹ️ No schema specified. Using default schema 'dbo' for operations.")
192
- print(f" Scanning all schemas for table discovery...\n")
154
+ # Only support compact format: "ws/lh.lakehouse/schema" or "ws/lh.lakehouse"
155
+ if not connection_string or "/" not in connection_string:
156
+ raise ValueError(
157
+ "Invalid connection string format. "
158
+ "Expected format: 'workspace/lakehouse.lakehouse/schema' or 'workspace/lakehouse.lakehouse'"
159
+ )
160
+
161
+ parts = connection_string.split("/")
162
+ if len(parts) == 2:
163
+ workspace, lakehouse_name = parts
164
+ scan_all_schemas = True
165
+ schema = "dbo"
166
+ print(f"ℹ️ No schema specified. Using default schema 'dbo' for operations.")
167
+ print(f" Scanning all schemas for table discovery...\n")
168
+ elif len(parts) == 3:
169
+ workspace, lakehouse_name, schema = parts
170
+ else:
171
+ raise ValueError(
172
+ f"Invalid connection string format: '{connection_string}'. "
173
+ "Expected format: 'workspace/lakehouse.lakehouse' or 'workspace/lakehouse.lakehouse/schema'"
174
+ )
175
+
176
+ if lakehouse_name.endswith(".lakehouse"):
177
+ lakehouse_name = lakehouse_name[:-10]
193
178
 
194
179
  if not workspace or not lakehouse_name:
195
180
  raise ValueError(
196
- "Missing required parameters. Use either:\n"
181
+ "Missing required parameters. Use compact format:\n"
197
182
  " connect('workspace/lakehouse.lakehouse/schema', 'sql_folder')\n"
198
- " connect('workspace/lakehouse.lakehouse') # defaults to dbo\n"
199
- " connect('workspace', 'lakehouse', 'schema', 'sql_folder')\n"
200
- " connect('workspace', 'lakehouse') # defaults to dbo"
183
+ " connect('workspace/lakehouse.lakehouse') # defaults to dbo"
201
184
  )
202
185
 
203
186
  return cls(workspace, lakehouse_name, schema, sql_folder, compaction_threshold, scan_all_schemas)
@@ -210,7 +193,7 @@ class Duckrun:
210
193
  if token != "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
211
194
  self.con.sql(f"CREATE OR REPLACE SECRET onelake (TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{token}')")
212
195
  else:
213
- print("Please login to Azure CLI")
196
+ print("Authenticating with Azure (trying CLI, will fallback to browser if needed)...")
214
197
  from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
215
198
  credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
216
199
  token = credential.get_token("https://storage.azure.com/.default")
@@ -227,7 +210,7 @@ class Duckrun:
227
210
  """
228
211
  token = self._get_storage_token()
229
212
  if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
230
- print("Getting Azure token for table discovery...")
213
+ print("Authenticating with Azure for table discovery (trying CLI, will fallback to browser if needed)...")
231
214
  from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
232
215
  credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
233
216
  token_obj = credential.get_token("https://storage.azure.com/.default")
@@ -426,7 +409,7 @@ class Duckrun:
426
409
  if mode == 'overwrite':
427
410
  self.con.sql(f"DROP VIEW IF EXISTS {normalized_table}")
428
411
  df = self.con.sql(sql).record_batch()
429
- write_deltalake(path, df, mode='overwrite')
412
+ write_deltalake(path, df, mode='overwrite', max_rows_per_file=RG, max_rows_per_group=RG, min_rows_per_group=RG)
430
413
  self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
431
414
  dt = DeltaTable(path)
432
415
  dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)
@@ -434,7 +417,7 @@ class Duckrun:
434
417
 
435
418
  elif mode == 'append':
436
419
  df = self.con.sql(sql).record_batch()
437
- write_deltalake(path, df, mode='append')
420
+ write_deltalake(path, df, mode='append', max_rows_per_file=RG, max_rows_per_group=RG, min_rows_per_group=RG)
438
421
  self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
439
422
  dt = DeltaTable(path)
440
423
  if len(dt.file_uris()) > self.compaction_threshold:
@@ -451,7 +434,7 @@ class Duckrun:
451
434
  print(f"Table {normalized_table} doesn't exist. Creating...")
452
435
  self.con.sql(f"DROP VIEW IF EXISTS {normalized_table}")
453
436
  df = self.con.sql(sql).record_batch()
454
- write_deltalake(path, df, mode='overwrite')
437
+ write_deltalake(path, df, mode='overwrite', max_rows_per_file=RG, max_rows_per_group=RG, min_rows_per_group=RG)
455
438
  self.con.sql(f"CREATE OR REPLACE VIEW {normalized_table} AS SELECT * FROM delta_scan('{path}')")
456
439
  dt = DeltaTable(path)
457
440
  dt.vacuum(dry_run=False)
@@ -542,7 +525,7 @@ class Duckrun:
542
525
  # Get Azure token
543
526
  token = self._get_storage_token()
544
527
  if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
545
- print("Getting Azure token for file upload...")
528
+ print("Authenticating with Azure for file upload (trying CLI, will fallback to browser if needed)...")
546
529
  from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
547
530
  credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
548
531
  token_obj = credential.get_token("https://storage.azure.com/.default")
@@ -649,7 +632,7 @@ class Duckrun:
649
632
  # Get Azure token
650
633
  token = self._get_storage_token()
651
634
  if token == "PLACEHOLDER_TOKEN_TOKEN_NOT_AVAILABLE":
652
- print("Getting Azure token for file download...")
635
+ print("Authenticating with Azure for file download (trying CLI, will fallback to browser if needed)...")
653
636
  from azure.identity import AzureCliCredential, InteractiveBrowserCredential, ChainedTokenCredential
654
637
  credential = ChainedTokenCredential(AzureCliCredential(), InteractiveBrowserCredential())
655
638
  token_obj = credential.get_token("https://storage.azure.com/.default")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.1.6.3
3
+ Version: 0.1.8
4
4
  Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
5
  Author: mim
6
6
  License: MIT
@@ -11,7 +11,7 @@ Requires-Python: >=3.9
11
11
  Description-Content-Type: text/markdown
12
12
  License-File: LICENSE
13
13
  Requires-Dist: duckdb>=1.2.0
14
- Requires-Dist: deltalake>=0.18.2
14
+ Requires-Dist: deltalake<=0.18.2
15
15
  Requires-Dist: requests>=2.28.0
16
16
  Requires-Dist: obstore>=0.2.0
17
17
  Provides-Extra: local
@@ -20,7 +20,7 @@ Dynamic: license-file
20
20
 
21
21
  <img src="https://raw.githubusercontent.com/djouallah/duckrun/main/duckrun.png" width="400" alt="Duckrun">
22
22
 
23
- Simple task runner for Microsoft Fabric Python notebooks, powered by DuckDB and Delta Lake.
23
+ A helper package for stuff that made my life easier when working with Fabric Python notebooks. Just the things that actually made sense to me - nothing fancy
24
24
 
25
25
  ## Important Notes
26
26
 
@@ -30,6 +30,10 @@ Simple task runner for Microsoft Fabric Python notebooks, powered by DuckDB and
30
30
 
31
31
  **Why no spaces?** Duckrun uses simple name-based paths instead of GUIDs. This keeps the code clean and readable, which is perfect for data engineering workspaces where naming conventions are already well-established. Just use underscores or hyphens instead: `my_workspace` or `my-lakehouse`.
32
32
 
33
+ ## What It Does
34
+
35
+ It does orchestration, arbitrary SQL statements, and file manipulation. That's it - just stuff I encounter in my daily workflow when working with Fabric notebooks.
36
+
33
37
  ## Installation
34
38
 
35
39
  ```bash
@@ -101,7 +105,7 @@ con.sql("SELECT * FROM dbo_customers").show()
101
105
  con.sql("SELECT * FROM bronze_raw_data").show()
102
106
  ```
103
107
 
104
- ## Two Ways to Use Duckrun
108
+ ## Three Ways to Use Duckrun
105
109
 
106
110
  ### 1. Data Exploration (Spark-Style API)
107
111
 
@@ -0,0 +1,7 @@
1
+ duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
+ duckrun/core.py,sha256=NrGriuJO7Mh1e9NKplNKkNleUWBpIKG5CwJGj3qNxxw,33334
3
+ duckrun-0.1.8.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
4
+ duckrun-0.1.8.dist-info/METADATA,sha256=CsvDljoHqgKfoDDdxHmNoKiR1PJNkqf6ye3hbxWm118,13847
5
+ duckrun-0.1.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
+ duckrun-0.1.8.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
7
+ duckrun-0.1.8.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- duckrun/__init__.py,sha256=L0jRtD9Ld8Ti4e6GRvPDdHvkQCFAPHM43GSP7ARh6EM,241
2
- duckrun/core.py,sha256=CT2NH5hCLsv4uB5zH3VxTuCVQy0nWkPBG-cICLPhG_8,34245
3
- duckrun-0.1.6.3.dist-info/licenses/LICENSE,sha256=-DeQQwdbCbkB4507ZF3QbocysB-EIjDtaLexvqRkGZc,1083
4
- duckrun-0.1.6.3.dist-info/METADATA,sha256=ny5DcRSU1B4SdHdJqHCYk0-hNo9-zqFABqMY9ulAVNk,13595
5
- duckrun-0.1.6.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
- duckrun-0.1.6.3.dist-info/top_level.txt,sha256=BknMEwebbUHrVAp3SC92ps8MPhK7XSYsaogTvi_DmEU,8
7
- duckrun-0.1.6.3.dist-info/RECORD,,