duckrun 0.1.1__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
5
  License-Expression: MIT
6
6
  Project-URL: Homepage, https://github.com/djouallah/duckrun
@@ -14,10 +14,16 @@ Requires-Dist: deltalake>=0.18.2
14
14
  Requires-Dist: requests>=2.28.0
15
15
  Dynamic: license-file
16
16
 
17
- # 🦆 Duckrun
17
+
18
+ <img src="duckrun.png" width="400" alt="Duckrun">
18
19
 
19
20
  Simple task runner for Microsoft Fabric Python notebook, powered by DuckDB and Delta_rs.
20
21
 
22
+
23
+ ## Known Limitation
24
+
25
+ Support only Lakehouse with schema, Workspace and lakehouse names should not contains space
26
+
21
27
  ## Installation
22
28
 
23
29
  ```bash
@@ -50,6 +56,10 @@ pipeline = [
50
56
  lakehouse.run(pipeline)
51
57
  ```
52
58
 
59
+ ## Early Exit
60
+
61
+ In a pipeline run, if a task fails, the pipeline will stop without running the subsequent tasks.
62
+
53
63
  ## How It Works
54
64
 
55
65
  Duckrun runs two types of tasks:
@@ -117,8 +127,8 @@ Use `__` to create variants of the same table:
117
127
 
118
128
  ```python
119
129
  pipeline = [
120
- ('sales__initial', 'overwrite', {}), # writes to 'sales' table
121
- ('sales__incremental', 'append', {}), # appends to 'sales' table
130
+ ('sales__initial', 'overwrite'), # writes to 'sales' table
131
+ ('sales__incremental', 'append'), # appends to 'sales' table
122
132
  ]
123
133
  ```
124
134
 
@@ -134,32 +144,7 @@ lakehouse.sql("SELECT * FROM my_table LIMIT 10").show()
134
144
  df = lakehouse.sql("SELECT COUNT(*) FROM sales").df()
135
145
  ```
136
146
 
137
- ## Real-World Example
138
147
 
139
- ```python
140
- import duckrun as dr
141
-
142
- lakehouse = dr.connect(
143
- workspace="Analytics",
144
- lakehouse_name="Sales",
145
- schema="dbo",
146
- sql_folder="./etl"
147
- )
148
-
149
- # Daily pipeline
150
- daily = [
151
- ('download_files', (api_url, local_path)),
152
- ('staging_orders', 'overwrite', {'run_date': '2024-06-01'}),
153
- ('staging_customers', 'overwrite', {'run_date': '2024-06-01'}),
154
- ('fact_sales', 'append'),
155
- ('dim_customer', 'overwrite')
156
- ]
157
-
158
- lakehouse.run(daily)
159
-
160
- # Check results
161
- lakehouse.sql("SELECT COUNT(*) FROM fact_sales").show()
162
- ```
163
148
 
164
149
  ## Remote SQL Files
165
150
 
@@ -1,7 +1,13 @@
1
- # 🦆 Duckrun
1
+
2
+ <img src="duckrun.png" width="400" alt="Duckrun">
2
3
 
3
4
  Simple task runner for Microsoft Fabric Python notebook, powered by DuckDB and Delta_rs.
4
5
 
6
+
7
+ ## Known Limitation
8
+
9
+ Support only Lakehouse with schema, Workspace and lakehouse names should not contains space
10
+
5
11
  ## Installation
6
12
 
7
13
  ```bash
@@ -34,6 +40,10 @@ pipeline = [
34
40
  lakehouse.run(pipeline)
35
41
  ```
36
42
 
43
+ ## Early Exit
44
+
45
+ In a pipeline run, if a task fails, the pipeline will stop without running the subsequent tasks.
46
+
37
47
  ## How It Works
38
48
 
39
49
  Duckrun runs two types of tasks:
@@ -101,8 +111,8 @@ Use `__` to create variants of the same table:
101
111
 
102
112
  ```python
103
113
  pipeline = [
104
- ('sales__initial', 'overwrite', {}), # writes to 'sales' table
105
- ('sales__incremental', 'append', {}), # appends to 'sales' table
114
+ ('sales__initial', 'overwrite'), # writes to 'sales' table
115
+ ('sales__incremental', 'append'), # appends to 'sales' table
106
116
  ]
107
117
  ```
108
118
 
@@ -118,32 +128,7 @@ lakehouse.sql("SELECT * FROM my_table LIMIT 10").show()
118
128
  df = lakehouse.sql("SELECT COUNT(*) FROM sales").df()
119
129
  ```
120
130
 
121
- ## Real-World Example
122
131
 
123
- ```python
124
- import duckrun as dr
125
-
126
- lakehouse = dr.connect(
127
- workspace="Analytics",
128
- lakehouse_name="Sales",
129
- schema="dbo",
130
- sql_folder="./etl"
131
- )
132
-
133
- # Daily pipeline
134
- daily = [
135
- ('download_files', (api_url, local_path)),
136
- ('staging_orders', 'overwrite', {'run_date': '2024-06-01'}),
137
- ('staging_customers', 'overwrite', {'run_date': '2024-06-01'}),
138
- ('fact_sales', 'append'),
139
- ('dim_customer', 'overwrite')
140
- ]
141
-
142
- lakehouse.run(daily)
143
-
144
- # Check results
145
- lakehouse.sql("SELECT COUNT(*) FROM fact_sales").show()
146
- ```
147
132
 
148
133
  ## Remote SQL Files
149
134
 
@@ -64,9 +64,14 @@ class Duckrun:
64
64
  def _attach_lakehouse(self):
65
65
  self._create_onelake_secret()
66
66
  try:
67
+ # Exclude Iceberg metadata folders when scanning for Delta tables
67
68
  list_tables_query = f"""
68
69
  SELECT DISTINCT(split_part(file, '_delta_log', 1)) as tables
69
70
  FROM glob ("abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{self.lakehouse_name}.Lakehouse/Tables/*/*/_delta_log/*.json")
71
+ WHERE file NOT LIKE '%/metadata/%'
72
+ AND file NOT LIKE '%/iceberg/%'
73
+ AND split_part(file, '_delta_log', 1) NOT LIKE '%/metadata'
74
+ AND split_part(file, '_delta_log', 1) NOT LIKE '%/iceberg'
70
75
  """
71
76
  list_tables_df = self.con.sql(list_tables_query).df()
72
77
  list_tables = list_tables_df['tables'].tolist() if not list_tables_df.empty else []
@@ -82,18 +87,27 @@ class Duckrun:
82
87
  if len(parts) >= 2:
83
88
  potential_schema = parts[-2]
84
89
  table = parts[-1]
90
+
91
+ # Skip Iceberg-related folders
92
+ if table in ('metadata', 'iceberg') or potential_schema in ('metadata', 'iceberg'):
93
+ continue
94
+
85
95
  if potential_schema == self.schema:
86
96
  try:
87
97
  self.con.sql(f"""
88
98
  CREATE OR REPLACE VIEW {table}
89
99
  AS SELECT * FROM delta_scan('{self.table_base_url}{self.schema}/{table}');
90
100
  """)
101
+ print(f" ✓ Attached: {table}")
91
102
  except Exception as e:
92
- print(f"Error creating view for table {table}: {e}")
103
+ print(f" Skipped {table}: {str(e)[:100]}")
104
+ continue
105
+
93
106
  print("\nAttached tables (views) in DuckDB:")
94
107
  self.con.sql("SELECT name FROM (SHOW ALL TABLES) WHERE database='memory'").show()
95
108
  except Exception as e:
96
109
  print(f"Error attaching lakehouse: {e}")
110
+ print("Continuing without pre-attached tables.")
97
111
 
98
112
  def _normalize_table_name(self, name: str) -> str:
99
113
  """Extract base table name before first '__'"""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: duckrun
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: Lakehouse task runner powered by DuckDB for Microsoft Fabric
5
5
  License-Expression: MIT
6
6
  Project-URL: Homepage, https://github.com/djouallah/duckrun
@@ -14,10 +14,16 @@ Requires-Dist: deltalake>=0.18.2
14
14
  Requires-Dist: requests>=2.28.0
15
15
  Dynamic: license-file
16
16
 
17
- # 🦆 Duckrun
17
+
18
+ <img src="duckrun.png" width="400" alt="Duckrun">
18
19
 
19
20
  Simple task runner for Microsoft Fabric Python notebook, powered by DuckDB and Delta_rs.
20
21
 
22
+
23
+ ## Known Limitation
24
+
25
+ Support only Lakehouse with schema, Workspace and lakehouse names should not contains space
26
+
21
27
  ## Installation
22
28
 
23
29
  ```bash
@@ -50,6 +56,10 @@ pipeline = [
50
56
  lakehouse.run(pipeline)
51
57
  ```
52
58
 
59
+ ## Early Exit
60
+
61
+ In a pipeline run, if a task fails, the pipeline will stop without running the subsequent tasks.
62
+
53
63
  ## How It Works
54
64
 
55
65
  Duckrun runs two types of tasks:
@@ -117,8 +127,8 @@ Use `__` to create variants of the same table:
117
127
 
118
128
  ```python
119
129
  pipeline = [
120
- ('sales__initial', 'overwrite', {}), # writes to 'sales' table
121
- ('sales__incremental', 'append', {}), # appends to 'sales' table
130
+ ('sales__initial', 'overwrite'), # writes to 'sales' table
131
+ ('sales__incremental', 'append'), # appends to 'sales' table
122
132
  ]
123
133
  ```
124
134
 
@@ -134,32 +144,7 @@ lakehouse.sql("SELECT * FROM my_table LIMIT 10").show()
134
144
  df = lakehouse.sql("SELECT COUNT(*) FROM sales").df()
135
145
  ```
136
146
 
137
- ## Real-World Example
138
147
 
139
- ```python
140
- import duckrun as dr
141
-
142
- lakehouse = dr.connect(
143
- workspace="Analytics",
144
- lakehouse_name="Sales",
145
- schema="dbo",
146
- sql_folder="./etl"
147
- )
148
-
149
- # Daily pipeline
150
- daily = [
151
- ('download_files', (api_url, local_path)),
152
- ('staging_orders', 'overwrite', {'run_date': '2024-06-01'}),
153
- ('staging_customers', 'overwrite', {'run_date': '2024-06-01'}),
154
- ('fact_sales', 'append'),
155
- ('dim_customer', 'overwrite')
156
- ]
157
-
158
- lakehouse.run(daily)
159
-
160
- # Check results
161
- lakehouse.sql("SELECT COUNT(*) FROM fact_sales").show()
162
- ```
163
148
 
164
149
  ## Remote SQL Files
165
150
 
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
5
5
 
6
6
  [project]
7
7
  name = "duckrun"
8
- version = "0.1.1"
8
+ version = "0.1.2"
9
9
  description = "Lakehouse task runner powered by DuckDB for Microsoft Fabric"
10
10
  readme = "README.md"
11
11
  license = "MIT"
File without changes
File without changes
File without changes