laken 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- laken-0.1.0/.gitignore +17 -0
- laken-0.1.0/PKG-INFO +298 -0
- laken-0.1.0/README.md +281 -0
- laken-0.1.0/pyproject.toml +51 -0
- laken-0.1.0/src/laken/__init__.py +23 -0
- laken-0.1.0/src/laken/cli.py +127 -0
- laken-0.1.0/src/laken/deploy/__init__.py +3 -0
- laken-0.1.0/src/laken/deploy/build.py +6 -0
- laken-0.1.0/src/laken/deploy/config.py +47 -0
- laken-0.1.0/src/laken/deploy/fabric_client.py +80 -0
- laken-0.1.0/src/laken/deploy/project.py +30 -0
- laken-0.1.0/src/laken/deploy/wheel.py +60 -0
- laken-0.1.0/src/laken/fabric.py +227 -0
- laken-0.1.0/src/laken/frames.py +58 -0
- laken-0.1.0/src/laken/lakehouse.py +181 -0
- laken-0.1.0/src/laken/local.py +416 -0
- laken-0.1.0/src/laken/onelake_fetcher.py +222 -0
- laken-0.1.0/src/laken/paths.py +35 -0
- laken-0.1.0/src/laken/protocol.py +95 -0
- laken-0.1.0/src/laken/py.typed +0 -0
- laken-0.1.0/src/laken/types.py +11 -0
- laken-0.1.0/src/laken/workspace.py +81 -0
laken-0.1.0/.gitignore
ADDED
laken-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: laken
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Local and Fabric lakehouse abstraction for modular, testable data code
|
|
5
|
+
Author-email: Cody VanZandt <cody.a.vanzandt@gmail.com>
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Requires-Dist: deltalake>=1.6.0
|
|
8
|
+
Requires-Dist: packaging>=24.0
|
|
9
|
+
Requires-Dist: pandas>=3.0.3
|
|
10
|
+
Requires-Dist: polars>=1.40.1
|
|
11
|
+
Requires-Dist: pyarrow>=24.0.0
|
|
12
|
+
Requires-Dist: pyspark>=3.5
|
|
13
|
+
Requires-Dist: python-dotenv>=1.0
|
|
14
|
+
Requires-Dist: requests>=2.32
|
|
15
|
+
Requires-Dist: typer>=0.15
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
<div align="center">
|
|
19
|
+
|
|
20
|
+
# laken
|
|
21
|
+
|
|
22
|
+
**The missing local development workflow for Microsoft Fabric.**
|
|
23
|
+
|
|
24
|
+
[](https://www.python.org/downloads/)
|
|
25
|
+
[](https://pypi.org/project/laken/)
|
|
26
|
+
[](https://www.microsoft.com/microsoft-fabric)
|
|
27
|
+
|
|
28
|
+
</div>
|
|
29
|
+
|
|
30
|
+
<br>
|
|
31
|
+
|
|
32
|
+
**laken** lets you develop Python code for Fabric locally, using the tooling you already trust.
|
|
33
|
+
|
|
34
|
+
Write local code against real lakehouse data. The same code runs in Fabric without modification.
|
|
35
|
+
|
|
36
|
+
When you're ready, `laken deploy` packages your project, publishes it to Fabric, and makes it
|
|
37
|
+
available to your Fabric notebooks.
|
|
38
|
+
|
|
39
|
+
Keep your code modular, your notebooks thin, and your local workflow intact.
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Installation
|
|
44
|
+
|
|
45
|
+
Install [uv](https://docs.astral.sh/uv/getting-started/installation/) if needed, then add
|
|
46
|
+
`laken`:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
uv add laken
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install laken
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Deploy uses [uv](https://docs.astral.sh/uv/getting-started/installation/) to build your
|
|
57
|
+
wheel before publishing to a Fabric environment.
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## Develop against your Fabric lakehouse
|
|
62
|
+
|
|
63
|
+
Set your credentials, select your workspace and lakehouse:
|
|
64
|
+
|
|
65
|
+
```env
|
|
66
|
+
AZURE_TENANT_ID=...
|
|
67
|
+
AZURE_CLIENT_ID=...
|
|
68
|
+
AZURE_CLIENT_SECRET=...
|
|
69
|
+
FABRIC_WORKSPACE_NAME=MyWorkspace
|
|
70
|
+
FABRIC_LAKEHOUSE_NAME=MyLakehouse
|
|
71
|
+
FABRIC_WORKSPACE_ID=...
|
|
72
|
+
FABRIC_LAKEHOUSE_ID=...
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from laken import Lakehouse
|
|
77
|
+
|
|
78
|
+
lh = Lakehouse()
|
|
79
|
+
products = lh.read_table("marketing.products", as_="pandas")
|
|
80
|
+
|
|
81
|
+
lh.write_table(products, "staging.products_snapshot")
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
`Lakehouse` detects when it is running locally and when it is running inside Fabric.
|
|
85
|
+
|
|
86
|
+
Locally, the first `read_table` for a Fabric table pulls from OneLake and caches it under
|
|
87
|
+
`.laken/` as Delta; later reads use the cache. In a Fabric notebook, the same code reads
|
|
88
|
+
from your attached lakehouse.
|
|
89
|
+
|
|
90
|
+
Local writes stay under `.laken/` and do not sync to Fabric; in Fabric, writes persist to
|
|
91
|
+
tables on the attached lakehouse.
|
|
92
|
+
|
|
93
|
+
---
|
|
94
|
+
|
|
95
|
+
## Deploy to Fabric
|
|
96
|
+
|
|
97
|
+
Structure your local code as a Python project using the standard
|
|
98
|
+
[src layout](https://packaging.python.org/en/latest/discussions/src-layout-vs-flat-layout/):
|
|
99
|
+
|
|
100
|
+
```
|
|
101
|
+
myapp/
|
|
102
|
+
├── pyproject.toml # [project] name = "myapp"
|
|
103
|
+
├── src/
|
|
104
|
+
│ └── myapp/
|
|
105
|
+
│ ├── __init__.py
|
|
106
|
+
│ └── pipeline.py
|
|
107
|
+
└── .env
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Add `laken` to your project dependencies.
|
|
111
|
+
|
|
112
|
+
See the
|
|
113
|
+
[Python packaging guide](https://packaging.python.org/en/latest/tutorials/packaging-projects/)
|
|
114
|
+
if you are setting this up for the first time.
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
# src/myapp/pipeline.py
|
|
118
|
+
import pandas as pd
|
|
119
|
+
|
|
120
|
+
from laken import Lakehouse
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def run_pipeline(lh: Lakehouse) -> None:
|
|
124
|
+
products = lh.read_table("marketing.products", as_="pandas")
|
|
125
|
+
summary = products.groupby("category", as_index=False)["amount"].sum()
|
|
126
|
+
lh.write_table(summary, "staging.product_summary")
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
When you are ready, `laken deploy` builds your package and loads it into your specified
|
|
130
|
+
Fabric Environment.
|
|
131
|
+
|
|
132
|
+
Deploy credentials (`.env` or shell):
|
|
133
|
+
|
|
134
|
+
```env
|
|
135
|
+
AZURE_TENANT_ID=...
|
|
136
|
+
AZURE_CLIENT_ID=...
|
|
137
|
+
AZURE_CLIENT_SECRET=...
|
|
138
|
+
FABRIC_WORKSPACE_ID=...
|
|
139
|
+
FABRIC_ENVIRONMENT_ID=...
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
From the repo root:
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
laken deploy
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
In a Fabric notebook:
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
from laken import Lakehouse
|
|
152
|
+
from myapp.pipeline import run_pipeline
|
|
153
|
+
|
|
154
|
+
lh = Lakehouse()
|
|
155
|
+
run_pipeline(lh)
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## Reference
|
|
161
|
+
|
|
162
|
+
### `Lakehouse`
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
from laken import Lakehouse
|
|
166
|
+
|
|
167
|
+
lh = Lakehouse()
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
For tests or scripts that must pin a backend:
|
|
171
|
+
|
|
172
|
+
```python
|
|
173
|
+
from laken import FabricLakehouse, LocalLakehouse
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
**Tables** — use `schema.table` to target a schema; a bare name is passed through to Spark
|
|
177
|
+
and Fabric resolves it (typically the default `dbo` schema on a schema-enabled lakehouse).
|
|
178
|
+
`mode` is `"overwrite"` or `"append"`.
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
lh.write_table(df, "products")
|
|
182
|
+
lh.write_table(df, "marketing.products", mode="append")
|
|
183
|
+
|
|
184
|
+
df = lh.read_table("products") # Spark
|
|
185
|
+
df = lh.read_table("products", as_="pandas")
|
|
186
|
+
df = lh.read_table("marketing.products", as_="polars")
|
|
187
|
+
|
|
188
|
+
lh.list_tables()
|
|
189
|
+
lh.table_exists("marketing.products")
|
|
190
|
+
lh.drop_table("marketing.products")
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
**Files** — local paths under `.laken/workspace/Files`; in Fabric, under the lakehouse
|
|
194
|
+
`Files/` area.
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
lh.write_file(df, "exports/summary.parquet")
|
|
198
|
+
lh.read_file("exports/summary.parquet", as_="pandas")
|
|
199
|
+
lh.list_files("exports")
|
|
200
|
+
lh.file_exists("exports/summary.parquet")
|
|
201
|
+
lh.delete_file("exports/summary.parquet")
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
**Warehouse tables** — Spark `synapsesql` in Fabric; local parquet stand-in for tests.
|
|
205
|
+
|
|
206
|
+
```python
|
|
207
|
+
lh.load_table_from_warehouse("SalesOrderHeader", "SalesWarehouse", as_="pandas")
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
**Other lakehouses** — defaults come from notebook context in Fabric; override locally
|
|
211
|
+
or in notebooks:
|
|
212
|
+
|
|
213
|
+
```python
|
|
214
|
+
lh = Lakehouse(lakehouse="Sales_LH")
|
|
215
|
+
lh.read_table("marketing.products", as_="pandas")
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
### CLI
|
|
219
|
+
|
|
220
|
+
```text
|
|
221
|
+
laken deploy [--workspace-id <id>] [--environment-id <id>]
|
|
222
|
+
laken status
|
|
223
|
+
laken refresh <table>
|
|
224
|
+
laken reset <table>
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
`laken deploy` builds the wheel from your repo's `pyproject.toml`, uploads it to a Fabric
|
|
228
|
+
Environment, and publishes it so notebooks can import your package.
|
|
229
|
+
|
|
230
|
+
`laken status`, `laken refresh`, and `laken reset` manage the local `.laken/` cache on your
|
|
231
|
+
laptop. They do not run inside Fabric notebooks.
|
|
232
|
+
|
|
233
|
+
`laken status` lists cached tables with state (`mirror`, `sample`, or `local`), the Fabric
|
|
234
|
+
source version when known, and notes such as staleness or sample size.
|
|
235
|
+
|
|
236
|
+
`laken refresh <table>` re-downloads a table from Fabric when it was originally cached
|
|
237
|
+
from Fabric. Local-only tables are left unchanged.
|
|
238
|
+
|
|
239
|
+
`laken reset <table>` discards local changes and re-fetches from Fabric. The table must
|
|
240
|
+
have been cached from Fabric first.
|
|
241
|
+
|
|
242
|
+
### Environment variables
|
|
243
|
+
|
|
244
|
+
| Variable | Purpose |
|
|
245
|
+
| :--- | :--- |
|
|
246
|
+
| `AZURE_TENANT_ID` | Auth (fetch + deploy) |
|
|
247
|
+
| `AZURE_CLIENT_ID` | Auth (fetch + deploy) |
|
|
248
|
+
| `AZURE_CLIENT_SECRET` | Auth (fetch + deploy) |
|
|
249
|
+
| `FABRIC_WORKSPACE_NAME` | Local table fetch |
|
|
250
|
+
| `FABRIC_LAKEHOUSE_NAME` | Local table fetch |
|
|
251
|
+
| `FABRIC_WORKSPACE_ID` | OneLake paths; required for deploy |
|
|
252
|
+
| `FABRIC_LAKEHOUSE_ID` | OneLake paths |
|
|
253
|
+
| `FABRIC_ENVIRONMENT_ID` | Deploy target |
|
|
254
|
+
|
|
255
|
+
`AZURE_TENANT_ID`, `AZURE_CLIENT_ID`, and `AZURE_CLIENT_SECRET` are credentials from an
|
|
256
|
+
Azure service principal.
|
|
257
|
+
|
|
258
|
+
`FABRIC_WORKSPACE_NAME`, `FABRIC_LAKEHOUSE_NAME`, `FABRIC_WORKSPACE_ID`,
|
|
259
|
+
`FABRIC_LAKEHOUSE_ID`, and `FABRIC_ENVIRONMENT_ID` can be read from a Fabric notebook with
|
|
260
|
+
`notebookutils`:
|
|
261
|
+
|
|
262
|
+
```python
|
|
263
|
+
import notebookutils
|
|
264
|
+
|
|
265
|
+
ctx = notebookutils.runtime.context
|
|
266
|
+
print(ctx.get("currentWorkspaceName"))
|
|
267
|
+
print(ctx.get("currentWorkspaceId"))
|
|
268
|
+
print(ctx.get("defaultLakehouseName"))
|
|
269
|
+
print(ctx.get("defaultLakehouseId"))
|
|
270
|
+
print(ctx.get("environmentId"))
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
Deploy expects `pyproject.toml` at the repo root, a buildable application wheel, and a
|
|
274
|
+
Fabric environment with a compatible Python/Spark runtime.
|
|
275
|
+
|
|
276
|
+
### Local vs Fabric
|
|
277
|
+
|
|
278
|
+
| Class | Where | Storage | Reads | Writes |
|
|
279
|
+
| :--- | :--- | :--- | :--- | :--- |
|
|
280
|
+
| `Lakehouse` | Auto-detects notebook context | Fabric if available, else `.laken/` Delta | Local: Fabric → cache; Fabric: attached lakehouse | Local: `.laken/` only; Fabric: attached lakehouse |
|
|
281
|
+
| `LocalLakehouse` | Laptop / CI | `.laken/workspace/` | Cached Delta and local tables | Local only; not pushed to Fabric |
|
|
282
|
+
| `FabricLakehouse` | Fabric notebook | Attached lakehouse | Spark/Delta on attached lakehouse | Delta tables on attached lakehouse |
|
|
283
|
+
|
|
284
|
+
First local read of a Fabric table fetches and caches Delta under `.laken/`. If Fabric
|
|
285
|
+
changes, `laken` warns and keeps the cache until you run `laken refresh <table>`. Large
|
|
286
|
+
tables may cache as a fixed-size sample.
|
|
287
|
+
|
|
288
|
+
---
|
|
289
|
+
|
|
290
|
+
## Development
|
|
291
|
+
|
|
292
|
+
Contributions are welcome. To work on this package:
|
|
293
|
+
|
|
294
|
+
```bash
|
|
295
|
+
uv sync
|
|
296
|
+
uv run pytest
|
|
297
|
+
uv run ruff check
|
|
298
|
+
```
|
laken-0.1.0/README.md
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
# laken
|
|
4
|
+
|
|
5
|
+
**The missing local development workflow for Microsoft Fabric.**
|
|
6
|
+
|
|
7
|
+
[](https://www.python.org/downloads/)
|
|
8
|
+
[](https://pypi.org/project/laken/)
|
|
9
|
+
[](https://www.microsoft.com/microsoft-fabric)
|
|
10
|
+
|
|
11
|
+
</div>
|
|
12
|
+
|
|
13
|
+
<br>
|
|
14
|
+
|
|
15
|
+
**laken** lets you develop Python code for Fabric locally, using the tooling you already trust.
|
|
16
|
+
|
|
17
|
+
Write local code against real lakehouse data. The same code runs in Fabric without modification.
|
|
18
|
+
|
|
19
|
+
When you're ready, `laken deploy` packages your project, publishes it to Fabric, and makes it
|
|
20
|
+
available to your Fabric notebooks.
|
|
21
|
+
|
|
22
|
+
Keep your code modular, your notebooks thin, and your local workflow intact.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
Install [uv](https://docs.astral.sh/uv/getting-started/installation/) if needed, then add
|
|
29
|
+
`laken`:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
uv add laken
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install laken
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Deploy uses [uv](https://docs.astral.sh/uv/getting-started/installation/) to build your
|
|
40
|
+
wheel before publishing to a Fabric environment.
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## Develop against your Fabric lakehouse
|
|
45
|
+
|
|
46
|
+
Set your credentials, select your workspace and lakehouse:
|
|
47
|
+
|
|
48
|
+
```env
|
|
49
|
+
AZURE_TENANT_ID=...
|
|
50
|
+
AZURE_CLIENT_ID=...
|
|
51
|
+
AZURE_CLIENT_SECRET=...
|
|
52
|
+
FABRIC_WORKSPACE_NAME=MyWorkspace
|
|
53
|
+
FABRIC_LAKEHOUSE_NAME=MyLakehouse
|
|
54
|
+
FABRIC_WORKSPACE_ID=...
|
|
55
|
+
FABRIC_LAKEHOUSE_ID=...
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from laken import Lakehouse
|
|
60
|
+
|
|
61
|
+
lh = Lakehouse()
|
|
62
|
+
products = lh.read_table("marketing.products", as_="pandas")
|
|
63
|
+
|
|
64
|
+
lh.write_table(products, "staging.products_snapshot")
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
`Lakehouse` detects when it is running locally and when it is running inside Fabric.
|
|
68
|
+
|
|
69
|
+
Locally, the first `read_table` for a Fabric table pulls from OneLake and caches it under
|
|
70
|
+
`.laken/` as Delta; later reads use the cache. In a Fabric notebook, the same code reads
|
|
71
|
+
from your attached lakehouse.
|
|
72
|
+
|
|
73
|
+
Local writes stay under `.laken/` and do not sync to Fabric; in Fabric, writes persist to
|
|
74
|
+
tables on the attached lakehouse.
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## Deploy to Fabric
|
|
79
|
+
|
|
80
|
+
Structure your local code as a Python project using the standard
|
|
81
|
+
[src layout](https://packaging.python.org/en/latest/discussions/src-layout-vs-flat-layout/):
|
|
82
|
+
|
|
83
|
+
```
|
|
84
|
+
myapp/
|
|
85
|
+
├── pyproject.toml # [project] name = "myapp"
|
|
86
|
+
├── src/
|
|
87
|
+
│ └── myapp/
|
|
88
|
+
│ ├── __init__.py
|
|
89
|
+
│ └── pipeline.py
|
|
90
|
+
└── .env
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Add `laken` to your project dependencies.
|
|
94
|
+
|
|
95
|
+
See the
|
|
96
|
+
[Python packaging guide](https://packaging.python.org/en/latest/tutorials/packaging-projects/)
|
|
97
|
+
if you are setting this up for the first time.
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
# src/myapp/pipeline.py
|
|
101
|
+
import pandas as pd
|
|
102
|
+
|
|
103
|
+
from laken import Lakehouse
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def run_pipeline(lh: Lakehouse) -> None:
|
|
107
|
+
products = lh.read_table("marketing.products", as_="pandas")
|
|
108
|
+
summary = products.groupby("category", as_index=False)["amount"].sum()
|
|
109
|
+
lh.write_table(summary, "staging.product_summary")
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
When you are ready, `laken deploy` builds your package and loads it into your specified
|
|
113
|
+
Fabric Environment.
|
|
114
|
+
|
|
115
|
+
Deploy credentials (`.env` or shell):
|
|
116
|
+
|
|
117
|
+
```env
|
|
118
|
+
AZURE_TENANT_ID=...
|
|
119
|
+
AZURE_CLIENT_ID=...
|
|
120
|
+
AZURE_CLIENT_SECRET=...
|
|
121
|
+
FABRIC_WORKSPACE_ID=...
|
|
122
|
+
FABRIC_ENVIRONMENT_ID=...
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
From the repo root:
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
laken deploy
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
In a Fabric notebook:
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
from laken import Lakehouse
|
|
135
|
+
from myapp.pipeline import run_pipeline
|
|
136
|
+
|
|
137
|
+
lh = Lakehouse()
|
|
138
|
+
run_pipeline(lh)
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## Reference
|
|
144
|
+
|
|
145
|
+
### `Lakehouse`
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
from laken import Lakehouse
|
|
149
|
+
|
|
150
|
+
lh = Lakehouse()
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
For tests or scripts that must pin a backend:
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
from laken import FabricLakehouse, LocalLakehouse
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
**Tables** — use `schema.table` to target a schema; a bare name is passed through to Spark
|
|
160
|
+
and Fabric resolves it (typically the default `dbo` schema on a schema-enabled lakehouse).
|
|
161
|
+
`mode` is `"overwrite"` or `"append"`.
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
lh.write_table(df, "products")
|
|
165
|
+
lh.write_table(df, "marketing.products", mode="append")
|
|
166
|
+
|
|
167
|
+
df = lh.read_table("products") # Spark
|
|
168
|
+
df = lh.read_table("products", as_="pandas")
|
|
169
|
+
df = lh.read_table("marketing.products", as_="polars")
|
|
170
|
+
|
|
171
|
+
lh.list_tables()
|
|
172
|
+
lh.table_exists("marketing.products")
|
|
173
|
+
lh.drop_table("marketing.products")
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
**Files** — local paths under `.laken/workspace/Files`; in Fabric, under the lakehouse
|
|
177
|
+
`Files/` area.
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
lh.write_file(df, "exports/summary.parquet")
|
|
181
|
+
lh.read_file("exports/summary.parquet", as_="pandas")
|
|
182
|
+
lh.list_files("exports")
|
|
183
|
+
lh.file_exists("exports/summary.parquet")
|
|
184
|
+
lh.delete_file("exports/summary.parquet")
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
**Warehouse tables** — Spark `synapsesql` in Fabric; local parquet stand-in for tests.
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
lh.load_table_from_warehouse("SalesOrderHeader", "SalesWarehouse", as_="pandas")
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
**Other lakehouses** — defaults come from notebook context in Fabric; override locally
|
|
194
|
+
or in notebooks:
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
lh = Lakehouse(lakehouse="Sales_LH")
|
|
198
|
+
lh.read_table("marketing.products", as_="pandas")
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### CLI
|
|
202
|
+
|
|
203
|
+
```text
|
|
204
|
+
laken deploy [--workspace-id <id>] [--environment-id <id>]
|
|
205
|
+
laken status
|
|
206
|
+
laken refresh <table>
|
|
207
|
+
laken reset <table>
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
`laken deploy` builds the wheel from your repo's `pyproject.toml`, uploads it to a Fabric
|
|
211
|
+
Environment, and publishes it so notebooks can import your package.
|
|
212
|
+
|
|
213
|
+
`laken status`, `laken refresh`, and `laken reset` manage the local `.laken/` cache on your
|
|
214
|
+
laptop. They do not run inside Fabric notebooks.
|
|
215
|
+
|
|
216
|
+
`laken status` lists cached tables with state (`mirror`, `sample`, or `local`), the Fabric
|
|
217
|
+
source version when known, and notes such as staleness or sample size.
|
|
218
|
+
|
|
219
|
+
`laken refresh <table>` re-downloads a table from Fabric when it was originally cached
|
|
220
|
+
from Fabric. Local-only tables are left unchanged.
|
|
221
|
+
|
|
222
|
+
`laken reset <table>` discards local changes and re-fetches from Fabric. The table must
|
|
223
|
+
have been cached from Fabric first.
|
|
224
|
+
|
|
225
|
+
### Environment variables
|
|
226
|
+
|
|
227
|
+
| Variable | Purpose |
|
|
228
|
+
| :--- | :--- |
|
|
229
|
+
| `AZURE_TENANT_ID` | Auth (fetch + deploy) |
|
|
230
|
+
| `AZURE_CLIENT_ID` | Auth (fetch + deploy) |
|
|
231
|
+
| `AZURE_CLIENT_SECRET` | Auth (fetch + deploy) |
|
|
232
|
+
| `FABRIC_WORKSPACE_NAME` | Local table fetch |
|
|
233
|
+
| `FABRIC_LAKEHOUSE_NAME` | Local table fetch |
|
|
234
|
+
| `FABRIC_WORKSPACE_ID` | OneLake paths; required for deploy |
|
|
235
|
+
| `FABRIC_LAKEHOUSE_ID` | OneLake paths |
|
|
236
|
+
| `FABRIC_ENVIRONMENT_ID` | Deploy target |
|
|
237
|
+
|
|
238
|
+
`AZURE_TENANT_ID`, `AZURE_CLIENT_ID`, and `AZURE_CLIENT_SECRET` are credentials from an
|
|
239
|
+
Azure service principal.
|
|
240
|
+
|
|
241
|
+
`FABRIC_WORKSPACE_NAME`, `FABRIC_LAKEHOUSE_NAME`, `FABRIC_WORKSPACE_ID`,
|
|
242
|
+
`FABRIC_LAKEHOUSE_ID`, and `FABRIC_ENVIRONMENT_ID` can be read from a Fabric notebook with
|
|
243
|
+
`notebookutils`:
|
|
244
|
+
|
|
245
|
+
```python
|
|
246
|
+
import notebookutils
|
|
247
|
+
|
|
248
|
+
ctx = notebookutils.runtime.context
|
|
249
|
+
print(ctx.get("currentWorkspaceName"))
|
|
250
|
+
print(ctx.get("currentWorkspaceId"))
|
|
251
|
+
print(ctx.get("defaultLakehouseName"))
|
|
252
|
+
print(ctx.get("defaultLakehouseId"))
|
|
253
|
+
print(ctx.get("environmentId"))
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
Deploy expects `pyproject.toml` at the repo root, a buildable application wheel, and a
|
|
257
|
+
Fabric environment with a compatible Python/Spark runtime.
|
|
258
|
+
|
|
259
|
+
### Local vs Fabric
|
|
260
|
+
|
|
261
|
+
| Class | Where | Storage | Reads | Writes |
|
|
262
|
+
| :--- | :--- | :--- | :--- | :--- |
|
|
263
|
+
| `Lakehouse` | Auto-detects notebook context | Fabric if available, else `.laken/` Delta | Local: Fabric → cache; Fabric: attached lakehouse | Local: `.laken/` only; Fabric: attached lakehouse |
|
|
264
|
+
| `LocalLakehouse` | Laptop / CI | `.laken/workspace/` | Cached Delta and local tables | Local only; not pushed to Fabric |
|
|
265
|
+
| `FabricLakehouse` | Fabric notebook | Attached lakehouse | Spark/Delta on attached lakehouse | Delta tables on attached lakehouse |
|
|
266
|
+
|
|
267
|
+
First local read of a Fabric table fetches and caches Delta under `.laken/`. If Fabric
|
|
268
|
+
changes, `laken` warns and keeps the cache until you run `laken refresh <table>`. Large
|
|
269
|
+
tables may cache as a fixed-size sample.
|
|
270
|
+
|
|
271
|
+
---
|
|
272
|
+
|
|
273
|
+
## Development
|
|
274
|
+
|
|
275
|
+
Contributions are welcome. To work on this package:
|
|
276
|
+
|
|
277
|
+
```bash
|
|
278
|
+
uv sync
|
|
279
|
+
uv run pytest
|
|
280
|
+
uv run ruff check
|
|
281
|
+
```
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "laken"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Local and Fabric lakehouse abstraction for modular, testable data code"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Cody VanZandt", email = "cody.a.vanzandt@gmail.com" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.11"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"pandas>=3.0.3",
|
|
12
|
+
"polars>=1.40.1",
|
|
13
|
+
"pyarrow>=24.0.0",
|
|
14
|
+
"pyspark>=3.5",
|
|
15
|
+
"packaging>=24.0",
|
|
16
|
+
"python-dotenv>=1.0",
|
|
17
|
+
"requests>=2.32",
|
|
18
|
+
"typer>=0.15",
|
|
19
|
+
"deltalake>=1.6.0",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[project.scripts]
|
|
23
|
+
laken = "laken.cli:app"
|
|
24
|
+
|
|
25
|
+
[build-system]
|
|
26
|
+
requires = ["hatchling"]
|
|
27
|
+
build-backend = "hatchling.build"
|
|
28
|
+
|
|
29
|
+
[tool.hatch.build.targets.sdist]
|
|
30
|
+
only-include = ["src/laken"]
|
|
31
|
+
|
|
32
|
+
[dependency-groups]
|
|
33
|
+
dev = [
|
|
34
|
+
"pytest>=9.0.3",
|
|
35
|
+
"ruff>=0.15.13",
|
|
36
|
+
"twine>=6.1.0",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[tool.ruff]
|
|
40
|
+
line-length = 100
|
|
41
|
+
target-version = "py311"
|
|
42
|
+
|
|
43
|
+
[tool.ruff.lint]
|
|
44
|
+
select = ["E", "F", "I", "UP", "B", "SIM"]
|
|
45
|
+
|
|
46
|
+
[tool.pytest.ini_options]
|
|
47
|
+
testpaths = ["tests"]
|
|
48
|
+
pythonpath = ["tests"]
|
|
49
|
+
markers = [
|
|
50
|
+
"integration: live Fabric tests (requires AZURE_* and FABRIC_* env vars)",
|
|
51
|
+
]
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from laken.fabric import FabricLakehouse
|
|
2
|
+
from laken.lakehouse import Lakehouse
|
|
3
|
+
from laken.local import LocalLakehouse
|
|
4
|
+
from laken.protocol import LakehouseProtocol
|
|
5
|
+
from laken.types import DfKind, InputFrame, OutputFrame, WriteMode
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def read_table(name: str, *, as_: DfKind = "spark") -> OutputFrame:
|
|
9
|
+
return Lakehouse().read_table(name, as_=as_)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def write_table(name: str, df: InputFrame, *, mode: WriteMode = "overwrite") -> None:
|
|
13
|
+
Lakehouse().write_table(df, name, mode=mode)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"FabricLakehouse",
|
|
18
|
+
"Lakehouse",
|
|
19
|
+
"LakehouseProtocol",
|
|
20
|
+
"LocalLakehouse",
|
|
21
|
+
"read_table",
|
|
22
|
+
"write_table",
|
|
23
|
+
]
|