databricks-bundle-decorators 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databricks_bundle_decorators-0.1.2/PKG-INFO +381 -0
- databricks_bundle_decorators-0.1.2/README.md +367 -0
- databricks_bundle_decorators-0.1.2/pyproject.toml +43 -0
- databricks_bundle_decorators-0.1.2/src/databricks_bundle_decorators/__init__.py +55 -0
- databricks_bundle_decorators-0.1.2/src/databricks_bundle_decorators/cli.py +253 -0
- databricks_bundle_decorators-0.1.2/src/databricks_bundle_decorators/codegen.py +107 -0
- databricks_bundle_decorators-0.1.2/src/databricks_bundle_decorators/context.py +31 -0
- databricks_bundle_decorators-0.1.2/src/databricks_bundle_decorators/decorators.py +316 -0
- databricks_bundle_decorators-0.1.2/src/databricks_bundle_decorators/discovery.py +27 -0
- databricks_bundle_decorators-0.1.2/src/databricks_bundle_decorators/io_manager.py +71 -0
- databricks_bundle_decorators-0.1.2/src/databricks_bundle_decorators/registry.py +83 -0
- databricks_bundle_decorators-0.1.2/src/databricks_bundle_decorators/runtime.py +139 -0
- databricks_bundle_decorators-0.1.2/src/databricks_bundle_decorators/sdk_types.py +194 -0
- databricks_bundle_decorators-0.1.2/src/databricks_bundle_decorators/task_values.py +72 -0
|
@@ -0,0 +1,381 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: databricks-bundle-decorators
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: Decorator-based framework for defining Databricks jobs and tasks as Python code.
|
|
5
|
+
Author: boccileonardo
|
|
6
|
+
Author-email: boccileonardo <leonardobocci99@hotmail.com>
|
|
7
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
11
|
+
Requires-Dist: databricks-bundles>=0.275
|
|
12
|
+
Requires-Python: >=3.12
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# databricks-bundle-decorators
|
|
16
|
+
|
|
17
|
+
Decorator-based framework for defining Databricks jobs and tasks as Python code. Define pipelines using `@task`, `@job`, and `job_cluster()` — they compile into [Databricks Asset Bundle](https://docs.databricks.com/aws/en/dev-tools/bundles/python/) resources.
|
|
18
|
+
|
|
19
|
+
## Why databricks-bundle-decorators?
|
|
20
|
+
|
|
21
|
+
Writing Databricks jobs in raw YAML is tedious and disconnects task logic from orchestration configuration. databricks-bundle-decorators lets you express both in Python:
|
|
22
|
+
|
|
23
|
+
- **AirFlow TaskFlow-inspired pattern** — define `@task` functions inside a `@job` body; dependencies are captured automatically from call arguments.
|
|
24
|
+
- **IoManager pattern** — large data (DataFrames, datasets) flows through permanent storage (Delta tables, Unity Catalog volumes) - multi-hop architecture.
|
|
25
|
+
- **Explicit task values** — small scalars (`str`, `int`, `float`, `bool`) can be passed between tasks via `set_task_value` / `get_task_value`, like you would with Airflow XComs.
|
|
26
|
+
- **Deploy-time codegen** — when you run `databricks bundle deploy`, the framework imports your Python files, discovers all `@job`/`@task` definitions, and generates Databricks Job configurations. The result is a databricks job, with all tasks and dependencies set up.
|
|
27
|
+
- **Runtime dispatch** — when Databricks runs the job (on schedule or manually), each task executes on a cluster via the `dbxdec-run` entry point, which loads upstream data through IoManagers and calls your task function.
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
uv add databricks-bundle-decorators
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Quickstart
|
|
36
|
+
|
|
37
|
+
### 1. Scaffold your pipeline project
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
uv init my-pipeline && cd my-pipeline
|
|
41
|
+
uv add databricks-bundle-decorators
|
|
42
|
+
uv run dbxdec init
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
`dbxdec init` creates:
|
|
46
|
+
|
|
47
|
+
| File | Purpose |
|
|
48
|
+
|------|---------|
|
|
49
|
+
| `resources/__init__.py` | `load_resources()` entry point for `databricks bundle deploy` |
|
|
50
|
+
| `src/<package>/pipelines/__init__.py` | Auto-discovery module that imports all pipeline files |
|
|
51
|
+
| `src/<package>/pipelines/example.py` | Starter pipeline with `@task`, `@job`, `job_cluster()` |
|
|
52
|
+
| `databricks.yaml` | Databricks Asset Bundle configuration (if not present) |
|
|
53
|
+
| `pyproject.toml` | Updated with the pipeline package entry point |
|
|
54
|
+
|
|
55
|
+
### 2. Define your pipeline
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
# src/my_pipeline/pipelines/github_events.py
|
|
59
|
+
|
|
60
|
+
import polars as pl
|
|
61
|
+
import requests
|
|
62
|
+
|
|
63
|
+
from databricks_bundle_decorators import IoManager, InputContext, OutputContext, params
|
|
64
|
+
from databricks_bundle_decorators import job, job_cluster, task
|
|
65
|
+
from databricks_bundle_decorators.task_values import set_task_value
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class DeltaIoManager(IoManager):
|
|
69
|
+
def __init__(self, catalog: str, schema: str):
|
|
70
|
+
self.catalog = catalog
|
|
71
|
+
self.schema = schema
|
|
72
|
+
|
|
73
|
+
def store(self, context: OutputContext, obj) -> None:
|
|
74
|
+
table = f"{self.catalog}.{self.schema}.{context.task_key}"
|
|
75
|
+
obj.write_delta(table, mode="overwrite")
|
|
76
|
+
|
|
77
|
+
def load(self, context: InputContext):
|
|
78
|
+
table = f"{self.catalog}.{self.schema}.{context.upstream_task_key}"
|
|
79
|
+
return pl.read_delta(table)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
staging_io = DeltaIoManager(catalog="main", schema="staging")
|
|
83
|
+
|
|
84
|
+
small_cluster = job_cluster(
|
|
85
|
+
name="small_cluster",
|
|
86
|
+
spark_version="16.4.x-scala2.12",
|
|
87
|
+
node_type_id="Standard_E8ds_v4",
|
|
88
|
+
num_workers=1,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@job(
|
|
93
|
+
tags={"source": "github", "type": "api"},
|
|
94
|
+
schedule="0 * * * *",
|
|
95
|
+
params={"url": "https://api.github.com/events"},
|
|
96
|
+
cluster="small_cluster",
|
|
97
|
+
)
|
|
98
|
+
def github_events():
|
|
99
|
+
@task(io_manager=staging_io)
|
|
100
|
+
def extract():
|
|
101
|
+
r = requests.get(params["url"])
|
|
102
|
+
df = pl.DataFrame(r.json())
|
|
103
|
+
set_task_value("row_count", len(df))
|
|
104
|
+
return df
|
|
105
|
+
|
|
106
|
+
@task
|
|
107
|
+
def transform(raw_df):
|
|
108
|
+
print(raw_df.head(10))
|
|
109
|
+
|
|
110
|
+
df = extract()
|
|
111
|
+
transform(df)
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### 3. Deploy
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
databricks bundle deploy --target dev
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## How It Works
|
|
121
|
+
|
|
122
|
+
### Deploy time (`databricks bundle deploy`)
|
|
123
|
+
|
|
124
|
+
When you run `databricks bundle deploy`, the Databricks CLI imports your Python pipeline files. This triggers the `@job` and `@task` decorators, which **register** your tasks and their dependencies into an internal DAG — no task code actually runs yet. The framework then generates Databricks Job definitions from this DAG and uploads them to your workspace.
|
|
125
|
+
|
|
126
|
+
The result: a Databricks Job appears in the UI with all your tasks, their dependency edges, cluster configs, and parameters fully wired up.
|
|
127
|
+
|
|
128
|
+
```
|
|
129
|
+
your_pipeline.py
|
|
130
|
+
@job / @task / job_cluster()
|
|
131
|
+
▼
|
|
132
|
+
Framework builds task DAG from decorator metadata
|
|
133
|
+
▼
|
|
134
|
+
codegen → Databricks Job definition
|
|
135
|
+
▼
|
|
136
|
+
databricks bundle deploy → Job created in workspace
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Runtime (when the job runs on Databricks)
|
|
140
|
+
|
|
141
|
+
When the job is triggered (on schedule or manually), Databricks launches each task as a separate `python_wheel_task` on a cluster. For each task:
|
|
142
|
+
|
|
143
|
+
1. The `dbxdec-run` entry point starts.
|
|
144
|
+
2. It looks up the upstream tasks and calls `IoManager.load()` to fetch their outputs.
|
|
145
|
+
3. It injects the loaded data as arguments to your task function and calls it.
|
|
146
|
+
4. If the task has an IoManager, it calls `IoManager.store()` to persist the return value for downstream tasks.
|
|
147
|
+
|
|
148
|
+
```
|
|
149
|
+
Databricks triggers job
|
|
150
|
+
→ launches each task as python_wheel_task
|
|
151
|
+
▼
|
|
152
|
+
dbxdec-run entry point
|
|
153
|
+
▼
|
|
154
|
+
IoManager.load() upstream data → call your task function
|
|
155
|
+
▼
|
|
156
|
+
IoManager.store() return value for downstream tasks
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## API Reference
|
|
160
|
+
|
|
161
|
+
### `@task`
|
|
162
|
+
|
|
163
|
+
Registers a function as a Databricks task.
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
@task
|
|
167
|
+
def my_task():
|
|
168
|
+
...
|
|
169
|
+
|
|
170
|
+
@task(io_manager=my_io_manager)
|
|
171
|
+
def my_task_with_io():
|
|
172
|
+
return some_dataframe
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
| Parameter | Type | Description |
|
|
176
|
+
|-----------|------|-------------|
|
|
177
|
+
| `io_manager` | `IoManager \| None` | Controls how the return value is persisted and loaded by downstream tasks. |
|
|
178
|
+
| `**kwargs` | `TaskConfig` | SDK-native `Task` fields (`max_retries`, `timeout_seconds`, `retry_on_timeout`, etc.). |
|
|
179
|
+
|
|
180
|
+
### `@job`
|
|
181
|
+
|
|
182
|
+
Registers a function as a Databricks job.
|
|
183
|
+
|
|
184
|
+
The `@job` body runs **once when Python imports the file** (not on Databricks). Its purpose is to let the framework discover which tasks exist and how they depend on each other. Inside the body, `@task` functions don't execute your business logic — they return lightweight `TaskProxy` objects that record dependency edges. Think of the `@job` body as a *declaration*, not execution.
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
@job(
|
|
188
|
+
tags={"team": "data-eng"},
|
|
189
|
+
schedule="0 * * * *",
|
|
190
|
+
params={"url": "https://api.example.com"},
|
|
191
|
+
cluster="small_cluster",
|
|
192
|
+
)
|
|
193
|
+
def my_job():
|
|
194
|
+
@task
|
|
195
|
+
def extract(): ... # Not called yet — just registered
|
|
196
|
+
|
|
197
|
+
@task
|
|
198
|
+
def transform(data): ... # Not called yet — just registered
|
|
199
|
+
|
|
200
|
+
data = extract() # Returns a TaskProxy (not real data)
|
|
201
|
+
transform(data) # Records: transform depends on extract
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
When this file is imported during `databricks bundle deploy`, the framework sees the DAG: `extract → transform`. Your actual `extract()` and `transform()` code only runs later when Databricks executes the job.
|
|
205
|
+
|
|
206
|
+
| Parameter | Type | Description |
|
|
207
|
+
|-----------|------|-------------|
|
|
208
|
+
| `params` | `dict[str, str] \| None` | Default job-level parameters, accessible via `from databricks_bundle_decorators import params`. |
|
|
209
|
+
| `cluster` | `str \| None` | Name of a `job_cluster()` to use as the shared job cluster. |
|
|
210
|
+
| `**kwargs` | `JobConfig` | SDK-native `Job` fields (`tags`, `schedule`, `max_concurrent_runs`, etc.). |
|
|
211
|
+
|
|
212
|
+
**Task namespacing:** Tasks inside a `@job` body are registered under qualified keys (`job_name.task_name`), preventing name collisions across jobs.
|
|
213
|
+
|
|
214
|
+
### `job_cluster()`
|
|
215
|
+
|
|
216
|
+
Registers a reusable ephemeral job-cluster configuration and returns its name.
|
|
217
|
+
|
|
218
|
+
```python
|
|
219
|
+
small_cluster = job_cluster(
|
|
220
|
+
name="gpu_cluster",
|
|
221
|
+
spark_version="14.0.x-gpu-ml-scala2.12",
|
|
222
|
+
node_type_id="Standard_NC6s_v3",
|
|
223
|
+
num_workers=4,
|
|
224
|
+
)
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
| Parameter | Type | Description |
|
|
228
|
+
|-----------|------|-------------|
|
|
229
|
+
| `name` | `str` | Cluster name, referenced from `@job(cluster=…)`. |
|
|
230
|
+
| `**kwargs` | `ClusterConfig` | SDK-native `ClusterSpec` fields (`spark_version`, `node_type_id`, `num_workers`, etc.). |
|
|
231
|
+
|
|
232
|
+
### IoManager
|
|
233
|
+
|
|
234
|
+
Abstract base class for inter-task data persistence. The **producing task** declares its IoManager; downstream tasks receive data automatically.
|
|
235
|
+
|
|
236
|
+
```
|
|
237
|
+
IoManager (attached to producer)
|
|
238
|
+
┌──────────┐
|
|
239
|
+
│ store() │ ← called after producer runs, persists return value
|
|
240
|
+
│ load() │ ← called before consumer runs, injects data as argument
|
|
241
|
+
└──────────┘
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
```python
|
|
245
|
+
from databricks_bundle_decorators import IoManager, OutputContext, InputContext
|
|
246
|
+
|
|
247
|
+
class DeltaIoManager(IoManager):
|
|
248
|
+
def __init__(self, catalog: str, schema: str):
|
|
249
|
+
self.catalog = catalog
|
|
250
|
+
self.schema = schema
|
|
251
|
+
|
|
252
|
+
def store(self, context: OutputContext, obj) -> None:
|
|
253
|
+
table = f"{self.catalog}.{self.schema}.{context.task_key}"
|
|
254
|
+
obj.write_delta(table, mode="overwrite")
|
|
255
|
+
|
|
256
|
+
def load(self, context: InputContext):
|
|
257
|
+
table = f"{self.catalog}.{self.schema}.{context.upstream_task_key}"
|
|
258
|
+
return pl.read_delta(table)
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
Downstream tasks are **storage-agnostic** — they receive plain Python objects and don't need to know the storage backend.
|
|
262
|
+
|
|
263
|
+
#### IoManager vs Task Values
|
|
264
|
+
|
|
265
|
+
| Mechanism | Use case | Size limit |
|
|
266
|
+
|-----------|----------|------------|
|
|
267
|
+
| `IoManager` | DataFrames, datasets, large objects | Unlimited (external storage) |
|
|
268
|
+
| `set_task_value` / `get_task_value` | Row counts, status flags, small strings | < 48 KB |
|
|
269
|
+
|
|
270
|
+
#### Context Objects
|
|
271
|
+
|
|
272
|
+
**`OutputContext`** (passed to `store()`): `job_name`, `task_key`, `run_id`
|
|
273
|
+
|
|
274
|
+
**`InputContext`** (passed to `load()`): `job_name`, `task_key`, `upstream_task_key`, `run_id`
|
|
275
|
+
|
|
276
|
+
### Task Values
|
|
277
|
+
|
|
278
|
+
For small scalar data between tasks without permanent storage:
|
|
279
|
+
|
|
280
|
+
```python
|
|
281
|
+
from databricks_bundle_decorators.task_values import set_task_value, get_task_value
|
|
282
|
+
|
|
283
|
+
@task
|
|
284
|
+
def producer():
|
|
285
|
+
set_task_value("row_count", 42)
|
|
286
|
+
|
|
287
|
+
@task
|
|
288
|
+
def consumer():
|
|
289
|
+
count = get_task_value("producer", "row_count")
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
Maps to `dbutils.jobs.taskValues` at runtime, with a local dict fallback for testing.
|
|
293
|
+
|
|
294
|
+
### Parameters
|
|
295
|
+
|
|
296
|
+
Job-level parameters are accessible via the global `params` dict:
|
|
297
|
+
|
|
298
|
+
```python
|
|
299
|
+
from databricks_bundle_decorators import params
|
|
300
|
+
|
|
301
|
+
@task
|
|
302
|
+
def my_task():
|
|
303
|
+
url = params["url"]
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
### Pipeline Discovery
|
|
307
|
+
|
|
308
|
+
Pipeline packages register via [entry points](https://packaging.python.org/en/latest/specifications/entry-points/):
|
|
309
|
+
|
|
310
|
+
```toml
|
|
311
|
+
[project.entry-points."databricks_bundle_decorators.pipelines"]
|
|
312
|
+
my_pipeline = "my_pipeline.pipelines"
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
The referenced module should import all modules containing `@task`/`@job` decorators:
|
|
316
|
+
|
|
317
|
+
```python
|
|
318
|
+
# my_pipeline/pipelines/__init__.py
|
|
319
|
+
import importlib, pkgutil
|
|
320
|
+
|
|
321
|
+
for _loader, _name, _is_pkg in pkgutil.walk_packages(__path__):
|
|
322
|
+
importlib.import_module(f"{__name__}.{_name}")
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
### CLI
|
|
326
|
+
|
|
327
|
+
```bash
|
|
328
|
+
uv run dbxdec init # Scaffold a pipeline project in the current directory
|
|
329
|
+
```
|
|
330
|
+
|
|
331
|
+
## Packaging Model
|
|
332
|
+
|
|
333
|
+
```
|
|
334
|
+
┌──────────────────────────────┐ ┌────────────────────────┐
|
|
335
|
+
│ databricks-bundle-decorators│ │ my-pipeline (repo) │
|
|
336
|
+
│ (library, PyPI) │◄────│ │
|
|
337
|
+
│ │ │ pyproject.toml │
|
|
338
|
+
│ @task, @job, job_cluster() │ │ src/my_pipeline/ │
|
|
339
|
+
│ IoManager ABC │ │ pipelines/ │
|
|
340
|
+
│ codegen, runtime, discovery │ │ resources/__init__.py │
|
|
341
|
+
│ dbxdec CLI │ │ databricks.yaml │
|
|
342
|
+
└──────────────────────────────┘ └────────────────────────┘
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
The framework is a reusable library. Pipeline repos contain only business logic — upgrading is a single dependency bump.
|
|
346
|
+
|
|
347
|
+
## Development
|
|
348
|
+
|
|
349
|
+
```bash
|
|
350
|
+
git clone https://github.com/<org>/databricks-bundle-decorators.git
|
|
351
|
+
cd databricks-bundle-decorators
|
|
352
|
+
uv sync
|
|
353
|
+
uv run pytest tests/ -v
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
## Project Structure
|
|
357
|
+
|
|
358
|
+
```
|
|
359
|
+
├── pyproject.toml
|
|
360
|
+
├── examples/
|
|
361
|
+
│ ├── databricks.yaml
|
|
362
|
+
│ ├── resources/__init__.py
|
|
363
|
+
│ └── example_pipeline.py
|
|
364
|
+
├── src/databricks_bundle_decorators/
|
|
365
|
+
│ ├── __init__.py # Public API exports
|
|
366
|
+
│ ├── cli.py # dbxdec init command
|
|
367
|
+
│ ├── codegen.py # Registry → Job objects
|
|
368
|
+
│ ├── context.py # Global params dict
|
|
369
|
+
│ ├── decorators.py # @task, @job, job_cluster(), TaskProxy
|
|
370
|
+
│ ├── discovery.py # Entry-point pipeline discovery
|
|
371
|
+
│ ├── io_manager.py # IoManager ABC, OutputContext, InputContext
|
|
372
|
+
│ ├── registry.py # Global registries, DuplicateResourceError
|
|
373
|
+
│ ├── runtime.py # dbxdec-run entry point
|
|
374
|
+
│ ├── sdk_types.py # JobConfig, TaskConfig, ClusterConfig TypedDicts
|
|
375
|
+
│ └── task_values.py # set_task_value / get_task_value
|
|
376
|
+
└── tests/
|
|
377
|
+
```
|
|
378
|
+
|
|
379
|
+
## Release
|
|
380
|
+
|
|
381
|
+
See [RELEASING.md](RELEASING.md) for the PyPI release process.
|