databricks-bundle-decorators 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databricks_bundle_decorators/__init__.py +55 -0
- databricks_bundle_decorators/cli.py +253 -0
- databricks_bundle_decorators/codegen.py +107 -0
- databricks_bundle_decorators/context.py +31 -0
- databricks_bundle_decorators/decorators.py +316 -0
- databricks_bundle_decorators/discovery.py +27 -0
- databricks_bundle_decorators/io_manager.py +71 -0
- databricks_bundle_decorators/registry.py +83 -0
- databricks_bundle_decorators/runtime.py +139 -0
- databricks_bundle_decorators/sdk_types.py +194 -0
- databricks_bundle_decorators/task_values.py +72 -0
- databricks_bundle_decorators-0.1.2.dist-info/METADATA +381 -0
- databricks_bundle_decorators-0.1.2.dist-info/RECORD +15 -0
- databricks_bundle_decorators-0.1.2.dist-info/WHEEL +4 -0
- databricks_bundle_decorators-0.1.2.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""databricks-bundle-decorators – decorator-based Databricks job/task framework.
|
|
2
|
+
|
|
3
|
+
Public API
|
|
4
|
+
----------
|
|
5
|
+
Decorators:
|
|
6
|
+
``@task``, ``@job``
|
|
7
|
+
|
|
8
|
+
Cluster configuration:
|
|
9
|
+
``job_cluster()``
|
|
10
|
+
|
|
11
|
+
Data management:
|
|
12
|
+
``IoManager``, ``OutputContext``, ``InputContext``
|
|
13
|
+
|
|
14
|
+
Task values (small scalars):
|
|
15
|
+
``set_task_value``, ``get_task_value``
|
|
16
|
+
|
|
17
|
+
Job parameters:
|
|
18
|
+
``params``
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from databricks_bundle_decorators.context import params as params
|
|
22
|
+
from databricks_bundle_decorators.decorators import job as job
|
|
23
|
+
from databricks_bundle_decorators.decorators import job_cluster as job_cluster
|
|
24
|
+
from databricks_bundle_decorators.decorators import task as task
|
|
25
|
+
from databricks_bundle_decorators.discovery import (
|
|
26
|
+
discover_pipelines as discover_pipelines,
|
|
27
|
+
)
|
|
28
|
+
from databricks_bundle_decorators.io_manager import InputContext as InputContext
|
|
29
|
+
from databricks_bundle_decorators.io_manager import IoManager as IoManager
|
|
30
|
+
from databricks_bundle_decorators.io_manager import OutputContext as OutputContext
|
|
31
|
+
from databricks_bundle_decorators.registry import (
|
|
32
|
+
DuplicateResourceError as DuplicateResourceError,
|
|
33
|
+
)
|
|
34
|
+
from databricks_bundle_decorators.sdk_types import ClusterConfig as ClusterConfig
|
|
35
|
+
from databricks_bundle_decorators.sdk_types import JobConfig as JobConfig
|
|
36
|
+
from databricks_bundle_decorators.sdk_types import TaskConfig as TaskConfig
|
|
37
|
+
from databricks_bundle_decorators.task_values import get_task_value as get_task_value
|
|
38
|
+
from databricks_bundle_decorators.task_values import set_task_value as set_task_value
|
|
39
|
+
|
|
40
|
+
__all__ = [
|
|
41
|
+
"task",
|
|
42
|
+
"job",
|
|
43
|
+
"job_cluster",
|
|
44
|
+
"discover_pipelines",
|
|
45
|
+
"IoManager",
|
|
46
|
+
"OutputContext",
|
|
47
|
+
"InputContext",
|
|
48
|
+
"DuplicateResourceError",
|
|
49
|
+
"ClusterConfig",
|
|
50
|
+
"JobConfig",
|
|
51
|
+
"TaskConfig",
|
|
52
|
+
"set_task_value",
|
|
53
|
+
"get_task_value",
|
|
54
|
+
"params",
|
|
55
|
+
]
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
"""CLI for databricks-bundle-decorators.
|
|
2
|
+
|
|
3
|
+
Provides scaffolding commands for pipeline repositories.
|
|
4
|
+
|
|
5
|
+
Usage::
|
|
6
|
+
|
|
7
|
+
uv run dbxdec init
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import sys
|
|
12
|
+
import textwrap
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
import tomllib
|
|
17
|
+
except ModuleNotFoundError: # Python < 3.11
|
|
18
|
+
import tomli as tomllib # type: ignore[no-redef]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _read_pyproject(cwd: Path) -> dict:
|
|
22
|
+
"""Read and parse pyproject.toml from *cwd*."""
|
|
23
|
+
path = cwd / "pyproject.toml"
|
|
24
|
+
if not path.exists():
|
|
25
|
+
print(
|
|
26
|
+
"Error: No pyproject.toml found in the current directory.", file=sys.stderr
|
|
27
|
+
)
|
|
28
|
+
print(
|
|
29
|
+
"Run this command from the root of your pipeline project.", file=sys.stderr
|
|
30
|
+
)
|
|
31
|
+
sys.exit(1)
|
|
32
|
+
return tomllib.loads(path.read_text())
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _detect_package_name(pyproject: dict) -> str:
|
|
36
|
+
"""Derive the Python import name from the project name in pyproject.toml."""
|
|
37
|
+
name = pyproject.get("project", {}).get("name")
|
|
38
|
+
if not name:
|
|
39
|
+
print("Error: No [project].name found in pyproject.toml.", file=sys.stderr)
|
|
40
|
+
sys.exit(1)
|
|
41
|
+
return name.replace("-", "_")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _detect_src_layout(cwd: Path, package_name: str) -> Path:
|
|
45
|
+
"""Return the package directory, detecting flat or src layout."""
|
|
46
|
+
src_path = cwd / "src" / package_name
|
|
47
|
+
flat_path = cwd / package_name
|
|
48
|
+
if src_path.exists():
|
|
49
|
+
return src_path
|
|
50
|
+
if flat_path.exists():
|
|
51
|
+
return flat_path
|
|
52
|
+
# Default to src layout (will be created)
|
|
53
|
+
return src_path
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# --- File templates -------------------------------------------------------
|
|
57
|
+
|
|
58
|
+
_RESOURCES_INIT = '''\
|
|
59
|
+
"""Resource loader for ``databricks bundle deploy``.
|
|
60
|
+
|
|
61
|
+
Referenced from ``python.resources`` in ``databricks.yaml``::
|
|
62
|
+
|
|
63
|
+
python:
|
|
64
|
+
venv_path: .venv
|
|
65
|
+
resources:
|
|
66
|
+
- 'resources:load_resources'
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
from databricks.bundles.core import Bundle, Resources
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def load_resources(bundle: Bundle) -> Resources:
|
|
73
|
+
"""Entry-point called by ``databricks bundle deploy``."""
|
|
74
|
+
from databricks_bundle_decorators.discovery import discover_pipelines
|
|
75
|
+
from databricks_bundle_decorators.codegen import generate_resources
|
|
76
|
+
|
|
77
|
+
discover_pipelines()
|
|
78
|
+
|
|
79
|
+
resources = Resources()
|
|
80
|
+
for key, job_resource in generate_resources().items():
|
|
81
|
+
resources.add_resource(key, job_resource)
|
|
82
|
+
return resources
|
|
83
|
+
'''
|
|
84
|
+
|
|
85
|
+
_PIPELINES_INIT = '''\
|
|
86
|
+
"""Pipeline auto-discovery.
|
|
87
|
+
|
|
88
|
+
Every .py module in this package is imported automatically, triggering
|
|
89
|
+
@task / @job / @job_cluster decorator registration.
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
import importlib
|
|
95
|
+
import pkgutil
|
|
96
|
+
|
|
97
|
+
for _loader, _module_name, _is_pkg in pkgutil.walk_packages(__path__):
|
|
98
|
+
importlib.import_module(f"{__name__}.{_module_name}")
|
|
99
|
+
'''
|
|
100
|
+
|
|
101
|
+
_DATABRICKS_YAML = """\
|
|
102
|
+
bundle:
|
|
103
|
+
name: {project_name}
|
|
104
|
+
|
|
105
|
+
artifacts:
|
|
106
|
+
{package_name}:
|
|
107
|
+
type: whl
|
|
108
|
+
build: uv build --wheel
|
|
109
|
+
path: .
|
|
110
|
+
|
|
111
|
+
python:
|
|
112
|
+
venv_path: .venv
|
|
113
|
+
resources:
|
|
114
|
+
- 'resources:load_resources'
|
|
115
|
+
|
|
116
|
+
targets:
|
|
117
|
+
dev:
|
|
118
|
+
mode: development
|
|
119
|
+
workspace:
|
|
120
|
+
host: https://<your-workspace>.azuredatabricks.net/
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
_EXAMPLE_PIPELINE = '''\
|
|
124
|
+
"""Example pipeline – replace with your own tasks and jobs."""
|
|
125
|
+
|
|
126
|
+
from databricks_bundle_decorators import job, job_cluster, task, params
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
default_cluster = job_cluster(
|
|
130
|
+
name="default_cluster",
|
|
131
|
+
spark_version="14.3.x-scala2.12",
|
|
132
|
+
node_type_id="Standard_DS3_v2",
|
|
133
|
+
num_workers=2,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
@task
|
|
138
|
+
def hello():
|
|
139
|
+
print(f"Hello from databricks-bundle-decorators! url={{params.get('url', 'N/A')}}")
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
@job(
|
|
143
|
+
params={{"url": "https://example.com"}},
|
|
144
|
+
cluster="default_cluster",
|
|
145
|
+
)
|
|
146
|
+
def example_job():
|
|
147
|
+
hello()
|
|
148
|
+
'''
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
# --- Init command ----------------------------------------------------------
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _cmd_init(args: argparse.Namespace) -> None:
|
|
155
|
+
"""Scaffold a new databricks-bundle-decorators pipeline project."""
|
|
156
|
+
cwd = Path.cwd()
|
|
157
|
+
pyproject = _read_pyproject(cwd)
|
|
158
|
+
package_name = _detect_package_name(pyproject)
|
|
159
|
+
project_name = pyproject["project"]["name"]
|
|
160
|
+
pkg_dir = _detect_src_layout(cwd, package_name)
|
|
161
|
+
|
|
162
|
+
created: list[str] = []
|
|
163
|
+
skipped: list[str] = []
|
|
164
|
+
|
|
165
|
+
def _write(path: Path, content: str) -> None:
|
|
166
|
+
if path.exists():
|
|
167
|
+
skipped.append(str(path.relative_to(cwd)))
|
|
168
|
+
return
|
|
169
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
170
|
+
path.write_text(content)
|
|
171
|
+
created.append(str(path.relative_to(cwd)))
|
|
172
|
+
|
|
173
|
+
# 1. resources/__init__.py
|
|
174
|
+
_write(cwd / "resources" / "__init__.py", _RESOURCES_INIT)
|
|
175
|
+
|
|
176
|
+
# 2. pipelines/__init__.py (auto-discovery)
|
|
177
|
+
_write(pkg_dir / "pipelines" / "__init__.py", _PIPELINES_INIT)
|
|
178
|
+
|
|
179
|
+
# 3. Example pipeline
|
|
180
|
+
_write(
|
|
181
|
+
pkg_dir / "pipelines" / "example.py",
|
|
182
|
+
_EXAMPLE_PIPELINE,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# 4. databricks.yaml
|
|
186
|
+
_write(
|
|
187
|
+
cwd / "databricks.yaml",
|
|
188
|
+
_DATABRICKS_YAML.format(
|
|
189
|
+
project_name=project_name,
|
|
190
|
+
package_name=package_name,
|
|
191
|
+
),
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# 5. Ensure package __init__.py exists
|
|
195
|
+
_write(pkg_dir / "__init__.py", "")
|
|
196
|
+
|
|
197
|
+
# --- Summary -----------------------------------------------------------
|
|
198
|
+
print()
|
|
199
|
+
print("databricks-bundle-decorators project initialized!")
|
|
200
|
+
print()
|
|
201
|
+
|
|
202
|
+
if created:
|
|
203
|
+
print("Created:")
|
|
204
|
+
for f in created:
|
|
205
|
+
print(f" {f}")
|
|
206
|
+
|
|
207
|
+
if skipped:
|
|
208
|
+
print("Skipped (already exist):")
|
|
209
|
+
for f in skipped:
|
|
210
|
+
print(f" {f}")
|
|
211
|
+
|
|
212
|
+
# --- Check for entry point in pyproject.toml ---------------------------
|
|
213
|
+
entry_points = (
|
|
214
|
+
pyproject.get("project", {})
|
|
215
|
+
.get("entry-points", {})
|
|
216
|
+
.get("databricks_bundle_decorators.pipelines", {})
|
|
217
|
+
)
|
|
218
|
+
if not entry_points:
|
|
219
|
+
print()
|
|
220
|
+
print("Next step: add the pipeline entry point to your pyproject.toml:")
|
|
221
|
+
print()
|
|
222
|
+
print(
|
|
223
|
+
textwrap.dedent(f"""\
|
|
224
|
+
[project.entry-points."databricks_bundle_decorators.pipelines"]
|
|
225
|
+
{package_name} = "{package_name}.pipelines"
|
|
226
|
+
""")
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
print("Done! Define your @task and @job functions in the pipelines/ directory.")
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
# --- Main ------------------------------------------------------------------
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def main() -> None:
|
|
236
|
+
parser = argparse.ArgumentParser(
|
|
237
|
+
prog="dbxdec",
|
|
238
|
+
description="databricks-bundle-decorators CLI",
|
|
239
|
+
)
|
|
240
|
+
subparsers = parser.add_subparsers(dest="command")
|
|
241
|
+
|
|
242
|
+
subparsers.add_parser(
|
|
243
|
+
"init",
|
|
244
|
+
help="Scaffold a new databricks-bundle-decorators pipeline project",
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
args = parser.parse_args()
|
|
248
|
+
|
|
249
|
+
if args.command == "init":
|
|
250
|
+
_cmd_init(args)
|
|
251
|
+
else:
|
|
252
|
+
parser.print_help()
|
|
253
|
+
sys.exit(1)
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Convert registries into ``databricks.bundles.jobs`` resource objects.
|
|
2
|
+
|
|
3
|
+
Called at deploy time by the resource loader. Reads the global
|
|
4
|
+
registries populated by ``@task``, ``@job_cluster``, and ``@job`` decorators
|
|
5
|
+
and produces ``Job`` dataclass instances that the Databricks CLI
|
|
6
|
+
serialises into the bundle configuration.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from databricks_bundle_decorators.registry import (
|
|
10
|
+
_CLUSTER_REGISTRY,
|
|
11
|
+
_JOB_REGISTRY,
|
|
12
|
+
_TASK_REGISTRY,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def generate_resources(package_name: str = "databricks_bundle_decorators") -> dict:
|
|
17
|
+
"""Build ``{resource_key: Job}`` from the global registries.
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
package_name:
|
|
22
|
+
The Python package name used in ``PythonWheelTask``. Must match
|
|
23
|
+
the ``[project.name]`` in *pyproject.toml*.
|
|
24
|
+
"""
|
|
25
|
+
from databricks.bundles.jobs import (
|
|
26
|
+
ClusterSpec,
|
|
27
|
+
Job,
|
|
28
|
+
JobCluster,
|
|
29
|
+
JobParameterDefinition,
|
|
30
|
+
Library,
|
|
31
|
+
PythonWheelTask,
|
|
32
|
+
Task,
|
|
33
|
+
TaskDependency,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
jobs: dict[str, Job] = {}
|
|
37
|
+
|
|
38
|
+
for job_name, job_meta in _JOB_REGISTRY.items():
|
|
39
|
+
tasks: list[Task] = []
|
|
40
|
+
|
|
41
|
+
for task_key, upstream_keys in job_meta.dag.items():
|
|
42
|
+
depends_on = [TaskDependency(task_key=uk) for uk in upstream_keys]
|
|
43
|
+
|
|
44
|
+
# ----- named_parameters sent to the wheel entry-point ----------
|
|
45
|
+
named_params: dict[str, str] = {
|
|
46
|
+
"__job_name__": job_name,
|
|
47
|
+
"__task_key__": task_key,
|
|
48
|
+
"__run_id__": "{{job.run_id}}",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
# Upstream edge info so the runtime can invoke IoManager.load()
|
|
52
|
+
edges = job_meta.dag_edges.get(task_key, {})
|
|
53
|
+
for param_name, upstream_task in edges.items():
|
|
54
|
+
named_params[f"__upstream__{param_name}"] = upstream_task
|
|
55
|
+
|
|
56
|
+
# Forward every job-level parameter to the task CLI
|
|
57
|
+
for param_name in job_meta.params:
|
|
58
|
+
named_params[param_name] = (
|
|
59
|
+
"{{" + f'job.parameters["{param_name}"]' + "}}"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# ----- per-task SDK config (max_retries, timeout, etc.) -----
|
|
63
|
+
qualified_key = f"{job_name}.{task_key}"
|
|
64
|
+
task_meta = _TASK_REGISTRY.get(qualified_key)
|
|
65
|
+
task_sdk_config = task_meta.sdk_config if task_meta else {}
|
|
66
|
+
|
|
67
|
+
task_obj = Task(
|
|
68
|
+
task_key=task_key,
|
|
69
|
+
depends_on=depends_on,
|
|
70
|
+
job_cluster_key=job_meta.cluster,
|
|
71
|
+
python_wheel_task=PythonWheelTask(
|
|
72
|
+
package_name=package_name,
|
|
73
|
+
entry_point="dbxdec-run",
|
|
74
|
+
named_parameters=named_params, # type: ignore[arg-type] # SDK Variable wrappers
|
|
75
|
+
),
|
|
76
|
+
libraries=[Library(whl="dist/*.whl")],
|
|
77
|
+
**task_sdk_config,
|
|
78
|
+
)
|
|
79
|
+
tasks.append(task_obj)
|
|
80
|
+
|
|
81
|
+
# ----- job clusters -----------------------------------------------
|
|
82
|
+
job_clusters: list[JobCluster] = []
|
|
83
|
+
if job_meta.cluster and job_meta.cluster in _CLUSTER_REGISTRY:
|
|
84
|
+
cluster_meta = _CLUSTER_REGISTRY[job_meta.cluster]
|
|
85
|
+
job_clusters.append(
|
|
86
|
+
JobCluster(
|
|
87
|
+
job_cluster_key=cluster_meta.name,
|
|
88
|
+
new_cluster=ClusterSpec.from_dict(cluster_meta.spec), # type: ignore[arg-type] # typed as ClusterSpecDict
|
|
89
|
+
)
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# ----- parameters -------------------------------------------------
|
|
93
|
+
parameters = [
|
|
94
|
+
JobParameterDefinition(name=k, default=v)
|
|
95
|
+
for k, v in job_meta.params.items()
|
|
96
|
+
]
|
|
97
|
+
|
|
98
|
+
job_obj = Job(
|
|
99
|
+
name=job_name,
|
|
100
|
+
tasks=tasks, # type: ignore[arg-type] # SDK Variable wrappers
|
|
101
|
+
parameters=parameters,
|
|
102
|
+
job_clusters=job_clusters, # type: ignore[arg-type] # SDK Variable wrappers
|
|
103
|
+
**job_meta.sdk_config,
|
|
104
|
+
)
|
|
105
|
+
jobs[job_name] = job_obj
|
|
106
|
+
|
|
107
|
+
return jobs
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Runtime context – provides job parameters to task functions.
|
|
2
|
+
|
|
3
|
+
At runtime the entry point populates the global ``params`` dict from CLI
|
|
4
|
+
arguments (parsed via ``argparse``). Task code imports and reads it::
|
|
5
|
+
|
|
6
|
+
from databricks_bundle_decorators import params
|
|
7
|
+
|
|
8
|
+
@task
|
|
9
|
+
def my_task():
|
|
10
|
+
url = params["url"]
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class _Params(dict[str, Any]):
|
|
17
|
+
"""Dict subclass that holds job parameters.
|
|
18
|
+
|
|
19
|
+
An instance lives at module level and is populated by the runtime
|
|
20
|
+
runner before the task function is invoked.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# Global params instance – importable by user code.
|
|
25
|
+
params: _Params = _Params()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _populate_params(values: dict[str, Any]) -> None:
|
|
29
|
+
"""Replace the contents of the global *params* dict."""
|
|
30
|
+
params.clear()
|
|
31
|
+
params.update(values)
|