zenml-nightly 0.61.0.dev20240713__py3-none-any.whl → 0.62.0.dev20240717__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. README.md +1 -1
  2. RELEASE_NOTES.md +40 -0
  3. zenml/VERSION +1 -1
  4. zenml/__init__.py +2 -0
  5. zenml/cli/stack.py +87 -228
  6. zenml/cli/stack_components.py +5 -3
  7. zenml/constants.py +2 -0
  8. zenml/entrypoints/entrypoint.py +3 -1
  9. zenml/integrations/__init__.py +1 -0
  10. zenml/integrations/constants.py +1 -0
  11. zenml/integrations/databricks/__init__.py +52 -0
  12. zenml/integrations/databricks/flavors/__init__.py +30 -0
  13. zenml/integrations/databricks/flavors/databricks_model_deployer_flavor.py +118 -0
  14. zenml/integrations/databricks/flavors/databricks_orchestrator_flavor.py +147 -0
  15. zenml/integrations/databricks/model_deployers/__init__.py +20 -0
  16. zenml/integrations/databricks/model_deployers/databricks_model_deployer.py +249 -0
  17. zenml/integrations/databricks/orchestrators/__init__.py +20 -0
  18. zenml/integrations/databricks/orchestrators/databricks_orchestrator.py +498 -0
  19. zenml/integrations/databricks/orchestrators/databricks_orchestrator_entrypoint_config.py +97 -0
  20. zenml/integrations/databricks/services/__init__.py +19 -0
  21. zenml/integrations/databricks/services/databricks_deployment.py +407 -0
  22. zenml/integrations/databricks/utils/__init__.py +14 -0
  23. zenml/integrations/databricks/utils/databricks_utils.py +87 -0
  24. zenml/integrations/great_expectations/data_validators/ge_data_validator.py +12 -8
  25. zenml/integrations/huggingface/materializers/huggingface_datasets_materializer.py +88 -3
  26. zenml/integrations/huggingface/steps/accelerate_runner.py +1 -7
  27. zenml/integrations/kubernetes/orchestrators/manifest_utils.py +7 -0
  28. zenml/integrations/kubernetes/pod_settings.py +2 -0
  29. zenml/integrations/lightgbm/__init__.py +1 -0
  30. zenml/integrations/mlflow/__init__.py +1 -1
  31. zenml/integrations/mlflow/model_registries/mlflow_model_registry.py +6 -2
  32. zenml/integrations/mlflow/services/mlflow_deployment.py +1 -1
  33. zenml/integrations/skypilot_lambda/__init__.py +1 -1
  34. zenml/materializers/built_in_materializer.py +1 -1
  35. zenml/materializers/cloudpickle_materializer.py +1 -1
  36. zenml/model/model.py +1 -1
  37. zenml/models/v2/misc/full_stack.py +32 -0
  38. zenml/orchestrators/__init__.py +4 -0
  39. zenml/orchestrators/wheeled_orchestrator.py +147 -0
  40. zenml/service_connectors/service_connector_utils.py +349 -0
  41. zenml/stack_deployments/gcp_stack_deployment.py +2 -4
  42. zenml/steps/base_step.py +7 -5
  43. zenml/utils/function_utils.py +1 -1
  44. zenml/utils/pipeline_docker_image_builder.py +8 -0
  45. zenml/zen_server/dashboard/assets/{404-DpJaNHKF.js → 404-B_YdvmwS.js} +1 -1
  46. zenml/zen_server/dashboard/assets/{@reactflow-DJfzkHO1.js → @reactflow-l_1hUr1S.js} +1 -1
  47. zenml/zen_server/dashboard/assets/{AwarenessChannel-BYDLT2xC.js → AwarenessChannel-CFg5iX4Z.js} +1 -1
  48. zenml/zen_server/dashboard/assets/{CodeSnippet-BkOuRmyq.js → CodeSnippet-Dvkx_82E.js} +1 -1
  49. zenml/zen_server/dashboard/assets/CollapsibleCard-opiuBHHc.js +1 -0
  50. zenml/zen_server/dashboard/assets/{Commands-ZvWR1BRs.js → Commands-DoN1xrEq.js} +1 -1
  51. zenml/zen_server/dashboard/assets/{CopyButton-DVwLkafa.js → CopyButton-Cr7xYEPb.js} +1 -1
  52. zenml/zen_server/dashboard/assets/{CsvVizualization-C2IiqX4I.js → CsvVizualization-Ck-nZ43m.js} +3 -3
  53. zenml/zen_server/dashboard/assets/{Error-CqX0VqW_.js → Error-kLtljEOM.js} +1 -1
  54. zenml/zen_server/dashboard/assets/{ExecutionStatus-BoLUXR9t.js → ExecutionStatus-DguLLgTK.js} +1 -1
  55. zenml/zen_server/dashboard/assets/{Helpbox-LFydyVwh.js → Helpbox-BXUMP21n.js} +1 -1
  56. zenml/zen_server/dashboard/assets/{Infobox-DnENC0sh.js → Infobox-DSt0O-dm.js} +1 -1
  57. zenml/zen_server/dashboard/assets/{InlineAvatar-CbJtYr0t.js → InlineAvatar-xsrsIGE-.js} +1 -1
  58. zenml/zen_server/dashboard/assets/Pagination-C6X-mifw.js +1 -0
  59. zenml/zen_server/dashboard/assets/{SetPassword-BYBdbQDo.js → SetPassword-BXGTWiwj.js} +1 -1
  60. zenml/zen_server/dashboard/assets/{SuccessStep-Nx743hll.js → SuccessStep-DZC60t0x.js} +1 -1
  61. zenml/zen_server/dashboard/assets/{UpdatePasswordSchemas-DF9gSzE0.js → UpdatePasswordSchemas-DGvwFWO1.js} +1 -1
  62. zenml/zen_server/dashboard/assets/{chevron-right-double-BiEMg7rd.js → chevron-right-double-CZBOf6JM.js} +1 -1
  63. zenml/zen_server/dashboard/assets/cloud-only-C_yFCAkP.js +1 -0
  64. zenml/zen_server/dashboard/assets/index-BczVOqUf.js +55 -0
  65. zenml/zen_server/dashboard/assets/index-EpMIKgrI.css +1 -0
  66. zenml/zen_server/dashboard/assets/{login-mutation-BUnVASxp.js → login-mutation-CrHrndTI.js} +1 -1
  67. zenml/zen_server/dashboard/assets/logs-D8k8BVFf.js +1 -0
  68. zenml/zen_server/dashboard/assets/{not-found-B4VnX8gK.js → not-found-DYa4pC-C.js} +1 -1
  69. zenml/zen_server/dashboard/assets/{package-CsUhPmou.js → package-B3fWP-Dh.js} +1 -1
  70. zenml/zen_server/dashboard/assets/page-1h_sD1jz.js +1 -0
  71. zenml/zen_server/dashboard/assets/{page-Sxn82W-5.js → page-1iL8aMqs.js} +1 -1
  72. zenml/zen_server/dashboard/assets/{page-DMOYZppS.js → page-2grKx_MY.js} +1 -1
  73. zenml/zen_server/dashboard/assets/page-5NCOHOsy.js +1 -0
  74. zenml/zen_server/dashboard/assets/{page-JyfeDUfu.js → page-8a4UMKXZ.js} +1 -1
  75. zenml/zen_server/dashboard/assets/{page-Bx6o0ARS.js → page-B6h3iaHJ.js} +1 -1
  76. zenml/zen_server/dashboard/assets/page-BDns21Iz.js +1 -0
  77. zenml/zen_server/dashboard/assets/{page-3efNCDeb.js → page-BhgCDInH.js} +2 -2
  78. zenml/zen_server/dashboard/assets/{page-DKlIdAe5.js → page-Bi-wtWiO.js} +2 -2
  79. zenml/zen_server/dashboard/assets/{page-7zTHbhhI.js → page-BkeAAYwp.js} +1 -1
  80. zenml/zen_server/dashboard/assets/{page-CRTJ0UuR.js → page-BkuQDIf-.js} +1 -1
  81. zenml/zen_server/dashboard/assets/page-BnaevhnB.js +1 -0
  82. zenml/zen_server/dashboard/assets/{page-BEs6jK71.js → page-Bq0YxkLV.js} +1 -1
  83. zenml/zen_server/dashboard/assets/page-Bs2F4eoD.js +2 -0
  84. zenml/zen_server/dashboard/assets/{page-CUZIGO-3.js → page-C6-UGEbH.js} +1 -1
  85. zenml/zen_server/dashboard/assets/{page-Xu8JEjSU.js → page-CCNRIt_f.js} +1 -1
  86. zenml/zen_server/dashboard/assets/{page-DvCvroOM.js → page-CHNxpz3n.js} +1 -1
  87. zenml/zen_server/dashboard/assets/{page-BpSqIf4B.js → page-DgorQFqi.js} +1 -1
  88. zenml/zen_server/dashboard/assets/page-K8ebxVIs.js +1 -0
  89. zenml/zen_server/dashboard/assets/{page-Cx67M0QT.js → page-MFQyIJd3.js} +1 -1
  90. zenml/zen_server/dashboard/assets/page-TgCF0P_U.js +1 -0
  91. zenml/zen_server/dashboard/assets/page-ZnCEe-eK.js +9 -0
  92. zenml/zen_server/dashboard/assets/{page-Dc_7KMQE.js → page-uA5prJGY.js} +1 -1
  93. zenml/zen_server/dashboard/assets/persist-D7HJNBWx.js +1 -0
  94. zenml/zen_server/dashboard/assets/plus-C8WOyCzt.js +1 -0
  95. zenml/zen_server/dashboard/assets/stack-detail-query-Cficsl6d.js +1 -0
  96. zenml/zen_server/dashboard/assets/update-server-settings-mutation-7d8xi1tS.js +1 -0
  97. zenml/zen_server/dashboard/assets/{url-DuQMeqYA.js → url-D7mAQGUM.js} +1 -1
  98. zenml/zen_server/dashboard/index.html +4 -4
  99. zenml/zen_server/dashboard_legacy/asset-manifest.json +4 -4
  100. zenml/zen_server/dashboard_legacy/index.html +1 -1
  101. zenml/zen_server/dashboard_legacy/{precache-manifest.c8c57fb0d2132b1d3c2119e776b7dfb3.js → precache-manifest.12246c7548e71e2c4438e496360de80c.js} +4 -4
  102. zenml/zen_server/dashboard_legacy/service-worker.js +1 -1
  103. zenml/zen_server/dashboard_legacy/static/js/main.3b27024b.chunk.js +2 -0
  104. zenml/zen_server/dashboard_legacy/static/js/{main.382439a7.chunk.js.map → main.3b27024b.chunk.js.map} +1 -1
  105. zenml/zen_server/deploy/helm/Chart.yaml +1 -1
  106. zenml/zen_server/deploy/helm/README.md +2 -2
  107. zenml/zen_server/routers/service_connectors_endpoints.py +57 -0
  108. zenml/zen_stores/migrations/versions/0.62.0_release.py +23 -0
  109. zenml/zen_stores/rest_zen_store.py +4 -0
  110. {zenml_nightly-0.61.0.dev20240713.dist-info → zenml_nightly-0.62.0.dev20240717.dist-info}/METADATA +2 -2
  111. {zenml_nightly-0.61.0.dev20240713.dist-info → zenml_nightly-0.62.0.dev20240717.dist-info}/RECORD +114 -96
  112. zenml/zen_server/dashboard/assets/Pagination-DEbVUupy.js +0 -1
  113. zenml/zen_server/dashboard/assets/chevron-down-D_ZlKMqH.js +0 -1
  114. zenml/zen_server/dashboard/assets/cloud-only-DVbIeckv.js +0 -1
  115. zenml/zen_server/dashboard/assets/index-C_CrU4vI.js +0 -1
  116. zenml/zen_server/dashboard/assets/index-DK1ynKjA.js +0 -55
  117. zenml/zen_server/dashboard/assets/index-inApY3KQ.css +0 -1
  118. zenml/zen_server/dashboard/assets/page-C43QGHTt.js +0 -9
  119. zenml/zen_server/dashboard/assets/page-CR0OG7ss.js +0 -1
  120. zenml/zen_server/dashboard/assets/page-CaopxiU1.js +0 -1
  121. zenml/zen_server/dashboard/assets/page-D7Z399xy.js +0 -1
  122. zenml/zen_server/dashboard/assets/page-D93kd7Xj.js +0 -1
  123. zenml/zen_server/dashboard/assets/page-DMsSn3dv.js +0 -2
  124. zenml/zen_server/dashboard/assets/page-Hus2pr9T.js +0 -1
  125. zenml/zen_server/dashboard/assets/page-TKXERe16.js +0 -1
  126. zenml/zen_server/dashboard/assets/plus-DOeLmm7C.js +0 -1
  127. zenml/zen_server/dashboard/assets/update-server-settings-mutation-CR8e3Sir.js +0 -1
  128. zenml/zen_server/dashboard_legacy/static/js/main.382439a7.chunk.js +0 -2
  129. {zenml_nightly-0.61.0.dev20240713.dist-info → zenml_nightly-0.62.0.dev20240717.dist-info}/LICENSE +0 -0
  130. {zenml_nightly-0.61.0.dev20240713.dist-info → zenml_nightly-0.62.0.dev20240717.dist-info}/WHEEL +0 -0
  131. {zenml_nightly-0.61.0.dev20240713.dist-info → zenml_nightly-0.62.0.dev20240717.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,498 @@
1
+ # Copyright (c) ZenML GmbH 2023. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at:
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12
+ # or implied. See the License for the specific language governing
13
+ # permissions and limitations under the License.
14
+ """Implementation of the Databricks orchestrator."""
15
+
16
+ import itertools
17
+ import os
18
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, cast
19
+ from uuid import UUID
20
+
21
+ from databricks.sdk import WorkspaceClient as DatabricksClient
22
+ from databricks.sdk.service.compute import (
23
+ AutoScale,
24
+ ClientsTypes,
25
+ ClusterSpec,
26
+ WorkloadType,
27
+ )
28
+ from databricks.sdk.service.jobs import CronSchedule, JobCluster
29
+ from databricks.sdk.service.jobs import Task as DatabricksTask
30
+
31
+ from zenml.client import Client
32
+ from zenml.constants import (
33
+ ENV_ZENML_CUSTOM_SOURCE_ROOT,
34
+ METADATA_ORCHESTRATOR_URL,
35
+ )
36
+ from zenml.integrations.databricks.flavors.databricks_orchestrator_flavor import (
37
+ DatabricksOrchestratorConfig,
38
+ DatabricksOrchestratorSettings,
39
+ )
40
+ from zenml.integrations.databricks.orchestrators.databricks_orchestrator_entrypoint_config import (
41
+ ENV_ZENML_DATABRICKS_ORCHESTRATOR_RUN_ID,
42
+ DatabricksEntrypointConfiguration,
43
+ )
44
+ from zenml.integrations.databricks.utils.databricks_utils import (
45
+ convert_step_to_task,
46
+ )
47
+ from zenml.io import fileio
48
+ from zenml.logger import get_logger
49
+ from zenml.metadata.metadata_types import MetadataType, Uri
50
+ from zenml.models.v2.core.schedule import ScheduleResponse
51
+ from zenml.orchestrators.utils import get_orchestrator_run_name
52
+ from zenml.orchestrators.wheeled_orchestrator import WheeledOrchestrator
53
+ from zenml.stack import StackValidator
54
+ from zenml.utils import io_utils
55
+ from zenml.utils.pipeline_docker_image_builder import (
56
+ PipelineDockerImageBuilder,
57
+ )
58
+
59
+ if TYPE_CHECKING:
60
+ from zenml.models import PipelineDeploymentResponse
61
+ from zenml.stack import Stack
62
+
63
+
64
+ logger = get_logger(__name__)
65
+
66
+ ZENML_STEP_DEFAULT_ENTRYPOINT_COMMAND = "entrypoint.main"
67
+ DATABRICKS_WHEELS_DIRECTORY_PREFIX = "dbfs:/FileStore/zenml"
68
+ DATABRICKS_LOCAL_FILESYSTEM_PREFIX = "file:/"
69
+ DATABRICKS_CLUSTER_DEFAULT_NAME = "zenml-databricks-cluster"
70
+ DATABRICKS_SPARK_DEFAULT_VERSION = "15.3.x-scala2.12"
71
+ DATABRICKS_JOB_ID_PARAMETER_REFERENCE = "{{job.id}}"
72
+ DATABRICKS_ZENML_DEFAULT_CUSTOM_REPOSITORY_PATH = "."
73
+
74
+
75
+ class DatabricksOrchestrator(WheeledOrchestrator):
76
+ """Base class for Orchestrator responsible for running pipelines remotely in a VM.
77
+
78
+ This orchestrator does not support running on a schedule.
79
+ """
80
+
81
+ # The default instance type to use if none is specified in settings
82
+ DEFAULT_INSTANCE_TYPE: Optional[str] = None
83
+
84
+ @property
85
+ def validator(self) -> Optional[StackValidator]:
86
+ """Validates the stack.
87
+
88
+ In the remote case, checks that the stack contains a container registry,
89
+ image builder and only remote components.
90
+
91
+ Returns:
92
+ A `StackValidator` instance.
93
+ """
94
+
95
+ def _validate_remote_components(
96
+ stack: "Stack",
97
+ ) -> Tuple[bool, str]:
98
+ for component in stack.components.values():
99
+ continue # TODO: Remove this line
100
+ if not component.config.is_local:
101
+ continue
102
+
103
+ return False, (
104
+ f"The Databricks orchestrator runs pipelines remotely, "
105
+ f"but the '{component.name}' {component.type.value} is "
106
+ "a local stack component and will not be available in "
107
+ "the Databricks step.\nPlease ensure that you always "
108
+ "use non-local stack components with the Databricks "
109
+ "orchestrator."
110
+ )
111
+
112
+ return True, ""
113
+
114
+ return StackValidator(
115
+ custom_validation_function=_validate_remote_components,
116
+ )
117
+
118
+ def _get_databricks_client(
119
+ self,
120
+ ) -> DatabricksClient:
121
+ """Creates a Databricks client.
122
+
123
+ Returns:
124
+ The Databricks client.
125
+ """
126
+ return DatabricksClient(
127
+ host=self.config.host,
128
+ client_id=self.config.client_id,
129
+ client_secret=self.config.client_secret,
130
+ )
131
+
132
+ @property
133
+ def config(self) -> DatabricksOrchestratorConfig:
134
+ """Returns the `DatabricksOrchestratorConfig` config.
135
+
136
+ Returns:
137
+ The configuration.
138
+ """
139
+ return cast(DatabricksOrchestratorConfig, self._config)
140
+
141
+ @property
142
+ def settings_class(self) -> Type[DatabricksOrchestratorSettings]:
143
+ """Settings class for the Databricks orchestrator.
144
+
145
+ Returns:
146
+ The settings class.
147
+ """
148
+ return DatabricksOrchestratorSettings
149
+
150
+ def get_orchestrator_run_id(self) -> str:
151
+ """Returns the active orchestrator run id.
152
+
153
+ Raises:
154
+ RuntimeError: If no run id exists. This happens when this method
155
+ gets called while the orchestrator is not running a pipeline.
156
+
157
+ Returns:
158
+ The orchestrator run id.
159
+
160
+ Raises:
161
+ RuntimeError: If the run id cannot be read from the environment.
162
+ """
163
+ try:
164
+ return os.environ[ENV_ZENML_DATABRICKS_ORCHESTRATOR_RUN_ID]
165
+ except KeyError:
166
+ raise RuntimeError(
167
+ "Unable to read run id from environment variable "
168
+ f"{ENV_ZENML_DATABRICKS_ORCHESTRATOR_RUN_ID}."
169
+ )
170
+
171
+ @property
172
+ def root_directory(self) -> str:
173
+ """Path to the root directory for all files concerning this orchestrator.
174
+
175
+ Returns:
176
+ Path to the root directory.
177
+ """
178
+ return os.path.join(
179
+ io_utils.get_global_config_directory(),
180
+ "databricks",
181
+ str(self.id),
182
+ )
183
+
184
+ @property
185
+ def pipeline_directory(self) -> str:
186
+ """Returns path to a directory in which the kubeflow pipeline files are stored.
187
+
188
+ Returns:
189
+ Path to the pipeline directory.
190
+ """
191
+ return os.path.join(self.root_directory, "pipelines")
192
+
193
+ def setup_credentials(self) -> None:
194
+ """Set up credentials for the orchestrator."""
195
+ connector = self.get_connector()
196
+ assert connector is not None
197
+ connector.configure_local_client()
198
+
199
+ def prepare_or_run_pipeline(
200
+ self,
201
+ deployment: "PipelineDeploymentResponse",
202
+ stack: "Stack",
203
+ environment: Dict[str, str],
204
+ ) -> Any:
205
+ """Creates a wheel and uploads the pipeline to Databricks.
206
+
207
+ This functions as an intermediary representation of the pipeline which
208
+ is then deployed to the kubeflow pipelines instance.
209
+
210
+ How it works:
211
+ -------------
212
+ Before this method is called the `prepare_pipeline_deployment()`
213
+ method builds a docker image that contains the code for the
214
+ pipeline, all steps the context around these files.
215
+
216
+ Based on this docker image a callable is created which builds
217
+ task for each step (`_construct_databricks_pipeline`).
218
+ To do this the entrypoint of the docker image is configured to
219
+ run the correct step within the docker image. The dependencies
220
+ between these task are then also configured onto each
221
+ task by pointing at the downstream steps.
222
+
223
+ Args:
224
+ deployment: The pipeline deployment to prepare or run.
225
+ stack: The stack the pipeline will run on.
226
+ environment: Environment variables to set in the orchestration
227
+ environment.
228
+
229
+ Raises:
230
+ ValueError: If the schedule is not set or if the cron expression
231
+ is not set.
232
+ """
233
+ if deployment.schedule:
234
+ if (
235
+ deployment.schedule.catchup
236
+ or deployment.schedule.interval_second
237
+ ):
238
+ logger.warning(
239
+ "Databricks orchestrator only uses schedules with the "
240
+ "`cron_expression` property, with optional `start_time` and/or `end_time`. "
241
+ "All other properties are ignored."
242
+ )
243
+ if deployment.schedule.cron_expression is None:
244
+ raise ValueError(
245
+ "Property `cron_expression` must be set when passing "
246
+ "schedule to a Databricks orchestrator."
247
+ )
248
+ if (
249
+ deployment.schedule.cron_expression
250
+ and self.settings_class().schedule_timezone is None
251
+ ):
252
+ raise ValueError(
253
+ "Property `schedule_timezone` must be set when passing "
254
+ "`cron_expression` to a Databricks orchestrator."
255
+ "Databricks orchestrator requires a Java Timezone ID to run the pipeline on schedule."
256
+ "Please refer to https://docs.oracle.com/middleware/1221/wcs/tag-ref/MISC/TimeZones.html for more information."
257
+ )
258
+
259
+ # Get deployment id
260
+ deployment_id = deployment.id
261
+
262
+ # Create a callable for future compilation into a dsl.Pipeline.
263
+ def _construct_databricks_pipeline(
264
+ zenml_project_wheel: str, job_cluster_key: str
265
+ ) -> List[DatabricksTask]:
266
+ """Create a databrcks task for each step.
267
+
268
+ This should contain the name of the step or task and configures the
269
+ entrypoint of the task to run the step.
270
+
271
+ Additionally, this gives each task information about its
272
+ direct downstream steps.
273
+
274
+ Args:
275
+ zenml_project_wheel: The wheel package containing the ZenML
276
+ project.
277
+ job_cluster_key: The ID of the Databricks job cluster.
278
+
279
+ Returns:
280
+ A list of Databricks tasks.
281
+ """
282
+ tasks = []
283
+ for step_name, step in deployment.step_configurations.items():
284
+ # The arguments are passed to configure the entrypoint of the
285
+ # docker container when the step is called.
286
+ arguments = DatabricksEntrypointConfiguration.get_entrypoint_arguments(
287
+ step_name=step_name,
288
+ deployment_id=deployment_id,
289
+ wheel_package=self.package_name,
290
+ databricks_job_id=DATABRICKS_JOB_ID_PARAMETER_REFERENCE,
291
+ )
292
+
293
+ # Find the upstream container ops of the current step and
294
+ # configure the current container op to run after them
295
+ upstream_steps = [
296
+ f"{deployment_id}_{upstream_step_name}"
297
+ for upstream_step_name in step.spec.upstream_steps
298
+ ]
299
+
300
+ docker_settings = step.config.docker_settings
301
+ docker_image_builder = PipelineDockerImageBuilder()
302
+ # Gather the requirements files
303
+ requirements_files = (
304
+ docker_image_builder.gather_requirements_files(
305
+ docker_settings=docker_settings,
306
+ stack=Client().active_stack,
307
+ log=False,
308
+ )
309
+ )
310
+
311
+ # Extract and clean the requirements
312
+ requirements = list(
313
+ itertools.chain.from_iterable(
314
+ r[1].strip().split("\n") for r in requirements_files
315
+ )
316
+ )
317
+
318
+ # Remove empty items and duplicates
319
+ requirements = sorted(set(filter(None, requirements)))
320
+
321
+ task = convert_step_to_task(
322
+ f"{deployment_id}_{step_name}",
323
+ ZENML_STEP_DEFAULT_ENTRYPOINT_COMMAND,
324
+ arguments,
325
+ requirements,
326
+ depends_on=upstream_steps,
327
+ zenml_project_wheel=zenml_project_wheel,
328
+ job_cluster_key=job_cluster_key,
329
+ )
330
+ tasks.append(task)
331
+ return tasks
332
+
333
+ # Get the orchestrator run name
334
+ orchestrator_run_name = get_orchestrator_run_name(
335
+ pipeline_name=deployment.pipeline_configuration.name
336
+ )
337
+ # Get a filepath to use to save the finished yaml to
338
+ fileio.makedirs(self.pipeline_directory)
339
+ pipeline_file_path = os.path.join(
340
+ self.pipeline_directory, f"{orchestrator_run_name}.yaml"
341
+ )
342
+
343
+ # Copy the repository to a temporary directory and add a setup.py file
344
+ repository_temp_dir = (
345
+ self.copy_repository_to_temp_dir_and_add_setup_py()
346
+ )
347
+
348
+ # Create a wheel for the package in the temporary directory
349
+ wheel_path = self.create_wheel(temp_dir=repository_temp_dir)
350
+
351
+ databricks_client = self._get_databricks_client()
352
+
353
+ # Create an empty folder in a volume.
354
+ deployment_name = (
355
+ deployment.pipeline.name if deployment.pipeline else "default"
356
+ )
357
+ databricks_directory = f"{DATABRICKS_WHEELS_DIRECTORY_PREFIX}/{deployment_name}/{orchestrator_run_name}"
358
+ databricks_wheel_path = (
359
+ f"{databricks_directory}/{wheel_path.rsplit('/', 1)[-1]}"
360
+ )
361
+
362
+ databricks_client.dbutils.fs.mkdirs(databricks_directory)
363
+ databricks_client.dbutils.fs.cp(
364
+ f"{DATABRICKS_LOCAL_FILESYSTEM_PREFIX}/{wheel_path}",
365
+ databricks_wheel_path,
366
+ )
367
+
368
+ # Construct the env variables for the pipeline
369
+ env_vars = environment.copy()
370
+ spark_env_vars = self.settings_class().spark_env_vars
371
+ if spark_env_vars:
372
+ for key, value in spark_env_vars.items():
373
+ env_vars[key] = value
374
+ env_vars[ENV_ZENML_CUSTOM_SOURCE_ROOT] = (
375
+ DATABRICKS_ZENML_DEFAULT_CUSTOM_REPOSITORY_PATH
376
+ )
377
+
378
+ fileio.rmtree(repository_temp_dir)
379
+
380
+ logger.info(
381
+ "Writing Databricks workflow definition to `%s`.",
382
+ pipeline_file_path,
383
+ )
384
+
385
+ # using the databricks client uploads the pipeline to databricks
386
+ job_cluster_key = self.sanitize_name(f"{deployment_id}")
387
+ self._upload_and_run_pipeline(
388
+ pipeline_name=orchestrator_run_name,
389
+ tasks=_construct_databricks_pipeline(
390
+ databricks_wheel_path, job_cluster_key
391
+ ),
392
+ env_vars=env_vars,
393
+ job_cluster_key=job_cluster_key,
394
+ schedule=deployment.schedule,
395
+ )
396
+
397
+ def _upload_and_run_pipeline(
398
+ self,
399
+ pipeline_name: str,
400
+ tasks: List[DatabricksTask],
401
+ env_vars: Dict[str, str],
402
+ job_cluster_key: str,
403
+ schedule: Optional["ScheduleResponse"] = None,
404
+ ) -> None:
405
+ """Uploads and run the pipeline on the Databricks jobs.
406
+
407
+ Args:
408
+ pipeline_name: The name of the pipeline.
409
+ tasks: The list of tasks to run.
410
+ env_vars: The environment variables.
411
+ job_cluster_key: The ID of the Databricks job cluster.
412
+ schedule: The schedule to run the pipeline
413
+
414
+ Raises:
415
+ ValueError: If the `Job Compute` policy is not found.
416
+ ValueError: If the `schedule_timezone` is not set when passing
417
+
418
+ """
419
+ databricks_client = self._get_databricks_client()
420
+ spark_conf = self.settings_class().spark_conf or {}
421
+ spark_conf[
422
+ "spark.databricks.driver.dbfsLibraryInstallationAllowed"
423
+ ] = "true"
424
+
425
+ policy_id = self.settings_class().policy_id or None
426
+ for policy in databricks_client.cluster_policies.list():
427
+ if policy.name == "Job Compute":
428
+ policy_id = policy.policy_id
429
+ if policy_id is None:
430
+ raise ValueError(
431
+ "Could not find the `Job Compute` policy in Databricks."
432
+ )
433
+ job_cluster = JobCluster(
434
+ job_cluster_key=job_cluster_key,
435
+ new_cluster=ClusterSpec(
436
+ spark_version=self.settings_class().spark_version
437
+ or DATABRICKS_SPARK_DEFAULT_VERSION,
438
+ num_workers=self.settings_class().num_workers,
439
+ node_type_id=self.settings_class().node_type_id
440
+ or "Standard_D4s_v5",
441
+ policy_id=policy_id,
442
+ autoscale=AutoScale(
443
+ min_workers=self.settings_class().autoscale[0],
444
+ max_workers=self.settings_class().autoscale[1],
445
+ ),
446
+ single_user_name=self.settings_class().single_user_name,
447
+ spark_env_vars=env_vars,
448
+ spark_conf=spark_conf,
449
+ workload_type=WorkloadType(
450
+ clients=ClientsTypes(jobs=True, notebooks=False)
451
+ ),
452
+ ),
453
+ )
454
+ if schedule and schedule.cron_expression:
455
+ schedule_timezone = self.settings_class().schedule_timezone
456
+ if schedule_timezone:
457
+ databricks_schedule = CronSchedule(
458
+ quartz_cron_expression=schedule.cron_expression,
459
+ timezone_id=schedule_timezone,
460
+ )
461
+ else:
462
+ raise ValueError(
463
+ "Property `schedule_timezone` must be set when passing "
464
+ "`cron_expression` to a Databricks orchestrator. "
465
+ "Databricks orchestrator requires a Java Timezone ID to run the pipeline on schedule. "
466
+ "Please refer to https://docs.oracle.com/middleware/1221/wcs/tag-ref/MISC/TimeZones.html for more information."
467
+ )
468
+ else:
469
+ databricks_schedule = None
470
+
471
+ job = databricks_client.jobs.create(
472
+ name=pipeline_name,
473
+ tasks=tasks,
474
+ job_clusters=[job_cluster],
475
+ schedule=databricks_schedule,
476
+ )
477
+ if job.job_id:
478
+ databricks_client.jobs.run_now(job_id=job.job_id)
479
+ else:
480
+ raise ValueError("An error occurred while getting the job id.")
481
+
482
+ def get_pipeline_run_metadata(
483
+ self, run_id: UUID
484
+ ) -> Dict[str, "MetadataType"]:
485
+ """Get general component-specific metadata for a pipeline run.
486
+
487
+ Args:
488
+ run_id: The ID of the pipeline run.
489
+
490
+ Returns:
491
+ A dictionary of metadata.
492
+ """
493
+ run_url = (
494
+ f"{self.config.host}/jobs/" f"{self.get_orchestrator_run_id()}"
495
+ )
496
+ return {
497
+ METADATA_ORCHESTRATOR_URL: Uri(run_url),
498
+ }
@@ -0,0 +1,97 @@
1
+ # Copyright (c) ZenML GmbH 2023. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at:
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12
+ # or implied. See the License for the specific language governing
13
+ # permissions and limitations under the License.
14
+ """Entrypoint configuration for ZenML Databricks pipeline steps."""
15
+
16
+ import os
17
+ import sys
18
+ from typing import Any, List, Set
19
+
20
+ import pkg_resources
21
+
22
+ from zenml.entrypoints.step_entrypoint_configuration import (
23
+ StepEntrypointConfiguration,
24
+ )
25
+
26
+ WHEEL_PACKAGE_OPTION = "wheel_package"
27
+ DATABRICKS_JOB_ID_OPTION = "databricks_job_id"
28
+ ENV_ZENML_DATABRICKS_ORCHESTRATOR_RUN_ID = (
29
+ "ZENML_DATABRICKS_ORCHESTRATOR_RUN_ID"
30
+ )
31
+
32
+
33
+ class DatabricksEntrypointConfiguration(StepEntrypointConfiguration):
34
+ """Entrypoint configuration for ZenML Databricks pipeline steps.
35
+
36
+ The only purpose of this entrypoint configuration is to reconstruct the
37
+ environment variables that exceed the maximum length of 256 characters
38
+ allowed for Databricks Processor steps from their individual components.
39
+ """
40
+
41
+ @classmethod
42
+ def get_entrypoint_options(cls) -> Set[str]:
43
+ """Gets all options required for running with this configuration.
44
+
45
+ Returns:
46
+ The superclass options as well as an option for the wheel package.
47
+ """
48
+ return (
49
+ super().get_entrypoint_options()
50
+ | {WHEEL_PACKAGE_OPTION}
51
+ | {DATABRICKS_JOB_ID_OPTION}
52
+ )
53
+
54
+ @classmethod
55
+ def get_entrypoint_arguments(
56
+ cls,
57
+ **kwargs: Any,
58
+ ) -> List[str]:
59
+ """Gets all arguments that the entrypoint command should be called with.
60
+
61
+ The argument list should be something that
62
+ `argparse.ArgumentParser.parse_args(...)` can handle (e.g.
63
+ `["--some_option", "some_value"]` or `["--some_option=some_value"]`).
64
+ It needs to provide values for all options returned by the
65
+ `get_entrypoint_options()` method of this class.
66
+
67
+ Args:
68
+ **kwargs: Kwargs, must include the step name.
69
+
70
+ Returns:
71
+ The superclass arguments as well as arguments for the wheel package.
72
+ """
73
+ return super().get_entrypoint_arguments(**kwargs) + [
74
+ f"--{WHEEL_PACKAGE_OPTION}",
75
+ kwargs[WHEEL_PACKAGE_OPTION],
76
+ f"--{DATABRICKS_JOB_ID_OPTION}",
77
+ kwargs[DATABRICKS_JOB_ID_OPTION],
78
+ ]
79
+
80
+ def run(self) -> None:
81
+ """Runs the step."""
82
+ # Get the wheel package and add it to the sys path
83
+ wheel_package = self.entrypoint_args[WHEEL_PACKAGE_OPTION]
84
+ distribution = pkg_resources.get_distribution(wheel_package)
85
+ project_root = os.path.join(distribution.location, wheel_package)
86
+ if project_root not in sys.path:
87
+ sys.path.insert(0, project_root)
88
+ sys.path.insert(-1, project_root)
89
+
90
+ # Get the job id and add it to the environment
91
+ databricks_job_id = self.entrypoint_args[DATABRICKS_JOB_ID_OPTION]
92
+ os.environ[ENV_ZENML_DATABRICKS_ORCHESTRATOR_RUN_ID] = (
93
+ databricks_job_id
94
+ )
95
+
96
+ # Run the step
97
+ super().run()
@@ -0,0 +1,19 @@
1
+ # Copyright (c) ZenML GmbH 2023. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at:
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12
+ # or implied. See the License for the specific language governing
13
+ # permissions and limitations under the License.
14
+ """Initialization of the Databricks Service."""
15
+
16
+ from zenml.integrations.databricks.services.databricks_deployment import ( # noqa
17
+ DatabricksDeploymentConfig,
18
+ DatabricksDeploymentService,
19
+ )