mlrun 1.6.4rc8__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +40 -122
- mlrun/alerts/__init__.py +15 -0
- mlrun/alerts/alert.py +248 -0
- mlrun/api/schemas/__init__.py +5 -4
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +47 -257
- mlrun/artifacts/dataset.py +11 -192
- mlrun/artifacts/manager.py +79 -47
- mlrun/artifacts/model.py +31 -159
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +74 -1
- mlrun/common/db/sql_session.py +5 -5
- mlrun/common/formatters/__init__.py +21 -0
- mlrun/common/formatters/artifact.py +45 -0
- mlrun/common/formatters/base.py +113 -0
- mlrun/common/formatters/feature_set.py +33 -0
- mlrun/common/formatters/function.py +46 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/formatters/run.py +29 -0
- mlrun/common/helpers.py +12 -3
- mlrun/common/model_monitoring/helpers.py +9 -5
- mlrun/{runtimes → common/runtimes}/constants.py +37 -9
- mlrun/common/schemas/__init__.py +31 -5
- mlrun/common/schemas/alert.py +202 -0
- mlrun/common/schemas/api_gateway.py +196 -0
- mlrun/common/schemas/artifact.py +25 -4
- mlrun/common/schemas/auth.py +16 -5
- mlrun/common/schemas/background_task.py +1 -1
- mlrun/common/schemas/client_spec.py +4 -2
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/constants.py +3 -0
- mlrun/common/schemas/feature_store.py +74 -44
- mlrun/common/schemas/frontend_spec.py +15 -7
- mlrun/common/schemas/function.py +12 -1
- mlrun/common/schemas/hub.py +11 -18
- mlrun/common/schemas/memory_reports.py +2 -2
- mlrun/common/schemas/model_monitoring/__init__.py +20 -4
- mlrun/common/schemas/model_monitoring/constants.py +123 -42
- mlrun/common/schemas/model_monitoring/grafana.py +13 -9
- mlrun/common/schemas/model_monitoring/model_endpoints.py +101 -54
- mlrun/common/schemas/notification.py +71 -14
- mlrun/common/schemas/object.py +2 -2
- mlrun/{model_monitoring/controller_handler.py → common/schemas/pagination.py} +9 -12
- mlrun/common/schemas/pipeline.py +8 -1
- mlrun/common/schemas/project.py +69 -18
- mlrun/common/schemas/runs.py +7 -1
- mlrun/common/schemas/runtime_resource.py +8 -12
- mlrun/common/schemas/schedule.py +4 -4
- mlrun/common/schemas/tag.py +1 -2
- mlrun/common/schemas/workflow.py +12 -4
- mlrun/common/types.py +14 -1
- mlrun/config.py +154 -69
- mlrun/data_types/data_types.py +6 -1
- mlrun/data_types/spark.py +2 -2
- mlrun/data_types/to_pandas.py +67 -37
- mlrun/datastore/__init__.py +6 -8
- mlrun/datastore/alibaba_oss.py +131 -0
- mlrun/datastore/azure_blob.py +143 -42
- mlrun/datastore/base.py +102 -58
- mlrun/datastore/datastore.py +34 -13
- mlrun/datastore/datastore_profile.py +146 -20
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -4
- mlrun/datastore/google_cloud_storage.py +97 -33
- mlrun/datastore/hdfs.py +56 -0
- mlrun/datastore/inmem.py +6 -3
- mlrun/datastore/redis.py +7 -2
- mlrun/datastore/s3.py +34 -12
- mlrun/datastore/snowflake_utils.py +45 -0
- mlrun/datastore/sources.py +303 -111
- mlrun/datastore/spark_utils.py +31 -2
- mlrun/datastore/store_resources.py +9 -7
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +453 -176
- mlrun/datastore/utils.py +72 -58
- mlrun/datastore/v3io.py +6 -1
- mlrun/db/base.py +274 -41
- mlrun/db/factory.py +1 -1
- mlrun/db/httpdb.py +893 -225
- mlrun/db/nopdb.py +291 -33
- mlrun/errors.py +36 -6
- mlrun/execution.py +115 -42
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +65 -73
- mlrun/feature_store/common.py +7 -12
- mlrun/feature_store/feature_set.py +76 -55
- mlrun/feature_store/feature_vector.py +39 -31
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +16 -11
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +13 -4
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +24 -32
- mlrun/feature_store/steps.py +45 -34
- mlrun/features.py +11 -21
- mlrun/frameworks/_common/artifacts_library.py +9 -9
- mlrun/frameworks/_common/mlrun_interface.py +5 -5
- mlrun/frameworks/_common/model_handler.py +48 -48
- mlrun/frameworks/_common/plan.py +5 -6
- mlrun/frameworks/_common/producer.py +3 -4
- mlrun/frameworks/_common/utils.py +5 -5
- mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
- mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
- mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
- mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
- mlrun/frameworks/_ml_common/model_handler.py +24 -24
- mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
- mlrun/frameworks/_ml_common/plan.py +2 -2
- mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
- mlrun/frameworks/_ml_common/utils.py +4 -4
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
- mlrun/frameworks/huggingface/model_server.py +4 -4
- mlrun/frameworks/lgbm/__init__.py +33 -33
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
- mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
- mlrun/frameworks/lgbm/model_handler.py +10 -10
- mlrun/frameworks/lgbm/model_server.py +6 -6
- mlrun/frameworks/lgbm/utils.py +5 -5
- mlrun/frameworks/onnx/dataset.py +8 -8
- mlrun/frameworks/onnx/mlrun_interface.py +3 -3
- mlrun/frameworks/onnx/model_handler.py +6 -6
- mlrun/frameworks/onnx/model_server.py +7 -7
- mlrun/frameworks/parallel_coordinates.py +6 -6
- mlrun/frameworks/pytorch/__init__.py +18 -18
- mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
- mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
- mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
- mlrun/frameworks/pytorch/model_handler.py +17 -17
- mlrun/frameworks/pytorch/model_server.py +7 -7
- mlrun/frameworks/sklearn/__init__.py +13 -13
- mlrun/frameworks/sklearn/estimator.py +4 -4
- mlrun/frameworks/sklearn/metrics_library.py +14 -14
- mlrun/frameworks/sklearn/mlrun_interface.py +16 -9
- mlrun/frameworks/sklearn/model_handler.py +2 -2
- mlrun/frameworks/tf_keras/__init__.py +10 -7
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
- mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
- mlrun/frameworks/tf_keras/model_handler.py +14 -14
- mlrun/frameworks/tf_keras/model_server.py +6 -6
- mlrun/frameworks/xgboost/__init__.py +13 -13
- mlrun/frameworks/xgboost/model_handler.py +6 -6
- mlrun/k8s_utils.py +61 -17
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +16 -15
- mlrun/launcher/client.py +13 -11
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +23 -13
- mlrun/launcher/remote.py +17 -10
- mlrun/lists.py +7 -6
- mlrun/model.py +478 -103
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +163 -371
- mlrun/{runtimes/mpijob/v1alpha1.py → model_monitoring/applications/__init__.py} +9 -15
- mlrun/model_monitoring/applications/_application_steps.py +188 -0
- mlrun/model_monitoring/applications/base.py +108 -0
- mlrun/model_monitoring/applications/context.py +341 -0
- mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
- mlrun/model_monitoring/applications/histogram_data_drift.py +354 -0
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +131 -278
- mlrun/model_monitoring/db/__init__.py +18 -0
- mlrun/model_monitoring/db/stores/__init__.py +136 -0
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/db/stores/base/store.py +213 -0
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
- mlrun/model_monitoring/db/tsdb/base.py +448 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +279 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +507 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
- mlrun/model_monitoring/features_drift_table.py +134 -106
- mlrun/model_monitoring/helpers.py +199 -55
- mlrun/model_monitoring/metrics/__init__.py +13 -0
- mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +134 -398
- mlrun/model_monitoring/tracking_policy.py +9 -2
- mlrun/model_monitoring/writer.py +161 -125
- mlrun/package/__init__.py +6 -6
- mlrun/package/context_handler.py +5 -5
- mlrun/package/packager.py +7 -7
- mlrun/package/packagers/default_packager.py +8 -8
- mlrun/package/packagers/numpy_packagers.py +15 -15
- mlrun/package/packagers/pandas_packagers.py +5 -5
- mlrun/package/packagers/python_standard_library_packagers.py +10 -10
- mlrun/package/packagers_manager.py +19 -23
- mlrun/package/utils/_formatter.py +6 -6
- mlrun/package/utils/_pickler.py +2 -2
- mlrun/package/utils/_supported_format.py +4 -4
- mlrun/package/utils/log_hint_utils.py +2 -2
- mlrun/package/utils/type_hint_utils.py +4 -9
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +24 -203
- mlrun/projects/operations.py +52 -25
- mlrun/projects/pipelines.py +191 -197
- mlrun/projects/project.py +1227 -400
- mlrun/render.py +16 -19
- mlrun/run.py +209 -184
- mlrun/runtimes/__init__.py +83 -15
- mlrun/runtimes/base.py +51 -35
- mlrun/runtimes/daskjob.py +17 -10
- mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
- mlrun/runtimes/databricks_job/databricks_runtime.py +8 -7
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -29
- mlrun/runtimes/function_reference.py +1 -1
- mlrun/runtimes/kubejob.py +34 -128
- mlrun/runtimes/local.py +40 -11
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +9 -10
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
- mlrun/runtimes/nuclio/api_gateway.py +769 -0
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +758 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/{function.py → nuclio/function.py} +200 -83
- mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
- mlrun/runtimes/{serving.py → nuclio/serving.py} +65 -68
- mlrun/runtimes/pod.py +281 -101
- mlrun/runtimes/remotesparkjob.py +12 -9
- mlrun/runtimes/sparkjob/spark3job.py +67 -51
- mlrun/runtimes/utils.py +41 -75
- mlrun/secrets.py +9 -5
- mlrun/serving/__init__.py +8 -1
- mlrun/serving/remote.py +2 -7
- mlrun/serving/routers.py +85 -69
- mlrun/serving/server.py +69 -44
- mlrun/serving/states.py +209 -36
- mlrun/serving/utils.py +22 -14
- mlrun/serving/v1_serving.py +6 -7
- mlrun/serving/v2_serving.py +133 -54
- mlrun/track/tracker.py +2 -1
- mlrun/track/tracker_manager.py +3 -3
- mlrun/track/trackers/mlflow_tracker.py +6 -2
- mlrun/utils/async_http.py +6 -8
- mlrun/utils/azure_vault.py +1 -1
- mlrun/utils/clones.py +1 -2
- mlrun/utils/condition_evaluator.py +3 -3
- mlrun/utils/db.py +21 -3
- mlrun/utils/helpers.py +405 -225
- mlrun/utils/http.py +3 -6
- mlrun/utils/logger.py +112 -16
- mlrun/utils/notifications/notification/__init__.py +17 -13
- mlrun/utils/notifications/notification/base.py +50 -2
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +24 -1
- mlrun/utils/notifications/notification/ipython.py +3 -1
- mlrun/utils/notifications/notification/slack.py +96 -21
- mlrun/utils/notifications/notification/webhook.py +59 -2
- mlrun/utils/notifications/notification_pusher.py +149 -30
- mlrun/utils/regex.py +9 -0
- mlrun/utils/retryer.py +208 -0
- mlrun/utils/singleton.py +1 -1
- mlrun/utils/v3io_clients.py +4 -6
- mlrun/utils/version/version.json +2 -2
- mlrun/utils/version/version.py +2 -6
- mlrun-1.7.0.dist-info/METADATA +378 -0
- mlrun-1.7.0.dist-info/RECORD +351 -0
- {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -273
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/application.py +0 -310
- mlrun/model_monitoring/batch.py +0 -1095
- mlrun/model_monitoring/prometheus.py +0 -219
- mlrun/model_monitoring/stores/__init__.py +0 -111
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -576
- mlrun/model_monitoring/stores/model_endpoint_store.py +0 -147
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/models/base.py +0 -84
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
- mlrun/platforms/other.py +0 -306
- mlrun-1.6.4rc8.dist-info/METADATA +0 -272
- mlrun-1.6.4rc8.dist-info/RECORD +0 -314
- {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/LICENSE +0 -0
- {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/entry_points.txt +0 -0
- {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/top_level.txt +0 -0
mlrun/runtimes/kubejob.py
CHANGED
|
@@ -11,17 +11,16 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
import time
|
|
14
|
+
import typing
|
|
16
15
|
import warnings
|
|
17
16
|
|
|
17
|
+
from mlrun_pipelines.common.ops import build_op
|
|
18
|
+
|
|
18
19
|
import mlrun.common.schemas
|
|
19
20
|
import mlrun.db
|
|
20
21
|
import mlrun.errors
|
|
21
22
|
|
|
22
|
-
from ..kfpops import build_op
|
|
23
23
|
from ..model import RunObject
|
|
24
|
-
from ..utils import get_in, logger
|
|
25
24
|
from .pod import KubeResource
|
|
26
25
|
|
|
27
26
|
|
|
@@ -65,29 +64,13 @@ class KubejobRuntime(KubeResource):
|
|
|
65
64
|
:param pull_at_runtime: load the archive into the container at job runtime vs on build/deploy
|
|
66
65
|
:param target_dir: target dir on runtime pod or repo clone / archive extraction
|
|
67
66
|
"""
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
if target_dir:
|
|
76
|
-
self.spec.build.source_code_target_dir = target_dir
|
|
77
|
-
|
|
78
|
-
self.spec.build.load_source_on_run = pull_at_runtime
|
|
79
|
-
if (
|
|
80
|
-
self.spec.build.base_image
|
|
81
|
-
and not self.spec.build.commands
|
|
82
|
-
and pull_at_runtime
|
|
83
|
-
and not self.spec.image
|
|
84
|
-
):
|
|
85
|
-
# if we load source from repo and don't need a full build use the base_image as the image
|
|
86
|
-
self.spec.image = self.spec.build.base_image
|
|
87
|
-
elif not pull_at_runtime:
|
|
88
|
-
# clear the image so build will not be skipped
|
|
89
|
-
self.spec.build.base_image = self.spec.build.base_image or self.spec.image
|
|
90
|
-
self.spec.image = ""
|
|
67
|
+
self._configure_mlrun_build_with_source(
|
|
68
|
+
source=source,
|
|
69
|
+
workdir=workdir,
|
|
70
|
+
handler=handler,
|
|
71
|
+
pull_at_runtime=pull_at_runtime,
|
|
72
|
+
target_dir=target_dir,
|
|
73
|
+
)
|
|
91
74
|
|
|
92
75
|
def build_config(
|
|
93
76
|
self,
|
|
@@ -160,125 +143,48 @@ class KubejobRuntime(KubeResource):
|
|
|
160
143
|
|
|
161
144
|
def deploy(
|
|
162
145
|
self,
|
|
163
|
-
watch=True,
|
|
164
|
-
with_mlrun=None,
|
|
165
|
-
skip_deployed=False,
|
|
166
|
-
is_kfp=False,
|
|
167
|
-
mlrun_version_specifier=None,
|
|
146
|
+
watch: bool = True,
|
|
147
|
+
with_mlrun: typing.Optional[bool] = None,
|
|
148
|
+
skip_deployed: bool = False,
|
|
149
|
+
is_kfp: bool = False,
|
|
150
|
+
mlrun_version_specifier: typing.Optional[bool] = None,
|
|
168
151
|
builder_env: dict = None,
|
|
169
152
|
show_on_failure: bool = False,
|
|
170
153
|
force_build: bool = False,
|
|
171
154
|
) -> bool:
|
|
172
|
-
"""
|
|
155
|
+
"""Deploy function, build container with dependencies
|
|
173
156
|
|
|
174
|
-
:param watch:
|
|
175
|
-
:param with_mlrun:
|
|
176
|
-
:param skip_deployed:
|
|
177
|
-
:param is_kfp:
|
|
178
|
-
:param mlrun_version_specifier:
|
|
157
|
+
:param watch: Wait for the deploy to complete (and print build logs)
|
|
158
|
+
:param with_mlrun: Add the current mlrun package to the container build
|
|
159
|
+
:param skip_deployed: Skip the build if we already have an image for the function
|
|
160
|
+
:param is_kfp: Deploy as part of a kfp pipeline
|
|
161
|
+
:param mlrun_version_specifier: Which mlrun package version to include (if not current)
|
|
179
162
|
:param builder_env: Kaniko builder pod env vars dict (for config/credentials)
|
|
180
163
|
e.g. builder_env={"GIT_TOKEN": token}
|
|
181
|
-
:param show_on_failure:
|
|
182
|
-
:param force_build:
|
|
164
|
+
:param show_on_failure: Show logs only in case of build failure
|
|
165
|
+
:param force_build: Set True for force building the image, even when no changes were made
|
|
183
166
|
|
|
184
167
|
:return: True if the function is ready (deployed)
|
|
185
168
|
"""
|
|
186
169
|
|
|
187
170
|
build = self.spec.build
|
|
171
|
+
with_mlrun = self._resolve_build_with_mlrun(with_mlrun)
|
|
188
172
|
|
|
189
|
-
if with_mlrun is None:
|
|
190
|
-
if build.with_mlrun is not None:
|
|
191
|
-
with_mlrun = build.with_mlrun
|
|
192
|
-
else:
|
|
193
|
-
with_mlrun = build.base_image and not (
|
|
194
|
-
build.base_image.startswith("mlrun/")
|
|
195
|
-
or "/mlrun/" in build.base_image
|
|
196
|
-
)
|
|
197
|
-
|
|
198
|
-
if (
|
|
199
|
-
not build.source
|
|
200
|
-
and not build.commands
|
|
201
|
-
and not build.requirements
|
|
202
|
-
and not build.extra
|
|
203
|
-
and with_mlrun
|
|
204
|
-
):
|
|
205
|
-
logger.info(
|
|
206
|
-
"Running build to add mlrun package, set "
|
|
207
|
-
"with_mlrun=False to skip if its already in the image"
|
|
208
|
-
)
|
|
209
173
|
self.status.state = ""
|
|
210
174
|
if build.base_image:
|
|
211
175
|
# clear the image so build will not be skipped
|
|
212
176
|
self.spec.image = ""
|
|
213
177
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
with_mlrun,
|
|
225
|
-
mlrun_version_specifier,
|
|
226
|
-
skip_deployed,
|
|
227
|
-
builder_env=builder_env,
|
|
228
|
-
force_build=force_build,
|
|
229
|
-
)
|
|
230
|
-
self.status = data["data"].get("status", None)
|
|
231
|
-
self.spec.image = get_in(data, "data.spec.image")
|
|
232
|
-
self.spec.build.base_image = self.spec.build.base_image or get_in(
|
|
233
|
-
data, "data.spec.build.base_image"
|
|
234
|
-
)
|
|
235
|
-
# Get the source target dir in case it was enriched due to loading source
|
|
236
|
-
self.spec.build.source_code_target_dir = get_in(
|
|
237
|
-
data, "data.spec.build.source_code_target_dir"
|
|
238
|
-
) or get_in(data, "data.spec.clone_target_dir")
|
|
239
|
-
ready = data.get("ready", False)
|
|
240
|
-
if not ready:
|
|
241
|
-
logger.info(
|
|
242
|
-
f"Started building image: {data.get('data', {}).get('spec', {}).get('build', {}).get('image')}"
|
|
243
|
-
)
|
|
244
|
-
if watch and not ready:
|
|
245
|
-
state = self._build_watch(watch, show_on_failure=show_on_failure)
|
|
246
|
-
ready = state == "ready"
|
|
247
|
-
self.status.state = state
|
|
248
|
-
|
|
249
|
-
if watch and not ready:
|
|
250
|
-
raise mlrun.errors.MLRunRuntimeError("Deploy failed")
|
|
251
|
-
return ready
|
|
252
|
-
|
|
253
|
-
def _build_watch(self, watch=True, logs=True, show_on_failure=False):
|
|
254
|
-
db = self._get_db()
|
|
255
|
-
offset = 0
|
|
256
|
-
try:
|
|
257
|
-
text, _ = db.get_builder_status(self, 0, logs=logs)
|
|
258
|
-
except mlrun.db.RunDBError:
|
|
259
|
-
raise ValueError("function or build process not found")
|
|
260
|
-
|
|
261
|
-
def print_log(text):
|
|
262
|
-
if text and (not show_on_failure or self.status.state == "error"):
|
|
263
|
-
print(text, end="")
|
|
264
|
-
|
|
265
|
-
print_log(text)
|
|
266
|
-
offset += len(text)
|
|
267
|
-
if watch:
|
|
268
|
-
while self.status.state in ["pending", "running"]:
|
|
269
|
-
time.sleep(2)
|
|
270
|
-
if show_on_failure:
|
|
271
|
-
text = ""
|
|
272
|
-
db.get_builder_status(self, 0, logs=False)
|
|
273
|
-
if self.status.state == "error":
|
|
274
|
-
# re-read the full log on failure
|
|
275
|
-
text, _ = db.get_builder_status(self, offset, logs=logs)
|
|
276
|
-
else:
|
|
277
|
-
text, _ = db.get_builder_status(self, offset, logs=logs)
|
|
278
|
-
print_log(text)
|
|
279
|
-
offset += len(text)
|
|
280
|
-
|
|
281
|
-
return self.status.state
|
|
178
|
+
return self._build_image(
|
|
179
|
+
builder_env=builder_env,
|
|
180
|
+
force_build=force_build,
|
|
181
|
+
mlrun_version_specifier=mlrun_version_specifier,
|
|
182
|
+
show_on_failure=show_on_failure,
|
|
183
|
+
skip_deployed=skip_deployed,
|
|
184
|
+
watch=watch,
|
|
185
|
+
is_kfp=is_kfp,
|
|
186
|
+
with_mlrun=with_mlrun,
|
|
187
|
+
)
|
|
282
188
|
|
|
283
189
|
def deploy_step(
|
|
284
190
|
self,
|
mlrun/runtimes/local.py
CHANGED
|
@@ -33,6 +33,7 @@ from sys import executable
|
|
|
33
33
|
from nuclio import Event
|
|
34
34
|
|
|
35
35
|
import mlrun
|
|
36
|
+
import mlrun.common.constants as mlrun_constants
|
|
36
37
|
from mlrun.lists import RunList
|
|
37
38
|
|
|
38
39
|
from ..errors import err_to_str
|
|
@@ -57,7 +58,9 @@ class ParallelRunner:
|
|
|
57
58
|
|
|
58
59
|
return TrackerManager()
|
|
59
60
|
|
|
60
|
-
def _get_handler(
|
|
61
|
+
def _get_handler(
|
|
62
|
+
self, handler: str, context: MLClientCtx, embed_in_sys: bool = True
|
|
63
|
+
):
|
|
61
64
|
return handler
|
|
62
65
|
|
|
63
66
|
def _get_dask_client(self, options):
|
|
@@ -85,7 +88,7 @@ class ParallelRunner:
|
|
|
85
88
|
handler = runobj.spec.handler
|
|
86
89
|
self._force_handler(handler)
|
|
87
90
|
set_paths(self.spec.pythonpath)
|
|
88
|
-
handler = self._get_handler(handler, execution)
|
|
91
|
+
handler = self._get_handler(handler, execution, embed_in_sys=False)
|
|
89
92
|
|
|
90
93
|
client, function_name = self._get_dask_client(generator.options)
|
|
91
94
|
parallel_runs = generator.options.parallel_runs or 4
|
|
@@ -142,7 +145,10 @@ class ParallelRunner:
|
|
|
142
145
|
if function_name and generator.options.teardown_dask:
|
|
143
146
|
logger.info("Tearing down the dask cluster..")
|
|
144
147
|
mlrun.get_run_db().delete_runtime_resources(
|
|
145
|
-
|
|
148
|
+
project=self.metadata.project,
|
|
149
|
+
kind=mlrun.runtimes.RuntimeKinds.dask,
|
|
150
|
+
object_id=function_name,
|
|
151
|
+
force=True,
|
|
146
152
|
)
|
|
147
153
|
|
|
148
154
|
return results
|
|
@@ -223,12 +229,14 @@ class LocalRuntime(BaseRuntime, ParallelRunner):
|
|
|
223
229
|
def is_deployed(self):
|
|
224
230
|
return True
|
|
225
231
|
|
|
226
|
-
def _get_handler(
|
|
232
|
+
def _get_handler(
|
|
233
|
+
self, handler: str, context: MLClientCtx, embed_in_sys: bool = True
|
|
234
|
+
):
|
|
227
235
|
command = self.spec.command
|
|
228
236
|
if not command and self.spec.build.functionSourceCode:
|
|
229
237
|
# if the code is embedded in the function object extract or find it
|
|
230
238
|
command, _ = mlrun.run.load_func_code(self)
|
|
231
|
-
return load_module(command, handler, context)
|
|
239
|
+
return load_module(command, handler, context, embed_in_sys=embed_in_sys)
|
|
232
240
|
|
|
233
241
|
def _pre_run(self, runobj: RunObject, execution: MLClientCtx):
|
|
234
242
|
workdir = self.spec.workdir
|
|
@@ -257,7 +265,8 @@ class LocalRuntime(BaseRuntime, ParallelRunner):
|
|
|
257
265
|
set_paths(os.path.realpath("."))
|
|
258
266
|
|
|
259
267
|
if (
|
|
260
|
-
runobj.metadata.labels.get(
|
|
268
|
+
runobj.metadata.labels.get(mlrun_constants.MLRunInternalLabels.kind)
|
|
269
|
+
== RemoteSparkRuntime.kind
|
|
261
270
|
and environ["MLRUN_SPARK_CLIENT_IGZ_SPARK"] == "true"
|
|
262
271
|
):
|
|
263
272
|
from mlrun.runtimes.remotesparkjob import igz_spark_pre_hook
|
|
@@ -370,8 +379,20 @@ class LocalRuntime(BaseRuntime, ParallelRunner):
|
|
|
370
379
|
return run_obj_dict
|
|
371
380
|
|
|
372
381
|
|
|
373
|
-
def load_module(
|
|
374
|
-
|
|
382
|
+
def load_module(
|
|
383
|
+
file_name: str,
|
|
384
|
+
handler: str,
|
|
385
|
+
context: MLClientCtx,
|
|
386
|
+
embed_in_sys: bool = True,
|
|
387
|
+
):
|
|
388
|
+
"""
|
|
389
|
+
Load module from filename
|
|
390
|
+
:param file_name: The module path to load
|
|
391
|
+
:param handler: The callable to load
|
|
392
|
+
:param context: Execution context
|
|
393
|
+
:param embed_in_sys: Embed the file-named module in sys.modules. This is not persistent with remote
|
|
394
|
+
environments and therefore can effect pickling.
|
|
395
|
+
"""
|
|
375
396
|
module = None
|
|
376
397
|
if file_name:
|
|
377
398
|
path = Path(file_name)
|
|
@@ -382,13 +403,21 @@ def load_module(file_name, handler, context):
|
|
|
382
403
|
if spec is None:
|
|
383
404
|
raise RunError(f"Cannot import from {file_name!r}")
|
|
384
405
|
module = imputil.module_from_spec(spec)
|
|
406
|
+
if embed_in_sys:
|
|
407
|
+
sys.modules[mod_name] = module
|
|
385
408
|
spec.loader.exec_module(module)
|
|
386
409
|
|
|
387
410
|
class_args = {}
|
|
388
411
|
if context:
|
|
389
412
|
class_args = copy(context._parameters.get("_init_args", {}))
|
|
390
413
|
|
|
391
|
-
return get_handler_extended(
|
|
414
|
+
return get_handler_extended(
|
|
415
|
+
handler,
|
|
416
|
+
context,
|
|
417
|
+
class_args,
|
|
418
|
+
namespaces=module,
|
|
419
|
+
reload_modules=context._reset_on_run,
|
|
420
|
+
)
|
|
392
421
|
|
|
393
422
|
|
|
394
423
|
def run_exec(cmd, args, env=None, cwd=None):
|
|
@@ -436,7 +465,7 @@ def run_exec(cmd, args, env=None, cwd=None):
|
|
|
436
465
|
return out, err
|
|
437
466
|
|
|
438
467
|
|
|
439
|
-
class _DupStdout
|
|
468
|
+
class _DupStdout:
|
|
440
469
|
def __init__(self):
|
|
441
470
|
self.terminal = sys.stdout
|
|
442
471
|
self.buf = StringIO()
|
|
@@ -493,7 +522,7 @@ def exec_from_params(handler, runobj: RunObject, context: MLClientCtx, cwd=None)
|
|
|
493
522
|
logger.warning("Run was aborted", err=err_to_str(exc))
|
|
494
523
|
# Run was aborted, the state run state is updated by the abort job, no need to commit again
|
|
495
524
|
context.set_state(
|
|
496
|
-
mlrun.runtimes.constants.RunStates.aborted, commit=False
|
|
525
|
+
mlrun.common.runtimes.constants.RunStates.aborted, commit=False
|
|
497
526
|
)
|
|
498
527
|
commit = False
|
|
499
528
|
except Exception as exc:
|
|
@@ -21,28 +21,8 @@ from mlrun.config import config
|
|
|
21
21
|
from .. import MPIJobCRDVersions
|
|
22
22
|
from .abstract import AbstractMPIJobRuntime
|
|
23
23
|
from .v1 import MpiRuntimeV1
|
|
24
|
-
from .v1alpha1 import MpiRuntimeV1Alpha1
|
|
25
24
|
|
|
26
25
|
|
|
27
26
|
def _resolve_mpijob_crd_version():
|
|
28
27
|
# config is expected to get enriched from the API through the client-spec
|
|
29
28
|
return config.mpijob_crd_version or MPIJobCRDVersions.default()
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
class MpiRuntimeContainer(containers.DeclarativeContainer):
|
|
33
|
-
resolver = providers.Callable(
|
|
34
|
-
_resolve_mpijob_crd_version,
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
selector = providers.Selector(
|
|
38
|
-
resolver,
|
|
39
|
-
v1=providers.Object(MpiRuntimeV1),
|
|
40
|
-
v1alpha1=providers.Object(MpiRuntimeV1Alpha1),
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
# An empty selector to be overriden by the API
|
|
44
|
-
handler_selector = providers.Selector(
|
|
45
|
-
resolver,
|
|
46
|
-
v1=providers.Object(None),
|
|
47
|
-
v1alpha1=providers.Object(None),
|
|
48
|
-
)
|
|
@@ -13,7 +13,6 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import abc
|
|
15
15
|
import os
|
|
16
|
-
import typing
|
|
17
16
|
|
|
18
17
|
from mlrun.config import config
|
|
19
18
|
from mlrun.runtimes.kubejob import KubejobRuntime
|
|
@@ -206,7 +205,7 @@ class AbstractMPIJobRuntime(KubejobRuntime, abc.ABC):
|
|
|
206
205
|
|
|
207
206
|
self.set_envs(horovod_autotune_settings)
|
|
208
207
|
|
|
209
|
-
def set_mpi_args(self, args:
|
|
208
|
+
def set_mpi_args(self, args: list[str]) -> None:
|
|
210
209
|
"""Sets the runtime's mpi arguments to args.
|
|
211
210
|
|
|
212
211
|
Parameters
|
|
@@ -224,14 +223,14 @@ class AbstractMPIJobRuntime(KubejobRuntime, abc.ABC):
|
|
|
224
223
|
```
|
|
225
224
|
# Define the wanted MPI arguments
|
|
226
225
|
mpi_args = []
|
|
227
|
-
mpi_args.append(
|
|
228
|
-
mpi_args.append(
|
|
229
|
-
mpi_args.append(
|
|
230
|
-
mpi_args.append(
|
|
231
|
-
mpi_args.append(
|
|
232
|
-
mpi_args.append(
|
|
233
|
-
mpi_args.append(
|
|
234
|
-
mpi_args.append(
|
|
226
|
+
mpi_args.append("-x")
|
|
227
|
+
mpi_args.append("NCCL_DEBUG=INFO")
|
|
228
|
+
mpi_args.append("-x")
|
|
229
|
+
mpi_args.append("NCCL_SOCKET_NTHREADS=2")
|
|
230
|
+
mpi_args.append("-x")
|
|
231
|
+
mpi_args.append("NCCL_NSOCKS_PERTHREAD=8")
|
|
232
|
+
mpi_args.append("-x")
|
|
233
|
+
mpi_args.append("NCCL_MIN_NCHANNELS=4")
|
|
235
234
|
|
|
236
235
|
# Set the MPI arguments in the function
|
|
237
236
|
fn.set_mpi_args(mpi_args)
|
mlrun/runtimes/mpijob/v1.py
CHANGED
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
from mlrun.runtimes.constants import MPIJobCRDVersions, MPIJobV1CleanPodPolicies
|
|
14
|
+
from mlrun.common.runtimes.constants import MPIJobCRDVersions, MPIJobV1CleanPodPolicies
|
|
15
15
|
from mlrun.runtimes.mpijob.abstract import AbstractMPIJobRuntime, MPIResourceSpec
|
|
16
16
|
|
|
17
17
|
|
|
@@ -12,12 +12,10 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from
|
|
16
|
-
|
|
17
|
-
from .
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class ModelEndpointsTable(Base, ModelEndpointsBaseTable):
|
|
23
|
-
pass
|
|
15
|
+
from .serving import ServingRuntime, new_v2_model_server # noqa
|
|
16
|
+
from .nuclio import nuclio_init_hook # noqa
|
|
17
|
+
from .function import (
|
|
18
|
+
min_nuclio_versions,
|
|
19
|
+
RemoteRuntime,
|
|
20
|
+
) # noqa
|
|
21
|
+
from .api_gateway import APIGateway
|