mlrun 1.7.0rc7__py3-none-any.whl → 1.7.0rc11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +1 -0
- mlrun/__main__.py +2 -0
- mlrun/artifacts/model.py +29 -25
- mlrun/common/schemas/__init__.py +4 -0
- mlrun/common/schemas/alert.py +122 -0
- mlrun/common/schemas/api_gateway.py +8 -1
- mlrun/common/schemas/auth.py +4 -0
- mlrun/common/schemas/client_spec.py +1 -0
- mlrun/common/schemas/hub.py +7 -9
- mlrun/common/schemas/model_monitoring/constants.py +4 -2
- mlrun/{datastore/helpers.py → common/schemas/pagination.py} +11 -3
- mlrun/common/schemas/project.py +15 -10
- mlrun/config.py +35 -13
- mlrun/datastore/__init__.py +3 -7
- mlrun/datastore/base.py +6 -5
- mlrun/datastore/datastore_profile.py +19 -1
- mlrun/datastore/snowflake_utils.py +43 -0
- mlrun/datastore/sources.py +18 -30
- mlrun/datastore/targets.py +140 -12
- mlrun/datastore/utils.py +10 -5
- mlrun/datastore/v3io.py +27 -50
- mlrun/db/base.py +88 -2
- mlrun/db/httpdb.py +314 -41
- mlrun/db/nopdb.py +142 -0
- mlrun/execution.py +21 -14
- mlrun/feature_store/api.py +9 -5
- mlrun/feature_store/feature_set.py +39 -23
- mlrun/feature_store/feature_vector.py +2 -1
- mlrun/feature_store/retrieval/spark_merger.py +27 -23
- mlrun/feature_store/steps.py +30 -19
- mlrun/features.py +4 -13
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
- mlrun/frameworks/lgbm/__init__.py +1 -1
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/model_handler.py +1 -1
- mlrun/frameworks/pytorch/__init__.py +2 -2
- mlrun/frameworks/sklearn/__init__.py +1 -1
- mlrun/frameworks/tf_keras/__init__.py +1 -1
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
- mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
- mlrun/frameworks/xgboost/__init__.py +1 -1
- mlrun/kfpops.py +2 -5
- mlrun/launcher/base.py +1 -1
- mlrun/launcher/client.py +2 -2
- mlrun/model.py +2 -2
- mlrun/model_monitoring/application.py +11 -2
- mlrun/model_monitoring/applications/histogram_data_drift.py +3 -3
- mlrun/model_monitoring/controller.py +2 -3
- mlrun/model_monitoring/helpers.py +3 -1
- mlrun/model_monitoring/stream_processing.py +0 -1
- mlrun/model_monitoring/writer.py +32 -0
- mlrun/package/packagers_manager.py +1 -0
- mlrun/platforms/__init__.py +1 -1
- mlrun/platforms/other.py +1 -1
- mlrun/projects/operations.py +11 -4
- mlrun/projects/pipelines.py +1 -1
- mlrun/projects/project.py +180 -73
- mlrun/run.py +77 -41
- mlrun/runtimes/__init__.py +16 -0
- mlrun/runtimes/base.py +4 -1
- mlrun/runtimes/kubejob.py +26 -121
- mlrun/runtimes/mpijob/abstract.py +8 -8
- mlrun/runtimes/nuclio/api_gateway.py +58 -8
- mlrun/runtimes/nuclio/application/application.py +79 -1
- mlrun/runtimes/nuclio/application/reverse_proxy.go +9 -1
- mlrun/runtimes/nuclio/function.py +20 -13
- mlrun/runtimes/nuclio/serving.py +11 -10
- mlrun/runtimes/pod.py +148 -3
- mlrun/runtimes/utils.py +0 -28
- mlrun/secrets.py +6 -2
- mlrun/serving/remote.py +2 -3
- mlrun/serving/routers.py +7 -4
- mlrun/serving/server.py +1 -1
- mlrun/serving/states.py +14 -38
- mlrun/serving/v2_serving.py +8 -7
- mlrun/utils/helpers.py +1 -1
- mlrun/utils/http.py +1 -1
- mlrun/utils/notifications/notification/base.py +12 -0
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +3 -1
- mlrun/utils/notifications/notification/ipython.py +2 -0
- mlrun/utils/notifications/notification/slack.py +41 -13
- mlrun/utils/notifications/notification/webhook.py +11 -1
- mlrun/utils/retryer.py +3 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc7.dist-info → mlrun-1.7.0rc11.dist-info}/METADATA +15 -15
- {mlrun-1.7.0rc7.dist-info → mlrun-1.7.0rc11.dist-info}/RECORD +91 -89
- {mlrun-1.7.0rc7.dist-info → mlrun-1.7.0rc11.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc7.dist-info → mlrun-1.7.0rc11.dist-info}/WHEEL +0 -0
- {mlrun-1.7.0rc7.dist-info → mlrun-1.7.0rc11.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc7.dist-info → mlrun-1.7.0rc11.dist-info}/top_level.txt +0 -0
mlrun/run.py
CHANGED
|
@@ -114,16 +114,18 @@ def function_to_module(code="", workdir=None, secrets=None, silent=False):
|
|
|
114
114
|
|
|
115
115
|
example::
|
|
116
116
|
|
|
117
|
-
mod = mlrun.function_to_module(
|
|
118
|
-
task = mlrun.new_task(inputs={
|
|
119
|
-
context = mlrun.get_or_create_ctx(
|
|
120
|
-
mod.my_job(context, p1=1, p2=
|
|
117
|
+
mod = mlrun.function_to_module("./examples/training.py")
|
|
118
|
+
task = mlrun.new_task(inputs={"infile.txt": "../examples/infile.txt"})
|
|
119
|
+
context = mlrun.get_or_create_ctx("myfunc", spec=task)
|
|
120
|
+
mod.my_job(context, p1=1, p2="x")
|
|
121
121
|
print(context.to_yaml())
|
|
122
122
|
|
|
123
|
-
fn = mlrun.import_function(
|
|
123
|
+
fn = mlrun.import_function("hub://open-archive")
|
|
124
124
|
mod = mlrun.function_to_module(fn)
|
|
125
|
-
data = mlrun.run.get_dataitem(
|
|
126
|
-
|
|
125
|
+
data = mlrun.run.get_dataitem(
|
|
126
|
+
"https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz"
|
|
127
|
+
)
|
|
128
|
+
context = mlrun.get_or_create_ctx("myfunc")
|
|
127
129
|
mod.open_archive(context, archive_url=data)
|
|
128
130
|
print(context.to_yaml())
|
|
129
131
|
|
|
@@ -256,29 +258,31 @@ def get_or_create_ctx(
|
|
|
256
258
|
Examples::
|
|
257
259
|
|
|
258
260
|
# load MLRUN runtime context (will be set by the runtime framework e.g. KubeFlow)
|
|
259
|
-
context = get_or_create_ctx(
|
|
261
|
+
context = get_or_create_ctx("train")
|
|
260
262
|
|
|
261
263
|
# get parameters from the runtime context (or use defaults)
|
|
262
|
-
p1 = context.get_param(
|
|
263
|
-
p2 = context.get_param(
|
|
264
|
+
p1 = context.get_param("p1", 1)
|
|
265
|
+
p2 = context.get_param("p2", "a-string")
|
|
264
266
|
|
|
265
267
|
# access input metadata, values, files, and secrets (passwords)
|
|
266
|
-
print(f
|
|
267
|
-
print(f
|
|
268
|
+
print(f"Run: {context.name} (uid={context.uid})")
|
|
269
|
+
print(f"Params: p1={p1}, p2={p2}")
|
|
268
270
|
print(f'accesskey = {context.get_secret("ACCESS_KEY")}')
|
|
269
|
-
input_str = context.get_input(
|
|
270
|
-
print(f
|
|
271
|
+
input_str = context.get_input("infile.txt").get()
|
|
272
|
+
print(f"file: {input_str}")
|
|
271
273
|
|
|
272
274
|
# RUN some useful code e.g. ML training, data prep, etc.
|
|
273
275
|
|
|
274
276
|
# log scalar result values (job result metrics)
|
|
275
|
-
context.log_result(
|
|
276
|
-
context.log_result(
|
|
277
|
-
context.set_label(
|
|
277
|
+
context.log_result("accuracy", p1 * 2)
|
|
278
|
+
context.log_result("loss", p1 * 3)
|
|
279
|
+
context.set_label("framework", "sklearn")
|
|
278
280
|
|
|
279
281
|
# log various types of artifacts (file, web page, table), will be versioned and visible in the UI
|
|
280
|
-
context.log_artifact(
|
|
281
|
-
|
|
282
|
+
context.log_artifact(
|
|
283
|
+
"model.txt", body=b"abc is 123", labels={"framework": "xgboost"}
|
|
284
|
+
)
|
|
285
|
+
context.log_artifact("results.html", body=b"<b> Some HTML <b>", viewer="web-app")
|
|
282
286
|
|
|
283
287
|
"""
|
|
284
288
|
|
|
@@ -348,7 +352,9 @@ def import_function(url="", secrets=None, db="", project=None, new_name=None):
|
|
|
348
352
|
|
|
349
353
|
function = mlrun.import_function("hub://auto-trainer")
|
|
350
354
|
function = mlrun.import_function("./func.yaml")
|
|
351
|
-
function = mlrun.import_function(
|
|
355
|
+
function = mlrun.import_function(
|
|
356
|
+
"https://raw.githubusercontent.com/org/repo/func.yaml"
|
|
357
|
+
)
|
|
352
358
|
|
|
353
359
|
:param url: path/url to Function Hub, db or function YAML file
|
|
354
360
|
:param secrets: optional, credentials dict for DB or URL (s3, v3io, ...)
|
|
@@ -389,6 +395,8 @@ def import_function_to_dict(url, secrets=None):
|
|
|
389
395
|
code = get_in(runtime, "spec.build.functionSourceCode")
|
|
390
396
|
update_in(runtime, "metadata.build.code_origin", url)
|
|
391
397
|
cmd = code_file = get_in(runtime, "spec.command", "")
|
|
398
|
+
# use kind = "job" by default if not specified
|
|
399
|
+
runtime.setdefault("kind", "job")
|
|
392
400
|
if " " in cmd:
|
|
393
401
|
code_file = cmd[: cmd.find(" ")]
|
|
394
402
|
if runtime["kind"] in ["", "local"]:
|
|
@@ -445,12 +453,18 @@ def new_function(
|
|
|
445
453
|
Example::
|
|
446
454
|
|
|
447
455
|
# define a container based function (the `training.py` must exist in the container workdir)
|
|
448
|
-
f = new_function(
|
|
456
|
+
f = new_function(
|
|
457
|
+
command="training.py -x {x}", image="myrepo/image:latest", kind="job"
|
|
458
|
+
)
|
|
449
459
|
f.run(params={"x": 5})
|
|
450
460
|
|
|
451
461
|
# define a container based function which reads its source from a git archive
|
|
452
|
-
f = new_function(
|
|
453
|
-
|
|
462
|
+
f = new_function(
|
|
463
|
+
command="training.py -x {x}",
|
|
464
|
+
image="myrepo/image:latest",
|
|
465
|
+
kind="job",
|
|
466
|
+
source="git://github.com/mlrun/something.git",
|
|
467
|
+
)
|
|
454
468
|
f.run(params={"x": 5})
|
|
455
469
|
|
|
456
470
|
# define a local handler function (execute a local function handler)
|
|
@@ -535,7 +549,7 @@ def new_function(
|
|
|
535
549
|
if source:
|
|
536
550
|
runner.spec.build.source = source
|
|
537
551
|
if handler:
|
|
538
|
-
if kind in
|
|
552
|
+
if kind in RuntimeKinds.handlerless_runtimes():
|
|
539
553
|
raise MLRunInvalidArgumentError(
|
|
540
554
|
f"Handler is not supported for {kind} runtime"
|
|
541
555
|
)
|
|
@@ -628,6 +642,8 @@ def code_to_function(
|
|
|
628
642
|
- mpijob: run distributed Horovod jobs over the MPI job operator
|
|
629
643
|
- spark: run distributed Spark job using Spark Kubernetes Operator
|
|
630
644
|
- remote-spark: run distributed Spark job on remote Spark service
|
|
645
|
+
- databricks: run code on Databricks cluster (python scripts, Spark etc.)
|
|
646
|
+
- application: run a long living application (e.g. a web server, UI, etc.)
|
|
631
647
|
|
|
632
648
|
Learn more about [Kinds of function (runtimes)](../concepts/functions-overview.html).
|
|
633
649
|
|
|
@@ -661,11 +677,15 @@ def code_to_function(
|
|
|
661
677
|
import mlrun
|
|
662
678
|
|
|
663
679
|
# create job function object from notebook code and add doc/metadata
|
|
664
|
-
fn = mlrun.code_to_function(
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
680
|
+
fn = mlrun.code_to_function(
|
|
681
|
+
"file_utils",
|
|
682
|
+
kind="job",
|
|
683
|
+
handler="open_archive",
|
|
684
|
+
image="mlrun/mlrun",
|
|
685
|
+
description="this function opens a zip archive into a local/mounted folder",
|
|
686
|
+
categories=["fileutils"],
|
|
687
|
+
labels={"author": "me"},
|
|
688
|
+
)
|
|
669
689
|
|
|
670
690
|
example::
|
|
671
691
|
|
|
@@ -676,11 +696,15 @@ def code_to_function(
|
|
|
676
696
|
Path("mover.py").touch()
|
|
677
697
|
|
|
678
698
|
# create nuclio function object from python module call mover.py
|
|
679
|
-
fn = mlrun.code_to_function(
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
699
|
+
fn = mlrun.code_to_function(
|
|
700
|
+
"nuclio-mover",
|
|
701
|
+
kind="nuclio",
|
|
702
|
+
filename="mover.py",
|
|
703
|
+
image="python:3.7",
|
|
704
|
+
description="this function moves files from one system to another",
|
|
705
|
+
requirements=["pandas"],
|
|
706
|
+
labels={"author": "me"},
|
|
707
|
+
)
|
|
684
708
|
|
|
685
709
|
"""
|
|
686
710
|
filebase, _ = path.splitext(path.basename(filename))
|
|
@@ -1094,13 +1118,25 @@ def wait_for_runs_completion(
|
|
|
1094
1118
|
example::
|
|
1095
1119
|
|
|
1096
1120
|
# run two training functions in parallel and wait for the results
|
|
1097
|
-
inputs = {
|
|
1098
|
-
run1 = train.run(
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1121
|
+
inputs = {"dataset": cleaned_data}
|
|
1122
|
+
run1 = train.run(
|
|
1123
|
+
name="train_lr",
|
|
1124
|
+
inputs=inputs,
|
|
1125
|
+
watch=False,
|
|
1126
|
+
params={
|
|
1127
|
+
"model_pkg_class": "sklearn.linear_model.LogisticRegression",
|
|
1128
|
+
"label_column": "label",
|
|
1129
|
+
},
|
|
1130
|
+
)
|
|
1131
|
+
run2 = train.run(
|
|
1132
|
+
name="train_lr",
|
|
1133
|
+
inputs=inputs,
|
|
1134
|
+
watch=False,
|
|
1135
|
+
params={
|
|
1136
|
+
"model_pkg_class": "sklearn.ensemble.RandomForestClassifier",
|
|
1137
|
+
"label_column": "label",
|
|
1138
|
+
},
|
|
1139
|
+
)
|
|
1104
1140
|
completed = wait_for_runs_completion([run1, run2])
|
|
1105
1141
|
|
|
1106
1142
|
:param runs: list of run objects (the returned values of function.run())
|
mlrun/runtimes/__init__.py
CHANGED
|
@@ -154,6 +154,22 @@ class RuntimeKinds:
|
|
|
154
154
|
RuntimeKinds.application,
|
|
155
155
|
]
|
|
156
156
|
|
|
157
|
+
@staticmethod
|
|
158
|
+
def pure_nuclio_deployed_runtimes():
|
|
159
|
+
return [
|
|
160
|
+
RuntimeKinds.remote,
|
|
161
|
+
RuntimeKinds.nuclio,
|
|
162
|
+
RuntimeKinds.serving,
|
|
163
|
+
]
|
|
164
|
+
|
|
165
|
+
@staticmethod
|
|
166
|
+
def handlerless_runtimes():
|
|
167
|
+
return [
|
|
168
|
+
RuntimeKinds.serving,
|
|
169
|
+
# Application runtime handler is internal reverse proxy
|
|
170
|
+
RuntimeKinds.application,
|
|
171
|
+
]
|
|
172
|
+
|
|
157
173
|
@staticmethod
|
|
158
174
|
def local_runtimes():
|
|
159
175
|
return [
|
mlrun/runtimes/base.py
CHANGED
|
@@ -23,6 +23,7 @@ from typing import Callable, Optional, Union
|
|
|
23
23
|
import requests.exceptions
|
|
24
24
|
from nuclio.build import mlrun_footer
|
|
25
25
|
|
|
26
|
+
import mlrun.common.constants
|
|
26
27
|
import mlrun.common.schemas
|
|
27
28
|
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
28
29
|
import mlrun.db
|
|
@@ -634,7 +635,9 @@ class BaseRuntime(ModelObj):
|
|
|
634
635
|
image = image or self.spec.image or ""
|
|
635
636
|
|
|
636
637
|
image = enrich_image_url(image, client_version, client_python_version)
|
|
637
|
-
if not image.startswith(
|
|
638
|
+
if not image.startswith(
|
|
639
|
+
mlrun.common.constants.IMAGE_NAME_ENRICH_REGISTRY_PREFIX
|
|
640
|
+
):
|
|
638
641
|
return image
|
|
639
642
|
registry, repository = get_parsed_docker_registry()
|
|
640
643
|
if registry:
|
mlrun/runtimes/kubejob.py
CHANGED
|
@@ -12,7 +12,6 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
import time
|
|
16
15
|
import warnings
|
|
17
16
|
|
|
18
17
|
import mlrun.common.schemas
|
|
@@ -21,7 +20,6 @@ import mlrun.errors
|
|
|
21
20
|
|
|
22
21
|
from ..kfpops import build_op
|
|
23
22
|
from ..model import RunObject
|
|
24
|
-
from ..utils import get_in, logger
|
|
25
23
|
from .pod import KubeResource
|
|
26
24
|
|
|
27
25
|
|
|
@@ -65,29 +63,13 @@ class KubejobRuntime(KubeResource):
|
|
|
65
63
|
:param pull_at_runtime: load the archive into the container at job runtime vs on build/deploy
|
|
66
64
|
:param target_dir: target dir on runtime pod or repo clone / archive extraction
|
|
67
65
|
"""
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
if target_dir:
|
|
76
|
-
self.spec.build.source_code_target_dir = target_dir
|
|
77
|
-
|
|
78
|
-
self.spec.build.load_source_on_run = pull_at_runtime
|
|
79
|
-
if (
|
|
80
|
-
self.spec.build.base_image
|
|
81
|
-
and not self.spec.build.commands
|
|
82
|
-
and pull_at_runtime
|
|
83
|
-
and not self.spec.image
|
|
84
|
-
):
|
|
85
|
-
# if we load source from repo and don't need a full build use the base_image as the image
|
|
86
|
-
self.spec.image = self.spec.build.base_image
|
|
87
|
-
elif not pull_at_runtime:
|
|
88
|
-
# clear the image so build will not be skipped
|
|
89
|
-
self.spec.build.base_image = self.spec.build.base_image or self.spec.image
|
|
90
|
-
self.spec.image = ""
|
|
66
|
+
self._configure_mlrun_build_with_source(
|
|
67
|
+
source=source,
|
|
68
|
+
workdir=workdir,
|
|
69
|
+
handler=handler,
|
|
70
|
+
pull_at_runtime=pull_at_runtime,
|
|
71
|
+
target_dir=target_dir,
|
|
72
|
+
)
|
|
91
73
|
|
|
92
74
|
def build_config(
|
|
93
75
|
self,
|
|
@@ -169,116 +151,39 @@ class KubejobRuntime(KubeResource):
|
|
|
169
151
|
show_on_failure: bool = False,
|
|
170
152
|
force_build: bool = False,
|
|
171
153
|
) -> bool:
|
|
172
|
-
"""
|
|
154
|
+
"""Deploy function, build container with dependencies
|
|
173
155
|
|
|
174
|
-
:param watch:
|
|
175
|
-
:param with_mlrun:
|
|
176
|
-
:param skip_deployed:
|
|
177
|
-
:param is_kfp:
|
|
178
|
-
:param mlrun_version_specifier:
|
|
156
|
+
:param watch: Wait for the deploy to complete (and print build logs)
|
|
157
|
+
:param with_mlrun: Add the current mlrun package to the container build
|
|
158
|
+
:param skip_deployed: Skip the build if we already have an image for the function
|
|
159
|
+
:param is_kfp: Deploy as part of a kfp pipeline
|
|
160
|
+
:param mlrun_version_specifier: Which mlrun package version to include (if not current)
|
|
179
161
|
:param builder_env: Kaniko builder pod env vars dict (for config/credentials)
|
|
180
162
|
e.g. builder_env={"GIT_TOKEN": token}
|
|
181
|
-
:param show_on_failure:
|
|
182
|
-
:param force_build:
|
|
163
|
+
:param show_on_failure: Show logs only in case of build failure
|
|
164
|
+
:param force_build: Set True for force building the image, even when no changes were made
|
|
183
165
|
|
|
184
166
|
:return: True if the function is ready (deployed)
|
|
185
167
|
"""
|
|
186
168
|
|
|
187
169
|
build = self.spec.build
|
|
170
|
+
with_mlrun = self._resolve_build_with_mlrun(with_mlrun)
|
|
188
171
|
|
|
189
|
-
if with_mlrun is None:
|
|
190
|
-
if build.with_mlrun is not None:
|
|
191
|
-
with_mlrun = build.with_mlrun
|
|
192
|
-
else:
|
|
193
|
-
with_mlrun = build.base_image and not (
|
|
194
|
-
build.base_image.startswith("mlrun/")
|
|
195
|
-
or "/mlrun/" in build.base_image
|
|
196
|
-
)
|
|
197
|
-
|
|
198
|
-
if (
|
|
199
|
-
not build.source
|
|
200
|
-
and not build.commands
|
|
201
|
-
and not build.requirements
|
|
202
|
-
and not build.extra
|
|
203
|
-
and with_mlrun
|
|
204
|
-
):
|
|
205
|
-
logger.info(
|
|
206
|
-
"Running build to add mlrun package, set "
|
|
207
|
-
"with_mlrun=False to skip if its already in the image"
|
|
208
|
-
)
|
|
209
172
|
self.status.state = ""
|
|
210
173
|
if build.base_image:
|
|
211
174
|
# clear the image so build will not be skipped
|
|
212
175
|
self.spec.image = ""
|
|
213
176
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
with_mlrun,
|
|
225
|
-
mlrun_version_specifier,
|
|
226
|
-
skip_deployed,
|
|
227
|
-
builder_env=builder_env,
|
|
228
|
-
force_build=force_build,
|
|
229
|
-
)
|
|
230
|
-
self.status = data["data"].get("status", None)
|
|
231
|
-
self.spec.image = get_in(data, "data.spec.image")
|
|
232
|
-
self.spec.build.base_image = self.spec.build.base_image or get_in(
|
|
233
|
-
data, "data.spec.build.base_image"
|
|
234
|
-
)
|
|
235
|
-
# Get the source target dir in case it was enriched due to loading source
|
|
236
|
-
self.spec.build.source_code_target_dir = get_in(
|
|
237
|
-
data, "data.spec.build.source_code_target_dir"
|
|
238
|
-
) or get_in(data, "data.spec.clone_target_dir")
|
|
239
|
-
ready = data.get("ready", False)
|
|
240
|
-
if not ready:
|
|
241
|
-
logger.info(
|
|
242
|
-
f"Started building image: {data.get('data', {}).get('spec', {}).get('build', {}).get('image')}"
|
|
243
|
-
)
|
|
244
|
-
if watch and not ready:
|
|
245
|
-
state = self._build_watch(watch, show_on_failure=show_on_failure)
|
|
246
|
-
ready = state == "ready"
|
|
247
|
-
self.status.state = state
|
|
248
|
-
|
|
249
|
-
if watch and not ready:
|
|
250
|
-
raise mlrun.errors.MLRunRuntimeError("Deploy failed")
|
|
251
|
-
return ready
|
|
252
|
-
|
|
253
|
-
def _build_watch(self, watch=True, logs=True, show_on_failure=False):
|
|
254
|
-
db = self._get_db()
|
|
255
|
-
offset = 0
|
|
256
|
-
try:
|
|
257
|
-
text, _ = db.get_builder_status(self, 0, logs=logs)
|
|
258
|
-
except mlrun.db.RunDBError:
|
|
259
|
-
raise ValueError("function or build process not found")
|
|
260
|
-
|
|
261
|
-
def print_log(text):
|
|
262
|
-
if text and (not show_on_failure or self.status.state == "error"):
|
|
263
|
-
print(text, end="")
|
|
264
|
-
|
|
265
|
-
print_log(text)
|
|
266
|
-
offset += len(text)
|
|
267
|
-
if watch:
|
|
268
|
-
while self.status.state in ["pending", "running"]:
|
|
269
|
-
time.sleep(2)
|
|
270
|
-
if show_on_failure:
|
|
271
|
-
text = ""
|
|
272
|
-
db.get_builder_status(self, 0, logs=False)
|
|
273
|
-
if self.status.state == "error":
|
|
274
|
-
# re-read the full log on failure
|
|
275
|
-
text, _ = db.get_builder_status(self, offset, logs=logs)
|
|
276
|
-
else:
|
|
277
|
-
text, _ = db.get_builder_status(self, offset, logs=logs)
|
|
278
|
-
print_log(text)
|
|
279
|
-
offset += len(text)
|
|
280
|
-
|
|
281
|
-
return self.status.state
|
|
177
|
+
return self._build_image(
|
|
178
|
+
builder_env=builder_env,
|
|
179
|
+
force_build=force_build,
|
|
180
|
+
mlrun_version_specifier=mlrun_version_specifier,
|
|
181
|
+
show_on_failure=show_on_failure,
|
|
182
|
+
skip_deployed=skip_deployed,
|
|
183
|
+
watch=watch,
|
|
184
|
+
is_kfp=is_kfp,
|
|
185
|
+
with_mlrun=with_mlrun,
|
|
186
|
+
)
|
|
282
187
|
|
|
283
188
|
def deploy_step(
|
|
284
189
|
self,
|
|
@@ -223,14 +223,14 @@ class AbstractMPIJobRuntime(KubejobRuntime, abc.ABC):
|
|
|
223
223
|
```
|
|
224
224
|
# Define the wanted MPI arguments
|
|
225
225
|
mpi_args = []
|
|
226
|
-
mpi_args.append(
|
|
227
|
-
mpi_args.append(
|
|
228
|
-
mpi_args.append(
|
|
229
|
-
mpi_args.append(
|
|
230
|
-
mpi_args.append(
|
|
231
|
-
mpi_args.append(
|
|
232
|
-
mpi_args.append(
|
|
233
|
-
mpi_args.append(
|
|
226
|
+
mpi_args.append("-x")
|
|
227
|
+
mpi_args.append("NCCL_DEBUG=INFO")
|
|
228
|
+
mpi_args.append("-x")
|
|
229
|
+
mpi_args.append("NCCL_SOCKET_NTHREADS=2")
|
|
230
|
+
mpi_args.append("-x")
|
|
231
|
+
mpi_args.append("NCCL_NSOCKS_PERTHREAD=8")
|
|
232
|
+
mpi_args.append("-x")
|
|
233
|
+
mpi_args.append("NCCL_MIN_NCHANNELS=4")
|
|
234
234
|
|
|
235
235
|
# Set the MPI arguments in the function
|
|
236
236
|
fn.set_mpi_args(mpi_args)
|
|
@@ -22,7 +22,8 @@ from requests.auth import HTTPBasicAuth
|
|
|
22
22
|
import mlrun
|
|
23
23
|
import mlrun.common.schemas
|
|
24
24
|
|
|
25
|
-
from
|
|
25
|
+
from ..utils import logger
|
|
26
|
+
from .function import RemoteRuntime, get_fullname, min_nuclio_versions
|
|
26
27
|
from .serving import ServingRuntime
|
|
27
28
|
|
|
28
29
|
NUCLIO_API_GATEWAY_AUTHENTICATION_MODE_BASIC_AUTH = "basicAuth"
|
|
@@ -85,13 +86,14 @@ class BasicAuth(APIGatewayAuthenticator):
|
|
|
85
86
|
self,
|
|
86
87
|
) -> Optional[dict[str, Optional[mlrun.common.schemas.APIGatewayBasicAuth]]]:
|
|
87
88
|
return {
|
|
88
|
-
"
|
|
89
|
+
"basicAuth": mlrun.common.schemas.APIGatewayBasicAuth(
|
|
89
90
|
username=self._username, password=self._password
|
|
90
91
|
)
|
|
91
92
|
}
|
|
92
93
|
|
|
93
94
|
|
|
94
95
|
class APIGateway:
|
|
96
|
+
@min_nuclio_versions("1.13.1")
|
|
95
97
|
def __init__(
|
|
96
98
|
self,
|
|
97
99
|
project,
|
|
@@ -147,6 +149,7 @@ class APIGateway:
|
|
|
147
149
|
self.description = description
|
|
148
150
|
self.canary = canary
|
|
149
151
|
self.authentication = authentication
|
|
152
|
+
self.state = ""
|
|
150
153
|
|
|
151
154
|
def invoke(
|
|
152
155
|
self,
|
|
@@ -172,6 +175,11 @@ class APIGateway:
|
|
|
172
175
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
173
176
|
"Invocation url is not set. Set up gateway's `invoke_url` attribute."
|
|
174
177
|
)
|
|
178
|
+
if not self.is_ready():
|
|
179
|
+
raise mlrun.errors.MLRunPreconditionFailedError(
|
|
180
|
+
f"API gateway is not ready. " f"Current state: {self.state}"
|
|
181
|
+
)
|
|
182
|
+
|
|
175
183
|
if (
|
|
176
184
|
self.authentication.authentication_mode
|
|
177
185
|
== NUCLIO_API_GATEWAY_AUTHENTICATION_MODE_BASIC_AUTH
|
|
@@ -188,6 +196,33 @@ class APIGateway:
|
|
|
188
196
|
auth=HTTPBasicAuth(*auth) if auth else None,
|
|
189
197
|
)
|
|
190
198
|
|
|
199
|
+
def wait_for_readiness(self, max_wait_time=90):
|
|
200
|
+
"""
|
|
201
|
+
Wait for the API gateway to become ready within the maximum wait time.
|
|
202
|
+
|
|
203
|
+
Parameters:
|
|
204
|
+
max_wait_time: int - Maximum time to wait in seconds (default is 90 seconds).
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
bool: True if the entity becomes ready within the maximum wait time, False otherwise
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
def _ensure_ready():
|
|
211
|
+
if not self.is_ready():
|
|
212
|
+
raise AssertionError(
|
|
213
|
+
f"Waiting for gateway readiness is taking more than {max_wait_time} seconds"
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
return mlrun.utils.helpers.retry_until_successful(
|
|
217
|
+
3, max_wait_time, logger, False, _ensure_ready
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
def is_ready(self):
|
|
221
|
+
if self.state is not mlrun.common.schemas.api_gateway.APIGatewayState.ready:
|
|
222
|
+
# try to sync the state
|
|
223
|
+
self.sync()
|
|
224
|
+
return self.state == mlrun.common.schemas.api_gateway.APIGatewayState.ready
|
|
225
|
+
|
|
191
226
|
def sync(self):
|
|
192
227
|
"""
|
|
193
228
|
Synchronize the API gateway from the server.
|
|
@@ -201,6 +236,7 @@ class APIGateway:
|
|
|
201
236
|
self.functions = synced_gateway.functions
|
|
202
237
|
self.canary = synced_gateway.canary
|
|
203
238
|
self.description = synced_gateway.description
|
|
239
|
+
self.state = synced_gateway.state
|
|
204
240
|
|
|
205
241
|
def with_basic_auth(self, username: str, password: str):
|
|
206
242
|
"""
|
|
@@ -247,7 +283,12 @@ class APIGateway:
|
|
|
247
283
|
def from_scheme(cls, api_gateway: mlrun.common.schemas.APIGateway):
|
|
248
284
|
project = api_gateway.metadata.labels.get(PROJECT_NAME_LABEL)
|
|
249
285
|
functions, canary = cls._resolve_canary(api_gateway.spec.upstreams)
|
|
250
|
-
|
|
286
|
+
state = (
|
|
287
|
+
api_gateway.status.state
|
|
288
|
+
if api_gateway.status
|
|
289
|
+
else mlrun.common.schemas.APIGatewayState.none
|
|
290
|
+
)
|
|
291
|
+
api_gateway = cls(
|
|
251
292
|
project=project,
|
|
252
293
|
description=api_gateway.spec.description,
|
|
253
294
|
name=api_gateway.spec.name,
|
|
@@ -257,15 +298,21 @@ class APIGateway:
|
|
|
257
298
|
functions=functions,
|
|
258
299
|
canary=canary,
|
|
259
300
|
)
|
|
301
|
+
api_gateway.state = state
|
|
302
|
+
return api_gateway
|
|
260
303
|
|
|
261
304
|
def to_scheme(self) -> mlrun.common.schemas.APIGateway:
|
|
262
305
|
upstreams = (
|
|
263
306
|
[
|
|
264
307
|
mlrun.common.schemas.APIGatewayUpstream(
|
|
265
|
-
nucliofunction={"name":
|
|
266
|
-
percentage=
|
|
267
|
-
)
|
|
268
|
-
|
|
308
|
+
nucliofunction={"name": self.functions[0]},
|
|
309
|
+
percentage=self.canary[0],
|
|
310
|
+
),
|
|
311
|
+
mlrun.common.schemas.APIGatewayUpstream(
|
|
312
|
+
# do not set percent for the second function,
|
|
313
|
+
# so we can define which function to display as a primary one in UI
|
|
314
|
+
nucliofunction={"name": self.functions[1]},
|
|
315
|
+
),
|
|
269
316
|
]
|
|
270
317
|
if self.canary
|
|
271
318
|
else [
|
|
@@ -300,7 +347,10 @@ class APIGateway:
|
|
|
300
347
|
|
|
301
348
|
:return: (str) The invoke URL.
|
|
302
349
|
"""
|
|
303
|
-
|
|
350
|
+
host = self.host
|
|
351
|
+
if not self.host.startswith("http"):
|
|
352
|
+
host = f"https://{self.host}"
|
|
353
|
+
return urljoin(host, self.path)
|
|
304
354
|
|
|
305
355
|
def _validate(
|
|
306
356
|
self,
|