mlrun 1.7.0rc7__py3-none-any.whl → 1.7.0rc11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (91) hide show
  1. mlrun/__init__.py +1 -0
  2. mlrun/__main__.py +2 -0
  3. mlrun/artifacts/model.py +29 -25
  4. mlrun/common/schemas/__init__.py +4 -0
  5. mlrun/common/schemas/alert.py +122 -0
  6. mlrun/common/schemas/api_gateway.py +8 -1
  7. mlrun/common/schemas/auth.py +4 -0
  8. mlrun/common/schemas/client_spec.py +1 -0
  9. mlrun/common/schemas/hub.py +7 -9
  10. mlrun/common/schemas/model_monitoring/constants.py +4 -2
  11. mlrun/{datastore/helpers.py → common/schemas/pagination.py} +11 -3
  12. mlrun/common/schemas/project.py +15 -10
  13. mlrun/config.py +35 -13
  14. mlrun/datastore/__init__.py +3 -7
  15. mlrun/datastore/base.py +6 -5
  16. mlrun/datastore/datastore_profile.py +19 -1
  17. mlrun/datastore/snowflake_utils.py +43 -0
  18. mlrun/datastore/sources.py +18 -30
  19. mlrun/datastore/targets.py +140 -12
  20. mlrun/datastore/utils.py +10 -5
  21. mlrun/datastore/v3io.py +27 -50
  22. mlrun/db/base.py +88 -2
  23. mlrun/db/httpdb.py +314 -41
  24. mlrun/db/nopdb.py +142 -0
  25. mlrun/execution.py +21 -14
  26. mlrun/feature_store/api.py +9 -5
  27. mlrun/feature_store/feature_set.py +39 -23
  28. mlrun/feature_store/feature_vector.py +2 -1
  29. mlrun/feature_store/retrieval/spark_merger.py +27 -23
  30. mlrun/feature_store/steps.py +30 -19
  31. mlrun/features.py +4 -13
  32. mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
  33. mlrun/frameworks/lgbm/__init__.py +1 -1
  34. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  35. mlrun/frameworks/lgbm/model_handler.py +1 -1
  36. mlrun/frameworks/pytorch/__init__.py +2 -2
  37. mlrun/frameworks/sklearn/__init__.py +1 -1
  38. mlrun/frameworks/tf_keras/__init__.py +1 -1
  39. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
  40. mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
  41. mlrun/frameworks/xgboost/__init__.py +1 -1
  42. mlrun/kfpops.py +2 -5
  43. mlrun/launcher/base.py +1 -1
  44. mlrun/launcher/client.py +2 -2
  45. mlrun/model.py +2 -2
  46. mlrun/model_monitoring/application.py +11 -2
  47. mlrun/model_monitoring/applications/histogram_data_drift.py +3 -3
  48. mlrun/model_monitoring/controller.py +2 -3
  49. mlrun/model_monitoring/helpers.py +3 -1
  50. mlrun/model_monitoring/stream_processing.py +0 -1
  51. mlrun/model_monitoring/writer.py +32 -0
  52. mlrun/package/packagers_manager.py +1 -0
  53. mlrun/platforms/__init__.py +1 -1
  54. mlrun/platforms/other.py +1 -1
  55. mlrun/projects/operations.py +11 -4
  56. mlrun/projects/pipelines.py +1 -1
  57. mlrun/projects/project.py +180 -73
  58. mlrun/run.py +77 -41
  59. mlrun/runtimes/__init__.py +16 -0
  60. mlrun/runtimes/base.py +4 -1
  61. mlrun/runtimes/kubejob.py +26 -121
  62. mlrun/runtimes/mpijob/abstract.py +8 -8
  63. mlrun/runtimes/nuclio/api_gateway.py +58 -8
  64. mlrun/runtimes/nuclio/application/application.py +79 -1
  65. mlrun/runtimes/nuclio/application/reverse_proxy.go +9 -1
  66. mlrun/runtimes/nuclio/function.py +20 -13
  67. mlrun/runtimes/nuclio/serving.py +11 -10
  68. mlrun/runtimes/pod.py +148 -3
  69. mlrun/runtimes/utils.py +0 -28
  70. mlrun/secrets.py +6 -2
  71. mlrun/serving/remote.py +2 -3
  72. mlrun/serving/routers.py +7 -4
  73. mlrun/serving/server.py +1 -1
  74. mlrun/serving/states.py +14 -38
  75. mlrun/serving/v2_serving.py +8 -7
  76. mlrun/utils/helpers.py +1 -1
  77. mlrun/utils/http.py +1 -1
  78. mlrun/utils/notifications/notification/base.py +12 -0
  79. mlrun/utils/notifications/notification/console.py +2 -0
  80. mlrun/utils/notifications/notification/git.py +3 -1
  81. mlrun/utils/notifications/notification/ipython.py +2 -0
  82. mlrun/utils/notifications/notification/slack.py +41 -13
  83. mlrun/utils/notifications/notification/webhook.py +11 -1
  84. mlrun/utils/retryer.py +3 -2
  85. mlrun/utils/version/version.json +2 -2
  86. {mlrun-1.7.0rc7.dist-info → mlrun-1.7.0rc11.dist-info}/METADATA +15 -15
  87. {mlrun-1.7.0rc7.dist-info → mlrun-1.7.0rc11.dist-info}/RECORD +91 -89
  88. {mlrun-1.7.0rc7.dist-info → mlrun-1.7.0rc11.dist-info}/LICENSE +0 -0
  89. {mlrun-1.7.0rc7.dist-info → mlrun-1.7.0rc11.dist-info}/WHEEL +0 -0
  90. {mlrun-1.7.0rc7.dist-info → mlrun-1.7.0rc11.dist-info}/entry_points.txt +0 -0
  91. {mlrun-1.7.0rc7.dist-info → mlrun-1.7.0rc11.dist-info}/top_level.txt +0 -0
mlrun/run.py CHANGED
@@ -114,16 +114,18 @@ def function_to_module(code="", workdir=None, secrets=None, silent=False):
114
114
 
115
115
  example::
116
116
 
117
- mod = mlrun.function_to_module('./examples/training.py')
118
- task = mlrun.new_task(inputs={'infile.txt': '../examples/infile.txt'})
119
- context = mlrun.get_or_create_ctx('myfunc', spec=task)
120
- mod.my_job(context, p1=1, p2='x')
117
+ mod = mlrun.function_to_module("./examples/training.py")
118
+ task = mlrun.new_task(inputs={"infile.txt": "../examples/infile.txt"})
119
+ context = mlrun.get_or_create_ctx("myfunc", spec=task)
120
+ mod.my_job(context, p1=1, p2="x")
121
121
  print(context.to_yaml())
122
122
 
123
- fn = mlrun.import_function('hub://open-archive')
123
+ fn = mlrun.import_function("hub://open-archive")
124
124
  mod = mlrun.function_to_module(fn)
125
- data = mlrun.run.get_dataitem("https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz")
126
- context = mlrun.get_or_create_ctx('myfunc')
125
+ data = mlrun.run.get_dataitem(
126
+ "https://fpsignals-public.s3.amazonaws.com/catsndogs.tar.gz"
127
+ )
128
+ context = mlrun.get_or_create_ctx("myfunc")
127
129
  mod.open_archive(context, archive_url=data)
128
130
  print(context.to_yaml())
129
131
 
@@ -256,29 +258,31 @@ def get_or_create_ctx(
256
258
  Examples::
257
259
 
258
260
  # load MLRUN runtime context (will be set by the runtime framework e.g. KubeFlow)
259
- context = get_or_create_ctx('train')
261
+ context = get_or_create_ctx("train")
260
262
 
261
263
  # get parameters from the runtime context (or use defaults)
262
- p1 = context.get_param('p1', 1)
263
- p2 = context.get_param('p2', 'a-string')
264
+ p1 = context.get_param("p1", 1)
265
+ p2 = context.get_param("p2", "a-string")
264
266
 
265
267
  # access input metadata, values, files, and secrets (passwords)
266
- print(f'Run: {context.name} (uid={context.uid})')
267
- print(f'Params: p1={p1}, p2={p2}')
268
+ print(f"Run: {context.name} (uid={context.uid})")
269
+ print(f"Params: p1={p1}, p2={p2}")
268
270
  print(f'accesskey = {context.get_secret("ACCESS_KEY")}')
269
- input_str = context.get_input('infile.txt').get()
270
- print(f'file: {input_str}')
271
+ input_str = context.get_input("infile.txt").get()
272
+ print(f"file: {input_str}")
271
273
 
272
274
  # RUN some useful code e.g. ML training, data prep, etc.
273
275
 
274
276
  # log scalar result values (job result metrics)
275
- context.log_result('accuracy', p1 * 2)
276
- context.log_result('loss', p1 * 3)
277
- context.set_label('framework', 'sklearn')
277
+ context.log_result("accuracy", p1 * 2)
278
+ context.log_result("loss", p1 * 3)
279
+ context.set_label("framework", "sklearn")
278
280
 
279
281
  # log various types of artifacts (file, web page, table), will be versioned and visible in the UI
280
- context.log_artifact('model.txt', body=b'abc is 123', labels={'framework': 'xgboost'})
281
- context.log_artifact('results.html', body=b'<b> Some HTML <b>', viewer='web-app')
282
+ context.log_artifact(
283
+ "model.txt", body=b"abc is 123", labels={"framework": "xgboost"}
284
+ )
285
+ context.log_artifact("results.html", body=b"<b> Some HTML <b>", viewer="web-app")
282
286
 
283
287
  """
284
288
 
@@ -348,7 +352,9 @@ def import_function(url="", secrets=None, db="", project=None, new_name=None):
348
352
 
349
353
  function = mlrun.import_function("hub://auto-trainer")
350
354
  function = mlrun.import_function("./func.yaml")
351
- function = mlrun.import_function("https://raw.githubusercontent.com/org/repo/func.yaml")
355
+ function = mlrun.import_function(
356
+ "https://raw.githubusercontent.com/org/repo/func.yaml"
357
+ )
352
358
 
353
359
  :param url: path/url to Function Hub, db or function YAML file
354
360
  :param secrets: optional, credentials dict for DB or URL (s3, v3io, ...)
@@ -389,6 +395,8 @@ def import_function_to_dict(url, secrets=None):
389
395
  code = get_in(runtime, "spec.build.functionSourceCode")
390
396
  update_in(runtime, "metadata.build.code_origin", url)
391
397
  cmd = code_file = get_in(runtime, "spec.command", "")
398
+ # use kind = "job" by default if not specified
399
+ runtime.setdefault("kind", "job")
392
400
  if " " in cmd:
393
401
  code_file = cmd[: cmd.find(" ")]
394
402
  if runtime["kind"] in ["", "local"]:
@@ -445,12 +453,18 @@ def new_function(
445
453
  Example::
446
454
 
447
455
  # define a container based function (the `training.py` must exist in the container workdir)
448
- f = new_function(command='training.py -x {x}', image='myrepo/image:latest', kind='job')
456
+ f = new_function(
457
+ command="training.py -x {x}", image="myrepo/image:latest", kind="job"
458
+ )
449
459
  f.run(params={"x": 5})
450
460
 
451
461
  # define a container based function which reads its source from a git archive
452
- f = new_function(command='training.py -x {x}', image='myrepo/image:latest', kind='job',
453
- source='git://github.com/mlrun/something.git')
462
+ f = new_function(
463
+ command="training.py -x {x}",
464
+ image="myrepo/image:latest",
465
+ kind="job",
466
+ source="git://github.com/mlrun/something.git",
467
+ )
454
468
  f.run(params={"x": 5})
455
469
 
456
470
  # define a local handler function (execute a local function handler)
@@ -535,7 +549,7 @@ def new_function(
535
549
  if source:
536
550
  runner.spec.build.source = source
537
551
  if handler:
538
- if kind in [RuntimeKinds.serving, RuntimeKinds.application]:
552
+ if kind in RuntimeKinds.handlerless_runtimes():
539
553
  raise MLRunInvalidArgumentError(
540
554
  f"Handler is not supported for {kind} runtime"
541
555
  )
@@ -628,6 +642,8 @@ def code_to_function(
628
642
  - mpijob: run distributed Horovod jobs over the MPI job operator
629
643
  - spark: run distributed Spark job using Spark Kubernetes Operator
630
644
  - remote-spark: run distributed Spark job on remote Spark service
645
+ - databricks: run code on Databricks cluster (python scripts, Spark etc.)
646
+ - application: run a long living application (e.g. a web server, UI, etc.)
631
647
 
632
648
  Learn more about [Kinds of function (runtimes)](../concepts/functions-overview.html).
633
649
 
@@ -661,11 +677,15 @@ def code_to_function(
661
677
  import mlrun
662
678
 
663
679
  # create job function object from notebook code and add doc/metadata
664
- fn = mlrun.code_to_function("file_utils", kind="job",
665
- handler="open_archive", image="mlrun/mlrun",
666
- description = "this function opens a zip archive into a local/mounted folder",
667
- categories = ["fileutils"],
668
- labels = {"author": "me"})
680
+ fn = mlrun.code_to_function(
681
+ "file_utils",
682
+ kind="job",
683
+ handler="open_archive",
684
+ image="mlrun/mlrun",
685
+ description="this function opens a zip archive into a local/mounted folder",
686
+ categories=["fileutils"],
687
+ labels={"author": "me"},
688
+ )
669
689
 
670
690
  example::
671
691
 
@@ -676,11 +696,15 @@ def code_to_function(
676
696
  Path("mover.py").touch()
677
697
 
678
698
  # create nuclio function object from python module call mover.py
679
- fn = mlrun.code_to_function("nuclio-mover", kind="nuclio",
680
- filename="mover.py", image="python:3.7",
681
- description = "this function moves files from one system to another",
682
- requirements = ["pandas"],
683
- labels = {"author": "me"})
699
+ fn = mlrun.code_to_function(
700
+ "nuclio-mover",
701
+ kind="nuclio",
702
+ filename="mover.py",
703
+ image="python:3.7",
704
+ description="this function moves files from one system to another",
705
+ requirements=["pandas"],
706
+ labels={"author": "me"},
707
+ )
684
708
 
685
709
  """
686
710
  filebase, _ = path.splitext(path.basename(filename))
@@ -1094,13 +1118,25 @@ def wait_for_runs_completion(
1094
1118
  example::
1095
1119
 
1096
1120
  # run two training functions in parallel and wait for the results
1097
- inputs = {'dataset': cleaned_data}
1098
- run1 = train.run(name='train_lr', inputs=inputs, watch=False,
1099
- params={'model_pkg_class': 'sklearn.linear_model.LogisticRegression',
1100
- 'label_column': 'label'})
1101
- run2 = train.run(name='train_lr', inputs=inputs, watch=False,
1102
- params={'model_pkg_class': 'sklearn.ensemble.RandomForestClassifier',
1103
- 'label_column': 'label'})
1121
+ inputs = {"dataset": cleaned_data}
1122
+ run1 = train.run(
1123
+ name="train_lr",
1124
+ inputs=inputs,
1125
+ watch=False,
1126
+ params={
1127
+ "model_pkg_class": "sklearn.linear_model.LogisticRegression",
1128
+ "label_column": "label",
1129
+ },
1130
+ )
1131
+ run2 = train.run(
1132
+ name="train_lr",
1133
+ inputs=inputs,
1134
+ watch=False,
1135
+ params={
1136
+ "model_pkg_class": "sklearn.ensemble.RandomForestClassifier",
1137
+ "label_column": "label",
1138
+ },
1139
+ )
1104
1140
  completed = wait_for_runs_completion([run1, run2])
1105
1141
 
1106
1142
  :param runs: list of run objects (the returned values of function.run())
@@ -154,6 +154,22 @@ class RuntimeKinds:
154
154
  RuntimeKinds.application,
155
155
  ]
156
156
 
157
+ @staticmethod
158
+ def pure_nuclio_deployed_runtimes():
159
+ return [
160
+ RuntimeKinds.remote,
161
+ RuntimeKinds.nuclio,
162
+ RuntimeKinds.serving,
163
+ ]
164
+
165
+ @staticmethod
166
+ def handlerless_runtimes():
167
+ return [
168
+ RuntimeKinds.serving,
169
+ # Application runtime handler is internal reverse proxy
170
+ RuntimeKinds.application,
171
+ ]
172
+
157
173
  @staticmethod
158
174
  def local_runtimes():
159
175
  return [
mlrun/runtimes/base.py CHANGED
@@ -23,6 +23,7 @@ from typing import Callable, Optional, Union
23
23
  import requests.exceptions
24
24
  from nuclio.build import mlrun_footer
25
25
 
26
+ import mlrun.common.constants
26
27
  import mlrun.common.schemas
27
28
  import mlrun.common.schemas.model_monitoring.constants as mm_constants
28
29
  import mlrun.db
@@ -634,7 +635,9 @@ class BaseRuntime(ModelObj):
634
635
  image = image or self.spec.image or ""
635
636
 
636
637
  image = enrich_image_url(image, client_version, client_python_version)
637
- if not image.startswith("."):
638
+ if not image.startswith(
639
+ mlrun.common.constants.IMAGE_NAME_ENRICH_REGISTRY_PREFIX
640
+ ):
638
641
  return image
639
642
  registry, repository = get_parsed_docker_registry()
640
643
  if registry:
mlrun/runtimes/kubejob.py CHANGED
@@ -12,7 +12,6 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- import time
16
15
  import warnings
17
16
 
18
17
  import mlrun.common.schemas
@@ -21,7 +20,6 @@ import mlrun.errors
21
20
 
22
21
  from ..kfpops import build_op
23
22
  from ..model import RunObject
24
- from ..utils import get_in, logger
25
23
  from .pod import KubeResource
26
24
 
27
25
 
@@ -65,29 +63,13 @@ class KubejobRuntime(KubeResource):
65
63
  :param pull_at_runtime: load the archive into the container at job runtime vs on build/deploy
66
64
  :param target_dir: target dir on runtime pod or repo clone / archive extraction
67
65
  """
68
- mlrun.utils.helpers.validate_builder_source(source, pull_at_runtime, workdir)
69
-
70
- self.spec.build.source = source
71
- if handler:
72
- self.spec.default_handler = handler
73
- if workdir:
74
- self.spec.workdir = workdir
75
- if target_dir:
76
- self.spec.build.source_code_target_dir = target_dir
77
-
78
- self.spec.build.load_source_on_run = pull_at_runtime
79
- if (
80
- self.spec.build.base_image
81
- and not self.spec.build.commands
82
- and pull_at_runtime
83
- and not self.spec.image
84
- ):
85
- # if we load source from repo and don't need a full build use the base_image as the image
86
- self.spec.image = self.spec.build.base_image
87
- elif not pull_at_runtime:
88
- # clear the image so build will not be skipped
89
- self.spec.build.base_image = self.spec.build.base_image or self.spec.image
90
- self.spec.image = ""
66
+ self._configure_mlrun_build_with_source(
67
+ source=source,
68
+ workdir=workdir,
69
+ handler=handler,
70
+ pull_at_runtime=pull_at_runtime,
71
+ target_dir=target_dir,
72
+ )
91
73
 
92
74
  def build_config(
93
75
  self,
@@ -169,116 +151,39 @@ class KubejobRuntime(KubeResource):
169
151
  show_on_failure: bool = False,
170
152
  force_build: bool = False,
171
153
  ) -> bool:
172
- """deploy function, build container with dependencies
154
+ """Deploy function, build container with dependencies
173
155
 
174
- :param watch: wait for the deploy to complete (and print build logs)
175
- :param with_mlrun: add the current mlrun package to the container build
176
- :param skip_deployed: skip the build if we already have an image for the function
177
- :param is_kfp: deploy as part of a kfp pipeline
178
- :param mlrun_version_specifier: which mlrun package version to include (if not current)
156
+ :param watch: Wait for the deploy to complete (and print build logs)
157
+ :param with_mlrun: Add the current mlrun package to the container build
158
+ :param skip_deployed: Skip the build if we already have an image for the function
159
+ :param is_kfp: Deploy as part of a kfp pipeline
160
+ :param mlrun_version_specifier: Which mlrun package version to include (if not current)
179
161
  :param builder_env: Kaniko builder pod env vars dict (for config/credentials)
180
162
  e.g. builder_env={"GIT_TOKEN": token}
181
- :param show_on_failure: show logs only in case of build failure
182
- :param force_build: set True for force building the image, even when no changes were made
163
+ :param show_on_failure: Show logs only in case of build failure
164
+ :param force_build: Set True for force building the image, even when no changes were made
183
165
 
184
166
  :return: True if the function is ready (deployed)
185
167
  """
186
168
 
187
169
  build = self.spec.build
170
+ with_mlrun = self._resolve_build_with_mlrun(with_mlrun)
188
171
 
189
- if with_mlrun is None:
190
- if build.with_mlrun is not None:
191
- with_mlrun = build.with_mlrun
192
- else:
193
- with_mlrun = build.base_image and not (
194
- build.base_image.startswith("mlrun/")
195
- or "/mlrun/" in build.base_image
196
- )
197
-
198
- if (
199
- not build.source
200
- and not build.commands
201
- and not build.requirements
202
- and not build.extra
203
- and with_mlrun
204
- ):
205
- logger.info(
206
- "Running build to add mlrun package, set "
207
- "with_mlrun=False to skip if its already in the image"
208
- )
209
172
  self.status.state = ""
210
173
  if build.base_image:
211
174
  # clear the image so build will not be skipped
212
175
  self.spec.image = ""
213
176
 
214
- # When we're in pipelines context we must watch otherwise the pipelines pod will exit before the operation
215
- # is actually done. (when a pipelines pod exits, the pipeline step marked as done)
216
- if is_kfp:
217
- watch = True
218
-
219
- ready = False
220
- if self._is_remote_api():
221
- db = self._get_db()
222
- data = db.remote_builder(
223
- self,
224
- with_mlrun,
225
- mlrun_version_specifier,
226
- skip_deployed,
227
- builder_env=builder_env,
228
- force_build=force_build,
229
- )
230
- self.status = data["data"].get("status", None)
231
- self.spec.image = get_in(data, "data.spec.image")
232
- self.spec.build.base_image = self.spec.build.base_image or get_in(
233
- data, "data.spec.build.base_image"
234
- )
235
- # Get the source target dir in case it was enriched due to loading source
236
- self.spec.build.source_code_target_dir = get_in(
237
- data, "data.spec.build.source_code_target_dir"
238
- ) or get_in(data, "data.spec.clone_target_dir")
239
- ready = data.get("ready", False)
240
- if not ready:
241
- logger.info(
242
- f"Started building image: {data.get('data', {}).get('spec', {}).get('build', {}).get('image')}"
243
- )
244
- if watch and not ready:
245
- state = self._build_watch(watch, show_on_failure=show_on_failure)
246
- ready = state == "ready"
247
- self.status.state = state
248
-
249
- if watch and not ready:
250
- raise mlrun.errors.MLRunRuntimeError("Deploy failed")
251
- return ready
252
-
253
- def _build_watch(self, watch=True, logs=True, show_on_failure=False):
254
- db = self._get_db()
255
- offset = 0
256
- try:
257
- text, _ = db.get_builder_status(self, 0, logs=logs)
258
- except mlrun.db.RunDBError:
259
- raise ValueError("function or build process not found")
260
-
261
- def print_log(text):
262
- if text and (not show_on_failure or self.status.state == "error"):
263
- print(text, end="")
264
-
265
- print_log(text)
266
- offset += len(text)
267
- if watch:
268
- while self.status.state in ["pending", "running"]:
269
- time.sleep(2)
270
- if show_on_failure:
271
- text = ""
272
- db.get_builder_status(self, 0, logs=False)
273
- if self.status.state == "error":
274
- # re-read the full log on failure
275
- text, _ = db.get_builder_status(self, offset, logs=logs)
276
- else:
277
- text, _ = db.get_builder_status(self, offset, logs=logs)
278
- print_log(text)
279
- offset += len(text)
280
-
281
- return self.status.state
177
+ return self._build_image(
178
+ builder_env=builder_env,
179
+ force_build=force_build,
180
+ mlrun_version_specifier=mlrun_version_specifier,
181
+ show_on_failure=show_on_failure,
182
+ skip_deployed=skip_deployed,
183
+ watch=watch,
184
+ is_kfp=is_kfp,
185
+ with_mlrun=with_mlrun,
186
+ )
282
187
 
283
188
  def deploy_step(
284
189
  self,
@@ -223,14 +223,14 @@ class AbstractMPIJobRuntime(KubejobRuntime, abc.ABC):
223
223
  ```
224
224
  # Define the wanted MPI arguments
225
225
  mpi_args = []
226
- mpi_args.append('-x')
227
- mpi_args.append('NCCL_DEBUG=INFO')
228
- mpi_args.append('-x')
229
- mpi_args.append('NCCL_SOCKET_NTHREADS=2')
230
- mpi_args.append('-x')
231
- mpi_args.append('NCCL_NSOCKS_PERTHREAD=8')
232
- mpi_args.append('-x')
233
- mpi_args.append('NCCL_MIN_NCHANNELS=4')
226
+ mpi_args.append("-x")
227
+ mpi_args.append("NCCL_DEBUG=INFO")
228
+ mpi_args.append("-x")
229
+ mpi_args.append("NCCL_SOCKET_NTHREADS=2")
230
+ mpi_args.append("-x")
231
+ mpi_args.append("NCCL_NSOCKS_PERTHREAD=8")
232
+ mpi_args.append("-x")
233
+ mpi_args.append("NCCL_MIN_NCHANNELS=4")
234
234
 
235
235
  # Set the MPI arguments in the function
236
236
  fn.set_mpi_args(mpi_args)
@@ -22,7 +22,8 @@ from requests.auth import HTTPBasicAuth
22
22
  import mlrun
23
23
  import mlrun.common.schemas
24
24
 
25
- from .function import RemoteRuntime, get_fullname
25
+ from ..utils import logger
26
+ from .function import RemoteRuntime, get_fullname, min_nuclio_versions
26
27
  from .serving import ServingRuntime
27
28
 
28
29
  NUCLIO_API_GATEWAY_AUTHENTICATION_MODE_BASIC_AUTH = "basicAuth"
@@ -85,13 +86,14 @@ class BasicAuth(APIGatewayAuthenticator):
85
86
  self,
86
87
  ) -> Optional[dict[str, Optional[mlrun.common.schemas.APIGatewayBasicAuth]]]:
87
88
  return {
88
- "authentication": mlrun.common.schemas.APIGatewayBasicAuth(
89
+ "basicAuth": mlrun.common.schemas.APIGatewayBasicAuth(
89
90
  username=self._username, password=self._password
90
91
  )
91
92
  }
92
93
 
93
94
 
94
95
  class APIGateway:
96
+ @min_nuclio_versions("1.13.1")
95
97
  def __init__(
96
98
  self,
97
99
  project,
@@ -147,6 +149,7 @@ class APIGateway:
147
149
  self.description = description
148
150
  self.canary = canary
149
151
  self.authentication = authentication
152
+ self.state = ""
150
153
 
151
154
  def invoke(
152
155
  self,
@@ -172,6 +175,11 @@ class APIGateway:
172
175
  raise mlrun.errors.MLRunInvalidArgumentError(
173
176
  "Invocation url is not set. Set up gateway's `invoke_url` attribute."
174
177
  )
178
+ if not self.is_ready():
179
+ raise mlrun.errors.MLRunPreconditionFailedError(
180
+ f"API gateway is not ready. " f"Current state: {self.state}"
181
+ )
182
+
175
183
  if (
176
184
  self.authentication.authentication_mode
177
185
  == NUCLIO_API_GATEWAY_AUTHENTICATION_MODE_BASIC_AUTH
@@ -188,6 +196,33 @@ class APIGateway:
188
196
  auth=HTTPBasicAuth(*auth) if auth else None,
189
197
  )
190
198
 
199
+ def wait_for_readiness(self, max_wait_time=90):
200
+ """
201
+ Wait for the API gateway to become ready within the maximum wait time.
202
+
203
+ Parameters:
204
+ max_wait_time: int - Maximum time to wait in seconds (default is 90 seconds).
205
+
206
+ Returns:
207
+ bool: True if the entity becomes ready within the maximum wait time, False otherwise
208
+ """
209
+
210
+ def _ensure_ready():
211
+ if not self.is_ready():
212
+ raise AssertionError(
213
+ f"Waiting for gateway readiness is taking more than {max_wait_time} seconds"
214
+ )
215
+
216
+ return mlrun.utils.helpers.retry_until_successful(
217
+ 3, max_wait_time, logger, False, _ensure_ready
218
+ )
219
+
220
+ def is_ready(self):
221
+ if self.state is not mlrun.common.schemas.api_gateway.APIGatewayState.ready:
222
+ # try to sync the state
223
+ self.sync()
224
+ return self.state == mlrun.common.schemas.api_gateway.APIGatewayState.ready
225
+
191
226
  def sync(self):
192
227
  """
193
228
  Synchronize the API gateway from the server.
@@ -201,6 +236,7 @@ class APIGateway:
201
236
  self.functions = synced_gateway.functions
202
237
  self.canary = synced_gateway.canary
203
238
  self.description = synced_gateway.description
239
+ self.state = synced_gateway.state
204
240
 
205
241
  def with_basic_auth(self, username: str, password: str):
206
242
  """
@@ -247,7 +283,12 @@ class APIGateway:
247
283
  def from_scheme(cls, api_gateway: mlrun.common.schemas.APIGateway):
248
284
  project = api_gateway.metadata.labels.get(PROJECT_NAME_LABEL)
249
285
  functions, canary = cls._resolve_canary(api_gateway.spec.upstreams)
250
- return cls(
286
+ state = (
287
+ api_gateway.status.state
288
+ if api_gateway.status
289
+ else mlrun.common.schemas.APIGatewayState.none
290
+ )
291
+ api_gateway = cls(
251
292
  project=project,
252
293
  description=api_gateway.spec.description,
253
294
  name=api_gateway.spec.name,
@@ -257,15 +298,21 @@ class APIGateway:
257
298
  functions=functions,
258
299
  canary=canary,
259
300
  )
301
+ api_gateway.state = state
302
+ return api_gateway
260
303
 
261
304
  def to_scheme(self) -> mlrun.common.schemas.APIGateway:
262
305
  upstreams = (
263
306
  [
264
307
  mlrun.common.schemas.APIGatewayUpstream(
265
- nucliofunction={"name": function_name},
266
- percentage=percentage,
267
- )
268
- for function_name, percentage in zip(self.functions, self.canary)
308
+ nucliofunction={"name": self.functions[0]},
309
+ percentage=self.canary[0],
310
+ ),
311
+ mlrun.common.schemas.APIGatewayUpstream(
312
+ # do not set percent for the second function,
313
+ # so we can define which function to display as a primary one in UI
314
+ nucliofunction={"name": self.functions[1]},
315
+ ),
269
316
  ]
270
317
  if self.canary
271
318
  else [
@@ -300,7 +347,10 @@ class APIGateway:
300
347
 
301
348
  :return: (str) The invoke URL.
302
349
  """
303
- return urljoin(self.host, self.path)
350
+ host = self.host
351
+ if not self.host.startswith("http"):
352
+ host = f"https://{self.host}"
353
+ return urljoin(host, self.path)
304
354
 
305
355
  def _validate(
306
356
  self,