mlrun 1.10.0rc24__py3-none-any.whl → 1.10.0rc26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/artifacts/llm_prompt.py +8 -1
- mlrun/common/model_monitoring/helpers.py +86 -0
- mlrun/common/schemas/hub.py +25 -18
- mlrun/common/schemas/model_monitoring/constants.py +1 -0
- mlrun/common/schemas/model_monitoring/model_endpoints.py +10 -1
- mlrun/config.py +2 -3
- mlrun/datastore/__init__.py +2 -2
- mlrun/datastore/azure_blob.py +66 -43
- mlrun/datastore/datastore_profile.py +35 -5
- mlrun/datastore/model_provider/huggingface_provider.py +122 -30
- mlrun/datastore/model_provider/model_provider.py +62 -4
- mlrun/datastore/model_provider/openai_provider.py +114 -43
- mlrun/datastore/s3.py +24 -2
- mlrun/datastore/storeytargets.py +2 -3
- mlrun/db/base.py +15 -1
- mlrun/db/httpdb.py +17 -6
- mlrun/db/nopdb.py +14 -0
- mlrun/k8s_utils.py +0 -14
- mlrun/model_monitoring/api.py +2 -2
- mlrun/model_monitoring/applications/base.py +37 -10
- mlrun/model_monitoring/applications/context.py +1 -4
- mlrun/model_monitoring/controller.py +15 -5
- mlrun/model_monitoring/db/_schedules.py +2 -4
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +3 -1
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +3 -0
- mlrun/model_monitoring/helpers.py +5 -5
- mlrun/platforms/iguazio.py +7 -3
- mlrun/projects/project.py +33 -29
- mlrun/runtimes/base.py +0 -3
- mlrun/runtimes/mounts.py +15 -2
- mlrun/runtimes/nuclio/__init__.py +1 -0
- mlrun/runtimes/nuclio/application/application.py +11 -2
- mlrun/runtimes/nuclio/function.py +10 -0
- mlrun/runtimes/nuclio/serving.py +4 -0
- mlrun/runtimes/pod.py +153 -11
- mlrun/runtimes/utils.py +22 -5
- mlrun/serving/routers.py +23 -41
- mlrun/serving/server.py +26 -14
- mlrun/serving/states.py +3 -3
- mlrun/serving/system_steps.py +52 -29
- mlrun/serving/v2_serving.py +9 -10
- mlrun/utils/helpers.py +5 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.10.0rc24.dist-info → mlrun-1.10.0rc26.dist-info}/METADATA +24 -23
- {mlrun-1.10.0rc24.dist-info → mlrun-1.10.0rc26.dist-info}/RECORD +50 -50
- {mlrun-1.10.0rc24.dist-info → mlrun-1.10.0rc26.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc24.dist-info → mlrun-1.10.0rc26.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc24.dist-info → mlrun-1.10.0rc26.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.10.0rc24.dist-info → mlrun-1.10.0rc26.dist-info}/top_level.txt +0 -0
mlrun/projects/project.py
CHANGED
|
@@ -2749,16 +2749,18 @@ class MlrunProject(ModelObj):
|
|
|
2749
2749
|
| Creating a function with non project source is done by specifying a module ``handler`` and on the
|
|
2750
2750
|
returned function set the source with ``function.with_source_archive(<source>)``.
|
|
2751
2751
|
|
|
2752
|
-
|
|
2752
|
+
Supported URL prefixes:
|
|
2753
2753
|
|
|
2754
|
-
|
|
2755
|
-
|
|
2756
|
-
|
|
2754
|
+
- Object: s3://, v3io://, etc.
|
|
2755
|
+
- MLRun DB: e.g db://project/func:ver
|
|
2756
|
+
- Function hub/market: e.g. hub://auto-trainer:master
|
|
2757
2757
|
|
|
2758
2758
|
Examples::
|
|
2759
2759
|
|
|
2760
2760
|
proj.set_function(func_object)
|
|
2761
|
-
proj.set_function(
|
|
2761
|
+
proj.set_function(
|
|
2762
|
+
"http://.../mynb.ipynb", "train", kind="job", image="mlrun/mlrun"
|
|
2763
|
+
)
|
|
2762
2764
|
proj.set_function("./func.yaml")
|
|
2763
2765
|
proj.set_function("hub://get_toy_data", "getdata")
|
|
2764
2766
|
|
|
@@ -2785,18 +2787,6 @@ class MlrunProject(ModelObj):
|
|
|
2785
2787
|
# By providing a path to a pip requirements file
|
|
2786
2788
|
proj.set_function("my.py", requirements="requirements.txt")
|
|
2787
2789
|
|
|
2788
|
-
One of the most important parameters is 'kind', used to specify the chosen runtime. The options are:
|
|
2789
|
-
- local: execute a local python or shell script
|
|
2790
|
-
- job: insert the code into a Kubernetes pod and execute it
|
|
2791
|
-
- nuclio: insert the code into a real-time serverless nuclio function
|
|
2792
|
-
- serving: insert code into orchestrated nuclio function(s) forming a DAG
|
|
2793
|
-
- dask: run the specified python code / script as Dask Distributed job
|
|
2794
|
-
- mpijob: run distributed Horovod jobs over the MPI job operator
|
|
2795
|
-
- spark: run distributed Spark job using Spark Kubernetes Operator
|
|
2796
|
-
- remote-spark: run distributed Spark job on remote Spark service
|
|
2797
|
-
- databricks: run code on Databricks cluster (python scripts, Spark etc.)
|
|
2798
|
-
- application: run a long living application (e.g. a web server, UI, etc.)
|
|
2799
|
-
|
|
2800
2790
|
Learn more about :doc:`../../concepts/functions-overview`.
|
|
2801
2791
|
|
|
2802
2792
|
:param func: Function object or spec/code url, None refers to current Notebook
|
|
@@ -2804,8 +2794,20 @@ class MlrunProject(ModelObj):
|
|
|
2804
2794
|
Versions (e.g. myfunc:v1). If the `tag` parameter is provided, the tag in the name
|
|
2805
2795
|
must match the tag parameter.
|
|
2806
2796
|
Specifying a tag in the name will update the project's tagged function (myfunc:v1)
|
|
2807
|
-
:param kind:
|
|
2808
|
-
|
|
2797
|
+
:param kind: Default: job. One of
|
|
2798
|
+
|
|
2799
|
+
- local: execute a local python or shell script
|
|
2800
|
+
- job: insert the code into a Kubernetes pod and execute it
|
|
2801
|
+
- nuclio: insert the code into a real-time serverless nuclio function
|
|
2802
|
+
- serving: insert code into orchestrated nuclio function(s) forming a DAG
|
|
2803
|
+
- dask: run the specified python code / script as Dask Distributed job
|
|
2804
|
+
- mpijob: run distributed Horovod jobs over the MPI job operator
|
|
2805
|
+
- spark: run distributed Spark job using Spark Kubernetes Operator
|
|
2806
|
+
- remote-spark: run distributed Spark job on remote Spark service
|
|
2807
|
+
- databricks: run code on Databricks cluster (python scripts, Spark etc.)
|
|
2808
|
+
- application: run a long living application (e.g. a web server, UI, etc.)
|
|
2809
|
+
- handler: execute a python handler (used automatically in notebooks or for debug)
|
|
2810
|
+
|
|
2809
2811
|
:param image: Docker image to be used, can also be specified in the function object/yaml
|
|
2810
2812
|
:param handler: Default function handler to invoke (can only be set with .py/.ipynb files)
|
|
2811
2813
|
:param with_repo: Add (clone) the current repo to the build source - use when the function code is in
|
|
@@ -3814,7 +3816,7 @@ class MlrunProject(ModelObj):
|
|
|
3814
3816
|
|
|
3815
3817
|
import mlrun
|
|
3816
3818
|
from mlrun.datastore.datastore_profile import (
|
|
3817
|
-
|
|
3819
|
+
DatastoreProfileKafkaStream,
|
|
3818
3820
|
DatastoreProfileTDEngine,
|
|
3819
3821
|
)
|
|
3820
3822
|
|
|
@@ -3831,7 +3833,7 @@ class MlrunProject(ModelObj):
|
|
|
3831
3833
|
project.register_datastore_profile(tsdb_profile)
|
|
3832
3834
|
|
|
3833
3835
|
# Create and register stream profile
|
|
3834
|
-
stream_profile =
|
|
3836
|
+
stream_profile = DatastoreProfileKafkaStream(
|
|
3835
3837
|
name="my-kafka",
|
|
3836
3838
|
brokers=["<kafka-broker-ip-address>:9094"],
|
|
3837
3839
|
topics=[], # Keep the topics list empty
|
|
@@ -3873,9 +3875,9 @@ class MlrunProject(ModelObj):
|
|
|
3873
3875
|
|
|
3874
3876
|
.. code-block:: python
|
|
3875
3877
|
|
|
3876
|
-
from mlrun.datastore.datastore_profile import
|
|
3878
|
+
from mlrun.datastore.datastore_profile import DatastoreProfileKafkaStream
|
|
3877
3879
|
|
|
3878
|
-
stream_profile =
|
|
3880
|
+
stream_profile = DatastoreProfileKafkaStream(
|
|
3879
3881
|
name="confluent-kafka",
|
|
3880
3882
|
brokers=["<server-domain-start>.confluent.cloud:9092"],
|
|
3881
3883
|
topics=[],
|
|
@@ -3904,7 +3906,7 @@ class MlrunProject(ModelObj):
|
|
|
3904
3906
|
The supported profiles are:
|
|
3905
3907
|
|
|
3906
3908
|
* :py:class:`~mlrun.datastore.datastore_profile.DatastoreProfileV3io`
|
|
3907
|
-
* :py:class:`~mlrun.datastore.datastore_profile.
|
|
3909
|
+
* :py:class:`~mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream`
|
|
3908
3910
|
|
|
3909
3911
|
You need to register one of them, and pass the profile's name.
|
|
3910
3912
|
:param replace_creds: If ``True`` - override the existing credentials.
|
|
@@ -3944,7 +3946,9 @@ class MlrunProject(ModelObj):
|
|
|
3944
3946
|
start: Optional[datetime.datetime] = None,
|
|
3945
3947
|
end: Optional[datetime.datetime] = None,
|
|
3946
3948
|
top_level: bool = False,
|
|
3947
|
-
|
|
3949
|
+
modes: Optional[
|
|
3950
|
+
Union[mm_constants.EndpointMode, list[mm_constants.EndpointMode]]
|
|
3951
|
+
] = None,
|
|
3948
3952
|
uids: Optional[list[str]] = None,
|
|
3949
3953
|
latest_only: bool = False,
|
|
3950
3954
|
tsdb_metrics: bool = False,
|
|
@@ -3960,7 +3964,7 @@ class MlrunProject(ModelObj):
|
|
|
3960
3964
|
5) function_tag
|
|
3961
3965
|
6) labels
|
|
3962
3966
|
7) top level
|
|
3963
|
-
8)
|
|
3967
|
+
8) modes
|
|
3964
3968
|
9) uids
|
|
3965
3969
|
10) start and end time, corresponding to the `created` field.
|
|
3966
3970
|
By default, when no filters are applied, all available endpoints for the given project will be listed.
|
|
@@ -3982,8 +3986,8 @@ class MlrunProject(ModelObj):
|
|
|
3982
3986
|
:param start: The start time to filter by.Corresponding to the `created` field.
|
|
3983
3987
|
:param end: The end time to filter by. Corresponding to the `created` field.
|
|
3984
3988
|
:param top_level: If true will return only routers and endpoint that are NOT children of any router.
|
|
3985
|
-
:param
|
|
3986
|
-
|
|
3989
|
+
:param modes: Specifies the mode of the model endpoint. Can be "real-time" (0), "batch" (1),
|
|
3990
|
+
"batch_legacy" (2). If set to None, all are included.
|
|
3987
3991
|
:param uids: If passed will return a list `ModelEndpoint` object with uid in uids.
|
|
3988
3992
|
:param tsdb_metrics: When True, the time series metrics will be added to the output
|
|
3989
3993
|
of the resulting.
|
|
@@ -4005,7 +4009,7 @@ class MlrunProject(ModelObj):
|
|
|
4005
4009
|
start=start,
|
|
4006
4010
|
end=end,
|
|
4007
4011
|
top_level=top_level,
|
|
4008
|
-
|
|
4012
|
+
modes=modes,
|
|
4009
4013
|
uids=uids,
|
|
4010
4014
|
latest_only=latest_only,
|
|
4011
4015
|
tsdb_metrics=tsdb_metrics,
|
mlrun/runtimes/base.py
CHANGED
|
@@ -142,9 +142,6 @@ class FunctionSpec(ModelObj):
|
|
|
142
142
|
def build(self, build):
|
|
143
143
|
self._build = self._verify_dict(build, "build", ImageBuilder)
|
|
144
144
|
|
|
145
|
-
def enrich_function_preemption_spec(self):
|
|
146
|
-
pass
|
|
147
|
-
|
|
148
145
|
def validate_service_account(self, allowed_service_accounts):
|
|
149
146
|
pass
|
|
150
147
|
|
mlrun/runtimes/mounts.py
CHANGED
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import os
|
|
16
16
|
import typing
|
|
17
|
+
import warnings
|
|
17
18
|
from collections import namedtuple
|
|
18
19
|
|
|
19
20
|
from mlrun.config import config
|
|
@@ -247,10 +248,22 @@ def mount_s3(
|
|
|
247
248
|
def _use_s3_cred(runtime: "KubeResource"):
|
|
248
249
|
_access_key = aws_access_key or os.environ.get(prefix + "AWS_ACCESS_KEY_ID")
|
|
249
250
|
_secret_key = aws_secret_key or os.environ.get(prefix + "AWS_SECRET_ACCESS_KEY")
|
|
250
|
-
|
|
251
|
+
|
|
252
|
+
# Check for endpoint URL with backward compatibility
|
|
253
|
+
_endpoint_url = endpoint_url or os.environ.get(prefix + "AWS_ENDPOINT_URL_S3")
|
|
254
|
+
if not _endpoint_url:
|
|
255
|
+
# Check for deprecated environment variable
|
|
256
|
+
_endpoint_url = os.environ.get(prefix + "S3_ENDPOINT_URL")
|
|
257
|
+
if _endpoint_url:
|
|
258
|
+
warnings.warn(
|
|
259
|
+
"S3_ENDPOINT_URL is deprecated in 1.10.0 and will be removed in 1.12.0, "
|
|
260
|
+
"use AWS_ENDPOINT_URL_S3 instead.",
|
|
261
|
+
# TODO: Remove this in 1.12.0
|
|
262
|
+
FutureWarning,
|
|
263
|
+
)
|
|
251
264
|
|
|
252
265
|
if _endpoint_url:
|
|
253
|
-
runtime.set_env(prefix + "
|
|
266
|
+
runtime.set_env(prefix + "AWS_ENDPOINT_URL_S3", _endpoint_url)
|
|
254
267
|
if aws_region:
|
|
255
268
|
runtime.set_env(prefix + "AWS_REGION", aws_region)
|
|
256
269
|
if non_anonymous:
|
|
@@ -16,6 +16,7 @@ from .serving import ServingRuntime, new_v2_model_server # noqa
|
|
|
16
16
|
from .nuclio import nuclio_init_hook # noqa
|
|
17
17
|
from .function import (
|
|
18
18
|
min_nuclio_versions,
|
|
19
|
+
multiple_port_sidecar_is_supported,
|
|
19
20
|
RemoteRuntime,
|
|
20
21
|
) # noqa
|
|
21
22
|
from .api_gateway import APIGateway
|
|
@@ -22,7 +22,10 @@ import mlrun.errors
|
|
|
22
22
|
import mlrun.run
|
|
23
23
|
from mlrun.common.runtimes.constants import NuclioIngressAddTemplatedIngressModes
|
|
24
24
|
from mlrun.runtimes import RemoteRuntime
|
|
25
|
-
from mlrun.runtimes.nuclio import
|
|
25
|
+
from mlrun.runtimes.nuclio import (
|
|
26
|
+
min_nuclio_versions,
|
|
27
|
+
multiple_port_sidecar_is_supported,
|
|
28
|
+
)
|
|
26
29
|
from mlrun.runtimes.nuclio.api_gateway import (
|
|
27
30
|
APIGateway,
|
|
28
31
|
APIGatewayMetadata,
|
|
@@ -182,7 +185,13 @@ class ApplicationSpec(NuclioSpec):
|
|
|
182
185
|
if port != self.internal_application_port:
|
|
183
186
|
cleaned_ports.append(port)
|
|
184
187
|
|
|
185
|
-
|
|
188
|
+
application_ports = [self.internal_application_port] + cleaned_ports
|
|
189
|
+
|
|
190
|
+
# ensure multiple ports are supported in Nuclio
|
|
191
|
+
if len(application_ports) > 1:
|
|
192
|
+
multiple_port_sidecar_is_supported()
|
|
193
|
+
|
|
194
|
+
self._application_ports = application_ports
|
|
186
195
|
|
|
187
196
|
@property
|
|
188
197
|
def internal_application_port(self):
|
|
@@ -1045,6 +1045,9 @@ class RemoteRuntime(KubeResource):
|
|
|
1045
1045
|
sidecar["image"] = image
|
|
1046
1046
|
|
|
1047
1047
|
ports = mlrun.utils.helpers.as_list(ports)
|
|
1048
|
+
if len(ports) > 1:
|
|
1049
|
+
mlrun.runtimes.nuclio.multiple_port_sidecar_is_supported()
|
|
1050
|
+
|
|
1048
1051
|
# according to RFC-6335, port name should be less than 15 characters,
|
|
1049
1052
|
# so we truncate it if needed and leave room for the index
|
|
1050
1053
|
port_name = name[:13].rstrip("-_") if len(name) > 13 else name
|
|
@@ -1458,3 +1461,10 @@ def enrich_nuclio_function_from_headers(
|
|
|
1458
1461
|
else []
|
|
1459
1462
|
)
|
|
1460
1463
|
func.status.container_image = headers.get("x-mlrun-container-image", "")
|
|
1464
|
+
|
|
1465
|
+
|
|
1466
|
+
@min_nuclio_versions("1.14.14")
|
|
1467
|
+
def multiple_port_sidecar_is_supported():
|
|
1468
|
+
# multiple ports are supported from nuclio version 1.14.14
|
|
1469
|
+
# this method exists only for running the min_nuclio_versions decorator
|
|
1470
|
+
return True
|
mlrun/runtimes/nuclio/serving.py
CHANGED
|
@@ -22,6 +22,7 @@ from nuclio import KafkaTrigger
|
|
|
22
22
|
|
|
23
23
|
import mlrun
|
|
24
24
|
import mlrun.common.schemas as schemas
|
|
25
|
+
import mlrun.datastore.datastore_profile as ds_profile
|
|
25
26
|
from mlrun.datastore import get_kafka_brokers_from_dict, parse_kafka_url
|
|
26
27
|
from mlrun.model import ObjectList
|
|
27
28
|
from mlrun.runtimes.function_reference import FunctionReference
|
|
@@ -740,6 +741,7 @@ class ServingRuntime(RemoteRuntime):
|
|
|
740
741
|
current_function="*",
|
|
741
742
|
track_models=False,
|
|
742
743
|
workdir=None,
|
|
744
|
+
stream_profile: Optional[ds_profile.DatastoreProfile] = None,
|
|
743
745
|
**kwargs,
|
|
744
746
|
) -> GraphServer:
|
|
745
747
|
"""create mock server object for local testing/emulation
|
|
@@ -748,6 +750,7 @@ class ServingRuntime(RemoteRuntime):
|
|
|
748
750
|
:param current_function: specify if you want to simulate a child function, * for all functions
|
|
749
751
|
:param track_models: allow model tracking (disabled by default in the mock server)
|
|
750
752
|
:param workdir: working directory to locate the source code (if not the current one)
|
|
753
|
+
:param stream_profile: stream profile to use for the mock server output stream.
|
|
751
754
|
"""
|
|
752
755
|
|
|
753
756
|
# set the namespaces/modules to look for the steps code in
|
|
@@ -787,6 +790,7 @@ class ServingRuntime(RemoteRuntime):
|
|
|
787
790
|
logger=logger,
|
|
788
791
|
is_mock=True,
|
|
789
792
|
monitoring_mock=self.spec.track_models,
|
|
793
|
+
stream_profile=stream_profile,
|
|
790
794
|
)
|
|
791
795
|
|
|
792
796
|
server.graph = add_system_steps_to_graph(
|
mlrun/runtimes/pod.py
CHANGED
|
@@ -17,6 +17,7 @@ import os
|
|
|
17
17
|
import re
|
|
18
18
|
import time
|
|
19
19
|
import typing
|
|
20
|
+
import warnings
|
|
20
21
|
from collections.abc import Iterable
|
|
21
22
|
from enum import Enum
|
|
22
23
|
|
|
@@ -35,6 +36,7 @@ from mlrun.common.schemas import (
|
|
|
35
36
|
|
|
36
37
|
from ..config import config as mlconf
|
|
37
38
|
from ..k8s_utils import (
|
|
39
|
+
generate_preemptible_nodes_affinity_terms,
|
|
38
40
|
validate_node_selectors,
|
|
39
41
|
)
|
|
40
42
|
from ..utils import logger, update_in
|
|
@@ -874,6 +876,133 @@ class KubeResource(BaseRuntime):
|
|
|
874
876
|
"""
|
|
875
877
|
self.spec.with_requests(mem, cpu, patch=patch)
|
|
876
878
|
|
|
879
|
+
@staticmethod
|
|
880
|
+
def detect_preemptible_node_selector(node_selector: dict[str, str]) -> list[str]:
|
|
881
|
+
"""
|
|
882
|
+
Check whether any provided node selector matches preemptible selectors.
|
|
883
|
+
|
|
884
|
+
:param node_selector: User-provided node selector mapping.
|
|
885
|
+
:return: List of `"key='value'"` strings that match a preemptible selector.
|
|
886
|
+
"""
|
|
887
|
+
preemptible_node_selector = mlconf.get_preemptible_node_selector()
|
|
888
|
+
|
|
889
|
+
return [
|
|
890
|
+
f"'{key}': '{val}'"
|
|
891
|
+
for key, val in node_selector.items()
|
|
892
|
+
if preemptible_node_selector.get(key) == val
|
|
893
|
+
]
|
|
894
|
+
|
|
895
|
+
def detect_preemptible_tolerations(
|
|
896
|
+
self, tolerations: list[k8s_client.V1Toleration]
|
|
897
|
+
) -> list[str]:
|
|
898
|
+
"""
|
|
899
|
+
Check whether any provided toleration matches preemptible tolerations.
|
|
900
|
+
|
|
901
|
+
:param tolerations: User-provided tolerations.
|
|
902
|
+
:return: List of formatted toleration strings that are considered preemptible.
|
|
903
|
+
"""
|
|
904
|
+
preemptible_tolerations = [
|
|
905
|
+
k8s_client.V1Toleration(
|
|
906
|
+
key=toleration.get("key"),
|
|
907
|
+
value=toleration.get("value"),
|
|
908
|
+
effect=toleration.get("effect"),
|
|
909
|
+
)
|
|
910
|
+
for toleration in mlconf.get_preemptible_tolerations()
|
|
911
|
+
]
|
|
912
|
+
|
|
913
|
+
def _format_toleration(toleration):
|
|
914
|
+
return f"'{toleration.key}'='{toleration.value}' (effect: '{toleration.effect}')"
|
|
915
|
+
|
|
916
|
+
return [
|
|
917
|
+
_format_toleration(toleration)
|
|
918
|
+
for toleration in tolerations
|
|
919
|
+
if toleration in preemptible_tolerations
|
|
920
|
+
]
|
|
921
|
+
|
|
922
|
+
def detect_preemptible_affinity(self, affinity: k8s_client.V1Affinity) -> list[str]:
|
|
923
|
+
"""
|
|
924
|
+
Check whether any provided affinity rules match preemptible affinity configs.
|
|
925
|
+
|
|
926
|
+
:param affinity: User-provided affinity object.
|
|
927
|
+
:return: List of formatted expressions that overlap with preemptible terms.
|
|
928
|
+
"""
|
|
929
|
+
preemptible_affinity_terms = generate_preemptible_nodes_affinity_terms()
|
|
930
|
+
conflicting_affinities = []
|
|
931
|
+
|
|
932
|
+
if (
|
|
933
|
+
affinity
|
|
934
|
+
and affinity.node_affinity
|
|
935
|
+
and affinity.node_affinity.required_during_scheduling_ignored_during_execution
|
|
936
|
+
):
|
|
937
|
+
user_terms = affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms
|
|
938
|
+
for user_term in user_terms:
|
|
939
|
+
user_expressions = {
|
|
940
|
+
(expr.key, expr.operator, tuple(expr.values or []))
|
|
941
|
+
for expr in user_term.match_expressions or []
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
for preemptible_term in preemptible_affinity_terms:
|
|
945
|
+
preemptible_expressions = {
|
|
946
|
+
(expr.key, expr.operator, tuple(expr.values or []))
|
|
947
|
+
for expr in preemptible_term.match_expressions or []
|
|
948
|
+
}
|
|
949
|
+
|
|
950
|
+
# Ensure operators match and preemptible expressions are present
|
|
951
|
+
common_exprs = user_expressions & preemptible_expressions
|
|
952
|
+
if common_exprs:
|
|
953
|
+
formatted = ", ".join(
|
|
954
|
+
f"'{key} {operator} {list(values)}'"
|
|
955
|
+
for key, operator, values in common_exprs
|
|
956
|
+
)
|
|
957
|
+
conflicting_affinities.append(formatted)
|
|
958
|
+
return conflicting_affinities
|
|
959
|
+
|
|
960
|
+
def raise_preemptible_warning(
|
|
961
|
+
self,
|
|
962
|
+
node_selector: typing.Optional[dict[str, str]],
|
|
963
|
+
tolerations: typing.Optional[list[k8s_client.V1Toleration]],
|
|
964
|
+
affinity: typing.Optional[k8s_client.V1Affinity],
|
|
965
|
+
) -> None:
|
|
966
|
+
"""
|
|
967
|
+
Detect conflicts and emit a single consolidated warning if needed.
|
|
968
|
+
|
|
969
|
+
:param node_selector: User-provided node selector.
|
|
970
|
+
:param tolerations: User-provided tolerations.
|
|
971
|
+
:param affinity: User-provided affinity.
|
|
972
|
+
:warns: PreemptionWarning - Emitted when any of the provided selectors,
|
|
973
|
+
tolerations, or affinity terms match the configured preemptible
|
|
974
|
+
settings. The message lists the conflicting items.
|
|
975
|
+
"""
|
|
976
|
+
conflict_messages = []
|
|
977
|
+
|
|
978
|
+
if node_selector:
|
|
979
|
+
ns_conflicts = ", ".join(
|
|
980
|
+
self.detect_preemptible_node_selector(node_selector)
|
|
981
|
+
)
|
|
982
|
+
if ns_conflicts:
|
|
983
|
+
conflict_messages.append(f"Node selectors: {ns_conflicts}")
|
|
984
|
+
|
|
985
|
+
if tolerations:
|
|
986
|
+
tol_conflicts = ", ".join(self.detect_preemptible_tolerations(tolerations))
|
|
987
|
+
if tol_conflicts:
|
|
988
|
+
conflict_messages.append(f"Tolerations: {tol_conflicts}")
|
|
989
|
+
|
|
990
|
+
if affinity:
|
|
991
|
+
affinity_conflicts = ", ".join(self.detect_preemptible_affinity(affinity))
|
|
992
|
+
if affinity_conflicts:
|
|
993
|
+
conflict_messages.append(f"Affinity: {affinity_conflicts}")
|
|
994
|
+
|
|
995
|
+
if conflict_messages:
|
|
996
|
+
warning_componentes = "; \n".join(conflict_messages)
|
|
997
|
+
warnings.warn(
|
|
998
|
+
f"Warning: based on MLRun's preemptible node configuration, the following components \n"
|
|
999
|
+
f"may be removed or adjusted at runtime:\n"
|
|
1000
|
+
f"{warning_componentes}.\n"
|
|
1001
|
+
"This adjustment depends on the function's preemption mode. \n"
|
|
1002
|
+
"The list of potential adjusted preemptible selectors can be viewed here: "
|
|
1003
|
+
"mlrun.mlconf.get_preemptible_node_selector() and mlrun.mlconf.get_preemptible_tolerations()."
|
|
1004
|
+
)
|
|
1005
|
+
|
|
877
1006
|
def with_node_selection(
|
|
878
1007
|
self,
|
|
879
1008
|
node_name: typing.Optional[str] = None,
|
|
@@ -882,18 +1011,26 @@ class KubeResource(BaseRuntime):
|
|
|
882
1011
|
tolerations: typing.Optional[list[k8s_client.V1Toleration]] = None,
|
|
883
1012
|
):
|
|
884
1013
|
"""
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity
|
|
891
|
-
for details
|
|
892
|
-
:param tolerations: Tolerations are applied to pods, and allow (but do not require) the pods to schedule
|
|
893
|
-
onto nodes with matching taints - see
|
|
894
|
-
https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration
|
|
895
|
-
for details
|
|
1014
|
+
Configure Kubernetes node scheduling for this function.
|
|
1015
|
+
|
|
1016
|
+
Updates one or more scheduling hints: exact node pinning, label-based selection,
|
|
1017
|
+
affinity/anti-affinity rules, and taint tolerations. Passing ``None`` leaves the
|
|
1018
|
+
current value unchanged; pass an empty dict/list (e.g., ``{}``, ``[]``) to clear.
|
|
896
1019
|
|
|
1020
|
+
:param node_name: Exact Kubernetes node name to pin the pod to.
|
|
1021
|
+
:param node_selector: Mapping of label selectors. Use ``{}`` to clear.
|
|
1022
|
+
:param affinity: :class:`kubernetes.client.V1Affinity` constraints.
|
|
1023
|
+
:param tolerations: List of :class:`kubernetes.client.V1Toleration`. Use ``[]`` to clear.
|
|
1024
|
+
:warns: PreemptionWarning - Emitted if provided selectors/tolerations/affinity
|
|
1025
|
+
conflict with the function's preemption mode.
|
|
1026
|
+
|
|
1027
|
+
Example usage:
|
|
1028
|
+
Prefer a GPU pool and allow scheduling on spot nodes::
|
|
1029
|
+
|
|
1030
|
+
job.with_node_selection(
|
|
1031
|
+
node_selector={"nodepool": "gpu"},
|
|
1032
|
+
tolerations=[k8s_client.V1Toleration(key="spot", operator="Exists")],
|
|
1033
|
+
)
|
|
897
1034
|
"""
|
|
898
1035
|
if node_name:
|
|
899
1036
|
self.spec.node_name = node_name
|
|
@@ -904,6 +1041,11 @@ class KubeResource(BaseRuntime):
|
|
|
904
1041
|
self.spec.affinity = affinity
|
|
905
1042
|
if tolerations is not None:
|
|
906
1043
|
self.spec.tolerations = tolerations
|
|
1044
|
+
self.raise_preemptible_warning(
|
|
1045
|
+
node_selector=self.spec.node_selector,
|
|
1046
|
+
tolerations=self.spec.tolerations,
|
|
1047
|
+
affinity=self.spec.affinity,
|
|
1048
|
+
)
|
|
907
1049
|
|
|
908
1050
|
def with_priority_class(self, name: typing.Optional[str] = None):
|
|
909
1051
|
"""
|
mlrun/runtimes/utils.py
CHANGED
|
@@ -26,6 +26,7 @@ import pandas as pd
|
|
|
26
26
|
import mlrun
|
|
27
27
|
import mlrun.common.constants
|
|
28
28
|
import mlrun.common.constants as mlrun_constants
|
|
29
|
+
import mlrun.common.runtimes.constants
|
|
29
30
|
import mlrun.common.schemas
|
|
30
31
|
import mlrun.utils.regex
|
|
31
32
|
from mlrun.artifacts import TableArtifact
|
|
@@ -153,6 +154,7 @@ def results_to_iter(results, runspec, execution):
|
|
|
153
154
|
|
|
154
155
|
iter = []
|
|
155
156
|
failed = 0
|
|
157
|
+
pending_retry = 0
|
|
156
158
|
running = 0
|
|
157
159
|
for task in results:
|
|
158
160
|
if task:
|
|
@@ -164,17 +166,26 @@ def results_to_iter(results, runspec, execution):
|
|
|
164
166
|
"state": state,
|
|
165
167
|
"iter": id,
|
|
166
168
|
}
|
|
167
|
-
if state ==
|
|
169
|
+
if state == mlrun.common.runtimes.constants.RunStates.error:
|
|
168
170
|
failed += 1
|
|
169
171
|
err = get_in(task, ["status", "error"], "")
|
|
170
|
-
logger.error(f"error in task
|
|
171
|
-
elif state
|
|
172
|
+
logger.error(f"error in task {execution.uid}:{id} - {err_to_str(err)}")
|
|
173
|
+
elif state == mlrun.common.runtimes.constants.RunStates.pending_retry:
|
|
174
|
+
pending_retry += 1
|
|
175
|
+
err = get_in(task, ["status", "error"], "")
|
|
176
|
+
retry_count = get_in(task, ["status", "retry_count"], 0)
|
|
177
|
+
logger.warning(
|
|
178
|
+
f"pending retry in task {execution.uid}:{id} - {err_to_str(err)}. Retry count: {retry_count}"
|
|
179
|
+
)
|
|
180
|
+
elif state != mlrun.common.runtimes.constants.RunStates.completed:
|
|
172
181
|
running += 1
|
|
173
182
|
|
|
174
183
|
iter.append(struct)
|
|
175
184
|
|
|
176
185
|
if not iter:
|
|
177
|
-
execution.set_state(
|
|
186
|
+
execution.set_state(
|
|
187
|
+
mlrun.common.runtimes.constants.RunStates.completed, commit=True
|
|
188
|
+
)
|
|
178
189
|
logger.warning("Warning!, zero iteration results")
|
|
179
190
|
return
|
|
180
191
|
if hasattr(pd, "json_normalize"):
|
|
@@ -204,8 +215,14 @@ def results_to_iter(results, runspec, execution):
|
|
|
204
215
|
error=f"{failed} of {len(results)} tasks failed, check logs in db for details",
|
|
205
216
|
commit=False,
|
|
206
217
|
)
|
|
218
|
+
elif pending_retry:
|
|
219
|
+
execution.set_state(
|
|
220
|
+
mlrun.common.runtimes.constants.RunStates.pending_retry, commit=False
|
|
221
|
+
)
|
|
207
222
|
elif running == 0:
|
|
208
|
-
execution.set_state(
|
|
223
|
+
execution.set_state(
|
|
224
|
+
mlrun.common.runtimes.constants.RunStates.completed, commit=False
|
|
225
|
+
)
|
|
209
226
|
execution.commit()
|
|
210
227
|
|
|
211
228
|
|
mlrun/serving/routers.py
CHANGED
|
@@ -31,6 +31,9 @@ import mlrun.common.model_monitoring
|
|
|
31
31
|
import mlrun.common.schemas.model_monitoring
|
|
32
32
|
from mlrun.utils import logger, now_date
|
|
33
33
|
|
|
34
|
+
from ..common.model_monitoring.helpers import (
|
|
35
|
+
get_model_endpoints_creation_task_status,
|
|
36
|
+
)
|
|
34
37
|
from .utils import RouterToDict, _extract_input_data, _update_result_body
|
|
35
38
|
from .v2_serving import _ModelLogPusher
|
|
36
39
|
|
|
@@ -171,46 +174,6 @@ class BaseModelRouter(RouterToDict):
|
|
|
171
174
|
"""run tasks after processing the event"""
|
|
172
175
|
return event
|
|
173
176
|
|
|
174
|
-
def _get_background_task_status(
|
|
175
|
-
self,
|
|
176
|
-
) -> mlrun.common.schemas.BackgroundTaskState:
|
|
177
|
-
self._background_task_check_timestamp = now_date()
|
|
178
|
-
server: mlrun.serving.GraphServer = getattr(
|
|
179
|
-
self.context, "_server", None
|
|
180
|
-
) or getattr(self.context, "server", None)
|
|
181
|
-
if not self.context.is_mock:
|
|
182
|
-
if server.model_endpoint_creation_task_name:
|
|
183
|
-
background_task = mlrun.get_run_db().get_project_background_task(
|
|
184
|
-
server.project, server.model_endpoint_creation_task_name
|
|
185
|
-
)
|
|
186
|
-
logger.debug(
|
|
187
|
-
"Checking model endpoint creation task status",
|
|
188
|
-
task_name=server.model_endpoint_creation_task_name,
|
|
189
|
-
)
|
|
190
|
-
if (
|
|
191
|
-
background_task.status.state
|
|
192
|
-
in mlrun.common.schemas.BackgroundTaskState.terminal_states()
|
|
193
|
-
):
|
|
194
|
-
logger.info(
|
|
195
|
-
f"Model endpoint creation task completed with state {background_task.status.state}"
|
|
196
|
-
)
|
|
197
|
-
else: # in progress
|
|
198
|
-
logger.info(
|
|
199
|
-
f"Model endpoint creation task is still in progress with the current state: "
|
|
200
|
-
f"{background_task.status.state}. Events will not be monitored for the next "
|
|
201
|
-
f"{mlrun.mlconf.model_endpoint_monitoring.model_endpoint_creation_check_period} seconds",
|
|
202
|
-
name=self.name,
|
|
203
|
-
background_task_check_timestamp=self._background_task_check_timestamp.isoformat(),
|
|
204
|
-
)
|
|
205
|
-
return background_task.status.state
|
|
206
|
-
else:
|
|
207
|
-
logger.error(
|
|
208
|
-
"Model endpoint creation task name not provided. This function is not being monitored.",
|
|
209
|
-
)
|
|
210
|
-
elif self.context.monitoring_mock:
|
|
211
|
-
return mlrun.common.schemas.BackgroundTaskState.succeeded
|
|
212
|
-
return mlrun.common.schemas.BackgroundTaskState.failed
|
|
213
|
-
|
|
214
177
|
def _update_background_task_state(self, event):
|
|
215
178
|
if not self.background_task_reached_terminal_state and (
|
|
216
179
|
self._background_task_check_timestamp is None
|
|
@@ -219,7 +182,26 @@ class BaseModelRouter(RouterToDict):
|
|
|
219
182
|
seconds=mlrun.mlconf.model_endpoint_monitoring.model_endpoint_creation_check_period
|
|
220
183
|
)
|
|
221
184
|
):
|
|
222
|
-
|
|
185
|
+
server: mlrun.serving.GraphServer = getattr(
|
|
186
|
+
self.context, "_server", None
|
|
187
|
+
) or getattr(self.context, "server", None)
|
|
188
|
+
if not self.context.is_mock:
|
|
189
|
+
(
|
|
190
|
+
self._background_task_current_state,
|
|
191
|
+
self._background_task_check_timestamp,
|
|
192
|
+
_,
|
|
193
|
+
) = get_model_endpoints_creation_task_status(server)
|
|
194
|
+
elif self.context.monitoring_mock:
|
|
195
|
+
self._background_task_current_state = (
|
|
196
|
+
mlrun.common.schemas.BackgroundTaskState.succeeded
|
|
197
|
+
)
|
|
198
|
+
self._background_task_check_timestamp = mlrun.utils.now_date()
|
|
199
|
+
else:
|
|
200
|
+
self._background_task_current_state = (
|
|
201
|
+
mlrun.common.schemas.BackgroundTaskState.failed
|
|
202
|
+
)
|
|
203
|
+
self._background_task_check_timestamp = mlrun.utils.now_date()
|
|
204
|
+
|
|
223
205
|
if event.body:
|
|
224
206
|
event.body["background_task_state"] = (
|
|
225
207
|
self._background_task_current_state
|