mlrun 1.10.0rc25__py3-none-any.whl → 1.10.0rc26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/artifacts/llm_prompt.py +8 -1
- mlrun/common/model_monitoring/helpers.py +86 -0
- mlrun/common/schemas/hub.py +11 -18
- mlrun/config.py +2 -3
- mlrun/datastore/__init__.py +2 -2
- mlrun/datastore/datastore_profile.py +27 -3
- mlrun/datastore/model_provider/huggingface_provider.py +5 -1
- mlrun/datastore/model_provider/model_provider.py +1 -1
- mlrun/datastore/s3.py +24 -2
- mlrun/datastore/storeytargets.py +2 -3
- mlrun/db/base.py +14 -0
- mlrun/db/httpdb.py +11 -2
- mlrun/db/nopdb.py +13 -0
- mlrun/k8s_utils.py +0 -14
- mlrun/model_monitoring/applications/base.py +15 -0
- mlrun/model_monitoring/controller.py +5 -3
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +3 -1
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +3 -0
- mlrun/model_monitoring/helpers.py +5 -5
- mlrun/projects/project.py +5 -5
- mlrun/runtimes/base.py +0 -3
- mlrun/runtimes/mounts.py +15 -2
- mlrun/runtimes/pod.py +153 -11
- mlrun/serving/routers.py +23 -41
- mlrun/serving/server.py +1 -0
- mlrun/serving/states.py +3 -3
- mlrun/serving/system_steps.py +52 -29
- mlrun/serving/v2_serving.py +9 -10
- mlrun/utils/helpers.py +5 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.10.0rc25.dist-info → mlrun-1.10.0rc26.dist-info}/METADATA +3 -3
- {mlrun-1.10.0rc25.dist-info → mlrun-1.10.0rc26.dist-info}/RECORD +37 -37
- {mlrun-1.10.0rc25.dist-info → mlrun-1.10.0rc26.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc25.dist-info → mlrun-1.10.0rc26.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc25.dist-info → mlrun-1.10.0rc26.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.10.0rc25.dist-info → mlrun-1.10.0rc26.dist-info}/top_level.txt +0 -0
|
@@ -973,6 +973,9 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
973
973
|
start: Optional[datetime] = None,
|
|
974
974
|
end: Optional[datetime] = None,
|
|
975
975
|
) -> dict[str, float]:
|
|
976
|
+
if not endpoint_ids:
|
|
977
|
+
return {}
|
|
978
|
+
|
|
976
979
|
# Get the last request timestamp for each endpoint from the KV table.
|
|
977
980
|
# The result of the query is a list of dictionaries,
|
|
978
981
|
# each dictionary contains the endpoint id and the last request timestamp.
|
|
@@ -143,7 +143,7 @@ def get_stream_path(
|
|
|
143
143
|
return stream_uri.replace("v3io://", f"ds://{profile.name}")
|
|
144
144
|
|
|
145
145
|
elif isinstance(
|
|
146
|
-
profile, mlrun.datastore.datastore_profile.
|
|
146
|
+
profile, mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream
|
|
147
147
|
):
|
|
148
148
|
topic = mlrun.common.model_monitoring.helpers.get_kafka_topic(
|
|
149
149
|
project=project, function_name=function_name
|
|
@@ -152,7 +152,7 @@ def get_stream_path(
|
|
|
152
152
|
else:
|
|
153
153
|
raise mlrun.errors.MLRunValueError(
|
|
154
154
|
f"Received an unexpected stream profile type: {type(profile)}\n"
|
|
155
|
-
"Expects `DatastoreProfileV3io` or `
|
|
155
|
+
"Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaStream`."
|
|
156
156
|
)
|
|
157
157
|
|
|
158
158
|
|
|
@@ -300,7 +300,7 @@ def _get_v3io_output_stream(
|
|
|
300
300
|
|
|
301
301
|
def _get_kafka_output_stream(
|
|
302
302
|
*,
|
|
303
|
-
kafka_profile: mlrun.datastore.datastore_profile.
|
|
303
|
+
kafka_profile: mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream,
|
|
304
304
|
project: str,
|
|
305
305
|
function_name: str,
|
|
306
306
|
mock: bool = False,
|
|
@@ -356,7 +356,7 @@ def get_output_stream(
|
|
|
356
356
|
)
|
|
357
357
|
|
|
358
358
|
elif isinstance(
|
|
359
|
-
profile, mlrun.datastore.datastore_profile.
|
|
359
|
+
profile, mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream
|
|
360
360
|
):
|
|
361
361
|
return _get_kafka_output_stream(
|
|
362
362
|
kafka_profile=profile,
|
|
@@ -368,7 +368,7 @@ def get_output_stream(
|
|
|
368
368
|
else:
|
|
369
369
|
raise mlrun.errors.MLRunValueError(
|
|
370
370
|
f"Received an unexpected stream profile type: {type(profile)}\n"
|
|
371
|
-
"Expects `DatastoreProfileV3io` or `
|
|
371
|
+
"Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaStream`."
|
|
372
372
|
)
|
|
373
373
|
|
|
374
374
|
|
mlrun/projects/project.py
CHANGED
|
@@ -3816,7 +3816,7 @@ class MlrunProject(ModelObj):
|
|
|
3816
3816
|
|
|
3817
3817
|
import mlrun
|
|
3818
3818
|
from mlrun.datastore.datastore_profile import (
|
|
3819
|
-
|
|
3819
|
+
DatastoreProfileKafkaStream,
|
|
3820
3820
|
DatastoreProfileTDEngine,
|
|
3821
3821
|
)
|
|
3822
3822
|
|
|
@@ -3833,7 +3833,7 @@ class MlrunProject(ModelObj):
|
|
|
3833
3833
|
project.register_datastore_profile(tsdb_profile)
|
|
3834
3834
|
|
|
3835
3835
|
# Create and register stream profile
|
|
3836
|
-
stream_profile =
|
|
3836
|
+
stream_profile = DatastoreProfileKafkaStream(
|
|
3837
3837
|
name="my-kafka",
|
|
3838
3838
|
brokers=["<kafka-broker-ip-address>:9094"],
|
|
3839
3839
|
topics=[], # Keep the topics list empty
|
|
@@ -3875,9 +3875,9 @@ class MlrunProject(ModelObj):
|
|
|
3875
3875
|
|
|
3876
3876
|
.. code-block:: python
|
|
3877
3877
|
|
|
3878
|
-
from mlrun.datastore.datastore_profile import
|
|
3878
|
+
from mlrun.datastore.datastore_profile import DatastoreProfileKafkaStream
|
|
3879
3879
|
|
|
3880
|
-
stream_profile =
|
|
3880
|
+
stream_profile = DatastoreProfileKafkaStream(
|
|
3881
3881
|
name="confluent-kafka",
|
|
3882
3882
|
brokers=["<server-domain-start>.confluent.cloud:9092"],
|
|
3883
3883
|
topics=[],
|
|
@@ -3906,7 +3906,7 @@ class MlrunProject(ModelObj):
|
|
|
3906
3906
|
The supported profiles are:
|
|
3907
3907
|
|
|
3908
3908
|
* :py:class:`~mlrun.datastore.datastore_profile.DatastoreProfileV3io`
|
|
3909
|
-
* :py:class:`~mlrun.datastore.datastore_profile.
|
|
3909
|
+
* :py:class:`~mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream`
|
|
3910
3910
|
|
|
3911
3911
|
You need to register one of them, and pass the profile's name.
|
|
3912
3912
|
:param replace_creds: If ``True`` - override the existing credentials.
|
mlrun/runtimes/base.py
CHANGED
|
@@ -142,9 +142,6 @@ class FunctionSpec(ModelObj):
|
|
|
142
142
|
def build(self, build):
|
|
143
143
|
self._build = self._verify_dict(build, "build", ImageBuilder)
|
|
144
144
|
|
|
145
|
-
def enrich_function_preemption_spec(self):
|
|
146
|
-
pass
|
|
147
|
-
|
|
148
145
|
def validate_service_account(self, allowed_service_accounts):
|
|
149
146
|
pass
|
|
150
147
|
|
mlrun/runtimes/mounts.py
CHANGED
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import os
|
|
16
16
|
import typing
|
|
17
|
+
import warnings
|
|
17
18
|
from collections import namedtuple
|
|
18
19
|
|
|
19
20
|
from mlrun.config import config
|
|
@@ -247,10 +248,22 @@ def mount_s3(
|
|
|
247
248
|
def _use_s3_cred(runtime: "KubeResource"):
|
|
248
249
|
_access_key = aws_access_key or os.environ.get(prefix + "AWS_ACCESS_KEY_ID")
|
|
249
250
|
_secret_key = aws_secret_key or os.environ.get(prefix + "AWS_SECRET_ACCESS_KEY")
|
|
250
|
-
|
|
251
|
+
|
|
252
|
+
# Check for endpoint URL with backward compatibility
|
|
253
|
+
_endpoint_url = endpoint_url or os.environ.get(prefix + "AWS_ENDPOINT_URL_S3")
|
|
254
|
+
if not _endpoint_url:
|
|
255
|
+
# Check for deprecated environment variable
|
|
256
|
+
_endpoint_url = os.environ.get(prefix + "S3_ENDPOINT_URL")
|
|
257
|
+
if _endpoint_url:
|
|
258
|
+
warnings.warn(
|
|
259
|
+
"S3_ENDPOINT_URL is deprecated in 1.10.0 and will be removed in 1.12.0, "
|
|
260
|
+
"use AWS_ENDPOINT_URL_S3 instead.",
|
|
261
|
+
# TODO: Remove this in 1.12.0
|
|
262
|
+
FutureWarning,
|
|
263
|
+
)
|
|
251
264
|
|
|
252
265
|
if _endpoint_url:
|
|
253
|
-
runtime.set_env(prefix + "
|
|
266
|
+
runtime.set_env(prefix + "AWS_ENDPOINT_URL_S3", _endpoint_url)
|
|
254
267
|
if aws_region:
|
|
255
268
|
runtime.set_env(prefix + "AWS_REGION", aws_region)
|
|
256
269
|
if non_anonymous:
|
mlrun/runtimes/pod.py
CHANGED
|
@@ -17,6 +17,7 @@ import os
|
|
|
17
17
|
import re
|
|
18
18
|
import time
|
|
19
19
|
import typing
|
|
20
|
+
import warnings
|
|
20
21
|
from collections.abc import Iterable
|
|
21
22
|
from enum import Enum
|
|
22
23
|
|
|
@@ -35,6 +36,7 @@ from mlrun.common.schemas import (
|
|
|
35
36
|
|
|
36
37
|
from ..config import config as mlconf
|
|
37
38
|
from ..k8s_utils import (
|
|
39
|
+
generate_preemptible_nodes_affinity_terms,
|
|
38
40
|
validate_node_selectors,
|
|
39
41
|
)
|
|
40
42
|
from ..utils import logger, update_in
|
|
@@ -874,6 +876,133 @@ class KubeResource(BaseRuntime):
|
|
|
874
876
|
"""
|
|
875
877
|
self.spec.with_requests(mem, cpu, patch=patch)
|
|
876
878
|
|
|
879
|
+
@staticmethod
|
|
880
|
+
def detect_preemptible_node_selector(node_selector: dict[str, str]) -> list[str]:
|
|
881
|
+
"""
|
|
882
|
+
Check whether any provided node selector matches preemptible selectors.
|
|
883
|
+
|
|
884
|
+
:param node_selector: User-provided node selector mapping.
|
|
885
|
+
:return: List of `"key='value'"` strings that match a preemptible selector.
|
|
886
|
+
"""
|
|
887
|
+
preemptible_node_selector = mlconf.get_preemptible_node_selector()
|
|
888
|
+
|
|
889
|
+
return [
|
|
890
|
+
f"'{key}': '{val}'"
|
|
891
|
+
for key, val in node_selector.items()
|
|
892
|
+
if preemptible_node_selector.get(key) == val
|
|
893
|
+
]
|
|
894
|
+
|
|
895
|
+
def detect_preemptible_tolerations(
|
|
896
|
+
self, tolerations: list[k8s_client.V1Toleration]
|
|
897
|
+
) -> list[str]:
|
|
898
|
+
"""
|
|
899
|
+
Check whether any provided toleration matches preemptible tolerations.
|
|
900
|
+
|
|
901
|
+
:param tolerations: User-provided tolerations.
|
|
902
|
+
:return: List of formatted toleration strings that are considered preemptible.
|
|
903
|
+
"""
|
|
904
|
+
preemptible_tolerations = [
|
|
905
|
+
k8s_client.V1Toleration(
|
|
906
|
+
key=toleration.get("key"),
|
|
907
|
+
value=toleration.get("value"),
|
|
908
|
+
effect=toleration.get("effect"),
|
|
909
|
+
)
|
|
910
|
+
for toleration in mlconf.get_preemptible_tolerations()
|
|
911
|
+
]
|
|
912
|
+
|
|
913
|
+
def _format_toleration(toleration):
|
|
914
|
+
return f"'{toleration.key}'='{toleration.value}' (effect: '{toleration.effect}')"
|
|
915
|
+
|
|
916
|
+
return [
|
|
917
|
+
_format_toleration(toleration)
|
|
918
|
+
for toleration in tolerations
|
|
919
|
+
if toleration in preemptible_tolerations
|
|
920
|
+
]
|
|
921
|
+
|
|
922
|
+
def detect_preemptible_affinity(self, affinity: k8s_client.V1Affinity) -> list[str]:
|
|
923
|
+
"""
|
|
924
|
+
Check whether any provided affinity rules match preemptible affinity configs.
|
|
925
|
+
|
|
926
|
+
:param affinity: User-provided affinity object.
|
|
927
|
+
:return: List of formatted expressions that overlap with preemptible terms.
|
|
928
|
+
"""
|
|
929
|
+
preemptible_affinity_terms = generate_preemptible_nodes_affinity_terms()
|
|
930
|
+
conflicting_affinities = []
|
|
931
|
+
|
|
932
|
+
if (
|
|
933
|
+
affinity
|
|
934
|
+
and affinity.node_affinity
|
|
935
|
+
and affinity.node_affinity.required_during_scheduling_ignored_during_execution
|
|
936
|
+
):
|
|
937
|
+
user_terms = affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms
|
|
938
|
+
for user_term in user_terms:
|
|
939
|
+
user_expressions = {
|
|
940
|
+
(expr.key, expr.operator, tuple(expr.values or []))
|
|
941
|
+
for expr in user_term.match_expressions or []
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
for preemptible_term in preemptible_affinity_terms:
|
|
945
|
+
preemptible_expressions = {
|
|
946
|
+
(expr.key, expr.operator, tuple(expr.values or []))
|
|
947
|
+
for expr in preemptible_term.match_expressions or []
|
|
948
|
+
}
|
|
949
|
+
|
|
950
|
+
# Ensure operators match and preemptible expressions are present
|
|
951
|
+
common_exprs = user_expressions & preemptible_expressions
|
|
952
|
+
if common_exprs:
|
|
953
|
+
formatted = ", ".join(
|
|
954
|
+
f"'{key} {operator} {list(values)}'"
|
|
955
|
+
for key, operator, values in common_exprs
|
|
956
|
+
)
|
|
957
|
+
conflicting_affinities.append(formatted)
|
|
958
|
+
return conflicting_affinities
|
|
959
|
+
|
|
960
|
+
def raise_preemptible_warning(
|
|
961
|
+
self,
|
|
962
|
+
node_selector: typing.Optional[dict[str, str]],
|
|
963
|
+
tolerations: typing.Optional[list[k8s_client.V1Toleration]],
|
|
964
|
+
affinity: typing.Optional[k8s_client.V1Affinity],
|
|
965
|
+
) -> None:
|
|
966
|
+
"""
|
|
967
|
+
Detect conflicts and emit a single consolidated warning if needed.
|
|
968
|
+
|
|
969
|
+
:param node_selector: User-provided node selector.
|
|
970
|
+
:param tolerations: User-provided tolerations.
|
|
971
|
+
:param affinity: User-provided affinity.
|
|
972
|
+
:warns: PreemptionWarning - Emitted when any of the provided selectors,
|
|
973
|
+
tolerations, or affinity terms match the configured preemptible
|
|
974
|
+
settings. The message lists the conflicting items.
|
|
975
|
+
"""
|
|
976
|
+
conflict_messages = []
|
|
977
|
+
|
|
978
|
+
if node_selector:
|
|
979
|
+
ns_conflicts = ", ".join(
|
|
980
|
+
self.detect_preemptible_node_selector(node_selector)
|
|
981
|
+
)
|
|
982
|
+
if ns_conflicts:
|
|
983
|
+
conflict_messages.append(f"Node selectors: {ns_conflicts}")
|
|
984
|
+
|
|
985
|
+
if tolerations:
|
|
986
|
+
tol_conflicts = ", ".join(self.detect_preemptible_tolerations(tolerations))
|
|
987
|
+
if tol_conflicts:
|
|
988
|
+
conflict_messages.append(f"Tolerations: {tol_conflicts}")
|
|
989
|
+
|
|
990
|
+
if affinity:
|
|
991
|
+
affinity_conflicts = ", ".join(self.detect_preemptible_affinity(affinity))
|
|
992
|
+
if affinity_conflicts:
|
|
993
|
+
conflict_messages.append(f"Affinity: {affinity_conflicts}")
|
|
994
|
+
|
|
995
|
+
if conflict_messages:
|
|
996
|
+
warning_componentes = "; \n".join(conflict_messages)
|
|
997
|
+
warnings.warn(
|
|
998
|
+
f"Warning: based on MLRun's preemptible node configuration, the following components \n"
|
|
999
|
+
f"may be removed or adjusted at runtime:\n"
|
|
1000
|
+
f"{warning_componentes}.\n"
|
|
1001
|
+
"This adjustment depends on the function's preemption mode. \n"
|
|
1002
|
+
"The list of potential adjusted preemptible selectors can be viewed here: "
|
|
1003
|
+
"mlrun.mlconf.get_preemptible_node_selector() and mlrun.mlconf.get_preemptible_tolerations()."
|
|
1004
|
+
)
|
|
1005
|
+
|
|
877
1006
|
def with_node_selection(
|
|
878
1007
|
self,
|
|
879
1008
|
node_name: typing.Optional[str] = None,
|
|
@@ -882,18 +1011,26 @@ class KubeResource(BaseRuntime):
|
|
|
882
1011
|
tolerations: typing.Optional[list[k8s_client.V1Toleration]] = None,
|
|
883
1012
|
):
|
|
884
1013
|
"""
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity
|
|
891
|
-
for details
|
|
892
|
-
:param tolerations: Tolerations are applied to pods, and allow (but do not require) the pods to schedule
|
|
893
|
-
onto nodes with matching taints - see
|
|
894
|
-
https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration
|
|
895
|
-
for details
|
|
1014
|
+
Configure Kubernetes node scheduling for this function.
|
|
1015
|
+
|
|
1016
|
+
Updates one or more scheduling hints: exact node pinning, label-based selection,
|
|
1017
|
+
affinity/anti-affinity rules, and taint tolerations. Passing ``None`` leaves the
|
|
1018
|
+
current value unchanged; pass an empty dict/list (e.g., ``{}``, ``[]``) to clear.
|
|
896
1019
|
|
|
1020
|
+
:param node_name: Exact Kubernetes node name to pin the pod to.
|
|
1021
|
+
:param node_selector: Mapping of label selectors. Use ``{}`` to clear.
|
|
1022
|
+
:param affinity: :class:`kubernetes.client.V1Affinity` constraints.
|
|
1023
|
+
:param tolerations: List of :class:`kubernetes.client.V1Toleration`. Use ``[]`` to clear.
|
|
1024
|
+
:warns: PreemptionWarning - Emitted if provided selectors/tolerations/affinity
|
|
1025
|
+
conflict with the function's preemption mode.
|
|
1026
|
+
|
|
1027
|
+
Example usage:
|
|
1028
|
+
Prefer a GPU pool and allow scheduling on spot nodes::
|
|
1029
|
+
|
|
1030
|
+
job.with_node_selection(
|
|
1031
|
+
node_selector={"nodepool": "gpu"},
|
|
1032
|
+
tolerations=[k8s_client.V1Toleration(key="spot", operator="Exists")],
|
|
1033
|
+
)
|
|
897
1034
|
"""
|
|
898
1035
|
if node_name:
|
|
899
1036
|
self.spec.node_name = node_name
|
|
@@ -904,6 +1041,11 @@ class KubeResource(BaseRuntime):
|
|
|
904
1041
|
self.spec.affinity = affinity
|
|
905
1042
|
if tolerations is not None:
|
|
906
1043
|
self.spec.tolerations = tolerations
|
|
1044
|
+
self.raise_preemptible_warning(
|
|
1045
|
+
node_selector=self.spec.node_selector,
|
|
1046
|
+
tolerations=self.spec.tolerations,
|
|
1047
|
+
affinity=self.spec.affinity,
|
|
1048
|
+
)
|
|
907
1049
|
|
|
908
1050
|
def with_priority_class(self, name: typing.Optional[str] = None):
|
|
909
1051
|
"""
|
mlrun/serving/routers.py
CHANGED
|
@@ -31,6 +31,9 @@ import mlrun.common.model_monitoring
|
|
|
31
31
|
import mlrun.common.schemas.model_monitoring
|
|
32
32
|
from mlrun.utils import logger, now_date
|
|
33
33
|
|
|
34
|
+
from ..common.model_monitoring.helpers import (
|
|
35
|
+
get_model_endpoints_creation_task_status,
|
|
36
|
+
)
|
|
34
37
|
from .utils import RouterToDict, _extract_input_data, _update_result_body
|
|
35
38
|
from .v2_serving import _ModelLogPusher
|
|
36
39
|
|
|
@@ -171,46 +174,6 @@ class BaseModelRouter(RouterToDict):
|
|
|
171
174
|
"""run tasks after processing the event"""
|
|
172
175
|
return event
|
|
173
176
|
|
|
174
|
-
def _get_background_task_status(
|
|
175
|
-
self,
|
|
176
|
-
) -> mlrun.common.schemas.BackgroundTaskState:
|
|
177
|
-
self._background_task_check_timestamp = now_date()
|
|
178
|
-
server: mlrun.serving.GraphServer = getattr(
|
|
179
|
-
self.context, "_server", None
|
|
180
|
-
) or getattr(self.context, "server", None)
|
|
181
|
-
if not self.context.is_mock:
|
|
182
|
-
if server.model_endpoint_creation_task_name:
|
|
183
|
-
background_task = mlrun.get_run_db().get_project_background_task(
|
|
184
|
-
server.project, server.model_endpoint_creation_task_name
|
|
185
|
-
)
|
|
186
|
-
logger.debug(
|
|
187
|
-
"Checking model endpoint creation task status",
|
|
188
|
-
task_name=server.model_endpoint_creation_task_name,
|
|
189
|
-
)
|
|
190
|
-
if (
|
|
191
|
-
background_task.status.state
|
|
192
|
-
in mlrun.common.schemas.BackgroundTaskState.terminal_states()
|
|
193
|
-
):
|
|
194
|
-
logger.info(
|
|
195
|
-
f"Model endpoint creation task completed with state {background_task.status.state}"
|
|
196
|
-
)
|
|
197
|
-
else: # in progress
|
|
198
|
-
logger.info(
|
|
199
|
-
f"Model endpoint creation task is still in progress with the current state: "
|
|
200
|
-
f"{background_task.status.state}. Events will not be monitored for the next "
|
|
201
|
-
f"{mlrun.mlconf.model_endpoint_monitoring.model_endpoint_creation_check_period} seconds",
|
|
202
|
-
name=self.name,
|
|
203
|
-
background_task_check_timestamp=self._background_task_check_timestamp.isoformat(),
|
|
204
|
-
)
|
|
205
|
-
return background_task.status.state
|
|
206
|
-
else:
|
|
207
|
-
logger.error(
|
|
208
|
-
"Model endpoint creation task name not provided. This function is not being monitored.",
|
|
209
|
-
)
|
|
210
|
-
elif self.context.monitoring_mock:
|
|
211
|
-
return mlrun.common.schemas.BackgroundTaskState.succeeded
|
|
212
|
-
return mlrun.common.schemas.BackgroundTaskState.failed
|
|
213
|
-
|
|
214
177
|
def _update_background_task_state(self, event):
|
|
215
178
|
if not self.background_task_reached_terminal_state and (
|
|
216
179
|
self._background_task_check_timestamp is None
|
|
@@ -219,7 +182,26 @@ class BaseModelRouter(RouterToDict):
|
|
|
219
182
|
seconds=mlrun.mlconf.model_endpoint_monitoring.model_endpoint_creation_check_period
|
|
220
183
|
)
|
|
221
184
|
):
|
|
222
|
-
|
|
185
|
+
server: mlrun.serving.GraphServer = getattr(
|
|
186
|
+
self.context, "_server", None
|
|
187
|
+
) or getattr(self.context, "server", None)
|
|
188
|
+
if not self.context.is_mock:
|
|
189
|
+
(
|
|
190
|
+
self._background_task_current_state,
|
|
191
|
+
self._background_task_check_timestamp,
|
|
192
|
+
_,
|
|
193
|
+
) = get_model_endpoints_creation_task_status(server)
|
|
194
|
+
elif self.context.monitoring_mock:
|
|
195
|
+
self._background_task_current_state = (
|
|
196
|
+
mlrun.common.schemas.BackgroundTaskState.succeeded
|
|
197
|
+
)
|
|
198
|
+
self._background_task_check_timestamp = mlrun.utils.now_date()
|
|
199
|
+
else:
|
|
200
|
+
self._background_task_current_state = (
|
|
201
|
+
mlrun.common.schemas.BackgroundTaskState.failed
|
|
202
|
+
)
|
|
203
|
+
self._background_task_check_timestamp = mlrun.utils.now_date()
|
|
204
|
+
|
|
223
205
|
if event.body:
|
|
224
206
|
event.body["background_task_state"] = (
|
|
225
207
|
self._background_task_current_state
|
mlrun/serving/server.py
CHANGED
|
@@ -417,6 +417,7 @@ def add_monitoring_general_steps(
|
|
|
417
417
|
"mlrun.serving.system_steps.BackgroundTaskStatus",
|
|
418
418
|
"background_task_status_step",
|
|
419
419
|
model_endpoint_creation_strategy=mlrun.common.schemas.ModelEndpointCreationStrategy.SKIP,
|
|
420
|
+
full_event=True,
|
|
420
421
|
)
|
|
421
422
|
monitor_flow_step = graph.add_step(
|
|
422
423
|
"storey.Filter",
|
mlrun/serving/states.py
CHANGED
|
@@ -39,7 +39,7 @@ import mlrun.common.schemas as schemas
|
|
|
39
39
|
from mlrun.artifacts.llm_prompt import LLMPromptArtifact, PlaceholderDefaultDict
|
|
40
40
|
from mlrun.artifacts.model import ModelArtifact
|
|
41
41
|
from mlrun.datastore.datastore_profile import (
|
|
42
|
-
|
|
42
|
+
DatastoreProfileKafkaStream,
|
|
43
43
|
DatastoreProfileKafkaTarget,
|
|
44
44
|
DatastoreProfileV3io,
|
|
45
45
|
datastore_profile_read,
|
|
@@ -3398,7 +3398,7 @@ def _init_async_objects(context, steps):
|
|
|
3398
3398
|
datastore_profile = datastore_profile_read(stream_path)
|
|
3399
3399
|
if isinstance(
|
|
3400
3400
|
datastore_profile,
|
|
3401
|
-
(DatastoreProfileKafkaTarget,
|
|
3401
|
+
(DatastoreProfileKafkaTarget, DatastoreProfileKafkaStream),
|
|
3402
3402
|
):
|
|
3403
3403
|
step._async_object = KafkaStoreyTarget(
|
|
3404
3404
|
path=stream_path,
|
|
@@ -3414,7 +3414,7 @@ def _init_async_objects(context, steps):
|
|
|
3414
3414
|
else:
|
|
3415
3415
|
raise mlrun.errors.MLRunValueError(
|
|
3416
3416
|
f"Received an unexpected stream profile type: {type(datastore_profile)}\n"
|
|
3417
|
-
"Expects `DatastoreProfileV3io` or `
|
|
3417
|
+
"Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaStream`."
|
|
3418
3418
|
)
|
|
3419
3419
|
elif stream_path.startswith("kafka://") or kafka_brokers:
|
|
3420
3420
|
topic, brokers = parse_kafka_url(stream_path, kafka_brokers)
|
mlrun/serving/system_steps.py
CHANGED
|
@@ -11,7 +11,6 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
15
14
|
import random
|
|
16
15
|
from copy import copy
|
|
17
16
|
from datetime import timedelta
|
|
@@ -25,10 +24,27 @@ import mlrun.artifacts
|
|
|
25
24
|
import mlrun.common.schemas.model_monitoring as mm_schemas
|
|
26
25
|
import mlrun.feature_store
|
|
27
26
|
import mlrun.serving
|
|
27
|
+
from mlrun.common.model_monitoring.helpers import (
|
|
28
|
+
get_model_endpoints_creation_task_status,
|
|
29
|
+
)
|
|
28
30
|
from mlrun.common.schemas import MonitoringData
|
|
29
31
|
from mlrun.utils import get_data_from_path, logger
|
|
30
32
|
|
|
31
33
|
|
|
34
|
+
class MatchingEndpointsState(mlrun.common.types.StrEnum):
|
|
35
|
+
all_matched = "all_matched"
|
|
36
|
+
not_all_matched = "not_all_matched"
|
|
37
|
+
no_check_needed = "no_check_needed"
|
|
38
|
+
not_yet_checked = "not_yet_matched"
|
|
39
|
+
|
|
40
|
+
@staticmethod
|
|
41
|
+
def success_states() -> list[str]:
|
|
42
|
+
return [
|
|
43
|
+
MatchingEndpointsState.all_matched,
|
|
44
|
+
MatchingEndpointsState.no_check_needed,
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
|
|
32
48
|
class MonitoringPreProcessor(storey.MapClass):
|
|
33
49
|
"""preprocess step, reconstructs the serving output event body to StreamProcessingEvent schema"""
|
|
34
50
|
|
|
@@ -317,6 +333,9 @@ class BackgroundTaskStatus(storey.MapClass):
|
|
|
317
333
|
|
|
318
334
|
def __init__(self, **kwargs):
|
|
319
335
|
super().__init__(**kwargs)
|
|
336
|
+
self.matching_endpoints = MatchingEndpointsState.not_yet_checked
|
|
337
|
+
self.graph_model_endpoint_uids: set = set()
|
|
338
|
+
self.listed_model_endpoint_uids: set = set()
|
|
320
339
|
self.server: mlrun.serving.GraphServer = (
|
|
321
340
|
getattr(self.context, "server", None) if self.context else None
|
|
322
341
|
)
|
|
@@ -337,43 +356,47 @@ class BackgroundTaskStatus(storey.MapClass):
|
|
|
337
356
|
)
|
|
338
357
|
)
|
|
339
358
|
):
|
|
340
|
-
|
|
341
|
-
self.
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
self.
|
|
345
|
-
|
|
359
|
+
(
|
|
360
|
+
self._background_task_state,
|
|
361
|
+
self._background_task_check_timestamp,
|
|
362
|
+
self.listed_model_endpoint_uids,
|
|
363
|
+
) = get_model_endpoints_creation_task_status(self.server)
|
|
364
|
+
if (
|
|
365
|
+
self.listed_model_endpoint_uids
|
|
366
|
+
and self.matching_endpoints == MatchingEndpointsState.not_yet_checked
|
|
367
|
+
):
|
|
368
|
+
if not self.graph_model_endpoint_uids:
|
|
369
|
+
self.graph_model_endpoint_uids = collect_model_endpoint_uids(
|
|
370
|
+
self.server
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
if self.graph_model_endpoint_uids.issubset(self.listed_model_endpoint_uids):
|
|
374
|
+
self.matching_endpoints = MatchingEndpointsState.all_matched
|
|
375
|
+
elif self.listed_model_endpoint_uids is None:
|
|
376
|
+
self.matching_endpoints = MatchingEndpointsState.no_check_needed
|
|
346
377
|
|
|
347
378
|
if (
|
|
348
379
|
self._background_task_state
|
|
349
380
|
== mlrun.common.schemas.BackgroundTaskState.succeeded
|
|
381
|
+
and self.matching_endpoints in MatchingEndpointsState.success_states()
|
|
350
382
|
):
|
|
351
383
|
return event
|
|
352
384
|
else:
|
|
353
385
|
return None
|
|
354
386
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
)
|
|
369
|
-
else: # in progress
|
|
370
|
-
logger.info(
|
|
371
|
-
f"Model endpoint creation task is still in progress with the current state: "
|
|
372
|
-
f"{background_task_state}. Events will not be monitored for the next "
|
|
373
|
-
f"{mlrun.mlconf.model_endpoint_monitoring.model_endpoint_creation_check_period} seconds",
|
|
374
|
-
name=self.name,
|
|
375
|
-
background_task_check_timestamp=self._background_task_check_timestamp.isoformat(),
|
|
376
|
-
)
|
|
387
|
+
|
|
388
|
+
def collect_model_endpoint_uids(server: mlrun.serving.GraphServer) -> set[str]:
|
|
389
|
+
"""Collects all model endpoint UIDs from the server's graph steps."""
|
|
390
|
+
model_endpoint_uids = set()
|
|
391
|
+
for step in server.graph.steps.values():
|
|
392
|
+
if hasattr(step, "monitoring_data"):
|
|
393
|
+
for model in step.monitoring_data.keys():
|
|
394
|
+
uid = step.monitoring_data[model].get(
|
|
395
|
+
mlrun.common.schemas.MonitoringData.MODEL_ENDPOINT_UID
|
|
396
|
+
)
|
|
397
|
+
if uid:
|
|
398
|
+
model_endpoint_uids.add(uid)
|
|
399
|
+
return model_endpoint_uids
|
|
377
400
|
|
|
378
401
|
|
|
379
402
|
class SamplingStep(storey.MapClass):
|
mlrun/serving/v2_serving.py
CHANGED
|
@@ -24,6 +24,9 @@ import mlrun.common.schemas.model_monitoring
|
|
|
24
24
|
import mlrun.model_monitoring
|
|
25
25
|
from mlrun.utils import logger, now_date
|
|
26
26
|
|
|
27
|
+
from ..common.model_monitoring.helpers import (
|
|
28
|
+
get_model_endpoints_creation_task_status,
|
|
29
|
+
)
|
|
27
30
|
from .utils import StepToDict, _extract_input_data, _update_result_body
|
|
28
31
|
|
|
29
32
|
|
|
@@ -474,22 +477,18 @@ class V2ModelServer(StepToDict):
|
|
|
474
477
|
) or getattr(self.context, "server", None)
|
|
475
478
|
if not self.context.is_mock or self.context.monitoring_mock:
|
|
476
479
|
if server.model_endpoint_creation_task_name:
|
|
477
|
-
|
|
478
|
-
server
|
|
479
|
-
)
|
|
480
|
-
logger.debug(
|
|
481
|
-
"Checking model endpoint creation task status",
|
|
482
|
-
task_name=server.model_endpoint_creation_task_name,
|
|
480
|
+
background_task_state, _, _ = get_model_endpoints_creation_task_status(
|
|
481
|
+
server
|
|
483
482
|
)
|
|
484
483
|
if (
|
|
485
|
-
|
|
484
|
+
background_task_state
|
|
486
485
|
in mlrun.common.schemas.BackgroundTaskState.terminal_states()
|
|
487
486
|
):
|
|
488
487
|
logger.debug(
|
|
489
|
-
f"Model endpoint creation task completed with state {
|
|
488
|
+
f"Model endpoint creation task completed with state {background_task_state}"
|
|
490
489
|
)
|
|
491
490
|
if (
|
|
492
|
-
|
|
491
|
+
background_task_state
|
|
493
492
|
== mlrun.common.schemas.BackgroundTaskState.succeeded
|
|
494
493
|
):
|
|
495
494
|
self._model_logger = (
|
|
@@ -504,7 +503,7 @@ class V2ModelServer(StepToDict):
|
|
|
504
503
|
else: # in progress
|
|
505
504
|
logger.debug(
|
|
506
505
|
f"Model endpoint creation task is still in progress with the current state: "
|
|
507
|
-
f"{
|
|
506
|
+
f"{background_task_state}.",
|
|
508
507
|
name=self.name,
|
|
509
508
|
)
|
|
510
509
|
else:
|
mlrun/utils/helpers.py
CHANGED
|
@@ -804,7 +804,7 @@ def remove_tag_from_artifact_uri(uri: str) -> Optional[str]:
|
|
|
804
804
|
|
|
805
805
|
def extend_hub_uri_if_needed(uri) -> tuple[str, bool]:
|
|
806
806
|
"""
|
|
807
|
-
Retrieve the full uri of the
|
|
807
|
+
Retrieve the full uri of the function's yaml in the hub.
|
|
808
808
|
|
|
809
809
|
:param uri: structure: "hub://[<source>/]<item-name>[:<tag>]"
|
|
810
810
|
|
|
@@ -845,7 +845,10 @@ def extend_hub_uri_if_needed(uri) -> tuple[str, bool]:
|
|
|
845
845
|
# hub function directory name are with underscores instead of hyphens
|
|
846
846
|
name = name.replace("-", "_")
|
|
847
847
|
function_suffix = f"{name}/{tag}/src/function.yaml"
|
|
848
|
-
|
|
848
|
+
function_type = mlrun.common.schemas.hub.HubSourceType.functions
|
|
849
|
+
return indexed_source.source.get_full_uri(
|
|
850
|
+
function_suffix, function_type
|
|
851
|
+
), is_hub_uri
|
|
849
852
|
|
|
850
853
|
|
|
851
854
|
def gen_md_table(header, rows=None):
|
mlrun/utils/version/version.json
CHANGED