mlrun 1.10.0rc25__py3-none-any.whl → 1.10.0rc27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/artifacts/llm_prompt.py +8 -1
- mlrun/common/model_monitoring/helpers.py +86 -0
- mlrun/common/schemas/hub.py +11 -18
- mlrun/config.py +2 -3
- mlrun/datastore/__init__.py +2 -2
- mlrun/datastore/datastore_profile.py +27 -3
- mlrun/datastore/model_provider/huggingface_provider.py +5 -1
- mlrun/datastore/model_provider/model_provider.py +1 -1
- mlrun/datastore/s3.py +24 -2
- mlrun/datastore/storeytargets.py +2 -3
- mlrun/db/base.py +14 -0
- mlrun/db/httpdb.py +11 -2
- mlrun/db/nopdb.py +13 -0
- mlrun/k8s_utils.py +0 -14
- mlrun/model_monitoring/applications/base.py +20 -3
- mlrun/model_monitoring/controller.py +5 -3
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +3 -1
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +3 -0
- mlrun/model_monitoring/helpers.py +5 -5
- mlrun/projects/pipelines.py +2 -2
- mlrun/projects/project.py +5 -5
- mlrun/run.py +12 -1
- mlrun/runtimes/base.py +0 -3
- mlrun/runtimes/mounts.py +15 -2
- mlrun/runtimes/nuclio/function.py +35 -26
- mlrun/runtimes/pod.py +153 -11
- mlrun/serving/routers.py +23 -41
- mlrun/serving/server.py +1 -0
- mlrun/serving/states.py +3 -3
- mlrun/serving/system_steps.py +52 -29
- mlrun/serving/v2_serving.py +9 -10
- mlrun/utils/helpers.py +10 -13
- mlrun/utils/notifications/notification/base.py +18 -0
- mlrun/utils/notifications/notification/git.py +2 -4
- mlrun/utils/notifications/notification/slack.py +2 -4
- mlrun/utils/notifications/notification/webhook.py +2 -5
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.10.0rc25.dist-info → mlrun-1.10.0rc27.dist-info}/METADATA +22 -26
- {mlrun-1.10.0rc25.dist-info → mlrun-1.10.0rc27.dist-info}/RECORD +44 -44
- {mlrun-1.10.0rc25.dist-info → mlrun-1.10.0rc27.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc25.dist-info → mlrun-1.10.0rc27.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc25.dist-info → mlrun-1.10.0rc27.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.10.0rc25.dist-info → mlrun-1.10.0rc27.dist-info}/top_level.txt +0 -0
|
@@ -25,10 +25,12 @@ from mlrun.utils import logger
|
|
|
25
25
|
|
|
26
26
|
def _normalize_dict_for_v3io_frames(event: dict[str, Any]) -> dict[str, Any]:
|
|
27
27
|
"""
|
|
28
|
-
Normalize user
|
|
29
|
-
to a form V3IO frames tolerates.
|
|
28
|
+
Normalize user-defined keys (e.g., model input data and predictions) to a format V3IO Frames tolerates.
|
|
30
29
|
|
|
31
|
-
|
|
30
|
+
- Keys must match regex: '^[a-zA-Z_:]([a-zA-Z0-9_:])*$'
|
|
31
|
+
- Replace invalid characters (e.g., '-') with '_'.
|
|
32
|
+
- Prefix keys starting with digits with '_'.
|
|
33
|
+
- Flatten nested dictionaries using dot notation, while normalizing keys recursively.
|
|
32
34
|
"""
|
|
33
35
|
prefix = "_"
|
|
34
36
|
|
|
@@ -38,7 +40,18 @@ def _normalize_dict_for_v3io_frames(event: dict[str, Any]) -> dict[str, Any]:
|
|
|
38
40
|
return prefix + key
|
|
39
41
|
return key
|
|
40
42
|
|
|
41
|
-
|
|
43
|
+
def flatten_dict(d: dict[str, Any], parent_key: str = "") -> dict[str, Any]:
|
|
44
|
+
items = {}
|
|
45
|
+
for k, v in d.items():
|
|
46
|
+
new_key = norm_key(k)
|
|
47
|
+
full_key = f"{parent_key}.{new_key}" if parent_key else new_key
|
|
48
|
+
if isinstance(v, dict):
|
|
49
|
+
items.update(flatten_dict(v, full_key))
|
|
50
|
+
else:
|
|
51
|
+
items[full_key] = v
|
|
52
|
+
return items
|
|
53
|
+
|
|
54
|
+
return flatten_dict(event)
|
|
42
55
|
|
|
43
56
|
|
|
44
57
|
class ProcessBeforeTSDB(mlrun.feature_store.steps.MapClass):
|
|
@@ -973,6 +973,9 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
973
973
|
start: Optional[datetime] = None,
|
|
974
974
|
end: Optional[datetime] = None,
|
|
975
975
|
) -> dict[str, float]:
|
|
976
|
+
if not endpoint_ids:
|
|
977
|
+
return {}
|
|
978
|
+
|
|
976
979
|
# Get the last request timestamp for each endpoint from the KV table.
|
|
977
980
|
# The result of the query is a list of dictionaries,
|
|
978
981
|
# each dictionary contains the endpoint id and the last request timestamp.
|
|
@@ -143,7 +143,7 @@ def get_stream_path(
|
|
|
143
143
|
return stream_uri.replace("v3io://", f"ds://{profile.name}")
|
|
144
144
|
|
|
145
145
|
elif isinstance(
|
|
146
|
-
profile, mlrun.datastore.datastore_profile.
|
|
146
|
+
profile, mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream
|
|
147
147
|
):
|
|
148
148
|
topic = mlrun.common.model_monitoring.helpers.get_kafka_topic(
|
|
149
149
|
project=project, function_name=function_name
|
|
@@ -152,7 +152,7 @@ def get_stream_path(
|
|
|
152
152
|
else:
|
|
153
153
|
raise mlrun.errors.MLRunValueError(
|
|
154
154
|
f"Received an unexpected stream profile type: {type(profile)}\n"
|
|
155
|
-
"Expects `DatastoreProfileV3io` or `
|
|
155
|
+
"Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaStream`."
|
|
156
156
|
)
|
|
157
157
|
|
|
158
158
|
|
|
@@ -300,7 +300,7 @@ def _get_v3io_output_stream(
|
|
|
300
300
|
|
|
301
301
|
def _get_kafka_output_stream(
|
|
302
302
|
*,
|
|
303
|
-
kafka_profile: mlrun.datastore.datastore_profile.
|
|
303
|
+
kafka_profile: mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream,
|
|
304
304
|
project: str,
|
|
305
305
|
function_name: str,
|
|
306
306
|
mock: bool = False,
|
|
@@ -356,7 +356,7 @@ def get_output_stream(
|
|
|
356
356
|
)
|
|
357
357
|
|
|
358
358
|
elif isinstance(
|
|
359
|
-
profile, mlrun.datastore.datastore_profile.
|
|
359
|
+
profile, mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream
|
|
360
360
|
):
|
|
361
361
|
return _get_kafka_output_stream(
|
|
362
362
|
kafka_profile=profile,
|
|
@@ -368,7 +368,7 @@ def get_output_stream(
|
|
|
368
368
|
else:
|
|
369
369
|
raise mlrun.errors.MLRunValueError(
|
|
370
370
|
f"Received an unexpected stream profile type: {type(profile)}\n"
|
|
371
|
-
"Expects `DatastoreProfileV3io` or `
|
|
371
|
+
"Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaStream`."
|
|
372
372
|
)
|
|
373
373
|
|
|
374
374
|
|
mlrun/projects/pipelines.py
CHANGED
|
@@ -228,11 +228,11 @@ class _PipelineContext:
|
|
|
228
228
|
force_run_local = mlrun.mlconf.force_run_local
|
|
229
229
|
if force_run_local is None or force_run_local == "auto":
|
|
230
230
|
force_run_local = not mlrun.mlconf.is_api_running_on_k8s()
|
|
231
|
+
|
|
232
|
+
if self.workflow:
|
|
231
233
|
if not mlrun.mlconf.kfp_url:
|
|
232
234
|
logger.debug("Kubeflow pipeline URL is not set, running locally")
|
|
233
235
|
force_run_local = True
|
|
234
|
-
|
|
235
|
-
if self.workflow:
|
|
236
236
|
force_run_local = force_run_local or self.workflow.run_local
|
|
237
237
|
|
|
238
238
|
return force_run_local
|
mlrun/projects/project.py
CHANGED
|
@@ -3816,7 +3816,7 @@ class MlrunProject(ModelObj):
|
|
|
3816
3816
|
|
|
3817
3817
|
import mlrun
|
|
3818
3818
|
from mlrun.datastore.datastore_profile import (
|
|
3819
|
-
|
|
3819
|
+
DatastoreProfileKafkaStream,
|
|
3820
3820
|
DatastoreProfileTDEngine,
|
|
3821
3821
|
)
|
|
3822
3822
|
|
|
@@ -3833,7 +3833,7 @@ class MlrunProject(ModelObj):
|
|
|
3833
3833
|
project.register_datastore_profile(tsdb_profile)
|
|
3834
3834
|
|
|
3835
3835
|
# Create and register stream profile
|
|
3836
|
-
stream_profile =
|
|
3836
|
+
stream_profile = DatastoreProfileKafkaStream(
|
|
3837
3837
|
name="my-kafka",
|
|
3838
3838
|
brokers=["<kafka-broker-ip-address>:9094"],
|
|
3839
3839
|
topics=[], # Keep the topics list empty
|
|
@@ -3875,9 +3875,9 @@ class MlrunProject(ModelObj):
|
|
|
3875
3875
|
|
|
3876
3876
|
.. code-block:: python
|
|
3877
3877
|
|
|
3878
|
-
from mlrun.datastore.datastore_profile import
|
|
3878
|
+
from mlrun.datastore.datastore_profile import DatastoreProfileKafkaStream
|
|
3879
3879
|
|
|
3880
|
-
stream_profile =
|
|
3880
|
+
stream_profile = DatastoreProfileKafkaStream(
|
|
3881
3881
|
name="confluent-kafka",
|
|
3882
3882
|
brokers=["<server-domain-start>.confluent.cloud:9092"],
|
|
3883
3883
|
topics=[],
|
|
@@ -3906,7 +3906,7 @@ class MlrunProject(ModelObj):
|
|
|
3906
3906
|
The supported profiles are:
|
|
3907
3907
|
|
|
3908
3908
|
* :py:class:`~mlrun.datastore.datastore_profile.DatastoreProfileV3io`
|
|
3909
|
-
* :py:class:`~mlrun.datastore.datastore_profile.
|
|
3909
|
+
* :py:class:`~mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream`
|
|
3910
3910
|
|
|
3911
3911
|
You need to register one of them, and pass the profile's name.
|
|
3912
3912
|
:param replace_creds: If ``True`` - override the existing credentials.
|
mlrun/run.py
CHANGED
|
@@ -222,7 +222,8 @@ def get_or_create_ctx(
|
|
|
222
222
|
:param spec: dictionary holding run spec
|
|
223
223
|
:param with_env: look for context in environment vars, default True
|
|
224
224
|
:param rundb: path/url to the metadata and artifact database
|
|
225
|
-
:param project: project to initiate the context in (by default `mlrun.mlconf.active_project`)
|
|
225
|
+
:param project: project to initiate the context in (by default `mlrun.mlconf.active_project`).
|
|
226
|
+
If not set, an active project must exist.
|
|
226
227
|
:param upload_artifacts: when using local context (not as part of a job/run), upload artifacts to the
|
|
227
228
|
system default artifact path location
|
|
228
229
|
:return: execution context
|
|
@@ -277,6 +278,16 @@ def get_or_create_ctx(
|
|
|
277
278
|
if newspec and not isinstance(newspec, dict):
|
|
278
279
|
newspec = json.loads(newspec)
|
|
279
280
|
|
|
281
|
+
if (
|
|
282
|
+
not newspec.get("metadata", {}).get("project")
|
|
283
|
+
and not project
|
|
284
|
+
and not mlconf.active_project
|
|
285
|
+
):
|
|
286
|
+
raise mlrun.errors.MLRunMissingProjectError(
|
|
287
|
+
"""No active project found. Make sure to set an active project using: mlrun.get_or_create_project()
|
|
288
|
+
You can verify the active project with: mlrun.mlconf.active_project"""
|
|
289
|
+
)
|
|
290
|
+
|
|
280
291
|
if not newspec:
|
|
281
292
|
newspec = {}
|
|
282
293
|
if upload_artifacts:
|
mlrun/runtimes/base.py
CHANGED
|
@@ -142,9 +142,6 @@ class FunctionSpec(ModelObj):
|
|
|
142
142
|
def build(self, build):
|
|
143
143
|
self._build = self._verify_dict(build, "build", ImageBuilder)
|
|
144
144
|
|
|
145
|
-
def enrich_function_preemption_spec(self):
|
|
146
|
-
pass
|
|
147
|
-
|
|
148
145
|
def validate_service_account(self, allowed_service_accounts):
|
|
149
146
|
pass
|
|
150
147
|
|
mlrun/runtimes/mounts.py
CHANGED
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import os
|
|
16
16
|
import typing
|
|
17
|
+
import warnings
|
|
17
18
|
from collections import namedtuple
|
|
18
19
|
|
|
19
20
|
from mlrun.config import config
|
|
@@ -247,10 +248,22 @@ def mount_s3(
|
|
|
247
248
|
def _use_s3_cred(runtime: "KubeResource"):
|
|
248
249
|
_access_key = aws_access_key or os.environ.get(prefix + "AWS_ACCESS_KEY_ID")
|
|
249
250
|
_secret_key = aws_secret_key or os.environ.get(prefix + "AWS_SECRET_ACCESS_KEY")
|
|
250
|
-
|
|
251
|
+
|
|
252
|
+
# Check for endpoint URL with backward compatibility
|
|
253
|
+
_endpoint_url = endpoint_url or os.environ.get(prefix + "AWS_ENDPOINT_URL_S3")
|
|
254
|
+
if not _endpoint_url:
|
|
255
|
+
# Check for deprecated environment variable
|
|
256
|
+
_endpoint_url = os.environ.get(prefix + "S3_ENDPOINT_URL")
|
|
257
|
+
if _endpoint_url:
|
|
258
|
+
warnings.warn(
|
|
259
|
+
"S3_ENDPOINT_URL is deprecated in 1.10.0 and will be removed in 1.12.0, "
|
|
260
|
+
"use AWS_ENDPOINT_URL_S3 instead.",
|
|
261
|
+
# TODO: Remove this in 1.12.0
|
|
262
|
+
FutureWarning,
|
|
263
|
+
)
|
|
251
264
|
|
|
252
265
|
if _endpoint_url:
|
|
253
|
-
runtime.set_env(prefix + "
|
|
266
|
+
runtime.set_env(prefix + "AWS_ENDPOINT_URL_S3", _endpoint_url)
|
|
254
267
|
if aws_region:
|
|
255
268
|
runtime.set_env(prefix + "AWS_REGION", aws_region)
|
|
256
269
|
if non_anonymous:
|
|
@@ -968,24 +968,6 @@ class RemoteRuntime(KubeResource):
|
|
|
968
968
|
self._mock_server = None
|
|
969
969
|
|
|
970
970
|
if "://" not in path:
|
|
971
|
-
if not self.status.address:
|
|
972
|
-
# here we check that if default http trigger is disabled, function contains a custom http trigger
|
|
973
|
-
# Otherwise, the function is not invokable, so we raise an error
|
|
974
|
-
if (
|
|
975
|
-
not self._trigger_of_kind_exists(kind="http")
|
|
976
|
-
and self.spec.disable_default_http_trigger
|
|
977
|
-
):
|
|
978
|
-
raise mlrun.errors.MLRunPreconditionFailedError(
|
|
979
|
-
"Default http trigger creation is disabled and there is no any other custom http trigger, "
|
|
980
|
-
"so function can not be invoked via http. Either enable default http trigger creation or "
|
|
981
|
-
"create custom http trigger"
|
|
982
|
-
)
|
|
983
|
-
state, _, _ = self._get_state()
|
|
984
|
-
if state not in ["ready", "scaledToZero"]:
|
|
985
|
-
logger.warning(f"Function is in the {state} state")
|
|
986
|
-
if not self.status.address:
|
|
987
|
-
raise ValueError("no function address first run .deploy()")
|
|
988
|
-
|
|
989
971
|
path = self._resolve_invocation_url(path, force_external_address)
|
|
990
972
|
|
|
991
973
|
if headers is None:
|
|
@@ -1228,19 +1210,47 @@ class RemoteRuntime(KubeResource):
|
|
|
1228
1210
|
# internal / external invocation urls is a nuclio >= 1.6.x feature
|
|
1229
1211
|
# try to infer the invocation url from the internal and if not exists, use external.
|
|
1230
1212
|
# $$$$ we do not want to use the external invocation url (e.g.: ingress, nodePort, etc.)
|
|
1213
|
+
|
|
1214
|
+
# check function state before invocation
|
|
1215
|
+
state, _, _ = self._get_state()
|
|
1216
|
+
if state not in ["ready", "scaledToZero"]:
|
|
1217
|
+
logger.warning(f"Function is in the {state} state")
|
|
1218
|
+
|
|
1219
|
+
# prefer internal invocation url if running inside k8s cluster
|
|
1231
1220
|
if (
|
|
1232
1221
|
not force_external_address
|
|
1233
1222
|
and self.status.internal_invocation_urls
|
|
1234
1223
|
and mlrun.k8s_utils.is_running_inside_kubernetes_cluster()
|
|
1235
1224
|
):
|
|
1236
|
-
|
|
1225
|
+
url = mlrun.utils.helpers.join_urls(
|
|
1237
1226
|
f"http://{self.status.internal_invocation_urls[0]}", path
|
|
1238
1227
|
)
|
|
1228
|
+
logger.debug(
|
|
1229
|
+
f"Using internal invocation url {url}. Make sure you have network access to the k8s cluster. "
|
|
1230
|
+
f"Otherwise, set force_external_address to True"
|
|
1231
|
+
)
|
|
1232
|
+
return url
|
|
1239
1233
|
|
|
1240
1234
|
if self.status.external_invocation_urls:
|
|
1241
1235
|
return mlrun.utils.helpers.join_urls(
|
|
1242
1236
|
f"http://{self.status.external_invocation_urls[0]}", path
|
|
1243
1237
|
)
|
|
1238
|
+
|
|
1239
|
+
if not self.status.address:
|
|
1240
|
+
# if there is no address
|
|
1241
|
+
# here we check that if default http trigger is disabled, function contains a custom http trigger
|
|
1242
|
+
# Otherwise, the function is not invokable, so we raise an error
|
|
1243
|
+
if (
|
|
1244
|
+
not self._trigger_of_kind_exists(kind="http")
|
|
1245
|
+
and self.spec.disable_default_http_trigger
|
|
1246
|
+
):
|
|
1247
|
+
raise mlrun.errors.MLRunPreconditionFailedError(
|
|
1248
|
+
"Default http trigger creation is disabled and there is no any other custom http trigger, "
|
|
1249
|
+
"so function can not be invoked via http. Either enable default http trigger creation or "
|
|
1250
|
+
"create custom http trigger"
|
|
1251
|
+
)
|
|
1252
|
+
else:
|
|
1253
|
+
raise ValueError("no function address first run .deploy()")
|
|
1244
1254
|
else:
|
|
1245
1255
|
return mlrun.utils.helpers.join_urls(f"http://{self.status.address}", path)
|
|
1246
1256
|
|
|
@@ -1294,6 +1304,8 @@ class RemoteRuntime(KubeResource):
|
|
|
1294
1304
|
def get_url(
|
|
1295
1305
|
self,
|
|
1296
1306
|
force_external_address: bool = False,
|
|
1307
|
+
# leaving auth_info for BC
|
|
1308
|
+
# TODO: remove in 1.12.0
|
|
1297
1309
|
auth_info: AuthInfo = None,
|
|
1298
1310
|
):
|
|
1299
1311
|
"""
|
|
@@ -1304,13 +1316,10 @@ class RemoteRuntime(KubeResource):
|
|
|
1304
1316
|
|
|
1305
1317
|
:return: returns function's url
|
|
1306
1318
|
"""
|
|
1307
|
-
if
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
"no function address or not ready, first run .deploy()"
|
|
1312
|
-
)
|
|
1313
|
-
|
|
1319
|
+
if auth_info:
|
|
1320
|
+
logger.warning(
|
|
1321
|
+
"Deprecated parameter 'auth_info' was provided, but will be ignored. Will be removed in 1.12.0."
|
|
1322
|
+
)
|
|
1314
1323
|
return self._resolve_invocation_url("", force_external_address)
|
|
1315
1324
|
|
|
1316
1325
|
@staticmethod
|
mlrun/runtimes/pod.py
CHANGED
|
@@ -17,6 +17,7 @@ import os
|
|
|
17
17
|
import re
|
|
18
18
|
import time
|
|
19
19
|
import typing
|
|
20
|
+
import warnings
|
|
20
21
|
from collections.abc import Iterable
|
|
21
22
|
from enum import Enum
|
|
22
23
|
|
|
@@ -35,6 +36,7 @@ from mlrun.common.schemas import (
|
|
|
35
36
|
|
|
36
37
|
from ..config import config as mlconf
|
|
37
38
|
from ..k8s_utils import (
|
|
39
|
+
generate_preemptible_nodes_affinity_terms,
|
|
38
40
|
validate_node_selectors,
|
|
39
41
|
)
|
|
40
42
|
from ..utils import logger, update_in
|
|
@@ -874,6 +876,133 @@ class KubeResource(BaseRuntime):
|
|
|
874
876
|
"""
|
|
875
877
|
self.spec.with_requests(mem, cpu, patch=patch)
|
|
876
878
|
|
|
879
|
+
@staticmethod
|
|
880
|
+
def detect_preemptible_node_selector(node_selector: dict[str, str]) -> list[str]:
|
|
881
|
+
"""
|
|
882
|
+
Check whether any provided node selector matches preemptible selectors.
|
|
883
|
+
|
|
884
|
+
:param node_selector: User-provided node selector mapping.
|
|
885
|
+
:return: List of `"key='value'"` strings that match a preemptible selector.
|
|
886
|
+
"""
|
|
887
|
+
preemptible_node_selector = mlconf.get_preemptible_node_selector()
|
|
888
|
+
|
|
889
|
+
return [
|
|
890
|
+
f"'{key}': '{val}'"
|
|
891
|
+
for key, val in node_selector.items()
|
|
892
|
+
if preemptible_node_selector.get(key) == val
|
|
893
|
+
]
|
|
894
|
+
|
|
895
|
+
def detect_preemptible_tolerations(
|
|
896
|
+
self, tolerations: list[k8s_client.V1Toleration]
|
|
897
|
+
) -> list[str]:
|
|
898
|
+
"""
|
|
899
|
+
Check whether any provided toleration matches preemptible tolerations.
|
|
900
|
+
|
|
901
|
+
:param tolerations: User-provided tolerations.
|
|
902
|
+
:return: List of formatted toleration strings that are considered preemptible.
|
|
903
|
+
"""
|
|
904
|
+
preemptible_tolerations = [
|
|
905
|
+
k8s_client.V1Toleration(
|
|
906
|
+
key=toleration.get("key"),
|
|
907
|
+
value=toleration.get("value"),
|
|
908
|
+
effect=toleration.get("effect"),
|
|
909
|
+
)
|
|
910
|
+
for toleration in mlconf.get_preemptible_tolerations()
|
|
911
|
+
]
|
|
912
|
+
|
|
913
|
+
def _format_toleration(toleration):
|
|
914
|
+
return f"'{toleration.key}'='{toleration.value}' (effect: '{toleration.effect}')"
|
|
915
|
+
|
|
916
|
+
return [
|
|
917
|
+
_format_toleration(toleration)
|
|
918
|
+
for toleration in tolerations
|
|
919
|
+
if toleration in preemptible_tolerations
|
|
920
|
+
]
|
|
921
|
+
|
|
922
|
+
def detect_preemptible_affinity(self, affinity: k8s_client.V1Affinity) -> list[str]:
|
|
923
|
+
"""
|
|
924
|
+
Check whether any provided affinity rules match preemptible affinity configs.
|
|
925
|
+
|
|
926
|
+
:param affinity: User-provided affinity object.
|
|
927
|
+
:return: List of formatted expressions that overlap with preemptible terms.
|
|
928
|
+
"""
|
|
929
|
+
preemptible_affinity_terms = generate_preemptible_nodes_affinity_terms()
|
|
930
|
+
conflicting_affinities = []
|
|
931
|
+
|
|
932
|
+
if (
|
|
933
|
+
affinity
|
|
934
|
+
and affinity.node_affinity
|
|
935
|
+
and affinity.node_affinity.required_during_scheduling_ignored_during_execution
|
|
936
|
+
):
|
|
937
|
+
user_terms = affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms
|
|
938
|
+
for user_term in user_terms:
|
|
939
|
+
user_expressions = {
|
|
940
|
+
(expr.key, expr.operator, tuple(expr.values or []))
|
|
941
|
+
for expr in user_term.match_expressions or []
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
for preemptible_term in preemptible_affinity_terms:
|
|
945
|
+
preemptible_expressions = {
|
|
946
|
+
(expr.key, expr.operator, tuple(expr.values or []))
|
|
947
|
+
for expr in preemptible_term.match_expressions or []
|
|
948
|
+
}
|
|
949
|
+
|
|
950
|
+
# Ensure operators match and preemptible expressions are present
|
|
951
|
+
common_exprs = user_expressions & preemptible_expressions
|
|
952
|
+
if common_exprs:
|
|
953
|
+
formatted = ", ".join(
|
|
954
|
+
f"'{key} {operator} {list(values)}'"
|
|
955
|
+
for key, operator, values in common_exprs
|
|
956
|
+
)
|
|
957
|
+
conflicting_affinities.append(formatted)
|
|
958
|
+
return conflicting_affinities
|
|
959
|
+
|
|
960
|
+
def raise_preemptible_warning(
|
|
961
|
+
self,
|
|
962
|
+
node_selector: typing.Optional[dict[str, str]],
|
|
963
|
+
tolerations: typing.Optional[list[k8s_client.V1Toleration]],
|
|
964
|
+
affinity: typing.Optional[k8s_client.V1Affinity],
|
|
965
|
+
) -> None:
|
|
966
|
+
"""
|
|
967
|
+
Detect conflicts and emit a single consolidated warning if needed.
|
|
968
|
+
|
|
969
|
+
:param node_selector: User-provided node selector.
|
|
970
|
+
:param tolerations: User-provided tolerations.
|
|
971
|
+
:param affinity: User-provided affinity.
|
|
972
|
+
:warns: PreemptionWarning - Emitted when any of the provided selectors,
|
|
973
|
+
tolerations, or affinity terms match the configured preemptible
|
|
974
|
+
settings. The message lists the conflicting items.
|
|
975
|
+
"""
|
|
976
|
+
conflict_messages = []
|
|
977
|
+
|
|
978
|
+
if node_selector:
|
|
979
|
+
ns_conflicts = ", ".join(
|
|
980
|
+
self.detect_preemptible_node_selector(node_selector)
|
|
981
|
+
)
|
|
982
|
+
if ns_conflicts:
|
|
983
|
+
conflict_messages.append(f"Node selectors: {ns_conflicts}")
|
|
984
|
+
|
|
985
|
+
if tolerations:
|
|
986
|
+
tol_conflicts = ", ".join(self.detect_preemptible_tolerations(tolerations))
|
|
987
|
+
if tol_conflicts:
|
|
988
|
+
conflict_messages.append(f"Tolerations: {tol_conflicts}")
|
|
989
|
+
|
|
990
|
+
if affinity:
|
|
991
|
+
affinity_conflicts = ", ".join(self.detect_preemptible_affinity(affinity))
|
|
992
|
+
if affinity_conflicts:
|
|
993
|
+
conflict_messages.append(f"Affinity: {affinity_conflicts}")
|
|
994
|
+
|
|
995
|
+
if conflict_messages:
|
|
996
|
+
warning_componentes = "; \n".join(conflict_messages)
|
|
997
|
+
warnings.warn(
|
|
998
|
+
f"Warning: based on MLRun's preemptible node configuration, the following components \n"
|
|
999
|
+
f"may be removed or adjusted at runtime:\n"
|
|
1000
|
+
f"{warning_componentes}.\n"
|
|
1001
|
+
"This adjustment depends on the function's preemption mode. \n"
|
|
1002
|
+
"The list of potential adjusted preemptible selectors can be viewed here: "
|
|
1003
|
+
"mlrun.mlconf.get_preemptible_node_selector() and mlrun.mlconf.get_preemptible_tolerations()."
|
|
1004
|
+
)
|
|
1005
|
+
|
|
877
1006
|
def with_node_selection(
|
|
878
1007
|
self,
|
|
879
1008
|
node_name: typing.Optional[str] = None,
|
|
@@ -882,18 +1011,26 @@ class KubeResource(BaseRuntime):
|
|
|
882
1011
|
tolerations: typing.Optional[list[k8s_client.V1Toleration]] = None,
|
|
883
1012
|
):
|
|
884
1013
|
"""
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity
|
|
891
|
-
for details
|
|
892
|
-
:param tolerations: Tolerations are applied to pods, and allow (but do not require) the pods to schedule
|
|
893
|
-
onto nodes with matching taints - see
|
|
894
|
-
https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration
|
|
895
|
-
for details
|
|
1014
|
+
Configure Kubernetes node scheduling for this function.
|
|
1015
|
+
|
|
1016
|
+
Updates one or more scheduling hints: exact node pinning, label-based selection,
|
|
1017
|
+
affinity/anti-affinity rules, and taint tolerations. Passing ``None`` leaves the
|
|
1018
|
+
current value unchanged; pass an empty dict/list (e.g., ``{}``, ``[]``) to clear.
|
|
896
1019
|
|
|
1020
|
+
:param node_name: Exact Kubernetes node name to pin the pod to.
|
|
1021
|
+
:param node_selector: Mapping of label selectors. Use ``{}`` to clear.
|
|
1022
|
+
:param affinity: :class:`kubernetes.client.V1Affinity` constraints.
|
|
1023
|
+
:param tolerations: List of :class:`kubernetes.client.V1Toleration`. Use ``[]`` to clear.
|
|
1024
|
+
:warns: PreemptionWarning - Emitted if provided selectors/tolerations/affinity
|
|
1025
|
+
conflict with the function's preemption mode.
|
|
1026
|
+
|
|
1027
|
+
Example usage:
|
|
1028
|
+
Prefer a GPU pool and allow scheduling on spot nodes::
|
|
1029
|
+
|
|
1030
|
+
job.with_node_selection(
|
|
1031
|
+
node_selector={"nodepool": "gpu"},
|
|
1032
|
+
tolerations=[k8s_client.V1Toleration(key="spot", operator="Exists")],
|
|
1033
|
+
)
|
|
897
1034
|
"""
|
|
898
1035
|
if node_name:
|
|
899
1036
|
self.spec.node_name = node_name
|
|
@@ -904,6 +1041,11 @@ class KubeResource(BaseRuntime):
|
|
|
904
1041
|
self.spec.affinity = affinity
|
|
905
1042
|
if tolerations is not None:
|
|
906
1043
|
self.spec.tolerations = tolerations
|
|
1044
|
+
self.raise_preemptible_warning(
|
|
1045
|
+
node_selector=self.spec.node_selector,
|
|
1046
|
+
tolerations=self.spec.tolerations,
|
|
1047
|
+
affinity=self.spec.affinity,
|
|
1048
|
+
)
|
|
907
1049
|
|
|
908
1050
|
def with_priority_class(self, name: typing.Optional[str] = None):
|
|
909
1051
|
"""
|
mlrun/serving/routers.py
CHANGED
|
@@ -31,6 +31,9 @@ import mlrun.common.model_monitoring
|
|
|
31
31
|
import mlrun.common.schemas.model_monitoring
|
|
32
32
|
from mlrun.utils import logger, now_date
|
|
33
33
|
|
|
34
|
+
from ..common.model_monitoring.helpers import (
|
|
35
|
+
get_model_endpoints_creation_task_status,
|
|
36
|
+
)
|
|
34
37
|
from .utils import RouterToDict, _extract_input_data, _update_result_body
|
|
35
38
|
from .v2_serving import _ModelLogPusher
|
|
36
39
|
|
|
@@ -171,46 +174,6 @@ class BaseModelRouter(RouterToDict):
|
|
|
171
174
|
"""run tasks after processing the event"""
|
|
172
175
|
return event
|
|
173
176
|
|
|
174
|
-
def _get_background_task_status(
|
|
175
|
-
self,
|
|
176
|
-
) -> mlrun.common.schemas.BackgroundTaskState:
|
|
177
|
-
self._background_task_check_timestamp = now_date()
|
|
178
|
-
server: mlrun.serving.GraphServer = getattr(
|
|
179
|
-
self.context, "_server", None
|
|
180
|
-
) or getattr(self.context, "server", None)
|
|
181
|
-
if not self.context.is_mock:
|
|
182
|
-
if server.model_endpoint_creation_task_name:
|
|
183
|
-
background_task = mlrun.get_run_db().get_project_background_task(
|
|
184
|
-
server.project, server.model_endpoint_creation_task_name
|
|
185
|
-
)
|
|
186
|
-
logger.debug(
|
|
187
|
-
"Checking model endpoint creation task status",
|
|
188
|
-
task_name=server.model_endpoint_creation_task_name,
|
|
189
|
-
)
|
|
190
|
-
if (
|
|
191
|
-
background_task.status.state
|
|
192
|
-
in mlrun.common.schemas.BackgroundTaskState.terminal_states()
|
|
193
|
-
):
|
|
194
|
-
logger.info(
|
|
195
|
-
f"Model endpoint creation task completed with state {background_task.status.state}"
|
|
196
|
-
)
|
|
197
|
-
else: # in progress
|
|
198
|
-
logger.info(
|
|
199
|
-
f"Model endpoint creation task is still in progress with the current state: "
|
|
200
|
-
f"{background_task.status.state}. Events will not be monitored for the next "
|
|
201
|
-
f"{mlrun.mlconf.model_endpoint_monitoring.model_endpoint_creation_check_period} seconds",
|
|
202
|
-
name=self.name,
|
|
203
|
-
background_task_check_timestamp=self._background_task_check_timestamp.isoformat(),
|
|
204
|
-
)
|
|
205
|
-
return background_task.status.state
|
|
206
|
-
else:
|
|
207
|
-
logger.error(
|
|
208
|
-
"Model endpoint creation task name not provided. This function is not being monitored.",
|
|
209
|
-
)
|
|
210
|
-
elif self.context.monitoring_mock:
|
|
211
|
-
return mlrun.common.schemas.BackgroundTaskState.succeeded
|
|
212
|
-
return mlrun.common.schemas.BackgroundTaskState.failed
|
|
213
|
-
|
|
214
177
|
def _update_background_task_state(self, event):
|
|
215
178
|
if not self.background_task_reached_terminal_state and (
|
|
216
179
|
self._background_task_check_timestamp is None
|
|
@@ -219,7 +182,26 @@ class BaseModelRouter(RouterToDict):
|
|
|
219
182
|
seconds=mlrun.mlconf.model_endpoint_monitoring.model_endpoint_creation_check_period
|
|
220
183
|
)
|
|
221
184
|
):
|
|
222
|
-
|
|
185
|
+
server: mlrun.serving.GraphServer = getattr(
|
|
186
|
+
self.context, "_server", None
|
|
187
|
+
) or getattr(self.context, "server", None)
|
|
188
|
+
if not self.context.is_mock:
|
|
189
|
+
(
|
|
190
|
+
self._background_task_current_state,
|
|
191
|
+
self._background_task_check_timestamp,
|
|
192
|
+
_,
|
|
193
|
+
) = get_model_endpoints_creation_task_status(server)
|
|
194
|
+
elif self.context.monitoring_mock:
|
|
195
|
+
self._background_task_current_state = (
|
|
196
|
+
mlrun.common.schemas.BackgroundTaskState.succeeded
|
|
197
|
+
)
|
|
198
|
+
self._background_task_check_timestamp = mlrun.utils.now_date()
|
|
199
|
+
else:
|
|
200
|
+
self._background_task_current_state = (
|
|
201
|
+
mlrun.common.schemas.BackgroundTaskState.failed
|
|
202
|
+
)
|
|
203
|
+
self._background_task_check_timestamp = mlrun.utils.now_date()
|
|
204
|
+
|
|
223
205
|
if event.body:
|
|
224
206
|
event.body["background_task_state"] = (
|
|
225
207
|
self._background_task_current_state
|
mlrun/serving/server.py
CHANGED
|
@@ -417,6 +417,7 @@ def add_monitoring_general_steps(
|
|
|
417
417
|
"mlrun.serving.system_steps.BackgroundTaskStatus",
|
|
418
418
|
"background_task_status_step",
|
|
419
419
|
model_endpoint_creation_strategy=mlrun.common.schemas.ModelEndpointCreationStrategy.SKIP,
|
|
420
|
+
full_event=True,
|
|
420
421
|
)
|
|
421
422
|
monitor_flow_step = graph.add_step(
|
|
422
423
|
"storey.Filter",
|
mlrun/serving/states.py
CHANGED
|
@@ -39,7 +39,7 @@ import mlrun.common.schemas as schemas
|
|
|
39
39
|
from mlrun.artifacts.llm_prompt import LLMPromptArtifact, PlaceholderDefaultDict
|
|
40
40
|
from mlrun.artifacts.model import ModelArtifact
|
|
41
41
|
from mlrun.datastore.datastore_profile import (
|
|
42
|
-
|
|
42
|
+
DatastoreProfileKafkaStream,
|
|
43
43
|
DatastoreProfileKafkaTarget,
|
|
44
44
|
DatastoreProfileV3io,
|
|
45
45
|
datastore_profile_read,
|
|
@@ -3398,7 +3398,7 @@ def _init_async_objects(context, steps):
|
|
|
3398
3398
|
datastore_profile = datastore_profile_read(stream_path)
|
|
3399
3399
|
if isinstance(
|
|
3400
3400
|
datastore_profile,
|
|
3401
|
-
(DatastoreProfileKafkaTarget,
|
|
3401
|
+
(DatastoreProfileKafkaTarget, DatastoreProfileKafkaStream),
|
|
3402
3402
|
):
|
|
3403
3403
|
step._async_object = KafkaStoreyTarget(
|
|
3404
3404
|
path=stream_path,
|
|
@@ -3414,7 +3414,7 @@ def _init_async_objects(context, steps):
|
|
|
3414
3414
|
else:
|
|
3415
3415
|
raise mlrun.errors.MLRunValueError(
|
|
3416
3416
|
f"Received an unexpected stream profile type: {type(datastore_profile)}\n"
|
|
3417
|
-
"Expects `DatastoreProfileV3io` or `
|
|
3417
|
+
"Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaStream`."
|
|
3418
3418
|
)
|
|
3419
3419
|
elif stream_path.startswith("kafka://") or kafka_brokers:
|
|
3420
3420
|
topic, brokers = parse_kafka_url(stream_path, kafka_brokers)
|