mlrun 1.10.0rc13__py3-none-any.whl → 1.10.0rc42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +22 -2
- mlrun/artifacts/base.py +0 -31
- mlrun/artifacts/document.py +6 -1
- mlrun/artifacts/llm_prompt.py +123 -25
- mlrun/artifacts/manager.py +0 -5
- mlrun/artifacts/model.py +3 -3
- mlrun/common/constants.py +10 -1
- mlrun/common/formatters/artifact.py +1 -0
- mlrun/common/model_monitoring/helpers.py +86 -0
- mlrun/common/schemas/__init__.py +3 -0
- mlrun/common/schemas/auth.py +2 -0
- mlrun/common/schemas/function.py +10 -0
- mlrun/common/schemas/hub.py +30 -18
- mlrun/common/schemas/model_monitoring/__init__.py +3 -0
- mlrun/common/schemas/model_monitoring/constants.py +30 -6
- mlrun/common/schemas/model_monitoring/functions.py +14 -5
- mlrun/common/schemas/model_monitoring/model_endpoints.py +21 -0
- mlrun/common/schemas/pipeline.py +1 -1
- mlrun/common/schemas/serving.py +3 -0
- mlrun/common/schemas/workflow.py +3 -1
- mlrun/common/secrets.py +22 -1
- mlrun/config.py +33 -11
- mlrun/datastore/__init__.py +11 -3
- mlrun/datastore/azure_blob.py +162 -47
- mlrun/datastore/datastore.py +9 -4
- mlrun/datastore/datastore_profile.py +61 -5
- mlrun/datastore/model_provider/huggingface_provider.py +363 -0
- mlrun/datastore/model_provider/mock_model_provider.py +87 -0
- mlrun/datastore/model_provider/model_provider.py +230 -65
- mlrun/datastore/model_provider/openai_provider.py +295 -42
- mlrun/datastore/s3.py +24 -2
- mlrun/datastore/storeytargets.py +2 -3
- mlrun/datastore/utils.py +15 -3
- mlrun/db/base.py +47 -19
- mlrun/db/httpdb.py +120 -56
- mlrun/db/nopdb.py +38 -10
- mlrun/execution.py +70 -19
- mlrun/hub/__init__.py +15 -0
- mlrun/hub/module.py +181 -0
- mlrun/k8s_utils.py +105 -16
- mlrun/launcher/base.py +13 -6
- mlrun/launcher/local.py +15 -0
- mlrun/model.py +24 -3
- mlrun/model_monitoring/__init__.py +1 -0
- mlrun/model_monitoring/api.py +66 -27
- mlrun/model_monitoring/applications/__init__.py +1 -1
- mlrun/model_monitoring/applications/base.py +509 -117
- mlrun/model_monitoring/applications/context.py +2 -4
- mlrun/model_monitoring/applications/results.py +4 -7
- mlrun/model_monitoring/controller.py +239 -101
- mlrun/model_monitoring/db/_schedules.py +116 -33
- mlrun/model_monitoring/db/_stats.py +4 -3
- mlrun/model_monitoring/db/tsdb/base.py +100 -9
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +11 -6
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +191 -50
- mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +51 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +259 -40
- mlrun/model_monitoring/helpers.py +54 -9
- mlrun/model_monitoring/stream_processing.py +45 -14
- mlrun/model_monitoring/writer.py +220 -1
- mlrun/platforms/__init__.py +3 -2
- mlrun/platforms/iguazio.py +7 -3
- mlrun/projects/operations.py +6 -1
- mlrun/projects/pipelines.py +46 -26
- mlrun/projects/project.py +166 -58
- mlrun/run.py +94 -17
- mlrun/runtimes/__init__.py +18 -0
- mlrun/runtimes/base.py +14 -6
- mlrun/runtimes/daskjob.py +7 -0
- mlrun/runtimes/local.py +5 -2
- mlrun/runtimes/mounts.py +20 -2
- mlrun/runtimes/mpijob/abstract.py +6 -0
- mlrun/runtimes/mpijob/v1.py +6 -0
- mlrun/runtimes/nuclio/__init__.py +1 -0
- mlrun/runtimes/nuclio/application/application.py +149 -17
- mlrun/runtimes/nuclio/function.py +76 -27
- mlrun/runtimes/nuclio/serving.py +97 -15
- mlrun/runtimes/pod.py +234 -21
- mlrun/runtimes/remotesparkjob.py +6 -0
- mlrun/runtimes/sparkjob/spark3job.py +6 -0
- mlrun/runtimes/utils.py +49 -11
- mlrun/secrets.py +54 -13
- mlrun/serving/__init__.py +2 -0
- mlrun/serving/remote.py +79 -6
- mlrun/serving/routers.py +23 -41
- mlrun/serving/server.py +320 -80
- mlrun/serving/states.py +725 -157
- mlrun/serving/steps.py +62 -0
- mlrun/serving/system_steps.py +200 -119
- mlrun/serving/v2_serving.py +9 -10
- mlrun/utils/helpers.py +288 -88
- mlrun/utils/logger.py +3 -1
- mlrun/utils/notifications/notification/base.py +18 -0
- mlrun/utils/notifications/notification/git.py +2 -4
- mlrun/utils/notifications/notification/slack.py +2 -4
- mlrun/utils/notifications/notification/webhook.py +2 -5
- mlrun/utils/notifications/notification_pusher.py +1 -1
- mlrun/utils/retryer.py +15 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/METADATA +45 -51
- {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/RECORD +106 -101
- mlrun/api/schemas/__init__.py +0 -259
- {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/top_level.txt +0 -0
mlrun/runtimes/pod.py
CHANGED
|
@@ -17,14 +17,17 @@ import os
|
|
|
17
17
|
import re
|
|
18
18
|
import time
|
|
19
19
|
import typing
|
|
20
|
+
import warnings
|
|
20
21
|
from collections.abc import Iterable
|
|
21
22
|
from enum import Enum
|
|
23
|
+
from typing import Optional
|
|
22
24
|
|
|
23
25
|
import dotenv
|
|
24
26
|
import kubernetes.client as k8s_client
|
|
25
27
|
from kubernetes.client import V1Volume, V1VolumeMount
|
|
26
28
|
|
|
27
29
|
import mlrun.common.constants
|
|
30
|
+
import mlrun.common.secrets
|
|
28
31
|
import mlrun.errors
|
|
29
32
|
import mlrun.runtimes.mounts
|
|
30
33
|
import mlrun.utils.regex
|
|
@@ -35,6 +38,7 @@ from mlrun.common.schemas import (
|
|
|
35
38
|
|
|
36
39
|
from ..config import config as mlconf
|
|
37
40
|
from ..k8s_utils import (
|
|
41
|
+
generate_preemptible_nodes_affinity_terms,
|
|
38
42
|
validate_node_selectors,
|
|
39
43
|
)
|
|
40
44
|
from ..utils import logger, update_in
|
|
@@ -104,6 +108,10 @@ class KubeResourceSpec(FunctionSpec):
|
|
|
104
108
|
"security_context",
|
|
105
109
|
"state_thresholds",
|
|
106
110
|
"serving_spec",
|
|
111
|
+
"track_models",
|
|
112
|
+
"parameters",
|
|
113
|
+
"graph",
|
|
114
|
+
"filename",
|
|
107
115
|
]
|
|
108
116
|
_default_fields_to_strip = FunctionSpec._default_fields_to_strip + [
|
|
109
117
|
"volumes",
|
|
@@ -180,6 +188,9 @@ class KubeResourceSpec(FunctionSpec):
|
|
|
180
188
|
security_context=None,
|
|
181
189
|
state_thresholds=None,
|
|
182
190
|
serving_spec=None,
|
|
191
|
+
track_models=None,
|
|
192
|
+
parameters=None,
|
|
193
|
+
graph=None,
|
|
183
194
|
):
|
|
184
195
|
super().__init__(
|
|
185
196
|
command=command,
|
|
@@ -226,6 +237,10 @@ class KubeResourceSpec(FunctionSpec):
|
|
|
226
237
|
or mlrun.mlconf.function.spec.state_thresholds.default.to_dict()
|
|
227
238
|
)
|
|
228
239
|
self.serving_spec = serving_spec
|
|
240
|
+
self.track_models = track_models
|
|
241
|
+
self.parameters = parameters
|
|
242
|
+
self._graph = None
|
|
243
|
+
self.graph = graph
|
|
229
244
|
# Termination grace period is internal for runtimes that have a pod termination hook hence it is not in the
|
|
230
245
|
# _dict_fields and doesn't have a setter.
|
|
231
246
|
self._termination_grace_period_seconds = None
|
|
@@ -303,6 +318,17 @@ class KubeResourceSpec(FunctionSpec):
|
|
|
303
318
|
def termination_grace_period_seconds(self) -> typing.Optional[int]:
|
|
304
319
|
return self._termination_grace_period_seconds
|
|
305
320
|
|
|
321
|
+
@property
|
|
322
|
+
def graph(self):
|
|
323
|
+
"""states graph, holding the serving workflow/DAG topology"""
|
|
324
|
+
return self._graph
|
|
325
|
+
|
|
326
|
+
@graph.setter
|
|
327
|
+
def graph(self, graph):
|
|
328
|
+
from ..serving.states import graph_root_setter
|
|
329
|
+
|
|
330
|
+
graph_root_setter(self, graph)
|
|
331
|
+
|
|
306
332
|
def _serialize_field(
|
|
307
333
|
self, struct: dict, field_name: typing.Optional[str] = None, strip: bool = False
|
|
308
334
|
) -> typing.Any:
|
|
@@ -684,19 +710,45 @@ class KubeResource(BaseRuntime):
|
|
|
684
710
|
def spec(self, spec):
|
|
685
711
|
self._spec = self._verify_dict(spec, "spec", KubeResourceSpec)
|
|
686
712
|
|
|
687
|
-
def set_env_from_secret(
|
|
688
|
-
|
|
689
|
-
|
|
713
|
+
def set_env_from_secret(
|
|
714
|
+
self,
|
|
715
|
+
name: str,
|
|
716
|
+
secret: Optional[str] = None,
|
|
717
|
+
secret_key: Optional[str] = None,
|
|
718
|
+
):
|
|
719
|
+
"""
|
|
720
|
+
Set an environment variable from a Kubernetes Secret.
|
|
721
|
+
Client-side guard forbids MLRun internal auth/project secrets; no-op on API.
|
|
722
|
+
"""
|
|
723
|
+
mlrun.common.secrets.validate_not_forbidden_secret(secret)
|
|
724
|
+
key = secret_key or name
|
|
690
725
|
value_from = k8s_client.V1EnvVarSource(
|
|
691
|
-
secret_key_ref=k8s_client.V1SecretKeySelector(name=secret, key=
|
|
726
|
+
secret_key_ref=k8s_client.V1SecretKeySelector(name=secret, key=key)
|
|
692
727
|
)
|
|
693
|
-
return self._set_env(name, value_from=value_from)
|
|
728
|
+
return self._set_env(name=name, value_from=value_from)
|
|
694
729
|
|
|
695
|
-
def set_env(
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
730
|
+
def set_env(
|
|
731
|
+
self,
|
|
732
|
+
name: str,
|
|
733
|
+
value: Optional[str] = None,
|
|
734
|
+
value_from: Optional[typing.Any] = None,
|
|
735
|
+
):
|
|
736
|
+
"""
|
|
737
|
+
Set an environment variable.
|
|
738
|
+
If value comes from a Secret, validate on client-side only.
|
|
739
|
+
"""
|
|
740
|
+
if value_from is not None:
|
|
741
|
+
secret_name = self._extract_secret_name_from_value_from(
|
|
742
|
+
value_from=value_from
|
|
743
|
+
)
|
|
744
|
+
if secret_name:
|
|
745
|
+
mlrun.common.secrets.validate_not_forbidden_secret(secret_name)
|
|
746
|
+
return self._set_env(name=name, value_from=value_from)
|
|
747
|
+
|
|
748
|
+
# Plain literal value path
|
|
749
|
+
return self._set_env(
|
|
750
|
+
name=name, value=(str(value) if value is not None else None)
|
|
751
|
+
)
|
|
700
752
|
|
|
701
753
|
def with_annotations(self, annotations: dict):
|
|
702
754
|
"""set a key/value annotations in the metadata of the pod"""
|
|
@@ -853,6 +905,133 @@ class KubeResource(BaseRuntime):
|
|
|
853
905
|
"""
|
|
854
906
|
self.spec.with_requests(mem, cpu, patch=patch)
|
|
855
907
|
|
|
908
|
+
@staticmethod
|
|
909
|
+
def detect_preemptible_node_selector(node_selector: dict[str, str]) -> list[str]:
|
|
910
|
+
"""
|
|
911
|
+
Check whether any provided node selector matches preemptible selectors.
|
|
912
|
+
|
|
913
|
+
:param node_selector: User-provided node selector mapping.
|
|
914
|
+
:return: List of `"key='value'"` strings that match a preemptible selector.
|
|
915
|
+
"""
|
|
916
|
+
preemptible_node_selector = mlconf.get_preemptible_node_selector()
|
|
917
|
+
|
|
918
|
+
return [
|
|
919
|
+
f"'{key}': '{val}'"
|
|
920
|
+
for key, val in node_selector.items()
|
|
921
|
+
if preemptible_node_selector.get(key) == val
|
|
922
|
+
]
|
|
923
|
+
|
|
924
|
+
def detect_preemptible_tolerations(
|
|
925
|
+
self, tolerations: list[k8s_client.V1Toleration]
|
|
926
|
+
) -> list[str]:
|
|
927
|
+
"""
|
|
928
|
+
Check whether any provided toleration matches preemptible tolerations.
|
|
929
|
+
|
|
930
|
+
:param tolerations: User-provided tolerations.
|
|
931
|
+
:return: List of formatted toleration strings that are considered preemptible.
|
|
932
|
+
"""
|
|
933
|
+
preemptible_tolerations = [
|
|
934
|
+
k8s_client.V1Toleration(
|
|
935
|
+
key=toleration.get("key"),
|
|
936
|
+
value=toleration.get("value"),
|
|
937
|
+
effect=toleration.get("effect"),
|
|
938
|
+
)
|
|
939
|
+
for toleration in mlconf.get_preemptible_tolerations()
|
|
940
|
+
]
|
|
941
|
+
|
|
942
|
+
def _format_toleration(toleration):
|
|
943
|
+
return f"'{toleration.key}'='{toleration.value}' (effect: '{toleration.effect}')"
|
|
944
|
+
|
|
945
|
+
return [
|
|
946
|
+
_format_toleration(toleration)
|
|
947
|
+
for toleration in tolerations
|
|
948
|
+
if toleration in preemptible_tolerations
|
|
949
|
+
]
|
|
950
|
+
|
|
951
|
+
def detect_preemptible_affinity(self, affinity: k8s_client.V1Affinity) -> list[str]:
|
|
952
|
+
"""
|
|
953
|
+
Check whether any provided affinity rules match preemptible affinity configs.
|
|
954
|
+
|
|
955
|
+
:param affinity: User-provided affinity object.
|
|
956
|
+
:return: List of formatted expressions that overlap with preemptible terms.
|
|
957
|
+
"""
|
|
958
|
+
preemptible_affinity_terms = generate_preemptible_nodes_affinity_terms()
|
|
959
|
+
conflicting_affinities = []
|
|
960
|
+
|
|
961
|
+
if (
|
|
962
|
+
affinity
|
|
963
|
+
and affinity.node_affinity
|
|
964
|
+
and affinity.node_affinity.required_during_scheduling_ignored_during_execution
|
|
965
|
+
):
|
|
966
|
+
user_terms = affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms
|
|
967
|
+
for user_term in user_terms:
|
|
968
|
+
user_expressions = {
|
|
969
|
+
(expr.key, expr.operator, tuple(expr.values or []))
|
|
970
|
+
for expr in user_term.match_expressions or []
|
|
971
|
+
}
|
|
972
|
+
|
|
973
|
+
for preemptible_term in preemptible_affinity_terms:
|
|
974
|
+
preemptible_expressions = {
|
|
975
|
+
(expr.key, expr.operator, tuple(expr.values or []))
|
|
976
|
+
for expr in preemptible_term.match_expressions or []
|
|
977
|
+
}
|
|
978
|
+
|
|
979
|
+
# Ensure operators match and preemptible expressions are present
|
|
980
|
+
common_exprs = user_expressions & preemptible_expressions
|
|
981
|
+
if common_exprs:
|
|
982
|
+
formatted = ", ".join(
|
|
983
|
+
f"'{key} {operator} {list(values)}'"
|
|
984
|
+
for key, operator, values in common_exprs
|
|
985
|
+
)
|
|
986
|
+
conflicting_affinities.append(formatted)
|
|
987
|
+
return conflicting_affinities
|
|
988
|
+
|
|
989
|
+
def raise_preemptible_warning(
|
|
990
|
+
self,
|
|
991
|
+
node_selector: typing.Optional[dict[str, str]],
|
|
992
|
+
tolerations: typing.Optional[list[k8s_client.V1Toleration]],
|
|
993
|
+
affinity: typing.Optional[k8s_client.V1Affinity],
|
|
994
|
+
) -> None:
|
|
995
|
+
"""
|
|
996
|
+
Detect conflicts and emit a single consolidated warning if needed.
|
|
997
|
+
|
|
998
|
+
:param node_selector: User-provided node selector.
|
|
999
|
+
:param tolerations: User-provided tolerations.
|
|
1000
|
+
:param affinity: User-provided affinity.
|
|
1001
|
+
:warns: PreemptionWarning - Emitted when any of the provided selectors,
|
|
1002
|
+
tolerations, or affinity terms match the configured preemptible
|
|
1003
|
+
settings. The message lists the conflicting items.
|
|
1004
|
+
"""
|
|
1005
|
+
conflict_messages = []
|
|
1006
|
+
|
|
1007
|
+
if node_selector:
|
|
1008
|
+
ns_conflicts = ", ".join(
|
|
1009
|
+
self.detect_preemptible_node_selector(node_selector)
|
|
1010
|
+
)
|
|
1011
|
+
if ns_conflicts:
|
|
1012
|
+
conflict_messages.append(f"Node selectors: {ns_conflicts}")
|
|
1013
|
+
|
|
1014
|
+
if tolerations:
|
|
1015
|
+
tol_conflicts = ", ".join(self.detect_preemptible_tolerations(tolerations))
|
|
1016
|
+
if tol_conflicts:
|
|
1017
|
+
conflict_messages.append(f"Tolerations: {tol_conflicts}")
|
|
1018
|
+
|
|
1019
|
+
if affinity:
|
|
1020
|
+
affinity_conflicts = ", ".join(self.detect_preemptible_affinity(affinity))
|
|
1021
|
+
if affinity_conflicts:
|
|
1022
|
+
conflict_messages.append(f"Affinity: {affinity_conflicts}")
|
|
1023
|
+
|
|
1024
|
+
if conflict_messages:
|
|
1025
|
+
warning_componentes = "; \n".join(conflict_messages)
|
|
1026
|
+
warnings.warn(
|
|
1027
|
+
f"Warning: based on MLRun's preemptible node configuration, the following components \n"
|
|
1028
|
+
f"may be removed or adjusted at runtime:\n"
|
|
1029
|
+
f"{warning_componentes}.\n"
|
|
1030
|
+
"This adjustment depends on the function's preemption mode. \n"
|
|
1031
|
+
"The list of potential adjusted preemptible selectors can be viewed here: "
|
|
1032
|
+
"mlrun.mlconf.get_preemptible_node_selector() and mlrun.mlconf.get_preemptible_tolerations()."
|
|
1033
|
+
)
|
|
1034
|
+
|
|
856
1035
|
def with_node_selection(
|
|
857
1036
|
self,
|
|
858
1037
|
node_name: typing.Optional[str] = None,
|
|
@@ -861,18 +1040,26 @@ class KubeResource(BaseRuntime):
|
|
|
861
1040
|
tolerations: typing.Optional[list[k8s_client.V1Toleration]] = None,
|
|
862
1041
|
):
|
|
863
1042
|
"""
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
:param
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
1043
|
+
Configure Kubernetes node scheduling for this function.
|
|
1044
|
+
|
|
1045
|
+
Updates one or more scheduling hints: exact node pinning, label-based selection,
|
|
1046
|
+
affinity/anti-affinity rules, and taint tolerations. Passing ``None`` leaves the
|
|
1047
|
+
current value unchanged; pass an empty dict/list (e.g., ``{}``, ``[]``) to clear.
|
|
1048
|
+
|
|
1049
|
+
:param node_name: Exact Kubernetes node name to pin the pod to.
|
|
1050
|
+
:param node_selector: Mapping of label selectors. Use ``{}`` to clear.
|
|
1051
|
+
:param affinity: :class:`kubernetes.client.V1Affinity` constraints.
|
|
1052
|
+
:param tolerations: List of :class:`kubernetes.client.V1Toleration`. Use ``[]`` to clear.
|
|
1053
|
+
:warns: PreemptionWarning - Emitted if provided selectors/tolerations/affinity
|
|
1054
|
+
conflict with the function's preemption mode.
|
|
875
1055
|
|
|
1056
|
+
Example usage:
|
|
1057
|
+
Prefer a GPU pool and allow scheduling on spot nodes::
|
|
1058
|
+
|
|
1059
|
+
job.with_node_selection(
|
|
1060
|
+
node_selector={"nodepool": "gpu"},
|
|
1061
|
+
tolerations=[k8s_client.V1Toleration(key="spot", operator="Exists")],
|
|
1062
|
+
)
|
|
876
1063
|
"""
|
|
877
1064
|
if node_name:
|
|
878
1065
|
self.spec.node_name = node_name
|
|
@@ -883,6 +1070,11 @@ class KubeResource(BaseRuntime):
|
|
|
883
1070
|
self.spec.affinity = affinity
|
|
884
1071
|
if tolerations is not None:
|
|
885
1072
|
self.spec.tolerations = tolerations
|
|
1073
|
+
self.raise_preemptible_warning(
|
|
1074
|
+
node_selector=self.spec.node_selector,
|
|
1075
|
+
tolerations=self.spec.tolerations,
|
|
1076
|
+
affinity=self.spec.affinity,
|
|
1077
|
+
)
|
|
886
1078
|
|
|
887
1079
|
def with_priority_class(self, name: typing.Optional[str] = None):
|
|
888
1080
|
"""
|
|
@@ -1202,6 +1394,27 @@ class KubeResource(BaseRuntime):
|
|
|
1202
1394
|
|
|
1203
1395
|
return self.status.state
|
|
1204
1396
|
|
|
1397
|
+
@staticmethod
|
|
1398
|
+
def _extract_secret_name_from_value_from(
|
|
1399
|
+
value_from: typing.Any,
|
|
1400
|
+
) -> Optional[str]:
|
|
1401
|
+
"""Extract secret name from a V1EnvVarSource or dict representation."""
|
|
1402
|
+
if isinstance(value_from, k8s_client.V1EnvVarSource):
|
|
1403
|
+
if value_from.secret_key_ref:
|
|
1404
|
+
return value_from.secret_key_ref.name
|
|
1405
|
+
elif isinstance(value_from, dict):
|
|
1406
|
+
value_from = (
|
|
1407
|
+
value_from.get("valueFrom")
|
|
1408
|
+
or value_from.get("value_from")
|
|
1409
|
+
or value_from
|
|
1410
|
+
)
|
|
1411
|
+
secret_key_ref = (value_from or {}).get("secretKeyRef") or (
|
|
1412
|
+
value_from or {}
|
|
1413
|
+
).get("secret_key_ref")
|
|
1414
|
+
if isinstance(secret_key_ref, dict):
|
|
1415
|
+
return secret_key_ref.get("name")
|
|
1416
|
+
return None
|
|
1417
|
+
|
|
1205
1418
|
|
|
1206
1419
|
def _resolve_if_type_sanitized(attribute_name, attribute):
|
|
1207
1420
|
attribute_config = sanitized_attributes[attribute_name]
|
mlrun/runtimes/remotesparkjob.py
CHANGED
|
@@ -59,6 +59,9 @@ class RemoteSparkSpec(KubeResourceSpec):
|
|
|
59
59
|
security_context=None,
|
|
60
60
|
state_thresholds=None,
|
|
61
61
|
serving_spec=None,
|
|
62
|
+
graph=None,
|
|
63
|
+
parameters=None,
|
|
64
|
+
track_models=None,
|
|
62
65
|
):
|
|
63
66
|
super().__init__(
|
|
64
67
|
command=command,
|
|
@@ -89,6 +92,9 @@ class RemoteSparkSpec(KubeResourceSpec):
|
|
|
89
92
|
security_context=security_context,
|
|
90
93
|
state_thresholds=state_thresholds,
|
|
91
94
|
serving_spec=serving_spec,
|
|
95
|
+
graph=graph,
|
|
96
|
+
parameters=parameters,
|
|
97
|
+
track_models=track_models,
|
|
92
98
|
)
|
|
93
99
|
self.provider = provider
|
|
94
100
|
|
|
@@ -169,6 +169,9 @@ class Spark3JobSpec(KubeResourceSpec):
|
|
|
169
169
|
security_context=None,
|
|
170
170
|
state_thresholds=None,
|
|
171
171
|
serving_spec=None,
|
|
172
|
+
graph=None,
|
|
173
|
+
parameters=None,
|
|
174
|
+
track_models=None,
|
|
172
175
|
):
|
|
173
176
|
super().__init__(
|
|
174
177
|
command=command,
|
|
@@ -199,6 +202,9 @@ class Spark3JobSpec(KubeResourceSpec):
|
|
|
199
202
|
security_context=security_context,
|
|
200
203
|
state_thresholds=state_thresholds,
|
|
201
204
|
serving_spec=serving_spec,
|
|
205
|
+
graph=graph,
|
|
206
|
+
parameters=parameters,
|
|
207
|
+
track_models=track_models,
|
|
202
208
|
)
|
|
203
209
|
|
|
204
210
|
self.driver_resources = driver_resources or {}
|
mlrun/runtimes/utils.py
CHANGED
|
@@ -26,6 +26,7 @@ import pandas as pd
|
|
|
26
26
|
import mlrun
|
|
27
27
|
import mlrun.common.constants
|
|
28
28
|
import mlrun.common.constants as mlrun_constants
|
|
29
|
+
import mlrun.common.runtimes.constants
|
|
29
30
|
import mlrun.common.schemas
|
|
30
31
|
import mlrun.utils.regex
|
|
31
32
|
from mlrun.artifacts import TableArtifact
|
|
@@ -153,6 +154,7 @@ def results_to_iter(results, runspec, execution):
|
|
|
153
154
|
|
|
154
155
|
iter = []
|
|
155
156
|
failed = 0
|
|
157
|
+
pending_retry = 0
|
|
156
158
|
running = 0
|
|
157
159
|
for task in results:
|
|
158
160
|
if task:
|
|
@@ -164,17 +166,26 @@ def results_to_iter(results, runspec, execution):
|
|
|
164
166
|
"state": state,
|
|
165
167
|
"iter": id,
|
|
166
168
|
}
|
|
167
|
-
if state ==
|
|
169
|
+
if state == mlrun.common.runtimes.constants.RunStates.error:
|
|
168
170
|
failed += 1
|
|
169
171
|
err = get_in(task, ["status", "error"], "")
|
|
170
|
-
logger.error(f"error in task
|
|
171
|
-
elif state
|
|
172
|
+
logger.error(f"error in task {execution.uid}:{id} - {err_to_str(err)}")
|
|
173
|
+
elif state == mlrun.common.runtimes.constants.RunStates.pending_retry:
|
|
174
|
+
pending_retry += 1
|
|
175
|
+
err = get_in(task, ["status", "error"], "")
|
|
176
|
+
retry_count = get_in(task, ["status", "retry_count"], 0)
|
|
177
|
+
logger.warning(
|
|
178
|
+
f"pending retry in task {execution.uid}:{id} - {err_to_str(err)}. Retry count: {retry_count}"
|
|
179
|
+
)
|
|
180
|
+
elif state != mlrun.common.runtimes.constants.RunStates.completed:
|
|
172
181
|
running += 1
|
|
173
182
|
|
|
174
183
|
iter.append(struct)
|
|
175
184
|
|
|
176
185
|
if not iter:
|
|
177
|
-
execution.set_state(
|
|
186
|
+
execution.set_state(
|
|
187
|
+
mlrun.common.runtimes.constants.RunStates.completed, commit=True
|
|
188
|
+
)
|
|
178
189
|
logger.warning("Warning!, zero iteration results")
|
|
179
190
|
return
|
|
180
191
|
if hasattr(pd, "json_normalize"):
|
|
@@ -204,8 +215,14 @@ def results_to_iter(results, runspec, execution):
|
|
|
204
215
|
error=f"{failed} of {len(results)} tasks failed, check logs in db for details",
|
|
205
216
|
commit=False,
|
|
206
217
|
)
|
|
218
|
+
elif pending_retry:
|
|
219
|
+
execution.set_state(
|
|
220
|
+
mlrun.common.runtimes.constants.RunStates.pending_retry, commit=False
|
|
221
|
+
)
|
|
207
222
|
elif running == 0:
|
|
208
|
-
execution.set_state(
|
|
223
|
+
execution.set_state(
|
|
224
|
+
mlrun.common.runtimes.constants.RunStates.completed, commit=False
|
|
225
|
+
)
|
|
209
226
|
execution.commit()
|
|
210
227
|
|
|
211
228
|
|
|
@@ -431,24 +448,45 @@ def enrich_function_from_dict(function, function_dict):
|
|
|
431
448
|
return function
|
|
432
449
|
|
|
433
450
|
|
|
451
|
+
def resolve_owner(
|
|
452
|
+
labels: dict,
|
|
453
|
+
owner_to_enrich: Optional[str] = None,
|
|
454
|
+
):
|
|
455
|
+
"""
|
|
456
|
+
Resolve the owner label value
|
|
457
|
+
:param labels: The run labels dict
|
|
458
|
+
:param auth_username: The authenticated username
|
|
459
|
+
:return: The resolved owner label value
|
|
460
|
+
"""
|
|
461
|
+
|
|
462
|
+
if owner_to_enrich and (
|
|
463
|
+
labels.get("job-type") == mlrun.common.constants.JOB_TYPE_WORKFLOW_RUNNER
|
|
464
|
+
or labels.get("job-type")
|
|
465
|
+
== mlrun.common.constants.JOB_TYPE_RERUN_WORKFLOW_RUNNER
|
|
466
|
+
):
|
|
467
|
+
return owner_to_enrich
|
|
468
|
+
else:
|
|
469
|
+
return os.environ.get("V3IO_USERNAME") or getpass.getuser()
|
|
470
|
+
|
|
471
|
+
|
|
434
472
|
def enrich_run_labels(
|
|
435
473
|
labels: dict,
|
|
436
474
|
labels_to_enrich: Optional[list[mlrun_constants.MLRunInternalLabels]] = None,
|
|
475
|
+
owner_to_enrich: Optional[str] = None,
|
|
437
476
|
):
|
|
438
477
|
"""
|
|
439
|
-
Enrich the run labels with the internal labels and the labels enrichment extension
|
|
478
|
+
Enrich the run labels with the internal labels and the labels enrichment extension.
|
|
440
479
|
:param labels: The run labels dict
|
|
441
480
|
:param labels_to_enrich: The label keys to enrich from MLRunInternalLabels.default_run_labels_to_enrich
|
|
481
|
+
:param owner_to_enrich: Optional owner to enrich the labels with, if not provided will try to resolve it.
|
|
442
482
|
:return: The enriched labels dict
|
|
443
483
|
"""
|
|
444
484
|
# Merge the labels with the labels enrichment extension
|
|
445
485
|
labels_enrichment = {
|
|
446
|
-
mlrun_constants.MLRunInternalLabels.owner:
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
mlrun_constants.MLRunInternalLabels.v3io_user: os.environ.get("V3IO_USERNAME"),
|
|
486
|
+
mlrun_constants.MLRunInternalLabels.owner: resolve_owner(
|
|
487
|
+
labels, owner_to_enrich
|
|
488
|
+
),
|
|
450
489
|
}
|
|
451
|
-
|
|
452
490
|
# Resolve which label keys to enrich
|
|
453
491
|
if labels_to_enrich is None:
|
|
454
492
|
labels_to_enrich = (
|
mlrun/secrets.py
CHANGED
|
@@ -11,9 +11,9 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
import json
|
|
15
15
|
from ast import literal_eval
|
|
16
|
-
from os import environ
|
|
16
|
+
from os import environ
|
|
17
17
|
from typing import Callable, Optional, Union
|
|
18
18
|
|
|
19
19
|
from .utils import AzureVaultStore, list2dict
|
|
@@ -161,6 +161,9 @@ def get_secret_or_env(
|
|
|
161
161
|
4. An MLRun-generated env. variable, mounted from a project secret (to be used in MLRun runtimes)
|
|
162
162
|
5. The default value
|
|
163
163
|
|
|
164
|
+
Also supports discovering the value inside any environment variable that contains a JSON-encoded list
|
|
165
|
+
of dicts with fields: {'name': 'KEY', 'value': 'VAL', 'value_from': ...}. This fallback is applied
|
|
166
|
+
after checking normal environment variables and before returning the default.
|
|
164
167
|
Example::
|
|
165
168
|
|
|
166
169
|
secrets = {"KEY1": "VALUE1"}
|
|
@@ -187,18 +190,56 @@ def get_secret_or_env(
|
|
|
187
190
|
if prefix:
|
|
188
191
|
key = f"{prefix}_{key}"
|
|
189
192
|
|
|
190
|
-
value = None
|
|
191
193
|
if secret_provider:
|
|
192
194
|
if isinstance(secret_provider, (dict, SecretsStore)):
|
|
193
|
-
|
|
195
|
+
secret_value = secret_provider.get(key)
|
|
194
196
|
else:
|
|
195
|
-
|
|
196
|
-
if
|
|
197
|
-
return
|
|
197
|
+
secret_value = secret_provider(key)
|
|
198
|
+
if secret_value:
|
|
199
|
+
return secret_value
|
|
200
|
+
|
|
201
|
+
direct_environment_value = environ.get(key)
|
|
202
|
+
if direct_environment_value:
|
|
203
|
+
return direct_environment_value
|
|
204
|
+
|
|
205
|
+
json_list_value = _find_value_in_json_env_lists(key)
|
|
206
|
+
if json_list_value is not None:
|
|
207
|
+
return json_list_value
|
|
208
|
+
|
|
209
|
+
mlrun_env_key = SecretsStore.k8s_env_variable_name_for_secret(key)
|
|
210
|
+
mlrun_env_value = environ.get(mlrun_env_key)
|
|
211
|
+
if mlrun_env_value:
|
|
212
|
+
return mlrun_env_value
|
|
198
213
|
|
|
199
|
-
return
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
214
|
+
return default
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _find_value_in_json_env_lists(
|
|
218
|
+
secret_name: str,
|
|
219
|
+
) -> Optional[str]:
|
|
220
|
+
"""
|
|
221
|
+
Scan all environment variables. If any env var contains a JSON-encoded list
|
|
222
|
+
of dicts shaped like {'name': str, 'value': str|None, 'value_from': ...},
|
|
223
|
+
return the 'value' for the entry whose 'name' matches secret_name.
|
|
224
|
+
"""
|
|
225
|
+
for environment_variable_value in environ.values():
|
|
226
|
+
if not environment_variable_value or not isinstance(
|
|
227
|
+
environment_variable_value, str
|
|
228
|
+
):
|
|
229
|
+
continue
|
|
230
|
+
# Fast precheck to skip obvious non-JSON strings
|
|
231
|
+
first_char = environment_variable_value.lstrip()[:1]
|
|
232
|
+
if first_char not in ("[", "{"):
|
|
233
|
+
continue
|
|
234
|
+
try:
|
|
235
|
+
parsed_value = json.loads(environment_variable_value)
|
|
236
|
+
except ValueError:
|
|
237
|
+
continue
|
|
238
|
+
if isinstance(parsed_value, list):
|
|
239
|
+
for entry in parsed_value:
|
|
240
|
+
if isinstance(entry, dict) and entry.get("name") == secret_name:
|
|
241
|
+
value_in_entry = entry.get("value")
|
|
242
|
+
# Match original semantics: empty string is treated as "not found"
|
|
243
|
+
if value_in_entry:
|
|
244
|
+
return value_in_entry
|
|
245
|
+
return None
|
mlrun/serving/__init__.py
CHANGED
|
@@ -28,6 +28,7 @@ __all__ = [
|
|
|
28
28
|
"Model",
|
|
29
29
|
"ModelSelector",
|
|
30
30
|
"MonitoredStep",
|
|
31
|
+
"LLModel",
|
|
31
32
|
]
|
|
32
33
|
|
|
33
34
|
from .routers import ModelRouter, VotingEnsemble # noqa
|
|
@@ -47,6 +48,7 @@ from .states import (
|
|
|
47
48
|
Model,
|
|
48
49
|
ModelSelector,
|
|
49
50
|
MonitoredStep,
|
|
51
|
+
LLModel,
|
|
50
52
|
) # noqa
|
|
51
53
|
from .v1_serving import MLModelServer, new_v1_model_server # noqa
|
|
52
54
|
from .v2_serving import V2ModelServer # noqa
|