mlrun 1.10.0rc16__py3-none-any.whl → 1.10.0rc42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (98) hide show
  1. mlrun/__init__.py +22 -2
  2. mlrun/artifacts/document.py +6 -1
  3. mlrun/artifacts/llm_prompt.py +21 -15
  4. mlrun/artifacts/model.py +3 -3
  5. mlrun/common/constants.py +9 -0
  6. mlrun/common/formatters/artifact.py +1 -0
  7. mlrun/common/model_monitoring/helpers.py +86 -0
  8. mlrun/common/schemas/__init__.py +2 -0
  9. mlrun/common/schemas/auth.py +2 -0
  10. mlrun/common/schemas/function.py +10 -0
  11. mlrun/common/schemas/hub.py +30 -18
  12. mlrun/common/schemas/model_monitoring/__init__.py +2 -0
  13. mlrun/common/schemas/model_monitoring/constants.py +30 -6
  14. mlrun/common/schemas/model_monitoring/functions.py +13 -4
  15. mlrun/common/schemas/model_monitoring/model_endpoints.py +11 -0
  16. mlrun/common/schemas/pipeline.py +1 -1
  17. mlrun/common/schemas/serving.py +3 -0
  18. mlrun/common/schemas/workflow.py +1 -0
  19. mlrun/common/secrets.py +22 -1
  20. mlrun/config.py +32 -10
  21. mlrun/datastore/__init__.py +11 -3
  22. mlrun/datastore/azure_blob.py +162 -47
  23. mlrun/datastore/datastore.py +9 -4
  24. mlrun/datastore/datastore_profile.py +61 -5
  25. mlrun/datastore/model_provider/huggingface_provider.py +363 -0
  26. mlrun/datastore/model_provider/mock_model_provider.py +87 -0
  27. mlrun/datastore/model_provider/model_provider.py +211 -74
  28. mlrun/datastore/model_provider/openai_provider.py +243 -71
  29. mlrun/datastore/s3.py +24 -2
  30. mlrun/datastore/storeytargets.py +2 -3
  31. mlrun/datastore/utils.py +15 -3
  32. mlrun/db/base.py +27 -19
  33. mlrun/db/httpdb.py +57 -48
  34. mlrun/db/nopdb.py +25 -10
  35. mlrun/execution.py +55 -13
  36. mlrun/hub/__init__.py +15 -0
  37. mlrun/hub/module.py +181 -0
  38. mlrun/k8s_utils.py +105 -16
  39. mlrun/launcher/base.py +13 -6
  40. mlrun/launcher/local.py +2 -0
  41. mlrun/model.py +9 -3
  42. mlrun/model_monitoring/api.py +66 -27
  43. mlrun/model_monitoring/applications/__init__.py +1 -1
  44. mlrun/model_monitoring/applications/base.py +372 -136
  45. mlrun/model_monitoring/applications/context.py +2 -4
  46. mlrun/model_monitoring/applications/results.py +4 -7
  47. mlrun/model_monitoring/controller.py +239 -101
  48. mlrun/model_monitoring/db/_schedules.py +36 -13
  49. mlrun/model_monitoring/db/_stats.py +4 -3
  50. mlrun/model_monitoring/db/tsdb/base.py +29 -9
  51. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +4 -5
  52. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +154 -50
  53. mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +51 -0
  54. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
  55. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +245 -51
  56. mlrun/model_monitoring/helpers.py +28 -5
  57. mlrun/model_monitoring/stream_processing.py +45 -14
  58. mlrun/model_monitoring/writer.py +220 -1
  59. mlrun/platforms/__init__.py +3 -2
  60. mlrun/platforms/iguazio.py +7 -3
  61. mlrun/projects/operations.py +6 -1
  62. mlrun/projects/pipelines.py +2 -2
  63. mlrun/projects/project.py +128 -45
  64. mlrun/run.py +94 -17
  65. mlrun/runtimes/__init__.py +18 -0
  66. mlrun/runtimes/base.py +14 -6
  67. mlrun/runtimes/daskjob.py +1 -0
  68. mlrun/runtimes/local.py +5 -2
  69. mlrun/runtimes/mounts.py +20 -2
  70. mlrun/runtimes/nuclio/__init__.py +1 -0
  71. mlrun/runtimes/nuclio/application/application.py +147 -17
  72. mlrun/runtimes/nuclio/function.py +70 -27
  73. mlrun/runtimes/nuclio/serving.py +85 -4
  74. mlrun/runtimes/pod.py +213 -21
  75. mlrun/runtimes/utils.py +49 -9
  76. mlrun/secrets.py +54 -13
  77. mlrun/serving/remote.py +79 -6
  78. mlrun/serving/routers.py +23 -41
  79. mlrun/serving/server.py +211 -40
  80. mlrun/serving/states.py +536 -156
  81. mlrun/serving/steps.py +62 -0
  82. mlrun/serving/system_steps.py +136 -81
  83. mlrun/serving/v2_serving.py +9 -10
  84. mlrun/utils/helpers.py +212 -82
  85. mlrun/utils/logger.py +3 -1
  86. mlrun/utils/notifications/notification/base.py +18 -0
  87. mlrun/utils/notifications/notification/git.py +2 -4
  88. mlrun/utils/notifications/notification/slack.py +2 -4
  89. mlrun/utils/notifications/notification/webhook.py +2 -5
  90. mlrun/utils/notifications/notification_pusher.py +1 -1
  91. mlrun/utils/version/version.json +2 -2
  92. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/METADATA +44 -45
  93. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/RECORD +97 -92
  94. mlrun/api/schemas/__init__.py +0 -259
  95. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/WHEEL +0 -0
  96. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/entry_points.txt +0 -0
  97. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/licenses/LICENSE +0 -0
  98. {mlrun-1.10.0rc16.dist-info → mlrun-1.10.0rc42.dist-info}/top_level.txt +0 -0
@@ -14,6 +14,7 @@
14
14
  import json
15
15
  import os
16
16
  import warnings
17
+ from base64 import b64decode
17
18
  from copy import deepcopy
18
19
  from typing import Optional, Union
19
20
 
@@ -22,6 +23,8 @@ from nuclio import KafkaTrigger
22
23
 
23
24
  import mlrun
24
25
  import mlrun.common.schemas as schemas
26
+ import mlrun.common.secrets
27
+ import mlrun.datastore.datastore_profile as ds_profile
25
28
  from mlrun.datastore import get_kafka_brokers_from_dict, parse_kafka_url
26
29
  from mlrun.model import ObjectList
27
30
  from mlrun.runtimes.function_reference import FunctionReference
@@ -633,7 +636,12 @@ class ServingRuntime(RemoteRuntime):
633
636
 
634
637
  :returns: The Runtime (function) object
635
638
  """
636
-
639
+ if kind == "azure_vault" and isinstance(source, dict):
640
+ candidate_secret_name = (source.get("k8s_secret") or "").strip()
641
+ if candidate_secret_name:
642
+ mlrun.common.secrets.validate_not_forbidden_secret(
643
+ candidate_secret_name
644
+ )
637
645
  if kind == "vault" and isinstance(source, list):
638
646
  source = {"project": self.metadata.project, "secrets": source}
639
647
 
@@ -657,6 +665,7 @@ class ServingRuntime(RemoteRuntime):
657
665
  :param builder_env: env vars dict for source archive config/credentials e.g. builder_env={"GIT_TOKEN": token}
658
666
  :param force_build: set True for force building the image
659
667
  """
668
+
660
669
  load_mode = self.spec.load_mode
661
670
  if load_mode and load_mode not in ["sync", "async"]:
662
671
  raise ValueError(f"illegal model loading mode {load_mode}")
@@ -677,6 +686,21 @@ class ServingRuntime(RemoteRuntime):
677
686
  f"function {function} is used in steps and is not defined, "
678
687
  "use the .add_child_function() to specify child function attributes"
679
688
  )
689
+ if (
690
+ isinstance(self.spec.graph, RootFlowStep)
691
+ and any(
692
+ isinstance(step_type, mlrun.serving.states.ModelRunnerStep)
693
+ for step_type in self.spec.graph.steps.values()
694
+ )
695
+ and self.spec.build.functionSourceCode
696
+ ):
697
+ # Add import for LLModel
698
+ decoded_code = b64decode(self.spec.build.functionSourceCode).decode("utf-8")
699
+ import_llmodel_code = "\nfrom mlrun.serving.states import LLModel\n"
700
+ if import_llmodel_code not in decoded_code:
701
+ decoded_code += import_llmodel_code
702
+ encoded_code = mlrun.utils.helpers.encode_user_code(decoded_code)
703
+ self.spec.build.functionSourceCode = encoded_code
680
704
 
681
705
  # Handle secret processing before handling child functions, since secrets are transferred to them
682
706
  if self.spec.secret_sources:
@@ -740,6 +764,7 @@ class ServingRuntime(RemoteRuntime):
740
764
  current_function="*",
741
765
  track_models=False,
742
766
  workdir=None,
767
+ stream_profile: Optional[ds_profile.DatastoreProfile] = None,
743
768
  **kwargs,
744
769
  ) -> GraphServer:
745
770
  """create mock server object for local testing/emulation
@@ -748,6 +773,7 @@ class ServingRuntime(RemoteRuntime):
748
773
  :param current_function: specify if you want to simulate a child function, * for all functions
749
774
  :param track_models: allow model tracking (disabled by default in the mock server)
750
775
  :param workdir: working directory to locate the source code (if not the current one)
776
+ :param stream_profile: stream profile to use for the mock server output stream.
751
777
  """
752
778
 
753
779
  # set the namespaces/modules to look for the steps code in
@@ -787,6 +813,7 @@ class ServingRuntime(RemoteRuntime):
787
813
  logger=logger,
788
814
  is_mock=True,
789
815
  monitoring_mock=self.spec.track_models,
816
+ stream_profile=stream_profile,
790
817
  )
791
818
 
792
819
  server.graph = add_system_steps_to_graph(
@@ -835,8 +862,20 @@ class ServingRuntime(RemoteRuntime):
835
862
  )
836
863
  self._mock_server = self.to_mock_server()
837
864
 
838
- def to_job(self) -> KubejobRuntime:
839
- """Convert this ServingRuntime to a KubejobRuntime, so that the graph can be run as a standalone job."""
865
+ def to_job(self, func_name: Optional[str] = None) -> KubejobRuntime:
866
+ """Convert this ServingRuntime to a KubejobRuntime, so that the graph can be run as a standalone job.
867
+
868
+ Args:
869
+ func_name: Optional custom name for the job function. If not provided, automatically
870
+ appends '-batch' suffix to the serving function name to prevent database collision.
871
+
872
+ Returns:
873
+ KubejobRuntime configured to execute the serving graph as a batch job.
874
+
875
+ Note:
876
+ The job will have a different name than the serving function to prevent database collision.
877
+ The original serving function remains unchanged and can still be invoked after running the job.
878
+ """
840
879
  if self.spec.function_refs:
841
880
  raise mlrun.errors.MLRunInvalidArgumentError(
842
881
  f"Cannot convert function '{self.metadata.name}' to a job because it has child functions"
@@ -870,8 +909,50 @@ class ServingRuntime(RemoteRuntime):
870
909
  parameters=self.spec.parameters,
871
910
  graph=self.spec.graph,
872
911
  )
912
+
913
+ job_metadata = deepcopy(self.metadata)
914
+ original_name = job_metadata.name
915
+
916
+ if func_name:
917
+ # User provided explicit job name
918
+ job_metadata.name = func_name
919
+ logger.debug(
920
+ "Creating job from serving function with custom name",
921
+ new_name=func_name,
922
+ )
923
+ else:
924
+ job_metadata.name, was_renamed, suffix = (
925
+ mlrun.utils.helpers.ensure_batch_job_suffix(job_metadata.name)
926
+ )
927
+
928
+ # Check if the resulting name exceeds Kubernetes length limit
929
+ if (
930
+ len(job_metadata.name)
931
+ > mlrun.common.constants.K8S_DNS_1123_LABEL_MAX_LENGTH
932
+ ):
933
+ raise mlrun.errors.MLRunInvalidArgumentError(
934
+ f"Cannot convert serving function '{original_name}' to batch job: "
935
+ f"the resulting name '{job_metadata.name}' ({len(job_metadata.name)} characters) "
936
+ f"exceeds Kubernetes limit of {mlrun.common.constants.K8S_DNS_1123_LABEL_MAX_LENGTH} characters. "
937
+ f"Please provide a custom name via the func_name parameter, "
938
+ f"with at most {mlrun.common.constants.K8S_DNS_1123_LABEL_MAX_LENGTH} characters."
939
+ )
940
+
941
+ if was_renamed:
942
+ logger.info(
943
+ "Creating job from serving function (auto-appended suffix to prevent collision)",
944
+ new_name=job_metadata.name,
945
+ suffix=suffix,
946
+ )
947
+ else:
948
+ logger.debug(
949
+ "Creating job from serving function (name already has suffix)",
950
+ name=original_name,
951
+ suffix=suffix,
952
+ )
953
+
873
954
  job = KubejobRuntime(
874
955
  spec=spec,
875
- metadata=self.metadata,
956
+ metadata=job_metadata,
876
957
  )
877
958
  return job
mlrun/runtimes/pod.py CHANGED
@@ -17,14 +17,17 @@ import os
17
17
  import re
18
18
  import time
19
19
  import typing
20
+ import warnings
20
21
  from collections.abc import Iterable
21
22
  from enum import Enum
23
+ from typing import Optional
22
24
 
23
25
  import dotenv
24
26
  import kubernetes.client as k8s_client
25
27
  from kubernetes.client import V1Volume, V1VolumeMount
26
28
 
27
29
  import mlrun.common.constants
30
+ import mlrun.common.secrets
28
31
  import mlrun.errors
29
32
  import mlrun.runtimes.mounts
30
33
  import mlrun.utils.regex
@@ -35,6 +38,7 @@ from mlrun.common.schemas import (
35
38
 
36
39
  from ..config import config as mlconf
37
40
  from ..k8s_utils import (
41
+ generate_preemptible_nodes_affinity_terms,
38
42
  validate_node_selectors,
39
43
  )
40
44
  from ..utils import logger, update_in
@@ -107,6 +111,7 @@ class KubeResourceSpec(FunctionSpec):
107
111
  "track_models",
108
112
  "parameters",
109
113
  "graph",
114
+ "filename",
110
115
  ]
111
116
  _default_fields_to_strip = FunctionSpec._default_fields_to_strip + [
112
117
  "volumes",
@@ -705,19 +710,45 @@ class KubeResource(BaseRuntime):
705
710
  def spec(self, spec):
706
711
  self._spec = self._verify_dict(spec, "spec", KubeResourceSpec)
707
712
 
708
- def set_env_from_secret(self, name, secret=None, secret_key=None):
709
- """set pod environment var from secret"""
710
- secret_key = secret_key or name
713
+ def set_env_from_secret(
714
+ self,
715
+ name: str,
716
+ secret: Optional[str] = None,
717
+ secret_key: Optional[str] = None,
718
+ ):
719
+ """
720
+ Set an environment variable from a Kubernetes Secret.
721
+ Client-side guard forbids MLRun internal auth/project secrets; no-op on API.
722
+ """
723
+ mlrun.common.secrets.validate_not_forbidden_secret(secret)
724
+ key = secret_key or name
711
725
  value_from = k8s_client.V1EnvVarSource(
712
- secret_key_ref=k8s_client.V1SecretKeySelector(name=secret, key=secret_key)
726
+ secret_key_ref=k8s_client.V1SecretKeySelector(name=secret, key=key)
713
727
  )
714
- return self._set_env(name, value_from=value_from)
728
+ return self._set_env(name=name, value_from=value_from)
729
+
730
+ def set_env(
731
+ self,
732
+ name: str,
733
+ value: Optional[str] = None,
734
+ value_from: Optional[typing.Any] = None,
735
+ ):
736
+ """
737
+ Set an environment variable.
738
+ If value comes from a Secret, validate on client-side only.
739
+ """
740
+ if value_from is not None:
741
+ secret_name = self._extract_secret_name_from_value_from(
742
+ value_from=value_from
743
+ )
744
+ if secret_name:
745
+ mlrun.common.secrets.validate_not_forbidden_secret(secret_name)
746
+ return self._set_env(name=name, value_from=value_from)
715
747
 
716
- def set_env(self, name, value=None, value_from=None):
717
- """set pod environment var from value"""
718
- if value is not None:
719
- return self._set_env(name, value=str(value))
720
- return self._set_env(name, value_from=value_from)
748
+ # Plain literal value path
749
+ return self._set_env(
750
+ name=name, value=(str(value) if value is not None else None)
751
+ )
721
752
 
722
753
  def with_annotations(self, annotations: dict):
723
754
  """set a key/value annotations in the metadata of the pod"""
@@ -874,6 +905,133 @@ class KubeResource(BaseRuntime):
874
905
  """
875
906
  self.spec.with_requests(mem, cpu, patch=patch)
876
907
 
908
+ @staticmethod
909
+ def detect_preemptible_node_selector(node_selector: dict[str, str]) -> list[str]:
910
+ """
911
+ Check whether any provided node selector matches preemptible selectors.
912
+
913
+ :param node_selector: User-provided node selector mapping.
914
+ :return: List of `"key='value'"` strings that match a preemptible selector.
915
+ """
916
+ preemptible_node_selector = mlconf.get_preemptible_node_selector()
917
+
918
+ return [
919
+ f"'{key}': '{val}'"
920
+ for key, val in node_selector.items()
921
+ if preemptible_node_selector.get(key) == val
922
+ ]
923
+
924
+ def detect_preemptible_tolerations(
925
+ self, tolerations: list[k8s_client.V1Toleration]
926
+ ) -> list[str]:
927
+ """
928
+ Check whether any provided toleration matches preemptible tolerations.
929
+
930
+ :param tolerations: User-provided tolerations.
931
+ :return: List of formatted toleration strings that are considered preemptible.
932
+ """
933
+ preemptible_tolerations = [
934
+ k8s_client.V1Toleration(
935
+ key=toleration.get("key"),
936
+ value=toleration.get("value"),
937
+ effect=toleration.get("effect"),
938
+ )
939
+ for toleration in mlconf.get_preemptible_tolerations()
940
+ ]
941
+
942
+ def _format_toleration(toleration):
943
+ return f"'{toleration.key}'='{toleration.value}' (effect: '{toleration.effect}')"
944
+
945
+ return [
946
+ _format_toleration(toleration)
947
+ for toleration in tolerations
948
+ if toleration in preemptible_tolerations
949
+ ]
950
+
951
+ def detect_preemptible_affinity(self, affinity: k8s_client.V1Affinity) -> list[str]:
952
+ """
953
+ Check whether any provided affinity rules match preemptible affinity configs.
954
+
955
+ :param affinity: User-provided affinity object.
956
+ :return: List of formatted expressions that overlap with preemptible terms.
957
+ """
958
+ preemptible_affinity_terms = generate_preemptible_nodes_affinity_terms()
959
+ conflicting_affinities = []
960
+
961
+ if (
962
+ affinity
963
+ and affinity.node_affinity
964
+ and affinity.node_affinity.required_during_scheduling_ignored_during_execution
965
+ ):
966
+ user_terms = affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms
967
+ for user_term in user_terms:
968
+ user_expressions = {
969
+ (expr.key, expr.operator, tuple(expr.values or []))
970
+ for expr in user_term.match_expressions or []
971
+ }
972
+
973
+ for preemptible_term in preemptible_affinity_terms:
974
+ preemptible_expressions = {
975
+ (expr.key, expr.operator, tuple(expr.values or []))
976
+ for expr in preemptible_term.match_expressions or []
977
+ }
978
+
979
+ # Ensure operators match and preemptible expressions are present
980
+ common_exprs = user_expressions & preemptible_expressions
981
+ if common_exprs:
982
+ formatted = ", ".join(
983
+ f"'{key} {operator} {list(values)}'"
984
+ for key, operator, values in common_exprs
985
+ )
986
+ conflicting_affinities.append(formatted)
987
+ return conflicting_affinities
988
+
989
+ def raise_preemptible_warning(
990
+ self,
991
+ node_selector: typing.Optional[dict[str, str]],
992
+ tolerations: typing.Optional[list[k8s_client.V1Toleration]],
993
+ affinity: typing.Optional[k8s_client.V1Affinity],
994
+ ) -> None:
995
+ """
996
+ Detect conflicts and emit a single consolidated warning if needed.
997
+
998
+ :param node_selector: User-provided node selector.
999
+ :param tolerations: User-provided tolerations.
1000
+ :param affinity: User-provided affinity.
1001
+ :warns: PreemptionWarning - Emitted when any of the provided selectors,
1002
+ tolerations, or affinity terms match the configured preemptible
1003
+ settings. The message lists the conflicting items.
1004
+ """
1005
+ conflict_messages = []
1006
+
1007
+ if node_selector:
1008
+ ns_conflicts = ", ".join(
1009
+ self.detect_preemptible_node_selector(node_selector)
1010
+ )
1011
+ if ns_conflicts:
1012
+ conflict_messages.append(f"Node selectors: {ns_conflicts}")
1013
+
1014
+ if tolerations:
1015
+ tol_conflicts = ", ".join(self.detect_preemptible_tolerations(tolerations))
1016
+ if tol_conflicts:
1017
+ conflict_messages.append(f"Tolerations: {tol_conflicts}")
1018
+
1019
+ if affinity:
1020
+ affinity_conflicts = ", ".join(self.detect_preemptible_affinity(affinity))
1021
+ if affinity_conflicts:
1022
+ conflict_messages.append(f"Affinity: {affinity_conflicts}")
1023
+
1024
+ if conflict_messages:
1025
+ warning_componentes = "; \n".join(conflict_messages)
1026
+ warnings.warn(
1027
+ f"Warning: based on MLRun's preemptible node configuration, the following components \n"
1028
+ f"may be removed or adjusted at runtime:\n"
1029
+ f"{warning_componentes}.\n"
1030
+ "This adjustment depends on the function's preemption mode. \n"
1031
+ "The list of potential adjusted preemptible selectors can be viewed here: "
1032
+ "mlrun.mlconf.get_preemptible_node_selector() and mlrun.mlconf.get_preemptible_tolerations()."
1033
+ )
1034
+
877
1035
  def with_node_selection(
878
1036
  self,
879
1037
  node_name: typing.Optional[str] = None,
@@ -882,18 +1040,26 @@ class KubeResource(BaseRuntime):
882
1040
  tolerations: typing.Optional[list[k8s_client.V1Toleration]] = None,
883
1041
  ):
884
1042
  """
885
- Enables to control on which k8s node the job will run
886
-
887
- :param node_name: The name of the k8s node
888
- :param node_selector: Label selector, only nodes with matching labels will be eligible to be picked
889
- :param affinity: Expands the types of constraints you can express - see
890
- https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity
891
- for details
892
- :param tolerations: Tolerations are applied to pods, and allow (but do not require) the pods to schedule
893
- onto nodes with matching taints - see
894
- https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration
895
- for details
1043
+ Configure Kubernetes node scheduling for this function.
1044
+
1045
+ Updates one or more scheduling hints: exact node pinning, label-based selection,
1046
+ affinity/anti-affinity rules, and taint tolerations. Passing ``None`` leaves the
1047
+ current value unchanged; pass an empty dict/list (e.g., ``{}``, ``[]``) to clear.
1048
+
1049
+ :param node_name: Exact Kubernetes node name to pin the pod to.
1050
+ :param node_selector: Mapping of label selectors. Use ``{}`` to clear.
1051
+ :param affinity: :class:`kubernetes.client.V1Affinity` constraints.
1052
+ :param tolerations: List of :class:`kubernetes.client.V1Toleration`. Use ``[]`` to clear.
1053
+ :warns: PreemptionWarning - Emitted if provided selectors/tolerations/affinity
1054
+ conflict with the function's preemption mode.
1055
+
1056
+ Example usage:
1057
+ Prefer a GPU pool and allow scheduling on spot nodes::
896
1058
 
1059
+ job.with_node_selection(
1060
+ node_selector={"nodepool": "gpu"},
1061
+ tolerations=[k8s_client.V1Toleration(key="spot", operator="Exists")],
1062
+ )
897
1063
  """
898
1064
  if node_name:
899
1065
  self.spec.node_name = node_name
@@ -904,6 +1070,11 @@ class KubeResource(BaseRuntime):
904
1070
  self.spec.affinity = affinity
905
1071
  if tolerations is not None:
906
1072
  self.spec.tolerations = tolerations
1073
+ self.raise_preemptible_warning(
1074
+ node_selector=self.spec.node_selector,
1075
+ tolerations=self.spec.tolerations,
1076
+ affinity=self.spec.affinity,
1077
+ )
907
1078
 
908
1079
  def with_priority_class(self, name: typing.Optional[str] = None):
909
1080
  """
@@ -1223,6 +1394,27 @@ class KubeResource(BaseRuntime):
1223
1394
 
1224
1395
  return self.status.state
1225
1396
 
1397
+ @staticmethod
1398
+ def _extract_secret_name_from_value_from(
1399
+ value_from: typing.Any,
1400
+ ) -> Optional[str]:
1401
+ """Extract secret name from a V1EnvVarSource or dict representation."""
1402
+ if isinstance(value_from, k8s_client.V1EnvVarSource):
1403
+ if value_from.secret_key_ref:
1404
+ return value_from.secret_key_ref.name
1405
+ elif isinstance(value_from, dict):
1406
+ value_from = (
1407
+ value_from.get("valueFrom")
1408
+ or value_from.get("value_from")
1409
+ or value_from
1410
+ )
1411
+ secret_key_ref = (value_from or {}).get("secretKeyRef") or (
1412
+ value_from or {}
1413
+ ).get("secret_key_ref")
1414
+ if isinstance(secret_key_ref, dict):
1415
+ return secret_key_ref.get("name")
1416
+ return None
1417
+
1226
1418
 
1227
1419
  def _resolve_if_type_sanitized(attribute_name, attribute):
1228
1420
  attribute_config = sanitized_attributes[attribute_name]
mlrun/runtimes/utils.py CHANGED
@@ -26,6 +26,7 @@ import pandas as pd
26
26
  import mlrun
27
27
  import mlrun.common.constants
28
28
  import mlrun.common.constants as mlrun_constants
29
+ import mlrun.common.runtimes.constants
29
30
  import mlrun.common.schemas
30
31
  import mlrun.utils.regex
31
32
  from mlrun.artifacts import TableArtifact
@@ -153,6 +154,7 @@ def results_to_iter(results, runspec, execution):
153
154
 
154
155
  iter = []
155
156
  failed = 0
157
+ pending_retry = 0
156
158
  running = 0
157
159
  for task in results:
158
160
  if task:
@@ -164,17 +166,26 @@ def results_to_iter(results, runspec, execution):
164
166
  "state": state,
165
167
  "iter": id,
166
168
  }
167
- if state == "error":
169
+ if state == mlrun.common.runtimes.constants.RunStates.error:
168
170
  failed += 1
169
171
  err = get_in(task, ["status", "error"], "")
170
- logger.error(f"error in task {execution.uid}:{id} - {err_to_str(err)}")
171
- elif state != "completed":
172
+ logger.error(f"error in task {execution.uid}:{id} - {err_to_str(err)}")
173
+ elif state == mlrun.common.runtimes.constants.RunStates.pending_retry:
174
+ pending_retry += 1
175
+ err = get_in(task, ["status", "error"], "")
176
+ retry_count = get_in(task, ["status", "retry_count"], 0)
177
+ logger.warning(
178
+ f"pending retry in task {execution.uid}:{id} - {err_to_str(err)}. Retry count: {retry_count}"
179
+ )
180
+ elif state != mlrun.common.runtimes.constants.RunStates.completed:
172
181
  running += 1
173
182
 
174
183
  iter.append(struct)
175
184
 
176
185
  if not iter:
177
- execution.set_state("completed", commit=True)
186
+ execution.set_state(
187
+ mlrun.common.runtimes.constants.RunStates.completed, commit=True
188
+ )
178
189
  logger.warning("Warning!, zero iteration results")
179
190
  return
180
191
  if hasattr(pd, "json_normalize"):
@@ -204,8 +215,14 @@ def results_to_iter(results, runspec, execution):
204
215
  error=f"{failed} of {len(results)} tasks failed, check logs in db for details",
205
216
  commit=False,
206
217
  )
218
+ elif pending_retry:
219
+ execution.set_state(
220
+ mlrun.common.runtimes.constants.RunStates.pending_retry, commit=False
221
+ )
207
222
  elif running == 0:
208
- execution.set_state("completed", commit=False)
223
+ execution.set_state(
224
+ mlrun.common.runtimes.constants.RunStates.completed, commit=False
225
+ )
209
226
  execution.commit()
210
227
 
211
228
 
@@ -431,22 +448,45 @@ def enrich_function_from_dict(function, function_dict):
431
448
  return function
432
449
 
433
450
 
451
+ def resolve_owner(
452
+ labels: dict,
453
+ owner_to_enrich: Optional[str] = None,
454
+ ):
455
+ """
456
+ Resolve the owner label value
457
+ :param labels: The run labels dict
458
+ :param auth_username: The authenticated username
459
+ :return: The resolved owner label value
460
+ """
461
+
462
+ if owner_to_enrich and (
463
+ labels.get("job-type") == mlrun.common.constants.JOB_TYPE_WORKFLOW_RUNNER
464
+ or labels.get("job-type")
465
+ == mlrun.common.constants.JOB_TYPE_RERUN_WORKFLOW_RUNNER
466
+ ):
467
+ return owner_to_enrich
468
+ else:
469
+ return os.environ.get("V3IO_USERNAME") or getpass.getuser()
470
+
471
+
434
472
  def enrich_run_labels(
435
473
  labels: dict,
436
474
  labels_to_enrich: Optional[list[mlrun_constants.MLRunInternalLabels]] = None,
475
+ owner_to_enrich: Optional[str] = None,
437
476
  ):
438
477
  """
439
- Enrich the run labels with the internal labels and the labels enrichment extension
478
+ Enrich the run labels with the internal labels and the labels enrichment extension.
440
479
  :param labels: The run labels dict
441
480
  :param labels_to_enrich: The label keys to enrich from MLRunInternalLabels.default_run_labels_to_enrich
481
+ :param owner_to_enrich: Optional owner to enrich the labels with, if not provided will try to resolve it.
442
482
  :return: The enriched labels dict
443
483
  """
444
484
  # Merge the labels with the labels enrichment extension
445
485
  labels_enrichment = {
446
- mlrun_constants.MLRunInternalLabels.owner: os.environ.get("V3IO_USERNAME")
447
- or getpass.getuser(),
486
+ mlrun_constants.MLRunInternalLabels.owner: resolve_owner(
487
+ labels, owner_to_enrich
488
+ ),
448
489
  }
449
-
450
490
  # Resolve which label keys to enrich
451
491
  if labels_to_enrich is None:
452
492
  labels_to_enrich = (
mlrun/secrets.py CHANGED
@@ -11,9 +11,9 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
14
+ import json
15
15
  from ast import literal_eval
16
- from os import environ, getenv
16
+ from os import environ
17
17
  from typing import Callable, Optional, Union
18
18
 
19
19
  from .utils import AzureVaultStore, list2dict
@@ -161,6 +161,9 @@ def get_secret_or_env(
161
161
  4. An MLRun-generated env. variable, mounted from a project secret (to be used in MLRun runtimes)
162
162
  5. The default value
163
163
 
164
+ Also supports discovering the value inside any environment variable that contains a JSON-encoded list
165
+ of dicts with fields: {'name': 'KEY', 'value': 'VAL', 'value_from': ...}. This fallback is applied
166
+ after checking normal environment variables and before returning the default.
164
167
  Example::
165
168
 
166
169
  secrets = {"KEY1": "VALUE1"}
@@ -187,18 +190,56 @@ def get_secret_or_env(
187
190
  if prefix:
188
191
  key = f"{prefix}_{key}"
189
192
 
190
- value = None
191
193
  if secret_provider:
192
194
  if isinstance(secret_provider, (dict, SecretsStore)):
193
- value = secret_provider.get(key)
195
+ secret_value = secret_provider.get(key)
194
196
  else:
195
- value = secret_provider(key)
196
- if value:
197
- return value
197
+ secret_value = secret_provider(key)
198
+ if secret_value:
199
+ return secret_value
200
+
201
+ direct_environment_value = environ.get(key)
202
+ if direct_environment_value:
203
+ return direct_environment_value
204
+
205
+ json_list_value = _find_value_in_json_env_lists(key)
206
+ if json_list_value is not None:
207
+ return json_list_value
208
+
209
+ mlrun_env_key = SecretsStore.k8s_env_variable_name_for_secret(key)
210
+ mlrun_env_value = environ.get(mlrun_env_key)
211
+ if mlrun_env_value:
212
+ return mlrun_env_value
198
213
 
199
- return (
200
- value
201
- or getenv(key)
202
- or getenv(SecretsStore.k8s_env_variable_name_for_secret(key))
203
- or default
204
- )
214
+ return default
215
+
216
+
217
+ def _find_value_in_json_env_lists(
218
+ secret_name: str,
219
+ ) -> Optional[str]:
220
+ """
221
+ Scan all environment variables. If any env var contains a JSON-encoded list
222
+ of dicts shaped like {'name': str, 'value': str|None, 'value_from': ...},
223
+ return the 'value' for the entry whose 'name' matches secret_name.
224
+ """
225
+ for environment_variable_value in environ.values():
226
+ if not environment_variable_value or not isinstance(
227
+ environment_variable_value, str
228
+ ):
229
+ continue
230
+ # Fast precheck to skip obvious non-JSON strings
231
+ first_char = environment_variable_value.lstrip()[:1]
232
+ if first_char not in ("[", "{"):
233
+ continue
234
+ try:
235
+ parsed_value = json.loads(environment_variable_value)
236
+ except ValueError:
237
+ continue
238
+ if isinstance(parsed_value, list):
239
+ for entry in parsed_value:
240
+ if isinstance(entry, dict) and entry.get("name") == secret_name:
241
+ value_in_entry = entry.get("value")
242
+ # Match original semantics: empty string is treated as "not found"
243
+ if value_in_entry:
244
+ return value_in_entry
245
+ return None