mlrun 1.8.0rc38__py3-none-any.whl → 1.8.0rc39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

mlrun/runtimes/pod.py CHANGED
@@ -17,7 +17,6 @@ import os
17
17
  import re
18
18
  import time
19
19
  import typing
20
- import warnings
21
20
  from collections.abc import Iterable
22
21
  from enum import Enum
23
22
 
@@ -704,7 +703,29 @@ class KubeResourceSpec(FunctionSpec):
704
703
  ),
705
704
  affinity_field_name=affinity_field_name,
706
705
  )
706
+ # purge any affinity / anti-affinity preemption related configuration and enrich with preemptible tolerations
707
707
  elif self_preemption_mode == PreemptionModes.allow.value:
708
+ # remove preemptible anti-affinity
709
+ self._prune_affinity_node_selector_requirement(
710
+ generate_preemptible_node_selector_requirements(
711
+ NodeSelectorOperator.node_selector_op_not_in.value
712
+ ),
713
+ affinity_field_name=affinity_field_name,
714
+ )
715
+ # remove preemptible affinity
716
+ self._prune_affinity_node_selector_requirement(
717
+ generate_preemptible_node_selector_requirements(
718
+ NodeSelectorOperator.node_selector_op_in.value
719
+ ),
720
+ affinity_field_name=affinity_field_name,
721
+ )
722
+
723
+ # remove preemptible nodes constrain
724
+ self._prune_node_selector(
725
+ mlconf.get_preemptible_node_selector(),
726
+ node_selector_field_name=node_selector_field_name,
727
+ )
728
+
708
729
  # enrich with tolerations
709
730
  self._merge_tolerations(
710
731
  generate_preemptible_tolerations(),
@@ -1180,132 +1201,6 @@ class KubeResource(BaseRuntime):
1180
1201
  """
1181
1202
  self.spec.with_requests(mem, cpu, patch=patch)
1182
1203
 
1183
- def detect_preemptible_node_selector(
1184
- self, node_selector: dict[str, str]
1185
- ) -> list[str]:
1186
- """
1187
- Checks if any provided node selector matches the preemptible node selectors.
1188
- Issues a warning if a selector may be pruned at runtime depending on preemption mode.
1189
-
1190
- :param node_selector: The user-provided node selector dictionary.
1191
- """
1192
- preemptible_node_selector = mlconf.get_preemptible_node_selector()
1193
-
1194
- return [
1195
- f"'{key}': '{val}'"
1196
- for key, val in node_selector.items()
1197
- if preemptible_node_selector.get(key) == val
1198
- ]
1199
-
1200
- def detect_preemptible_tolerations(
1201
- self, tolerations: list[k8s_client.V1Toleration]
1202
- ) -> list[str]:
1203
- """
1204
- Checks if any provided toleration matches preemptible tolerations.
1205
- Issues a warning if a toleration may be pruned at runtime depending on preemption mode.
1206
-
1207
- :param tolerations: The user-provided list of tolerations.
1208
- """
1209
- preemptible_tolerations = [
1210
- k8s_client.V1Toleration(
1211
- key=toleration.get("key"),
1212
- value=toleration.get("value"),
1213
- effect=toleration.get("effect"),
1214
- )
1215
- for toleration in mlconf.get_preemptible_tolerations()
1216
- ]
1217
-
1218
- def _format_toleration(toleration):
1219
- return f"'{toleration.key}'='{toleration.value}' (effect: '{toleration.effect}')"
1220
-
1221
- return [
1222
- _format_toleration(toleration)
1223
- for toleration in tolerations
1224
- if toleration in preemptible_tolerations
1225
- ]
1226
-
1227
- def detect_preemptible_affinity(self, affinity: k8s_client.V1Affinity) -> list[str]:
1228
- """
1229
- Checks if any provided affinity rules match preemptible affinity configurations.
1230
- Issues a warning if an affinity rule may be pruned at runtime depending on preemption mode.
1231
-
1232
- :param affinity: The user-provided affinity object.
1233
- """
1234
-
1235
- preemptible_affinity_terms = generate_preemptible_nodes_affinity_terms()
1236
- conflicting_affinities = []
1237
-
1238
- if (
1239
- affinity
1240
- and affinity.node_affinity
1241
- and affinity.node_affinity.required_during_scheduling_ignored_during_execution
1242
- ):
1243
- user_terms = affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms
1244
- for user_term in user_terms:
1245
- user_expressions = {
1246
- (expr.key, expr.operator, tuple(expr.values or []))
1247
- for expr in user_term.match_expressions or []
1248
- }
1249
-
1250
- for preemptible_term in preemptible_affinity_terms:
1251
- preemptible_expressions = {
1252
- (expr.key, expr.operator, tuple(expr.values or []))
1253
- for expr in preemptible_term.match_expressions or []
1254
- }
1255
-
1256
- # Ensure operators match and preemptible expressions are present
1257
- common_exprs = user_expressions & preemptible_expressions
1258
- if common_exprs:
1259
- formatted = ", ".join(
1260
- f"'{key} {operator} {list(values)}'"
1261
- for key, operator, values in common_exprs
1262
- )
1263
- conflicting_affinities.append(formatted)
1264
- return conflicting_affinities
1265
-
1266
- def raise_preemptible_warning(
1267
- self,
1268
- node_selector: typing.Optional[dict[str, str]],
1269
- tolerations: typing.Optional[list[k8s_client.V1Toleration]],
1270
- affinity: typing.Optional[k8s_client.V1Affinity],
1271
- ) -> None:
1272
- """
1273
- Detects conflicts and issues a single warning if necessary.
1274
-
1275
- :param node_selector: The user-provided node selector dictionary.
1276
- :param tolerations: The user-provided list of tolerations.
1277
- :param affinity: The user-provided affinity object.
1278
- """
1279
- conflict_messages = []
1280
-
1281
- if node_selector:
1282
- ns_conflicts = ", ".join(
1283
- self.detect_preemptible_node_selector(node_selector)
1284
- )
1285
- if ns_conflicts:
1286
- conflict_messages.append(f"Node selectors: {ns_conflicts}")
1287
-
1288
- if tolerations:
1289
- tol_conflicts = ", ".join(self.detect_preemptible_tolerations(tolerations))
1290
- if tol_conflicts:
1291
- conflict_messages.append(f"Tolerations: {tol_conflicts}")
1292
-
1293
- if affinity:
1294
- affinity_conflicts = ", ".join(self.detect_preemptible_affinity(affinity))
1295
- if affinity_conflicts:
1296
- conflict_messages.append(f"Affinity: {affinity_conflicts}")
1297
-
1298
- if conflict_messages:
1299
- warning_componentes = "; \n".join(conflict_messages)
1300
- warnings.warn(
1301
- f"Warning: based on the preemptible node settings configured in your MLRun configuration,\n"
1302
- f"{warning_componentes}\n"
1303
- f" may be removed or adjusted at runtime.\n"
1304
- "This adjustment depends on the function's preemption mode. \n"
1305
- "The list of potential adjusted preemptible selectors can be viewed here: "
1306
- "mlrun.mlconf.get_preemptible_node_selector() and mlrun.mlconf.get_preemptible_tolerations()."
1307
- )
1308
-
1309
1204
  def with_node_selection(
1310
1205
  self,
1311
1206
  node_name: typing.Optional[str] = None,
@@ -1314,14 +1209,19 @@ class KubeResource(BaseRuntime):
1314
1209
  tolerations: typing.Optional[list[k8s_client.V1Toleration]] = None,
1315
1210
  ):
1316
1211
  """
1317
- Enables control over which Kubernetes node the job will run on.
1212
+ Enables to control on which k8s node the job will run
1213
+
1214
+ :param node_name: The name of the k8s node
1215
+ :param node_selector: Label selector, only nodes with matching labels will be eligible to be picked
1216
+ :param affinity: Expands the types of constraints you can express - see
1217
+ https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity
1218
+ for details
1219
+ :param tolerations: Tolerations are applied to pods, and allow (but do not require) the pods to schedule
1220
+ onto nodes with matching taints - see
1221
+ https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration
1222
+ for details
1318
1223
 
1319
- :param node_name: The name of the Kubernetes node.
1320
- :param node_selector: Label selector, only nodes with matching labels will be eligible.
1321
- :param affinity: Defines scheduling constraints.
1322
- :param tolerations: Allows scheduling onto nodes with matching taints.
1323
1224
  """
1324
- # Apply values as before
1325
1225
  if node_name:
1326
1226
  self.spec.node_name = node_name
1327
1227
  if node_selector is not None:
@@ -1332,12 +1232,6 @@ class KubeResource(BaseRuntime):
1332
1232
  if tolerations is not None:
1333
1233
  self.spec.tolerations = tolerations
1334
1234
 
1335
- self.raise_preemptible_warning(
1336
- node_selector=self.spec.node_selector,
1337
- tolerations=self.spec.tolerations,
1338
- affinity=self.spec.affinity,
1339
- )
1340
-
1341
1235
  def with_priority_class(self, name: typing.Optional[str] = None):
1342
1236
  """
1343
1237
  Enables to control the priority of the pod
@@ -1578,15 +1472,13 @@ class KubeResource(BaseRuntime):
1578
1472
  f"Started building image: {data.get('data', {}).get('spec', {}).get('build', {}).get('image')}"
1579
1473
  )
1580
1474
  if watch and not ready:
1581
- state = self._build_watch(
1475
+ self.status.state = self._build_watch(
1582
1476
  watch=watch,
1583
1477
  show_on_failure=show_on_failure,
1584
1478
  )
1585
- ready = state == "ready"
1586
- self.status.state = state
1587
-
1588
- if watch and not ready:
1589
- raise mlrun.errors.MLRunRuntimeError("Deploy failed")
1479
+ ready = self.status.state == "ready"
1480
+ if not ready:
1481
+ raise mlrun.errors.MLRunRuntimeError("Deploy failed")
1590
1482
  return ready
1591
1483
 
1592
1484
  def _build_watch(
mlrun/serving/routers.py CHANGED
@@ -18,6 +18,7 @@ import copy
18
18
  import json
19
19
  import traceback
20
20
  import typing
21
+ from datetime import timedelta
21
22
  from enum import Enum
22
23
  from io import BytesIO
23
24
  from typing import Union
@@ -78,6 +79,9 @@ class BaseModelRouter(RouterToDict):
78
79
  self.inputs_key = "instances" if self.protocol == "v1" else "inputs"
79
80
  self._input_path = input_path
80
81
  self._result_path = result_path
82
+ self._background_task_check_timestamp = None
83
+ self._background_task_terminate = False
84
+ self._background_task_current_state = None
81
85
  self.kwargs = kwargs
82
86
 
83
87
  def parse_event(self, event):
@@ -135,6 +139,7 @@ class BaseModelRouter(RouterToDict):
135
139
  raise ValueError(
136
140
  f"illegal path prefix {urlpath}, must start with {self.url_prefix}"
137
141
  )
142
+ self._update_background_task_state(event)
138
143
  return event
139
144
 
140
145
  def do_event(self, event, *args, **kwargs):
@@ -160,6 +165,63 @@ class BaseModelRouter(RouterToDict):
160
165
  """run tasks after processing the event"""
161
166
  return event
162
167
 
168
+ def _get_background_task_status(
169
+ self,
170
+ ) -> mlrun.common.schemas.BackgroundTaskState:
171
+ self._background_task_check_timestamp = now_date()
172
+ server: mlrun.serving.GraphServer = getattr(
173
+ self.context, "_server", None
174
+ ) or getattr(self.context, "server", None)
175
+ if not self.context.is_mock:
176
+ if server.model_endpoint_creation_task_name:
177
+ background_task = mlrun.get_run_db().get_project_background_task(
178
+ server.project, server.model_endpoint_creation_task_name
179
+ )
180
+ logger.debug(
181
+ "Checking model endpoint creation task status",
182
+ task_name=server.model_endpoint_creation_task_name,
183
+ )
184
+ if (
185
+ background_task.status.state
186
+ in mlrun.common.schemas.BackgroundTaskState.terminal_states()
187
+ ):
188
+ logger.debug(
189
+ f"Model endpoint creation task completed with state {background_task.status.state}"
190
+ )
191
+ self._background_task_terminate = True
192
+ else: # in progress
193
+ logger.debug(
194
+ f"Model endpoint creation task is still in progress with the current state: "
195
+ f"{background_task.status.state}. Events will not be monitored for the next 15 seconds",
196
+ name=self.name,
197
+ background_task_check_timestamp=self._background_task_check_timestamp.isoformat(),
198
+ )
199
+ return background_task.status.state
200
+ else:
201
+ logger.debug(
202
+ "Model endpoint creation task name not provided",
203
+ )
204
+ elif self.context.monitoring_mock:
205
+ self._background_task_terminate = (
206
+ True # If mock monitoring we return success and terminate task check.
207
+ )
208
+ return mlrun.common.schemas.BackgroundTaskState.succeeded
209
+ self._background_task_terminate = True # If mock without monitoring we return failed and terminate task check.
210
+ return mlrun.common.schemas.BackgroundTaskState.failed
211
+
212
+ def _update_background_task_state(self, event):
213
+ if not self._background_task_terminate and (
214
+ self._background_task_check_timestamp is None
215
+ or now_date() - self._background_task_check_timestamp
216
+ >= timedelta(seconds=15)
217
+ ):
218
+ self._background_task_current_state = self._get_background_task_status()
219
+ if event.body:
220
+ event.body["background_task_state"] = (
221
+ self._background_task_current_state
222
+ or mlrun.common.schemas.BackgroundTaskState.running
223
+ )
224
+
163
225
 
164
226
  class ModelRouter(BaseModelRouter):
165
227
  def _resolve_route(self, body, urlpath):
@@ -599,75 +661,29 @@ class VotingEnsemble(ParallelRun):
599
661
  self.log_router = True
600
662
  self.prediction_col_name = prediction_col_name or "prediction"
601
663
  self.format_response_with_col_name_flag = format_response_with_col_name_flag
602
- self.model_endpoint_uid = None
603
- self.model_endpoint = None
664
+ self.model_endpoint_uid = kwargs.get("model_endpoint_uid", None)
604
665
  self.shard_by_endpoint = shard_by_endpoint
666
+ self._model_logger = None
605
667
  self.initialized = False
606
668
 
607
669
  def post_init(self, mode="sync", **kwargs):
608
670
  self._update_weights(self.weights)
609
671
 
610
- def _lazy_init(self, event_id):
611
- server: mlrun.serving.GraphServer = getattr(
612
- self.context, "_server", None
613
- ) or getattr(self.context, "server", None)
614
- if not server:
615
- logger.warn("GraphServer not initialized for VotingEnsemble instance")
616
- return
617
- if not self.context.is_mock or self.context.monitoring_mock:
618
- if server.model_endpoint_creation_task_name:
619
- background_task = mlrun.get_run_db().get_project_background_task(
620
- server.project, server.model_endpoint_creation_task_name
621
- )
622
- logger.info(
623
- "Checking model endpoint creation task status",
624
- task_name=server.model_endpoint_creation_task_name,
625
- )
626
- if (
627
- background_task.status.state
628
- in mlrun.common.schemas.BackgroundTaskState.terminal_states()
629
- ):
630
- logger.info(
631
- f"Model endpoint creation task completed with state {background_task.status.state}"
632
- )
633
- else: # in progress
634
- logger.debug(
635
- f"Model endpoint creation task is still in progress with the current state: "
636
- f"{background_task.status.state}. This event will not be monitored.",
637
- name=self.name,
638
- event_id=event_id,
639
- )
640
- self.initialized = False
641
- return
642
- else:
643
- logger.info(
644
- "Model endpoint creation task name not provided",
645
- )
646
- try:
647
- self.model_endpoint_uid = (
648
- mlrun.get_run_db()
649
- .get_model_endpoint(
650
- project=server.project,
651
- name=self.name,
652
- function_name=server.function_name,
653
- function_tag=server.function_tag or "latest",
654
- tsdb_metrics=False,
655
- )
656
- .metadata.uid
657
- )
658
- except mlrun.errors.MLRunNotFoundError:
659
- logger.info(
660
- "Model endpoint not found for this step; monitoring for this model will not be performed",
661
- function_name=server.function_name,
662
- name=self.name,
672
+ def _lazy_init(self, event):
673
+ if event and isinstance(event, dict):
674
+ background_task_state = event.get("background_task_state", None)
675
+ if (
676
+ background_task_state
677
+ == mlrun.common.schemas.BackgroundTaskState.succeeded
678
+ ):
679
+ self._model_logger = (
680
+ _ModelLogPusher(self, self.context)
681
+ if self.context
682
+ and self.context.stream.enabled
683
+ and self.model_endpoint_uid
684
+ else None
663
685
  )
664
- self.model_endpoint_uid = None
665
- self._model_logger = (
666
- _ModelLogPusher(self, self.context)
667
- if self.context and self.context.stream.enabled and self.model_endpoint_uid
668
- else None
669
- )
670
- self.initialized = True
686
+ self.initialized = True
671
687
 
672
688
  def _resolve_route(self, body, urlpath):
673
689
  """Resolves the appropriate model to send the event to.
@@ -872,14 +888,14 @@ class VotingEnsemble(ParallelRun):
872
888
  Response
873
889
  Event response after running the requested logic
874
890
  """
875
- if not self.initialized:
876
- self._lazy_init(event.id)
877
891
  start = now_date()
878
892
  # Handle and verify the request
879
893
  original_body = event.body
880
894
  event.body = _extract_input_data(self._input_path, event.body)
881
895
  event = self.preprocess(event)
882
896
  event = self._pre_handle_event(event)
897
+ if not self.initialized:
898
+ self._lazy_init(event.body)
883
899
 
884
900
  # Should we terminate the event?
885
901
  if hasattr(event, "terminated") and event.terminated:
@@ -111,11 +111,11 @@ class V2ModelServer(StepToDict):
111
111
  if model:
112
112
  self.model = model
113
113
  self.ready = True
114
- self.model_endpoint_uid = None
114
+ self.model_endpoint_uid = kwargs.get("model_endpoint_uid", None)
115
115
  self.shard_by_endpoint = shard_by_endpoint
116
116
  self._model_logger = None
117
117
  self.initialized = False
118
- self.output_schema = []
118
+ self.output_schema = kwargs.get("outputs", [])
119
119
 
120
120
  def _load_and_update_state(self):
121
121
  try:
@@ -137,67 +137,29 @@ class V2ModelServer(StepToDict):
137
137
  else:
138
138
  self._load_and_update_state()
139
139
 
140
- def _lazy_init(self, event_id):
141
- server: mlrun.serving.GraphServer = getattr(
142
- self.context, "_server", None
143
- ) or getattr(self.context, "server", None)
144
- if not server:
145
- logger.warn("GraphServer not initialized for VotingEnsemble instance")
146
- return
147
- if not self.context.is_mock and not self.model_spec:
140
+ if self.ready and not self.context.is_mock and not self.model_spec:
148
141
  self.get_model()
149
- if not self.context.is_mock or self.context.monitoring_mock:
150
- if server.model_endpoint_creation_task_name:
151
- background_task = mlrun.get_run_db().get_project_background_task(
152
- server.project, server.model_endpoint_creation_task_name
153
- )
154
- logger.debug(
155
- "Checking model endpoint creation task status",
156
- task_name=server.model_endpoint_creation_task_name,
157
- )
158
- if (
159
- background_task.status.state
160
- in mlrun.common.schemas.BackgroundTaskState.terminal_states()
161
- ):
162
- logger.debug(
163
- f"Model endpoint creation task completed with state {background_task.status.state}"
164
- )
165
- else: # in progress
166
- logger.debug(
167
- f"Model endpoint creation task is still in progress with the current state: "
168
- f"{background_task.status.state}. This event will not be monitored.",
169
- name=self.name,
170
- event_id=event_id,
171
- )
172
- self.initialized = False
173
- return
174
- else:
175
- logger.debug(
176
- "Model endpoint creation task name not provided",
177
- )
178
- try:
179
- model_endpoint = mlrun.get_run_db().get_model_endpoint(
180
- project=server.project,
181
- name=self.name,
182
- function_name=server.function_name,
183
- function_tag=server.function_tag or "latest",
184
- tsdb_metrics=False,
185
- )
186
- self.model_endpoint_uid = model_endpoint.metadata.uid
187
- self.output_schema = model_endpoint.spec.label_names
188
- except mlrun.errors.MLRunNotFoundError:
189
- logger.info(
190
- "Model endpoint not found for this step; monitoring for this model will not be performed",
191
- function_name=server.function_name,
192
- name=self.name,
142
+
143
+ if self.model_spec:
144
+ self.output_schema = self.output_schema or [
145
+ feature.name for feature in self.model_spec.outputs
146
+ ]
147
+
148
+ def _lazy_init(self, event):
149
+ if event and isinstance(event, dict):
150
+ background_task_state = event.get("background_task_state", None)
151
+ if (
152
+ background_task_state
153
+ == mlrun.common.schemas.BackgroundTaskState.succeeded
154
+ ):
155
+ self._model_logger = (
156
+ _ModelLogPusher(self, self.context)
157
+ if self.context
158
+ and self.context.stream.enabled
159
+ and self.model_endpoint_uid
160
+ else None
193
161
  )
194
- self.model_endpoint_uid = None
195
- self._model_logger = (
196
- _ModelLogPusher(self, self.context)
197
- if self.context and self.context.stream.enabled and self.model_endpoint_uid
198
- else None
199
- )
200
- self.initialized = True
162
+ self.initialized = True
201
163
 
202
164
  def get_param(self, key: str, default=None):
203
165
  """get param by key (specified in the model or the function)"""
@@ -276,7 +238,7 @@ class V2ModelServer(StepToDict):
276
238
  def do_event(self, event, *args, **kwargs):
277
239
  """main model event handler method"""
278
240
  if not self.initialized:
279
- self._lazy_init(event.id)
241
+ self._lazy_init(event.body)
280
242
  start = now_date()
281
243
  original_body = event.body
282
244
  event_body = _extract_input_data(self._input_path, event.body)
mlrun/utils/async_http.py CHANGED
@@ -26,8 +26,7 @@ from aiohttp_retry.client import _RequestContext
26
26
  from mlrun.config import config
27
27
  from mlrun.errors import err_to_str
28
28
  from mlrun.errors import raise_for_status as ml_raise_for_status
29
-
30
- from .helpers import logger as mlrun_logger
29
+ from mlrun.utils.helpers import logger as mlrun_logger
31
30
 
32
31
  DEFAULT_BLACKLISTED_METHODS = [
33
32
  "POST",
mlrun/utils/helpers.py CHANGED
@@ -146,7 +146,7 @@ def get_artifact_target(item: dict, project=None):
146
146
  return item["spec"].get("target_path")
147
147
 
148
148
 
149
- # TODO: left for migrations testing purposes. Remove in 1.8.0.
149
+ # TODO: Remove once data migration v5 is obsolete
150
150
  def is_legacy_artifact(artifact):
151
151
  if isinstance(artifact, dict):
152
152
  return "metadata" not in artifact
@@ -498,7 +498,6 @@ def get_in(obj, keys, default=None):
498
498
  """
499
499
  if isinstance(keys, str):
500
500
  keys = keys.split(".")
501
-
502
501
  for key in keys:
503
502
  if not obj or key not in obj:
504
503
  return default
@@ -1,4 +1,4 @@
1
1
  {
2
- "git_commit": "239429702facc9fa9c6667d4a9db952f1940e598",
3
- "version": "1.8.0-rc38"
2
+ "git_commit": "c4d1cedcb732b6108ad1b9a2e33df82ba9114fa1",
3
+ "version": "1.8.0-rc39"
4
4
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mlrun
3
- Version: 1.8.0rc38
3
+ Version: 1.8.0rc39
4
4
  Summary: Tracking and config of machine learning runs
5
5
  Home-page: https://github.com/mlrun/mlrun
6
6
  Author: Yaron Haviv