mlrun 1.10.0rc18__py3-none-any.whl → 1.11.0rc16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (167) hide show
  1. mlrun/__init__.py +24 -3
  2. mlrun/__main__.py +0 -4
  3. mlrun/artifacts/dataset.py +2 -2
  4. mlrun/artifacts/document.py +6 -1
  5. mlrun/artifacts/llm_prompt.py +21 -15
  6. mlrun/artifacts/model.py +3 -3
  7. mlrun/artifacts/plots.py +1 -1
  8. mlrun/{model_monitoring/db/tsdb/tdengine → auth}/__init__.py +2 -3
  9. mlrun/auth/nuclio.py +89 -0
  10. mlrun/auth/providers.py +429 -0
  11. mlrun/auth/utils.py +415 -0
  12. mlrun/common/constants.py +14 -0
  13. mlrun/common/model_monitoring/helpers.py +123 -0
  14. mlrun/common/runtimes/constants.py +28 -0
  15. mlrun/common/schemas/__init__.py +14 -3
  16. mlrun/common/schemas/alert.py +2 -2
  17. mlrun/common/schemas/api_gateway.py +3 -0
  18. mlrun/common/schemas/auth.py +12 -10
  19. mlrun/common/schemas/client_spec.py +4 -0
  20. mlrun/common/schemas/constants.py +25 -0
  21. mlrun/common/schemas/frontend_spec.py +1 -8
  22. mlrun/common/schemas/function.py +34 -0
  23. mlrun/common/schemas/hub.py +33 -20
  24. mlrun/common/schemas/model_monitoring/__init__.py +2 -1
  25. mlrun/common/schemas/model_monitoring/constants.py +12 -15
  26. mlrun/common/schemas/model_monitoring/functions.py +13 -4
  27. mlrun/common/schemas/model_monitoring/model_endpoints.py +11 -0
  28. mlrun/common/schemas/pipeline.py +1 -1
  29. mlrun/common/schemas/secret.py +17 -2
  30. mlrun/common/secrets.py +95 -1
  31. mlrun/common/types.py +10 -10
  32. mlrun/config.py +69 -19
  33. mlrun/data_types/infer.py +2 -2
  34. mlrun/datastore/__init__.py +12 -5
  35. mlrun/datastore/azure_blob.py +162 -47
  36. mlrun/datastore/base.py +274 -10
  37. mlrun/datastore/datastore.py +7 -2
  38. mlrun/datastore/datastore_profile.py +84 -22
  39. mlrun/datastore/model_provider/huggingface_provider.py +225 -41
  40. mlrun/datastore/model_provider/mock_model_provider.py +87 -0
  41. mlrun/datastore/model_provider/model_provider.py +206 -74
  42. mlrun/datastore/model_provider/openai_provider.py +226 -66
  43. mlrun/datastore/s3.py +39 -18
  44. mlrun/datastore/sources.py +1 -1
  45. mlrun/datastore/store_resources.py +4 -4
  46. mlrun/datastore/storeytargets.py +17 -12
  47. mlrun/datastore/targets.py +1 -1
  48. mlrun/datastore/utils.py +25 -6
  49. mlrun/datastore/v3io.py +1 -1
  50. mlrun/db/base.py +63 -32
  51. mlrun/db/httpdb.py +373 -153
  52. mlrun/db/nopdb.py +54 -21
  53. mlrun/errors.py +4 -2
  54. mlrun/execution.py +66 -25
  55. mlrun/feature_store/api.py +1 -1
  56. mlrun/feature_store/common.py +1 -1
  57. mlrun/feature_store/feature_vector_utils.py +1 -1
  58. mlrun/feature_store/steps.py +8 -6
  59. mlrun/frameworks/_common/utils.py +3 -3
  60. mlrun/frameworks/_dl_common/loggers/logger.py +1 -1
  61. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +2 -1
  62. mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +1 -1
  63. mlrun/frameworks/_ml_common/utils.py +2 -1
  64. mlrun/frameworks/auto_mlrun/auto_mlrun.py +4 -3
  65. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +2 -1
  66. mlrun/frameworks/onnx/dataset.py +2 -1
  67. mlrun/frameworks/onnx/mlrun_interface.py +2 -1
  68. mlrun/frameworks/pytorch/callbacks/logging_callback.py +5 -4
  69. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +2 -1
  70. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +2 -1
  71. mlrun/frameworks/pytorch/utils.py +2 -1
  72. mlrun/frameworks/sklearn/metric.py +2 -1
  73. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +5 -4
  74. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +2 -1
  75. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +2 -1
  76. mlrun/hub/__init__.py +52 -0
  77. mlrun/hub/base.py +142 -0
  78. mlrun/hub/module.py +172 -0
  79. mlrun/hub/step.py +113 -0
  80. mlrun/k8s_utils.py +105 -16
  81. mlrun/launcher/base.py +15 -7
  82. mlrun/launcher/local.py +4 -1
  83. mlrun/model.py +14 -4
  84. mlrun/model_monitoring/__init__.py +0 -1
  85. mlrun/model_monitoring/api.py +65 -28
  86. mlrun/model_monitoring/applications/__init__.py +1 -1
  87. mlrun/model_monitoring/applications/base.py +299 -128
  88. mlrun/model_monitoring/applications/context.py +2 -4
  89. mlrun/model_monitoring/controller.py +132 -58
  90. mlrun/model_monitoring/db/_schedules.py +38 -29
  91. mlrun/model_monitoring/db/_stats.py +6 -16
  92. mlrun/model_monitoring/db/tsdb/__init__.py +9 -7
  93. mlrun/model_monitoring/db/tsdb/base.py +29 -9
  94. mlrun/model_monitoring/db/tsdb/preaggregate.py +234 -0
  95. mlrun/model_monitoring/db/tsdb/stream_graph_steps.py +63 -0
  96. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_metrics_queries.py +414 -0
  97. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_predictions_queries.py +376 -0
  98. mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_results_queries.py +590 -0
  99. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connection.py +434 -0
  100. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connector.py +541 -0
  101. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_operations.py +808 -0
  102. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_schema.py +502 -0
  103. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream.py +163 -0
  104. mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream_graph_steps.py +60 -0
  105. mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_dataframe_processor.py +141 -0
  106. mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_query_builder.py +585 -0
  107. mlrun/model_monitoring/db/tsdb/timescaledb/writer_graph_steps.py +73 -0
  108. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +20 -9
  109. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +235 -51
  110. mlrun/model_monitoring/features_drift_table.py +2 -1
  111. mlrun/model_monitoring/helpers.py +30 -6
  112. mlrun/model_monitoring/stream_processing.py +34 -28
  113. mlrun/model_monitoring/writer.py +224 -4
  114. mlrun/package/__init__.py +2 -1
  115. mlrun/platforms/__init__.py +0 -43
  116. mlrun/platforms/iguazio.py +8 -4
  117. mlrun/projects/operations.py +17 -11
  118. mlrun/projects/pipelines.py +2 -2
  119. mlrun/projects/project.py +187 -123
  120. mlrun/run.py +95 -21
  121. mlrun/runtimes/__init__.py +2 -186
  122. mlrun/runtimes/base.py +103 -25
  123. mlrun/runtimes/constants.py +225 -0
  124. mlrun/runtimes/daskjob.py +5 -2
  125. mlrun/runtimes/databricks_job/databricks_runtime.py +2 -1
  126. mlrun/runtimes/local.py +5 -2
  127. mlrun/runtimes/mounts.py +20 -2
  128. mlrun/runtimes/nuclio/__init__.py +12 -7
  129. mlrun/runtimes/nuclio/api_gateway.py +36 -6
  130. mlrun/runtimes/nuclio/application/application.py +339 -40
  131. mlrun/runtimes/nuclio/function.py +222 -72
  132. mlrun/runtimes/nuclio/serving.py +132 -42
  133. mlrun/runtimes/pod.py +213 -21
  134. mlrun/runtimes/utils.py +49 -9
  135. mlrun/secrets.py +99 -14
  136. mlrun/serving/__init__.py +2 -0
  137. mlrun/serving/remote.py +84 -11
  138. mlrun/serving/routers.py +26 -44
  139. mlrun/serving/server.py +138 -51
  140. mlrun/serving/serving_wrapper.py +6 -2
  141. mlrun/serving/states.py +997 -283
  142. mlrun/serving/steps.py +62 -0
  143. mlrun/serving/system_steps.py +149 -95
  144. mlrun/serving/v2_serving.py +9 -10
  145. mlrun/track/trackers/mlflow_tracker.py +29 -31
  146. mlrun/utils/helpers.py +292 -94
  147. mlrun/utils/http.py +9 -2
  148. mlrun/utils/notifications/notification/base.py +18 -0
  149. mlrun/utils/notifications/notification/git.py +3 -5
  150. mlrun/utils/notifications/notification/mail.py +39 -16
  151. mlrun/utils/notifications/notification/slack.py +2 -4
  152. mlrun/utils/notifications/notification/webhook.py +2 -5
  153. mlrun/utils/notifications/notification_pusher.py +3 -3
  154. mlrun/utils/version/version.json +2 -2
  155. mlrun/utils/version/version.py +3 -4
  156. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/METADATA +63 -74
  157. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/RECORD +161 -143
  158. mlrun/api/schemas/__init__.py +0 -259
  159. mlrun/db/auth_utils.py +0 -152
  160. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +0 -344
  161. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -75
  162. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +0 -281
  163. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +0 -1266
  164. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/WHEEL +0 -0
  165. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/entry_points.txt +0 -0
  166. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/licenses/LICENSE +0 -0
  167. {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/top_level.txt +0 -0
@@ -13,7 +13,7 @@
13
13
  # limitations under the License.
14
14
  import json
15
15
  import os
16
- import warnings
16
+ from base64 import b64decode
17
17
  from copy import deepcopy
18
18
  from typing import Optional, Union
19
19
 
@@ -22,6 +22,11 @@ from nuclio import KafkaTrigger
22
22
 
23
23
  import mlrun
24
24
  import mlrun.common.schemas as schemas
25
+ import mlrun.common.secrets
26
+ import mlrun.datastore.datastore_profile as ds_profile
27
+ import mlrun.runtimes.kubejob as kubejob_runtime
28
+ import mlrun.runtimes.nuclio.function as nuclio_function
29
+ import mlrun.runtimes.pod as pod_runtime
25
30
  from mlrun.datastore import get_kafka_brokers_from_dict, parse_kafka_url
26
31
  from mlrun.model import ObjectList
27
32
  from mlrun.runtimes.function_reference import FunctionReference
@@ -42,10 +47,6 @@ from mlrun.serving.states import (
42
47
  )
43
48
  from mlrun.utils import get_caller_globals, logger, set_paths
44
49
 
45
- from .. import KubejobRuntime
46
- from ..pod import KubeResourceSpec
47
- from .function import NuclioSpec, RemoteRuntime, min_nuclio_versions
48
-
49
50
  serving_subkind = "serving_v2"
50
51
 
51
52
 
@@ -83,8 +84,8 @@ def new_v2_model_server(
83
84
  return f
84
85
 
85
86
 
86
- class ServingSpec(NuclioSpec):
87
- _dict_fields = NuclioSpec._dict_fields + [
87
+ class ServingSpec(nuclio_function.NuclioSpec):
88
+ _dict_fields = nuclio_function.NuclioSpec._dict_fields + [
88
89
  "graph",
89
90
  "load_mode",
90
91
  "graph_initializer",
@@ -152,6 +153,7 @@ class ServingSpec(NuclioSpec):
152
153
  disable_default_http_trigger=None,
153
154
  model_endpoint_creation_task_name=None,
154
155
  serving_spec=None,
156
+ auth=None,
155
157
  ):
156
158
  super().__init__(
157
159
  command=command,
@@ -193,6 +195,7 @@ class ServingSpec(NuclioSpec):
193
195
  add_templated_ingress_host_mode=add_templated_ingress_host_mode,
194
196
  disable_default_http_trigger=disable_default_http_trigger,
195
197
  serving_spec=serving_spec,
198
+ auth=auth,
196
199
  )
197
200
 
198
201
  self.models = models or {}
@@ -229,7 +232,7 @@ class ServingSpec(NuclioSpec):
229
232
  self._function_refs = ObjectList.from_list(FunctionReference, function_refs)
230
233
 
231
234
 
232
- class ServingRuntime(RemoteRuntime):
235
+ class ServingRuntime(nuclio_function.RemoteRuntime):
233
236
  """MLRun Serving Runtime"""
234
237
 
235
238
  kind = "serving"
@@ -248,6 +251,8 @@ class ServingRuntime(RemoteRuntime):
248
251
  class_name=None,
249
252
  engine=None,
250
253
  exist_ok=False,
254
+ allow_cyclic: bool = False,
255
+ max_iterations: Optional[int] = None,
251
256
  **class_args,
252
257
  ) -> Union[RootFlowStep, RouterStep]:
253
258
  """set the serving graph topology (router/flow) and root class or params
@@ -278,14 +283,23 @@ class ServingRuntime(RemoteRuntime):
278
283
  :param class_name: - optional for router, router class name/path or router object
279
284
  :param engine: - optional for flow, sync or async engine
280
285
  :param exist_ok: - allow overriding existing topology
286
+ :param allow_cyclic: - allow cyclic graphs (only for async flow)
287
+ :param max_iterations: - optional, max iterations for cyclic graphs (only for async flow)
281
288
  :param class_args: - optional, router/flow class init args
282
289
 
283
- :return graph object (fn.spec.graph)
290
+ :return: graph object (fn.spec.graph)
284
291
  """
285
292
  topology = topology or StepKinds.router
286
293
  if self.spec.graph and not exist_ok:
287
294
  raise mlrun.errors.MLRunInvalidArgumentError(
288
- "graph topology is already set, cannot be overwritten"
295
+ "graph topology is already set, graph was initialized, use exist_ok=True to override"
296
+ )
297
+ if allow_cyclic and (
298
+ topology == StepKinds.router
299
+ or (topology == StepKinds.flow and engine == "sync")
300
+ ):
301
+ raise mlrun.errors.MLRunInvalidArgumentError(
302
+ "cyclic graphs are only supported in flow topology with async engine"
289
303
  )
290
304
 
291
305
  if topology == StepKinds.router:
@@ -299,7 +313,11 @@ class ServingRuntime(RemoteRuntime):
299
313
  step = RouterStep(class_name=class_name, class_args=class_args)
300
314
  self.spec.graph = step
301
315
  elif topology == StepKinds.flow:
302
- self.spec.graph = RootFlowStep(engine=engine or "async")
316
+ self.spec.graph = RootFlowStep(
317
+ engine=engine or "async",
318
+ allow_cyclic=allow_cyclic,
319
+ max_iterations=max_iterations,
320
+ )
303
321
  self.spec.graph.track_models = self.spec.track_models
304
322
  else:
305
323
  raise mlrun.errors.MLRunInvalidArgumentError(
@@ -310,7 +328,6 @@ class ServingRuntime(RemoteRuntime):
310
328
  def set_tracking(
311
329
  self,
312
330
  stream_path: Optional[str] = None,
313
- batch: Optional[int] = None,
314
331
  sampling_percentage: float = 100,
315
332
  stream_args: Optional[dict] = None,
316
333
  enable_tracking: bool = True,
@@ -320,7 +337,6 @@ class ServingRuntime(RemoteRuntime):
320
337
 
321
338
  :param stream_path: Path/url of the tracking stream e.g. v3io:///users/mike/mystream
322
339
  you can use the "dummy://" path for test/simulation.
323
- :param batch: Deprecated. Micro batch size (send micro batches of N records at a time).
324
340
  :param sampling_percentage: Down sampling events that will be pushed to the monitoring stream based on
325
341
  a specified percentage. e.g. 50 for 50%. By default, all events are pushed.
326
342
  :param stream_args: Stream initialization parameters, e.g. shards, retention_in_hours, ..
@@ -368,13 +384,6 @@ class ServingRuntime(RemoteRuntime):
368
384
 
369
385
  if stream_path:
370
386
  self.spec.parameters["log_stream"] = stream_path
371
- if batch:
372
- warnings.warn(
373
- "The `batch` size parameter was deprecated in version 1.8.0 and is no longer used. "
374
- "It will be removed in 1.11.",
375
- # TODO: Remove this in 1.11
376
- FutureWarning,
377
- )
378
387
  if stream_args:
379
388
  self.spec.parameters["stream_args"] = stream_args
380
389
 
@@ -393,7 +402,7 @@ class ServingRuntime(RemoteRuntime):
393
402
  outputs: Optional[list[str]] = None,
394
403
  **class_args,
395
404
  ):
396
- """add ml model and/or route to the function.
405
+ """Add ml model and/or route to the function.
397
406
 
398
407
  Example, create a function (from the notebook), add a model class, and deploy::
399
408
 
@@ -401,7 +410,7 @@ class ServingRuntime(RemoteRuntime):
401
410
  fn.add_model("boost", model_path, model_class="MyClass", my_arg=5)
402
411
  fn.deploy()
403
412
 
404
- only works with router topology, for nested topologies (model under router under flow)
413
+ Only works with router topology. For nested topologies (model under router under flow)
405
414
  need to add router to flow and use router.add_route()
406
415
 
407
416
  :param key: model api key (or name:version), will determine the relative url/path
@@ -414,18 +423,19 @@ class ServingRuntime(RemoteRuntime):
414
423
  with multiple router steps)
415
424
  :param child_function: child function name, when the model runs in a child function
416
425
  :param creation_strategy: Strategy for creating or updating the model endpoint:
417
- * **overwrite**:
418
- 1. If model endpoints with the same name exist, delete the `latest` one.
419
- 2. Create a new model endpoint entry and set it as `latest`.
420
- * **inplace** (default):
421
- 1. If model endpoints with the same name exist, update the `latest` entry.
422
- 2. Otherwise, create a new entry.
423
- * **archive**:
424
- 1. If model endpoints with the same name exist, preserve them.
425
- 2. Create a new model endpoint with the same name and set it to `latest`.
426
- :param outputs: list of the model outputs (e.g. labels) ,if provided will override the outputs that been
427
- configured in the model artifact, please note that those outputs need to be equal to the
428
- model serving function outputs (length, and order)
426
+
427
+ * **overwrite**: If model endpoints with the same name exist, delete the `latest`
428
+ one. Create a new model endpoint entry and set it as `latest`.
429
+
430
+ * **inplace** (default): If model endpoints with the same name exist, update the
431
+ `latest` entry. Otherwise, create a new entry.
432
+
433
+ * **archive**: If model endpoints with the same name exist, preserve them.
434
+ Create a new model endpoint with the same name and set it to `latest`.
435
+
436
+ :param outputs: list of the model outputs (e.g. labels), if provided will override the outputs that were
437
+ configured in the model artifact. Note that those outputs need to be equal to the
438
+ model serving function outputs (length, and order).
429
439
  :param class_args: extra kwargs to pass to the model serving class __init__
430
440
  (can be read in the model using .get_param(key) method)
431
441
  """
@@ -518,7 +528,7 @@ class ServingRuntime(RemoteRuntime):
518
528
  :param requirements: py package requirements file path OR list of packages
519
529
  :param kind: mlrun function/runtime kind
520
530
 
521
- :return function object
531
+ :return: function object
522
532
  """
523
533
  function_reference = FunctionReference(
524
534
  url,
@@ -633,14 +643,19 @@ class ServingRuntime(RemoteRuntime):
633
643
 
634
644
  :returns: The Runtime (function) object
635
645
  """
636
-
646
+ if kind == "azure_vault" and isinstance(source, dict):
647
+ candidate_secret_name = (source.get("k8s_secret") or "").strip()
648
+ if candidate_secret_name:
649
+ mlrun.common.secrets.validate_not_forbidden_secret(
650
+ candidate_secret_name
651
+ )
637
652
  if kind == "vault" and isinstance(source, list):
638
653
  source = {"project": self.metadata.project, "secrets": source}
639
654
 
640
655
  self.spec.secret_sources.append({"kind": kind, "source": source})
641
656
  return self
642
657
 
643
- @min_nuclio_versions("1.12.10")
658
+ @nuclio_function.min_nuclio_versions("1.12.10")
644
659
  def deploy(
645
660
  self,
646
661
  project="",
@@ -657,6 +672,7 @@ class ServingRuntime(RemoteRuntime):
657
672
  :param builder_env: env vars dict for source archive config/credentials e.g. builder_env={"GIT_TOKEN": token}
658
673
  :param force_build: set True for force building the image
659
674
  """
675
+
660
676
  load_mode = self.spec.load_mode
661
677
  if load_mode and load_mode not in ["sync", "async"]:
662
678
  raise ValueError(f"illegal model loading mode {load_mode}")
@@ -677,6 +693,21 @@ class ServingRuntime(RemoteRuntime):
677
693
  f"function {function} is used in steps and is not defined, "
678
694
  "use the .add_child_function() to specify child function attributes"
679
695
  )
696
+ if (
697
+ isinstance(self.spec.graph, RootFlowStep)
698
+ and any(
699
+ isinstance(step_type, mlrun.serving.states.ModelRunnerStep)
700
+ for step_type in self.spec.graph.steps.values()
701
+ )
702
+ and self.spec.build.functionSourceCode
703
+ ):
704
+ # Add import for LLModel
705
+ decoded_code = b64decode(self.spec.build.functionSourceCode).decode("utf-8")
706
+ import_llmodel_code = "\nfrom mlrun.serving.states import LLModel\n"
707
+ if import_llmodel_code not in decoded_code:
708
+ decoded_code += import_llmodel_code
709
+ encoded_code = mlrun.utils.helpers.encode_user_code(decoded_code)
710
+ self.spec.build.functionSourceCode = encoded_code
680
711
 
681
712
  # Handle secret processing before handling child functions, since secrets are transferred to them
682
713
  if self.spec.secret_sources:
@@ -740,6 +771,7 @@ class ServingRuntime(RemoteRuntime):
740
771
  current_function="*",
741
772
  track_models=False,
742
773
  workdir=None,
774
+ stream_profile: Optional[ds_profile.DatastoreProfile] = None,
743
775
  **kwargs,
744
776
  ) -> GraphServer:
745
777
  """create mock server object for local testing/emulation
@@ -748,6 +780,7 @@ class ServingRuntime(RemoteRuntime):
748
780
  :param current_function: specify if you want to simulate a child function, * for all functions
749
781
  :param track_models: allow model tracking (disabled by default in the mock server)
750
782
  :param workdir: working directory to locate the source code (if not the current one)
783
+ :param stream_profile: stream profile to use for the mock server output stream.
751
784
  """
752
785
 
753
786
  # set the namespaces/modules to look for the steps code in
@@ -787,6 +820,7 @@ class ServingRuntime(RemoteRuntime):
787
820
  logger=logger,
788
821
  is_mock=True,
789
822
  monitoring_mock=self.spec.track_models,
823
+ stream_profile=stream_profile,
790
824
  )
791
825
 
792
826
  server.graph = add_system_steps_to_graph(
@@ -835,14 +869,28 @@ class ServingRuntime(RemoteRuntime):
835
869
  )
836
870
  self._mock_server = self.to_mock_server()
837
871
 
838
- def to_job(self) -> KubejobRuntime:
839
- """Convert this ServingRuntime to a KubejobRuntime, so that the graph can be run as a standalone job."""
872
+ def to_job(
873
+ self, func_name: Optional[str] = None
874
+ ) -> "kubejob_runtime.KubejobRuntime":
875
+ """Convert this ServingRuntime to a KubejobRuntime, so that the graph can be run as a standalone job.
876
+
877
+ Args:
878
+ func_name: Optional custom name for the job function. If not provided, automatically
879
+ appends '-batch' suffix to the serving function name to prevent database collision.
880
+
881
+ Returns:
882
+ KubejobRuntime configured to execute the serving graph as a batch job.
883
+
884
+ Note:
885
+ The job will have a different name than the serving function to prevent database collision.
886
+ The original serving function remains unchanged and can still be invoked after running the job.
887
+ """
840
888
  if self.spec.function_refs:
841
889
  raise mlrun.errors.MLRunInvalidArgumentError(
842
890
  f"Cannot convert function '{self.metadata.name}' to a job because it has child functions"
843
891
  )
844
892
 
845
- spec = KubeResourceSpec(
893
+ spec = pod_runtime.KubeResourceSpec(
846
894
  image=self.spec.image,
847
895
  mode=self.spec.mode,
848
896
  volumes=self.spec.volumes,
@@ -870,8 +918,50 @@ class ServingRuntime(RemoteRuntime):
870
918
  parameters=self.spec.parameters,
871
919
  graph=self.spec.graph,
872
920
  )
873
- job = KubejobRuntime(
921
+
922
+ job_metadata = deepcopy(self.metadata)
923
+ original_name = job_metadata.name
924
+
925
+ if func_name:
926
+ # User provided explicit job name
927
+ job_metadata.name = func_name
928
+ logger.debug(
929
+ "Creating job from serving function with custom name",
930
+ new_name=func_name,
931
+ )
932
+ else:
933
+ job_metadata.name, was_renamed, suffix = (
934
+ mlrun.utils.helpers.ensure_batch_job_suffix(job_metadata.name)
935
+ )
936
+
937
+ # Check if the resulting name exceeds Kubernetes length limit
938
+ if (
939
+ len(job_metadata.name)
940
+ > mlrun.common.constants.K8S_DNS_1123_LABEL_MAX_LENGTH
941
+ ):
942
+ raise mlrun.errors.MLRunInvalidArgumentError(
943
+ f"Cannot convert serving function '{original_name}' to batch job: "
944
+ f"the resulting name '{job_metadata.name}' ({len(job_metadata.name)} characters) "
945
+ f"exceeds Kubernetes limit of {mlrun.common.constants.K8S_DNS_1123_LABEL_MAX_LENGTH} characters. "
946
+ f"Please provide a custom name via the func_name parameter, "
947
+ f"with at most {mlrun.common.constants.K8S_DNS_1123_LABEL_MAX_LENGTH} characters."
948
+ )
949
+
950
+ if was_renamed:
951
+ logger.info(
952
+ "Creating job from serving function (auto-appended suffix to prevent collision)",
953
+ new_name=job_metadata.name,
954
+ suffix=suffix,
955
+ )
956
+ else:
957
+ logger.debug(
958
+ "Creating job from serving function (name already has suffix)",
959
+ name=original_name,
960
+ suffix=suffix,
961
+ )
962
+
963
+ job = kubejob_runtime.KubejobRuntime(
874
964
  spec=spec,
875
- metadata=self.metadata,
965
+ metadata=job_metadata,
876
966
  )
877
967
  return job
mlrun/runtimes/pod.py CHANGED
@@ -17,14 +17,17 @@ import os
17
17
  import re
18
18
  import time
19
19
  import typing
20
+ import warnings
20
21
  from collections.abc import Iterable
21
22
  from enum import Enum
23
+ from typing import Optional
22
24
 
23
25
  import dotenv
24
26
  import kubernetes.client as k8s_client
25
27
  from kubernetes.client import V1Volume, V1VolumeMount
26
28
 
27
29
  import mlrun.common.constants
30
+ import mlrun.common.secrets
28
31
  import mlrun.errors
29
32
  import mlrun.runtimes.mounts
30
33
  import mlrun.utils.regex
@@ -35,6 +38,7 @@ from mlrun.common.schemas import (
35
38
 
36
39
  from ..config import config as mlconf
37
40
  from ..k8s_utils import (
41
+ generate_preemptible_nodes_affinity_terms,
38
42
  validate_node_selectors,
39
43
  )
40
44
  from ..utils import logger, update_in
@@ -107,6 +111,7 @@ class KubeResourceSpec(FunctionSpec):
107
111
  "track_models",
108
112
  "parameters",
109
113
  "graph",
114
+ "filename",
110
115
  ]
111
116
  _default_fields_to_strip = FunctionSpec._default_fields_to_strip + [
112
117
  "volumes",
@@ -705,19 +710,45 @@ class KubeResource(BaseRuntime):
705
710
  def spec(self, spec):
706
711
  self._spec = self._verify_dict(spec, "spec", KubeResourceSpec)
707
712
 
708
- def set_env_from_secret(self, name, secret=None, secret_key=None):
709
- """set pod environment var from secret"""
710
- secret_key = secret_key or name
713
+ def set_env_from_secret(
714
+ self,
715
+ name: str,
716
+ secret: Optional[str] = None,
717
+ secret_key: Optional[str] = None,
718
+ ):
719
+ """
720
+ Set an environment variable from a Kubernetes Secret.
721
+ Client-side guard forbids MLRun internal auth/project secrets; no-op on API.
722
+ """
723
+ mlrun.common.secrets.validate_not_forbidden_secret(secret)
724
+ key = secret_key or name
711
725
  value_from = k8s_client.V1EnvVarSource(
712
- secret_key_ref=k8s_client.V1SecretKeySelector(name=secret, key=secret_key)
726
+ secret_key_ref=k8s_client.V1SecretKeySelector(name=secret, key=key)
713
727
  )
714
- return self._set_env(name, value_from=value_from)
728
+ return self._set_env(name=name, value_from=value_from)
729
+
730
+ def set_env(
731
+ self,
732
+ name: str,
733
+ value: Optional[str] = None,
734
+ value_from: Optional[typing.Any] = None,
735
+ ):
736
+ """
737
+ Set an environment variable.
738
+ If value comes from a Secret, validate on client-side only.
739
+ """
740
+ if value_from is not None:
741
+ secret_name = self._extract_secret_name_from_value_from(
742
+ value_from=value_from
743
+ )
744
+ if secret_name:
745
+ mlrun.common.secrets.validate_not_forbidden_secret(secret_name)
746
+ return self._set_env(name=name, value_from=value_from)
715
747
 
716
- def set_env(self, name, value=None, value_from=None):
717
- """set pod environment var from value"""
718
- if value is not None:
719
- return self._set_env(name, value=str(value))
720
- return self._set_env(name, value_from=value_from)
748
+ # Plain literal value path
749
+ return self._set_env(
750
+ name=name, value=(str(value) if value is not None else None)
751
+ )
721
752
 
722
753
  def with_annotations(self, annotations: dict):
723
754
  """set a key/value annotations in the metadata of the pod"""
@@ -874,6 +905,133 @@ class KubeResource(BaseRuntime):
874
905
  """
875
906
  self.spec.with_requests(mem, cpu, patch=patch)
876
907
 
908
+ @staticmethod
909
+ def detect_preemptible_node_selector(node_selector: dict[str, str]) -> list[str]:
910
+ """
911
+ Check whether any provided node selector matches preemptible selectors.
912
+
913
+ :param node_selector: User-provided node selector mapping.
914
+ :return: List of `"key='value'"` strings that match a preemptible selector.
915
+ """
916
+ preemptible_node_selector = mlconf.get_preemptible_node_selector()
917
+
918
+ return [
919
+ f"'{key}': '{val}'"
920
+ for key, val in node_selector.items()
921
+ if preemptible_node_selector.get(key) == val
922
+ ]
923
+
924
+ def detect_preemptible_tolerations(
925
+ self, tolerations: list[k8s_client.V1Toleration]
926
+ ) -> list[str]:
927
+ """
928
+ Check whether any provided toleration matches preemptible tolerations.
929
+
930
+ :param tolerations: User-provided tolerations.
931
+ :return: List of formatted toleration strings that are considered preemptible.
932
+ """
933
+ preemptible_tolerations = [
934
+ k8s_client.V1Toleration(
935
+ key=toleration.get("key"),
936
+ value=toleration.get("value"),
937
+ effect=toleration.get("effect"),
938
+ )
939
+ for toleration in mlconf.get_preemptible_tolerations()
940
+ ]
941
+
942
+ def _format_toleration(toleration):
943
+ return f"'{toleration.key}'='{toleration.value}' (effect: '{toleration.effect}')"
944
+
945
+ return [
946
+ _format_toleration(toleration)
947
+ for toleration in tolerations
948
+ if toleration in preemptible_tolerations
949
+ ]
950
+
951
+ def detect_preemptible_affinity(self, affinity: k8s_client.V1Affinity) -> list[str]:
952
+ """
953
+ Check whether any provided affinity rules match preemptible affinity configs.
954
+
955
+ :param affinity: User-provided affinity object.
956
+ :return: List of formatted expressions that overlap with preemptible terms.
957
+ """
958
+ preemptible_affinity_terms = generate_preemptible_nodes_affinity_terms()
959
+ conflicting_affinities = []
960
+
961
+ if (
962
+ affinity
963
+ and affinity.node_affinity
964
+ and affinity.node_affinity.required_during_scheduling_ignored_during_execution
965
+ ):
966
+ user_terms = affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms
967
+ for user_term in user_terms:
968
+ user_expressions = {
969
+ (expr.key, expr.operator, tuple(expr.values or []))
970
+ for expr in user_term.match_expressions or []
971
+ }
972
+
973
+ for preemptible_term in preemptible_affinity_terms:
974
+ preemptible_expressions = {
975
+ (expr.key, expr.operator, tuple(expr.values or []))
976
+ for expr in preemptible_term.match_expressions or []
977
+ }
978
+
979
+ # Ensure operators match and preemptible expressions are present
980
+ common_exprs = user_expressions & preemptible_expressions
981
+ if common_exprs:
982
+ formatted = ", ".join(
983
+ f"'{key} {operator} {list(values)}'"
984
+ for key, operator, values in common_exprs
985
+ )
986
+ conflicting_affinities.append(formatted)
987
+ return conflicting_affinities
988
+
989
+ def raise_preemptible_warning(
990
+ self,
991
+ node_selector: typing.Optional[dict[str, str]],
992
+ tolerations: typing.Optional[list[k8s_client.V1Toleration]],
993
+ affinity: typing.Optional[k8s_client.V1Affinity],
994
+ ) -> None:
995
+ """
996
+ Detect conflicts and emit a single consolidated warning if needed.
997
+
998
+ :param node_selector: User-provided node selector.
999
+ :param tolerations: User-provided tolerations.
1000
+ :param affinity: User-provided affinity.
1001
+ :warns: PreemptionWarning - Emitted when any of the provided selectors,
1002
+ tolerations, or affinity terms match the configured preemptible
1003
+ settings. The message lists the conflicting items.
1004
+ """
1005
+ conflict_messages = []
1006
+
1007
+ if node_selector:
1008
+ ns_conflicts = ", ".join(
1009
+ self.detect_preemptible_node_selector(node_selector)
1010
+ )
1011
+ if ns_conflicts:
1012
+ conflict_messages.append(f"Node selectors: {ns_conflicts}")
1013
+
1014
+ if tolerations:
1015
+ tol_conflicts = ", ".join(self.detect_preemptible_tolerations(tolerations))
1016
+ if tol_conflicts:
1017
+ conflict_messages.append(f"Tolerations: {tol_conflicts}")
1018
+
1019
+ if affinity:
1020
+ affinity_conflicts = ", ".join(self.detect_preemptible_affinity(affinity))
1021
+ if affinity_conflicts:
1022
+ conflict_messages.append(f"Affinity: {affinity_conflicts}")
1023
+
1024
+ if conflict_messages:
1025
+ warning_componentes = "; \n".join(conflict_messages)
1026
+ warnings.warn(
1027
+ f"Warning: based on MLRun's preemptible node configuration, the following components \n"
1028
+ f"may be removed or adjusted at runtime:\n"
1029
+ f"{warning_componentes}.\n"
1030
+ "This adjustment depends on the function's preemption mode. \n"
1031
+ "The list of potential adjusted preemptible selectors can be viewed here: "
1032
+ "mlrun.mlconf.get_preemptible_node_selector() and mlrun.mlconf.get_preemptible_tolerations()."
1033
+ )
1034
+
877
1035
  def with_node_selection(
878
1036
  self,
879
1037
  node_name: typing.Optional[str] = None,
@@ -882,18 +1040,26 @@ class KubeResource(BaseRuntime):
882
1040
  tolerations: typing.Optional[list[k8s_client.V1Toleration]] = None,
883
1041
  ):
884
1042
  """
885
- Enables to control on which k8s node the job will run
886
-
887
- :param node_name: The name of the k8s node
888
- :param node_selector: Label selector, only nodes with matching labels will be eligible to be picked
889
- :param affinity: Expands the types of constraints you can express - see
890
- https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity
891
- for details
892
- :param tolerations: Tolerations are applied to pods, and allow (but do not require) the pods to schedule
893
- onto nodes with matching taints - see
894
- https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration
895
- for details
1043
+ Configure Kubernetes node scheduling for this function.
1044
+
1045
+ Updates one or more scheduling hints: exact node pinning, label-based selection,
1046
+ affinity/anti-affinity rules, and taint tolerations. Passing ``None`` leaves the
1047
+ current value unchanged; pass an empty dict/list (e.g., ``{}``, ``[]``) to clear.
1048
+
1049
+ :param node_name: Exact Kubernetes node name to pin the pod to.
1050
+ :param node_selector: Mapping of label selectors. Use ``{}`` to clear.
1051
+ :param affinity: :class:`kubernetes.client.V1Affinity` constraints.
1052
+ :param tolerations: List of :class:`kubernetes.client.V1Toleration`. Use ``[]`` to clear.
1053
+ :warns: PreemptionWarning - Emitted if provided selectors/tolerations/affinity
1054
+ conflict with the function's preemption mode.
1055
+
1056
+ Example usage:
1057
+ Prefer a GPU pool and allow scheduling on spot nodes::
896
1058
 
1059
+ job.with_node_selection(
1060
+ node_selector={"nodepool": "gpu"},
1061
+ tolerations=[k8s_client.V1Toleration(key="spot", operator="Exists")],
1062
+ )
897
1063
  """
898
1064
  if node_name:
899
1065
  self.spec.node_name = node_name
@@ -904,6 +1070,11 @@ class KubeResource(BaseRuntime):
904
1070
  self.spec.affinity = affinity
905
1071
  if tolerations is not None:
906
1072
  self.spec.tolerations = tolerations
1073
+ self.raise_preemptible_warning(
1074
+ node_selector=self.spec.node_selector,
1075
+ tolerations=self.spec.tolerations,
1076
+ affinity=self.spec.affinity,
1077
+ )
907
1078
 
908
1079
  def with_priority_class(self, name: typing.Optional[str] = None):
909
1080
  """
@@ -1223,6 +1394,27 @@ class KubeResource(BaseRuntime):
1223
1394
 
1224
1395
  return self.status.state
1225
1396
 
1397
+ @staticmethod
1398
+ def _extract_secret_name_from_value_from(
1399
+ value_from: typing.Any,
1400
+ ) -> Optional[str]:
1401
+ """Extract secret name from a V1EnvVarSource or dict representation."""
1402
+ if isinstance(value_from, k8s_client.V1EnvVarSource):
1403
+ if value_from.secret_key_ref:
1404
+ return value_from.secret_key_ref.name
1405
+ elif isinstance(value_from, dict):
1406
+ value_from = (
1407
+ value_from.get("valueFrom")
1408
+ or value_from.get("value_from")
1409
+ or value_from
1410
+ )
1411
+ secret_key_ref = (value_from or {}).get("secretKeyRef") or (
1412
+ value_from or {}
1413
+ ).get("secret_key_ref")
1414
+ if isinstance(secret_key_ref, dict):
1415
+ return secret_key_ref.get("name")
1416
+ return None
1417
+
1226
1418
 
1227
1419
  def _resolve_if_type_sanitized(attribute_name, attribute):
1228
1420
  attribute_config = sanitized_attributes[attribute_name]