mlrun 1.10.0rc18__py3-none-any.whl → 1.11.0rc16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +24 -3
- mlrun/__main__.py +0 -4
- mlrun/artifacts/dataset.py +2 -2
- mlrun/artifacts/document.py +6 -1
- mlrun/artifacts/llm_prompt.py +21 -15
- mlrun/artifacts/model.py +3 -3
- mlrun/artifacts/plots.py +1 -1
- mlrun/{model_monitoring/db/tsdb/tdengine → auth}/__init__.py +2 -3
- mlrun/auth/nuclio.py +89 -0
- mlrun/auth/providers.py +429 -0
- mlrun/auth/utils.py +415 -0
- mlrun/common/constants.py +14 -0
- mlrun/common/model_monitoring/helpers.py +123 -0
- mlrun/common/runtimes/constants.py +28 -0
- mlrun/common/schemas/__init__.py +14 -3
- mlrun/common/schemas/alert.py +2 -2
- mlrun/common/schemas/api_gateway.py +3 -0
- mlrun/common/schemas/auth.py +12 -10
- mlrun/common/schemas/client_spec.py +4 -0
- mlrun/common/schemas/constants.py +25 -0
- mlrun/common/schemas/frontend_spec.py +1 -8
- mlrun/common/schemas/function.py +34 -0
- mlrun/common/schemas/hub.py +33 -20
- mlrun/common/schemas/model_monitoring/__init__.py +2 -1
- mlrun/common/schemas/model_monitoring/constants.py +12 -15
- mlrun/common/schemas/model_monitoring/functions.py +13 -4
- mlrun/common/schemas/model_monitoring/model_endpoints.py +11 -0
- mlrun/common/schemas/pipeline.py +1 -1
- mlrun/common/schemas/secret.py +17 -2
- mlrun/common/secrets.py +95 -1
- mlrun/common/types.py +10 -10
- mlrun/config.py +69 -19
- mlrun/data_types/infer.py +2 -2
- mlrun/datastore/__init__.py +12 -5
- mlrun/datastore/azure_blob.py +162 -47
- mlrun/datastore/base.py +274 -10
- mlrun/datastore/datastore.py +7 -2
- mlrun/datastore/datastore_profile.py +84 -22
- mlrun/datastore/model_provider/huggingface_provider.py +225 -41
- mlrun/datastore/model_provider/mock_model_provider.py +87 -0
- mlrun/datastore/model_provider/model_provider.py +206 -74
- mlrun/datastore/model_provider/openai_provider.py +226 -66
- mlrun/datastore/s3.py +39 -18
- mlrun/datastore/sources.py +1 -1
- mlrun/datastore/store_resources.py +4 -4
- mlrun/datastore/storeytargets.py +17 -12
- mlrun/datastore/targets.py +1 -1
- mlrun/datastore/utils.py +25 -6
- mlrun/datastore/v3io.py +1 -1
- mlrun/db/base.py +63 -32
- mlrun/db/httpdb.py +373 -153
- mlrun/db/nopdb.py +54 -21
- mlrun/errors.py +4 -2
- mlrun/execution.py +66 -25
- mlrun/feature_store/api.py +1 -1
- mlrun/feature_store/common.py +1 -1
- mlrun/feature_store/feature_vector_utils.py +1 -1
- mlrun/feature_store/steps.py +8 -6
- mlrun/frameworks/_common/utils.py +3 -3
- mlrun/frameworks/_dl_common/loggers/logger.py +1 -1
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +2 -1
- mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +1 -1
- mlrun/frameworks/_ml_common/utils.py +2 -1
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +4 -3
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +2 -1
- mlrun/frameworks/onnx/dataset.py +2 -1
- mlrun/frameworks/onnx/mlrun_interface.py +2 -1
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +5 -4
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +2 -1
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +2 -1
- mlrun/frameworks/pytorch/utils.py +2 -1
- mlrun/frameworks/sklearn/metric.py +2 -1
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +5 -4
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +2 -1
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +2 -1
- mlrun/hub/__init__.py +52 -0
- mlrun/hub/base.py +142 -0
- mlrun/hub/module.py +172 -0
- mlrun/hub/step.py +113 -0
- mlrun/k8s_utils.py +105 -16
- mlrun/launcher/base.py +15 -7
- mlrun/launcher/local.py +4 -1
- mlrun/model.py +14 -4
- mlrun/model_monitoring/__init__.py +0 -1
- mlrun/model_monitoring/api.py +65 -28
- mlrun/model_monitoring/applications/__init__.py +1 -1
- mlrun/model_monitoring/applications/base.py +299 -128
- mlrun/model_monitoring/applications/context.py +2 -4
- mlrun/model_monitoring/controller.py +132 -58
- mlrun/model_monitoring/db/_schedules.py +38 -29
- mlrun/model_monitoring/db/_stats.py +6 -16
- mlrun/model_monitoring/db/tsdb/__init__.py +9 -7
- mlrun/model_monitoring/db/tsdb/base.py +29 -9
- mlrun/model_monitoring/db/tsdb/preaggregate.py +234 -0
- mlrun/model_monitoring/db/tsdb/stream_graph_steps.py +63 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_metrics_queries.py +414 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_predictions_queries.py +376 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_results_queries.py +590 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connection.py +434 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connector.py +541 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_operations.py +808 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_schema.py +502 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream.py +163 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream_graph_steps.py +60 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_dataframe_processor.py +141 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_query_builder.py +585 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/writer_graph_steps.py +73 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +20 -9
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +235 -51
- mlrun/model_monitoring/features_drift_table.py +2 -1
- mlrun/model_monitoring/helpers.py +30 -6
- mlrun/model_monitoring/stream_processing.py +34 -28
- mlrun/model_monitoring/writer.py +224 -4
- mlrun/package/__init__.py +2 -1
- mlrun/platforms/__init__.py +0 -43
- mlrun/platforms/iguazio.py +8 -4
- mlrun/projects/operations.py +17 -11
- mlrun/projects/pipelines.py +2 -2
- mlrun/projects/project.py +187 -123
- mlrun/run.py +95 -21
- mlrun/runtimes/__init__.py +2 -186
- mlrun/runtimes/base.py +103 -25
- mlrun/runtimes/constants.py +225 -0
- mlrun/runtimes/daskjob.py +5 -2
- mlrun/runtimes/databricks_job/databricks_runtime.py +2 -1
- mlrun/runtimes/local.py +5 -2
- mlrun/runtimes/mounts.py +20 -2
- mlrun/runtimes/nuclio/__init__.py +12 -7
- mlrun/runtimes/nuclio/api_gateway.py +36 -6
- mlrun/runtimes/nuclio/application/application.py +339 -40
- mlrun/runtimes/nuclio/function.py +222 -72
- mlrun/runtimes/nuclio/serving.py +132 -42
- mlrun/runtimes/pod.py +213 -21
- mlrun/runtimes/utils.py +49 -9
- mlrun/secrets.py +99 -14
- mlrun/serving/__init__.py +2 -0
- mlrun/serving/remote.py +84 -11
- mlrun/serving/routers.py +26 -44
- mlrun/serving/server.py +138 -51
- mlrun/serving/serving_wrapper.py +6 -2
- mlrun/serving/states.py +997 -283
- mlrun/serving/steps.py +62 -0
- mlrun/serving/system_steps.py +149 -95
- mlrun/serving/v2_serving.py +9 -10
- mlrun/track/trackers/mlflow_tracker.py +29 -31
- mlrun/utils/helpers.py +292 -94
- mlrun/utils/http.py +9 -2
- mlrun/utils/notifications/notification/base.py +18 -0
- mlrun/utils/notifications/notification/git.py +3 -5
- mlrun/utils/notifications/notification/mail.py +39 -16
- mlrun/utils/notifications/notification/slack.py +2 -4
- mlrun/utils/notifications/notification/webhook.py +2 -5
- mlrun/utils/notifications/notification_pusher.py +3 -3
- mlrun/utils/version/version.json +2 -2
- mlrun/utils/version/version.py +3 -4
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/METADATA +63 -74
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/RECORD +161 -143
- mlrun/api/schemas/__init__.py +0 -259
- mlrun/db/auth_utils.py +0 -152
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +0 -344
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -75
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +0 -281
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +0 -1266
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.10.0rc18.dist-info → mlrun-1.11.0rc16.dist-info}/top_level.txt +0 -0
mlrun/runtimes/nuclio/serving.py
CHANGED
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import json
|
|
15
15
|
import os
|
|
16
|
-
import
|
|
16
|
+
from base64 import b64decode
|
|
17
17
|
from copy import deepcopy
|
|
18
18
|
from typing import Optional, Union
|
|
19
19
|
|
|
@@ -22,6 +22,11 @@ from nuclio import KafkaTrigger
|
|
|
22
22
|
|
|
23
23
|
import mlrun
|
|
24
24
|
import mlrun.common.schemas as schemas
|
|
25
|
+
import mlrun.common.secrets
|
|
26
|
+
import mlrun.datastore.datastore_profile as ds_profile
|
|
27
|
+
import mlrun.runtimes.kubejob as kubejob_runtime
|
|
28
|
+
import mlrun.runtimes.nuclio.function as nuclio_function
|
|
29
|
+
import mlrun.runtimes.pod as pod_runtime
|
|
25
30
|
from mlrun.datastore import get_kafka_brokers_from_dict, parse_kafka_url
|
|
26
31
|
from mlrun.model import ObjectList
|
|
27
32
|
from mlrun.runtimes.function_reference import FunctionReference
|
|
@@ -42,10 +47,6 @@ from mlrun.serving.states import (
|
|
|
42
47
|
)
|
|
43
48
|
from mlrun.utils import get_caller_globals, logger, set_paths
|
|
44
49
|
|
|
45
|
-
from .. import KubejobRuntime
|
|
46
|
-
from ..pod import KubeResourceSpec
|
|
47
|
-
from .function import NuclioSpec, RemoteRuntime, min_nuclio_versions
|
|
48
|
-
|
|
49
50
|
serving_subkind = "serving_v2"
|
|
50
51
|
|
|
51
52
|
|
|
@@ -83,8 +84,8 @@ def new_v2_model_server(
|
|
|
83
84
|
return f
|
|
84
85
|
|
|
85
86
|
|
|
86
|
-
class ServingSpec(NuclioSpec):
|
|
87
|
-
_dict_fields = NuclioSpec._dict_fields + [
|
|
87
|
+
class ServingSpec(nuclio_function.NuclioSpec):
|
|
88
|
+
_dict_fields = nuclio_function.NuclioSpec._dict_fields + [
|
|
88
89
|
"graph",
|
|
89
90
|
"load_mode",
|
|
90
91
|
"graph_initializer",
|
|
@@ -152,6 +153,7 @@ class ServingSpec(NuclioSpec):
|
|
|
152
153
|
disable_default_http_trigger=None,
|
|
153
154
|
model_endpoint_creation_task_name=None,
|
|
154
155
|
serving_spec=None,
|
|
156
|
+
auth=None,
|
|
155
157
|
):
|
|
156
158
|
super().__init__(
|
|
157
159
|
command=command,
|
|
@@ -193,6 +195,7 @@ class ServingSpec(NuclioSpec):
|
|
|
193
195
|
add_templated_ingress_host_mode=add_templated_ingress_host_mode,
|
|
194
196
|
disable_default_http_trigger=disable_default_http_trigger,
|
|
195
197
|
serving_spec=serving_spec,
|
|
198
|
+
auth=auth,
|
|
196
199
|
)
|
|
197
200
|
|
|
198
201
|
self.models = models or {}
|
|
@@ -229,7 +232,7 @@ class ServingSpec(NuclioSpec):
|
|
|
229
232
|
self._function_refs = ObjectList.from_list(FunctionReference, function_refs)
|
|
230
233
|
|
|
231
234
|
|
|
232
|
-
class ServingRuntime(RemoteRuntime):
|
|
235
|
+
class ServingRuntime(nuclio_function.RemoteRuntime):
|
|
233
236
|
"""MLRun Serving Runtime"""
|
|
234
237
|
|
|
235
238
|
kind = "serving"
|
|
@@ -248,6 +251,8 @@ class ServingRuntime(RemoteRuntime):
|
|
|
248
251
|
class_name=None,
|
|
249
252
|
engine=None,
|
|
250
253
|
exist_ok=False,
|
|
254
|
+
allow_cyclic: bool = False,
|
|
255
|
+
max_iterations: Optional[int] = None,
|
|
251
256
|
**class_args,
|
|
252
257
|
) -> Union[RootFlowStep, RouterStep]:
|
|
253
258
|
"""set the serving graph topology (router/flow) and root class or params
|
|
@@ -278,14 +283,23 @@ class ServingRuntime(RemoteRuntime):
|
|
|
278
283
|
:param class_name: - optional for router, router class name/path or router object
|
|
279
284
|
:param engine: - optional for flow, sync or async engine
|
|
280
285
|
:param exist_ok: - allow overriding existing topology
|
|
286
|
+
:param allow_cyclic: - allow cyclic graphs (only for async flow)
|
|
287
|
+
:param max_iterations: - optional, max iterations for cyclic graphs (only for async flow)
|
|
281
288
|
:param class_args: - optional, router/flow class init args
|
|
282
289
|
|
|
283
|
-
:return graph object (fn.spec.graph)
|
|
290
|
+
:return: graph object (fn.spec.graph)
|
|
284
291
|
"""
|
|
285
292
|
topology = topology or StepKinds.router
|
|
286
293
|
if self.spec.graph and not exist_ok:
|
|
287
294
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
288
|
-
"graph topology is already set,
|
|
295
|
+
"graph topology is already set, graph was initialized, use exist_ok=True to override"
|
|
296
|
+
)
|
|
297
|
+
if allow_cyclic and (
|
|
298
|
+
topology == StepKinds.router
|
|
299
|
+
or (topology == StepKinds.flow and engine == "sync")
|
|
300
|
+
):
|
|
301
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
302
|
+
"cyclic graphs are only supported in flow topology with async engine"
|
|
289
303
|
)
|
|
290
304
|
|
|
291
305
|
if topology == StepKinds.router:
|
|
@@ -299,7 +313,11 @@ class ServingRuntime(RemoteRuntime):
|
|
|
299
313
|
step = RouterStep(class_name=class_name, class_args=class_args)
|
|
300
314
|
self.spec.graph = step
|
|
301
315
|
elif topology == StepKinds.flow:
|
|
302
|
-
self.spec.graph = RootFlowStep(
|
|
316
|
+
self.spec.graph = RootFlowStep(
|
|
317
|
+
engine=engine or "async",
|
|
318
|
+
allow_cyclic=allow_cyclic,
|
|
319
|
+
max_iterations=max_iterations,
|
|
320
|
+
)
|
|
303
321
|
self.spec.graph.track_models = self.spec.track_models
|
|
304
322
|
else:
|
|
305
323
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
@@ -310,7 +328,6 @@ class ServingRuntime(RemoteRuntime):
|
|
|
310
328
|
def set_tracking(
|
|
311
329
|
self,
|
|
312
330
|
stream_path: Optional[str] = None,
|
|
313
|
-
batch: Optional[int] = None,
|
|
314
331
|
sampling_percentage: float = 100,
|
|
315
332
|
stream_args: Optional[dict] = None,
|
|
316
333
|
enable_tracking: bool = True,
|
|
@@ -320,7 +337,6 @@ class ServingRuntime(RemoteRuntime):
|
|
|
320
337
|
|
|
321
338
|
:param stream_path: Path/url of the tracking stream e.g. v3io:///users/mike/mystream
|
|
322
339
|
you can use the "dummy://" path for test/simulation.
|
|
323
|
-
:param batch: Deprecated. Micro batch size (send micro batches of N records at a time).
|
|
324
340
|
:param sampling_percentage: Down sampling events that will be pushed to the monitoring stream based on
|
|
325
341
|
a specified percentage. e.g. 50 for 50%. By default, all events are pushed.
|
|
326
342
|
:param stream_args: Stream initialization parameters, e.g. shards, retention_in_hours, ..
|
|
@@ -368,13 +384,6 @@ class ServingRuntime(RemoteRuntime):
|
|
|
368
384
|
|
|
369
385
|
if stream_path:
|
|
370
386
|
self.spec.parameters["log_stream"] = stream_path
|
|
371
|
-
if batch:
|
|
372
|
-
warnings.warn(
|
|
373
|
-
"The `batch` size parameter was deprecated in version 1.8.0 and is no longer used. "
|
|
374
|
-
"It will be removed in 1.11.",
|
|
375
|
-
# TODO: Remove this in 1.11
|
|
376
|
-
FutureWarning,
|
|
377
|
-
)
|
|
378
387
|
if stream_args:
|
|
379
388
|
self.spec.parameters["stream_args"] = stream_args
|
|
380
389
|
|
|
@@ -393,7 +402,7 @@ class ServingRuntime(RemoteRuntime):
|
|
|
393
402
|
outputs: Optional[list[str]] = None,
|
|
394
403
|
**class_args,
|
|
395
404
|
):
|
|
396
|
-
"""
|
|
405
|
+
"""Add ml model and/or route to the function.
|
|
397
406
|
|
|
398
407
|
Example, create a function (from the notebook), add a model class, and deploy::
|
|
399
408
|
|
|
@@ -401,7 +410,7 @@ class ServingRuntime(RemoteRuntime):
|
|
|
401
410
|
fn.add_model("boost", model_path, model_class="MyClass", my_arg=5)
|
|
402
411
|
fn.deploy()
|
|
403
412
|
|
|
404
|
-
|
|
413
|
+
Only works with router topology. For nested topologies (model under router under flow)
|
|
405
414
|
need to add router to flow and use router.add_route()
|
|
406
415
|
|
|
407
416
|
:param key: model api key (or name:version), will determine the relative url/path
|
|
@@ -414,18 +423,19 @@ class ServingRuntime(RemoteRuntime):
|
|
|
414
423
|
with multiple router steps)
|
|
415
424
|
:param child_function: child function name, when the model runs in a child function
|
|
416
425
|
:param creation_strategy: Strategy for creating or updating the model endpoint:
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
model
|
|
426
|
+
|
|
427
|
+
* **overwrite**: If model endpoints with the same name exist, delete the `latest`
|
|
428
|
+
one. Create a new model endpoint entry and set it as `latest`.
|
|
429
|
+
|
|
430
|
+
* **inplace** (default): If model endpoints with the same name exist, update the
|
|
431
|
+
`latest` entry. Otherwise, create a new entry.
|
|
432
|
+
|
|
433
|
+
* **archive**: If model endpoints with the same name exist, preserve them.
|
|
434
|
+
Create a new model endpoint with the same name and set it to `latest`.
|
|
435
|
+
|
|
436
|
+
:param outputs: list of the model outputs (e.g. labels), if provided will override the outputs that were
|
|
437
|
+
configured in the model artifact. Note that those outputs need to be equal to the
|
|
438
|
+
model serving function outputs (length, and order).
|
|
429
439
|
:param class_args: extra kwargs to pass to the model serving class __init__
|
|
430
440
|
(can be read in the model using .get_param(key) method)
|
|
431
441
|
"""
|
|
@@ -518,7 +528,7 @@ class ServingRuntime(RemoteRuntime):
|
|
|
518
528
|
:param requirements: py package requirements file path OR list of packages
|
|
519
529
|
:param kind: mlrun function/runtime kind
|
|
520
530
|
|
|
521
|
-
:return function object
|
|
531
|
+
:return: function object
|
|
522
532
|
"""
|
|
523
533
|
function_reference = FunctionReference(
|
|
524
534
|
url,
|
|
@@ -633,14 +643,19 @@ class ServingRuntime(RemoteRuntime):
|
|
|
633
643
|
|
|
634
644
|
:returns: The Runtime (function) object
|
|
635
645
|
"""
|
|
636
|
-
|
|
646
|
+
if kind == "azure_vault" and isinstance(source, dict):
|
|
647
|
+
candidate_secret_name = (source.get("k8s_secret") or "").strip()
|
|
648
|
+
if candidate_secret_name:
|
|
649
|
+
mlrun.common.secrets.validate_not_forbidden_secret(
|
|
650
|
+
candidate_secret_name
|
|
651
|
+
)
|
|
637
652
|
if kind == "vault" and isinstance(source, list):
|
|
638
653
|
source = {"project": self.metadata.project, "secrets": source}
|
|
639
654
|
|
|
640
655
|
self.spec.secret_sources.append({"kind": kind, "source": source})
|
|
641
656
|
return self
|
|
642
657
|
|
|
643
|
-
@min_nuclio_versions("1.12.10")
|
|
658
|
+
@nuclio_function.min_nuclio_versions("1.12.10")
|
|
644
659
|
def deploy(
|
|
645
660
|
self,
|
|
646
661
|
project="",
|
|
@@ -657,6 +672,7 @@ class ServingRuntime(RemoteRuntime):
|
|
|
657
672
|
:param builder_env: env vars dict for source archive config/credentials e.g. builder_env={"GIT_TOKEN": token}
|
|
658
673
|
:param force_build: set True for force building the image
|
|
659
674
|
"""
|
|
675
|
+
|
|
660
676
|
load_mode = self.spec.load_mode
|
|
661
677
|
if load_mode and load_mode not in ["sync", "async"]:
|
|
662
678
|
raise ValueError(f"illegal model loading mode {load_mode}")
|
|
@@ -677,6 +693,21 @@ class ServingRuntime(RemoteRuntime):
|
|
|
677
693
|
f"function {function} is used in steps and is not defined, "
|
|
678
694
|
"use the .add_child_function() to specify child function attributes"
|
|
679
695
|
)
|
|
696
|
+
if (
|
|
697
|
+
isinstance(self.spec.graph, RootFlowStep)
|
|
698
|
+
and any(
|
|
699
|
+
isinstance(step_type, mlrun.serving.states.ModelRunnerStep)
|
|
700
|
+
for step_type in self.spec.graph.steps.values()
|
|
701
|
+
)
|
|
702
|
+
and self.spec.build.functionSourceCode
|
|
703
|
+
):
|
|
704
|
+
# Add import for LLModel
|
|
705
|
+
decoded_code = b64decode(self.spec.build.functionSourceCode).decode("utf-8")
|
|
706
|
+
import_llmodel_code = "\nfrom mlrun.serving.states import LLModel\n"
|
|
707
|
+
if import_llmodel_code not in decoded_code:
|
|
708
|
+
decoded_code += import_llmodel_code
|
|
709
|
+
encoded_code = mlrun.utils.helpers.encode_user_code(decoded_code)
|
|
710
|
+
self.spec.build.functionSourceCode = encoded_code
|
|
680
711
|
|
|
681
712
|
# Handle secret processing before handling child functions, since secrets are transferred to them
|
|
682
713
|
if self.spec.secret_sources:
|
|
@@ -740,6 +771,7 @@ class ServingRuntime(RemoteRuntime):
|
|
|
740
771
|
current_function="*",
|
|
741
772
|
track_models=False,
|
|
742
773
|
workdir=None,
|
|
774
|
+
stream_profile: Optional[ds_profile.DatastoreProfile] = None,
|
|
743
775
|
**kwargs,
|
|
744
776
|
) -> GraphServer:
|
|
745
777
|
"""create mock server object for local testing/emulation
|
|
@@ -748,6 +780,7 @@ class ServingRuntime(RemoteRuntime):
|
|
|
748
780
|
:param current_function: specify if you want to simulate a child function, * for all functions
|
|
749
781
|
:param track_models: allow model tracking (disabled by default in the mock server)
|
|
750
782
|
:param workdir: working directory to locate the source code (if not the current one)
|
|
783
|
+
:param stream_profile: stream profile to use for the mock server output stream.
|
|
751
784
|
"""
|
|
752
785
|
|
|
753
786
|
# set the namespaces/modules to look for the steps code in
|
|
@@ -787,6 +820,7 @@ class ServingRuntime(RemoteRuntime):
|
|
|
787
820
|
logger=logger,
|
|
788
821
|
is_mock=True,
|
|
789
822
|
monitoring_mock=self.spec.track_models,
|
|
823
|
+
stream_profile=stream_profile,
|
|
790
824
|
)
|
|
791
825
|
|
|
792
826
|
server.graph = add_system_steps_to_graph(
|
|
@@ -835,14 +869,28 @@ class ServingRuntime(RemoteRuntime):
|
|
|
835
869
|
)
|
|
836
870
|
self._mock_server = self.to_mock_server()
|
|
837
871
|
|
|
838
|
-
def to_job(
|
|
839
|
-
|
|
872
|
+
def to_job(
|
|
873
|
+
self, func_name: Optional[str] = None
|
|
874
|
+
) -> "kubejob_runtime.KubejobRuntime":
|
|
875
|
+
"""Convert this ServingRuntime to a KubejobRuntime, so that the graph can be run as a standalone job.
|
|
876
|
+
|
|
877
|
+
Args:
|
|
878
|
+
func_name: Optional custom name for the job function. If not provided, automatically
|
|
879
|
+
appends '-batch' suffix to the serving function name to prevent database collision.
|
|
880
|
+
|
|
881
|
+
Returns:
|
|
882
|
+
KubejobRuntime configured to execute the serving graph as a batch job.
|
|
883
|
+
|
|
884
|
+
Note:
|
|
885
|
+
The job will have a different name than the serving function to prevent database collision.
|
|
886
|
+
The original serving function remains unchanged and can still be invoked after running the job.
|
|
887
|
+
"""
|
|
840
888
|
if self.spec.function_refs:
|
|
841
889
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
842
890
|
f"Cannot convert function '{self.metadata.name}' to a job because it has child functions"
|
|
843
891
|
)
|
|
844
892
|
|
|
845
|
-
spec = KubeResourceSpec(
|
|
893
|
+
spec = pod_runtime.KubeResourceSpec(
|
|
846
894
|
image=self.spec.image,
|
|
847
895
|
mode=self.spec.mode,
|
|
848
896
|
volumes=self.spec.volumes,
|
|
@@ -870,8 +918,50 @@ class ServingRuntime(RemoteRuntime):
|
|
|
870
918
|
parameters=self.spec.parameters,
|
|
871
919
|
graph=self.spec.graph,
|
|
872
920
|
)
|
|
873
|
-
|
|
921
|
+
|
|
922
|
+
job_metadata = deepcopy(self.metadata)
|
|
923
|
+
original_name = job_metadata.name
|
|
924
|
+
|
|
925
|
+
if func_name:
|
|
926
|
+
# User provided explicit job name
|
|
927
|
+
job_metadata.name = func_name
|
|
928
|
+
logger.debug(
|
|
929
|
+
"Creating job from serving function with custom name",
|
|
930
|
+
new_name=func_name,
|
|
931
|
+
)
|
|
932
|
+
else:
|
|
933
|
+
job_metadata.name, was_renamed, suffix = (
|
|
934
|
+
mlrun.utils.helpers.ensure_batch_job_suffix(job_metadata.name)
|
|
935
|
+
)
|
|
936
|
+
|
|
937
|
+
# Check if the resulting name exceeds Kubernetes length limit
|
|
938
|
+
if (
|
|
939
|
+
len(job_metadata.name)
|
|
940
|
+
> mlrun.common.constants.K8S_DNS_1123_LABEL_MAX_LENGTH
|
|
941
|
+
):
|
|
942
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
943
|
+
f"Cannot convert serving function '{original_name}' to batch job: "
|
|
944
|
+
f"the resulting name '{job_metadata.name}' ({len(job_metadata.name)} characters) "
|
|
945
|
+
f"exceeds Kubernetes limit of {mlrun.common.constants.K8S_DNS_1123_LABEL_MAX_LENGTH} characters. "
|
|
946
|
+
f"Please provide a custom name via the func_name parameter, "
|
|
947
|
+
f"with at most {mlrun.common.constants.K8S_DNS_1123_LABEL_MAX_LENGTH} characters."
|
|
948
|
+
)
|
|
949
|
+
|
|
950
|
+
if was_renamed:
|
|
951
|
+
logger.info(
|
|
952
|
+
"Creating job from serving function (auto-appended suffix to prevent collision)",
|
|
953
|
+
new_name=job_metadata.name,
|
|
954
|
+
suffix=suffix,
|
|
955
|
+
)
|
|
956
|
+
else:
|
|
957
|
+
logger.debug(
|
|
958
|
+
"Creating job from serving function (name already has suffix)",
|
|
959
|
+
name=original_name,
|
|
960
|
+
suffix=suffix,
|
|
961
|
+
)
|
|
962
|
+
|
|
963
|
+
job = kubejob_runtime.KubejobRuntime(
|
|
874
964
|
spec=spec,
|
|
875
|
-
metadata=
|
|
965
|
+
metadata=job_metadata,
|
|
876
966
|
)
|
|
877
967
|
return job
|
mlrun/runtimes/pod.py
CHANGED
|
@@ -17,14 +17,17 @@ import os
|
|
|
17
17
|
import re
|
|
18
18
|
import time
|
|
19
19
|
import typing
|
|
20
|
+
import warnings
|
|
20
21
|
from collections.abc import Iterable
|
|
21
22
|
from enum import Enum
|
|
23
|
+
from typing import Optional
|
|
22
24
|
|
|
23
25
|
import dotenv
|
|
24
26
|
import kubernetes.client as k8s_client
|
|
25
27
|
from kubernetes.client import V1Volume, V1VolumeMount
|
|
26
28
|
|
|
27
29
|
import mlrun.common.constants
|
|
30
|
+
import mlrun.common.secrets
|
|
28
31
|
import mlrun.errors
|
|
29
32
|
import mlrun.runtimes.mounts
|
|
30
33
|
import mlrun.utils.regex
|
|
@@ -35,6 +38,7 @@ from mlrun.common.schemas import (
|
|
|
35
38
|
|
|
36
39
|
from ..config import config as mlconf
|
|
37
40
|
from ..k8s_utils import (
|
|
41
|
+
generate_preemptible_nodes_affinity_terms,
|
|
38
42
|
validate_node_selectors,
|
|
39
43
|
)
|
|
40
44
|
from ..utils import logger, update_in
|
|
@@ -107,6 +111,7 @@ class KubeResourceSpec(FunctionSpec):
|
|
|
107
111
|
"track_models",
|
|
108
112
|
"parameters",
|
|
109
113
|
"graph",
|
|
114
|
+
"filename",
|
|
110
115
|
]
|
|
111
116
|
_default_fields_to_strip = FunctionSpec._default_fields_to_strip + [
|
|
112
117
|
"volumes",
|
|
@@ -705,19 +710,45 @@ class KubeResource(BaseRuntime):
|
|
|
705
710
|
def spec(self, spec):
|
|
706
711
|
self._spec = self._verify_dict(spec, "spec", KubeResourceSpec)
|
|
707
712
|
|
|
708
|
-
def set_env_from_secret(
|
|
709
|
-
|
|
710
|
-
|
|
713
|
+
def set_env_from_secret(
|
|
714
|
+
self,
|
|
715
|
+
name: str,
|
|
716
|
+
secret: Optional[str] = None,
|
|
717
|
+
secret_key: Optional[str] = None,
|
|
718
|
+
):
|
|
719
|
+
"""
|
|
720
|
+
Set an environment variable from a Kubernetes Secret.
|
|
721
|
+
Client-side guard forbids MLRun internal auth/project secrets; no-op on API.
|
|
722
|
+
"""
|
|
723
|
+
mlrun.common.secrets.validate_not_forbidden_secret(secret)
|
|
724
|
+
key = secret_key or name
|
|
711
725
|
value_from = k8s_client.V1EnvVarSource(
|
|
712
|
-
secret_key_ref=k8s_client.V1SecretKeySelector(name=secret, key=
|
|
726
|
+
secret_key_ref=k8s_client.V1SecretKeySelector(name=secret, key=key)
|
|
713
727
|
)
|
|
714
|
-
return self._set_env(name, value_from=value_from)
|
|
728
|
+
return self._set_env(name=name, value_from=value_from)
|
|
729
|
+
|
|
730
|
+
def set_env(
|
|
731
|
+
self,
|
|
732
|
+
name: str,
|
|
733
|
+
value: Optional[str] = None,
|
|
734
|
+
value_from: Optional[typing.Any] = None,
|
|
735
|
+
):
|
|
736
|
+
"""
|
|
737
|
+
Set an environment variable.
|
|
738
|
+
If value comes from a Secret, validate on client-side only.
|
|
739
|
+
"""
|
|
740
|
+
if value_from is not None:
|
|
741
|
+
secret_name = self._extract_secret_name_from_value_from(
|
|
742
|
+
value_from=value_from
|
|
743
|
+
)
|
|
744
|
+
if secret_name:
|
|
745
|
+
mlrun.common.secrets.validate_not_forbidden_secret(secret_name)
|
|
746
|
+
return self._set_env(name=name, value_from=value_from)
|
|
715
747
|
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
return self._set_env(name, value_from=value_from)
|
|
748
|
+
# Plain literal value path
|
|
749
|
+
return self._set_env(
|
|
750
|
+
name=name, value=(str(value) if value is not None else None)
|
|
751
|
+
)
|
|
721
752
|
|
|
722
753
|
def with_annotations(self, annotations: dict):
|
|
723
754
|
"""set a key/value annotations in the metadata of the pod"""
|
|
@@ -874,6 +905,133 @@ class KubeResource(BaseRuntime):
|
|
|
874
905
|
"""
|
|
875
906
|
self.spec.with_requests(mem, cpu, patch=patch)
|
|
876
907
|
|
|
908
|
+
@staticmethod
|
|
909
|
+
def detect_preemptible_node_selector(node_selector: dict[str, str]) -> list[str]:
|
|
910
|
+
"""
|
|
911
|
+
Check whether any provided node selector matches preemptible selectors.
|
|
912
|
+
|
|
913
|
+
:param node_selector: User-provided node selector mapping.
|
|
914
|
+
:return: List of `"key='value'"` strings that match a preemptible selector.
|
|
915
|
+
"""
|
|
916
|
+
preemptible_node_selector = mlconf.get_preemptible_node_selector()
|
|
917
|
+
|
|
918
|
+
return [
|
|
919
|
+
f"'{key}': '{val}'"
|
|
920
|
+
for key, val in node_selector.items()
|
|
921
|
+
if preemptible_node_selector.get(key) == val
|
|
922
|
+
]
|
|
923
|
+
|
|
924
|
+
def detect_preemptible_tolerations(
|
|
925
|
+
self, tolerations: list[k8s_client.V1Toleration]
|
|
926
|
+
) -> list[str]:
|
|
927
|
+
"""
|
|
928
|
+
Check whether any provided toleration matches preemptible tolerations.
|
|
929
|
+
|
|
930
|
+
:param tolerations: User-provided tolerations.
|
|
931
|
+
:return: List of formatted toleration strings that are considered preemptible.
|
|
932
|
+
"""
|
|
933
|
+
preemptible_tolerations = [
|
|
934
|
+
k8s_client.V1Toleration(
|
|
935
|
+
key=toleration.get("key"),
|
|
936
|
+
value=toleration.get("value"),
|
|
937
|
+
effect=toleration.get("effect"),
|
|
938
|
+
)
|
|
939
|
+
for toleration in mlconf.get_preemptible_tolerations()
|
|
940
|
+
]
|
|
941
|
+
|
|
942
|
+
def _format_toleration(toleration):
|
|
943
|
+
return f"'{toleration.key}'='{toleration.value}' (effect: '{toleration.effect}')"
|
|
944
|
+
|
|
945
|
+
return [
|
|
946
|
+
_format_toleration(toleration)
|
|
947
|
+
for toleration in tolerations
|
|
948
|
+
if toleration in preemptible_tolerations
|
|
949
|
+
]
|
|
950
|
+
|
|
951
|
+
def detect_preemptible_affinity(self, affinity: k8s_client.V1Affinity) -> list[str]:
|
|
952
|
+
"""
|
|
953
|
+
Check whether any provided affinity rules match preemptible affinity configs.
|
|
954
|
+
|
|
955
|
+
:param affinity: User-provided affinity object.
|
|
956
|
+
:return: List of formatted expressions that overlap with preemptible terms.
|
|
957
|
+
"""
|
|
958
|
+
preemptible_affinity_terms = generate_preemptible_nodes_affinity_terms()
|
|
959
|
+
conflicting_affinities = []
|
|
960
|
+
|
|
961
|
+
if (
|
|
962
|
+
affinity
|
|
963
|
+
and affinity.node_affinity
|
|
964
|
+
and affinity.node_affinity.required_during_scheduling_ignored_during_execution
|
|
965
|
+
):
|
|
966
|
+
user_terms = affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms
|
|
967
|
+
for user_term in user_terms:
|
|
968
|
+
user_expressions = {
|
|
969
|
+
(expr.key, expr.operator, tuple(expr.values or []))
|
|
970
|
+
for expr in user_term.match_expressions or []
|
|
971
|
+
}
|
|
972
|
+
|
|
973
|
+
for preemptible_term in preemptible_affinity_terms:
|
|
974
|
+
preemptible_expressions = {
|
|
975
|
+
(expr.key, expr.operator, tuple(expr.values or []))
|
|
976
|
+
for expr in preemptible_term.match_expressions or []
|
|
977
|
+
}
|
|
978
|
+
|
|
979
|
+
# Ensure operators match and preemptible expressions are present
|
|
980
|
+
common_exprs = user_expressions & preemptible_expressions
|
|
981
|
+
if common_exprs:
|
|
982
|
+
formatted = ", ".join(
|
|
983
|
+
f"'{key} {operator} {list(values)}'"
|
|
984
|
+
for key, operator, values in common_exprs
|
|
985
|
+
)
|
|
986
|
+
conflicting_affinities.append(formatted)
|
|
987
|
+
return conflicting_affinities
|
|
988
|
+
|
|
989
|
+
def raise_preemptible_warning(
|
|
990
|
+
self,
|
|
991
|
+
node_selector: typing.Optional[dict[str, str]],
|
|
992
|
+
tolerations: typing.Optional[list[k8s_client.V1Toleration]],
|
|
993
|
+
affinity: typing.Optional[k8s_client.V1Affinity],
|
|
994
|
+
) -> None:
|
|
995
|
+
"""
|
|
996
|
+
Detect conflicts and emit a single consolidated warning if needed.
|
|
997
|
+
|
|
998
|
+
:param node_selector: User-provided node selector.
|
|
999
|
+
:param tolerations: User-provided tolerations.
|
|
1000
|
+
:param affinity: User-provided affinity.
|
|
1001
|
+
:warns: PreemptionWarning - Emitted when any of the provided selectors,
|
|
1002
|
+
tolerations, or affinity terms match the configured preemptible
|
|
1003
|
+
settings. The message lists the conflicting items.
|
|
1004
|
+
"""
|
|
1005
|
+
conflict_messages = []
|
|
1006
|
+
|
|
1007
|
+
if node_selector:
|
|
1008
|
+
ns_conflicts = ", ".join(
|
|
1009
|
+
self.detect_preemptible_node_selector(node_selector)
|
|
1010
|
+
)
|
|
1011
|
+
if ns_conflicts:
|
|
1012
|
+
conflict_messages.append(f"Node selectors: {ns_conflicts}")
|
|
1013
|
+
|
|
1014
|
+
if tolerations:
|
|
1015
|
+
tol_conflicts = ", ".join(self.detect_preemptible_tolerations(tolerations))
|
|
1016
|
+
if tol_conflicts:
|
|
1017
|
+
conflict_messages.append(f"Tolerations: {tol_conflicts}")
|
|
1018
|
+
|
|
1019
|
+
if affinity:
|
|
1020
|
+
affinity_conflicts = ", ".join(self.detect_preemptible_affinity(affinity))
|
|
1021
|
+
if affinity_conflicts:
|
|
1022
|
+
conflict_messages.append(f"Affinity: {affinity_conflicts}")
|
|
1023
|
+
|
|
1024
|
+
if conflict_messages:
|
|
1025
|
+
warning_componentes = "; \n".join(conflict_messages)
|
|
1026
|
+
warnings.warn(
|
|
1027
|
+
f"Warning: based on MLRun's preemptible node configuration, the following components \n"
|
|
1028
|
+
f"may be removed or adjusted at runtime:\n"
|
|
1029
|
+
f"{warning_componentes}.\n"
|
|
1030
|
+
"This adjustment depends on the function's preemption mode. \n"
|
|
1031
|
+
"The list of potential adjusted preemptible selectors can be viewed here: "
|
|
1032
|
+
"mlrun.mlconf.get_preemptible_node_selector() and mlrun.mlconf.get_preemptible_tolerations()."
|
|
1033
|
+
)
|
|
1034
|
+
|
|
877
1035
|
def with_node_selection(
|
|
878
1036
|
self,
|
|
879
1037
|
node_name: typing.Optional[str] = None,
|
|
@@ -882,18 +1040,26 @@ class KubeResource(BaseRuntime):
|
|
|
882
1040
|
tolerations: typing.Optional[list[k8s_client.V1Toleration]] = None,
|
|
883
1041
|
):
|
|
884
1042
|
"""
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
:param
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
1043
|
+
Configure Kubernetes node scheduling for this function.
|
|
1044
|
+
|
|
1045
|
+
Updates one or more scheduling hints: exact node pinning, label-based selection,
|
|
1046
|
+
affinity/anti-affinity rules, and taint tolerations. Passing ``None`` leaves the
|
|
1047
|
+
current value unchanged; pass an empty dict/list (e.g., ``{}``, ``[]``) to clear.
|
|
1048
|
+
|
|
1049
|
+
:param node_name: Exact Kubernetes node name to pin the pod to.
|
|
1050
|
+
:param node_selector: Mapping of label selectors. Use ``{}`` to clear.
|
|
1051
|
+
:param affinity: :class:`kubernetes.client.V1Affinity` constraints.
|
|
1052
|
+
:param tolerations: List of :class:`kubernetes.client.V1Toleration`. Use ``[]`` to clear.
|
|
1053
|
+
:warns: PreemptionWarning - Emitted if provided selectors/tolerations/affinity
|
|
1054
|
+
conflict with the function's preemption mode.
|
|
1055
|
+
|
|
1056
|
+
Example usage:
|
|
1057
|
+
Prefer a GPU pool and allow scheduling on spot nodes::
|
|
896
1058
|
|
|
1059
|
+
job.with_node_selection(
|
|
1060
|
+
node_selector={"nodepool": "gpu"},
|
|
1061
|
+
tolerations=[k8s_client.V1Toleration(key="spot", operator="Exists")],
|
|
1062
|
+
)
|
|
897
1063
|
"""
|
|
898
1064
|
if node_name:
|
|
899
1065
|
self.spec.node_name = node_name
|
|
@@ -904,6 +1070,11 @@ class KubeResource(BaseRuntime):
|
|
|
904
1070
|
self.spec.affinity = affinity
|
|
905
1071
|
if tolerations is not None:
|
|
906
1072
|
self.spec.tolerations = tolerations
|
|
1073
|
+
self.raise_preemptible_warning(
|
|
1074
|
+
node_selector=self.spec.node_selector,
|
|
1075
|
+
tolerations=self.spec.tolerations,
|
|
1076
|
+
affinity=self.spec.affinity,
|
|
1077
|
+
)
|
|
907
1078
|
|
|
908
1079
|
def with_priority_class(self, name: typing.Optional[str] = None):
|
|
909
1080
|
"""
|
|
@@ -1223,6 +1394,27 @@ class KubeResource(BaseRuntime):
|
|
|
1223
1394
|
|
|
1224
1395
|
return self.status.state
|
|
1225
1396
|
|
|
1397
|
+
@staticmethod
|
|
1398
|
+
def _extract_secret_name_from_value_from(
|
|
1399
|
+
value_from: typing.Any,
|
|
1400
|
+
) -> Optional[str]:
|
|
1401
|
+
"""Extract secret name from a V1EnvVarSource or dict representation."""
|
|
1402
|
+
if isinstance(value_from, k8s_client.V1EnvVarSource):
|
|
1403
|
+
if value_from.secret_key_ref:
|
|
1404
|
+
return value_from.secret_key_ref.name
|
|
1405
|
+
elif isinstance(value_from, dict):
|
|
1406
|
+
value_from = (
|
|
1407
|
+
value_from.get("valueFrom")
|
|
1408
|
+
or value_from.get("value_from")
|
|
1409
|
+
or value_from
|
|
1410
|
+
)
|
|
1411
|
+
secret_key_ref = (value_from or {}).get("secretKeyRef") or (
|
|
1412
|
+
value_from or {}
|
|
1413
|
+
).get("secret_key_ref")
|
|
1414
|
+
if isinstance(secret_key_ref, dict):
|
|
1415
|
+
return secret_key_ref.get("name")
|
|
1416
|
+
return None
|
|
1417
|
+
|
|
1226
1418
|
|
|
1227
1419
|
def _resolve_if_type_sanitized(attribute_name, attribute):
|
|
1228
1420
|
attribute_config = sanitized_attributes[attribute_name]
|