mlrun 1.7.0rc28__py3-none-any.whl → 1.7.0rc55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__main__.py +4 -2
- mlrun/alerts/alert.py +75 -8
- mlrun/artifacts/base.py +1 -0
- mlrun/artifacts/manager.py +9 -2
- mlrun/common/constants.py +4 -1
- mlrun/common/db/sql_session.py +3 -2
- mlrun/common/formatters/__init__.py +1 -0
- mlrun/common/formatters/artifact.py +1 -0
- mlrun/{model_monitoring/application.py → common/formatters/feature_set.py} +20 -6
- mlrun/common/formatters/run.py +3 -0
- mlrun/common/helpers.py +0 -1
- mlrun/common/schemas/__init__.py +3 -1
- mlrun/common/schemas/alert.py +15 -12
- mlrun/common/schemas/api_gateway.py +6 -6
- mlrun/common/schemas/auth.py +5 -0
- mlrun/common/schemas/client_spec.py +0 -1
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/frontend_spec.py +7 -0
- mlrun/common/schemas/function.py +7 -0
- mlrun/common/schemas/model_monitoring/__init__.py +4 -3
- mlrun/common/schemas/model_monitoring/constants.py +41 -26
- mlrun/common/schemas/model_monitoring/model_endpoints.py +23 -47
- mlrun/common/schemas/notification.py +69 -12
- mlrun/common/schemas/project.py +45 -12
- mlrun/common/schemas/workflow.py +10 -2
- mlrun/common/types.py +1 -0
- mlrun/config.py +91 -35
- mlrun/data_types/data_types.py +6 -1
- mlrun/data_types/spark.py +2 -2
- mlrun/data_types/to_pandas.py +57 -25
- mlrun/datastore/__init__.py +1 -0
- mlrun/datastore/alibaba_oss.py +3 -2
- mlrun/datastore/azure_blob.py +125 -37
- mlrun/datastore/base.py +42 -21
- mlrun/datastore/datastore.py +4 -2
- mlrun/datastore/datastore_profile.py +1 -1
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -3
- mlrun/datastore/google_cloud_storage.py +85 -29
- mlrun/datastore/inmem.py +4 -1
- mlrun/datastore/redis.py +1 -0
- mlrun/datastore/s3.py +25 -12
- mlrun/datastore/sources.py +76 -4
- mlrun/datastore/spark_utils.py +30 -0
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +102 -131
- mlrun/datastore/v3io.py +1 -0
- mlrun/db/base.py +15 -6
- mlrun/db/httpdb.py +57 -28
- mlrun/db/nopdb.py +29 -5
- mlrun/errors.py +20 -3
- mlrun/execution.py +46 -5
- mlrun/feature_store/api.py +25 -1
- mlrun/feature_store/common.py +6 -11
- mlrun/feature_store/feature_vector.py +3 -1
- mlrun/feature_store/retrieval/job.py +4 -1
- mlrun/feature_store/retrieval/spark_merger.py +10 -39
- mlrun/feature_store/steps.py +8 -0
- mlrun/frameworks/_common/plan.py +3 -3
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +2 -3
- mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
- mlrun/k8s_utils.py +48 -2
- mlrun/launcher/client.py +6 -6
- mlrun/launcher/local.py +2 -2
- mlrun/model.py +215 -34
- mlrun/model_monitoring/api.py +38 -24
- mlrun/model_monitoring/applications/__init__.py +1 -2
- mlrun/model_monitoring/applications/_application_steps.py +60 -29
- mlrun/model_monitoring/applications/base.py +2 -174
- mlrun/model_monitoring/applications/context.py +197 -70
- mlrun/model_monitoring/applications/evidently_base.py +11 -85
- mlrun/model_monitoring/applications/histogram_data_drift.py +21 -16
- mlrun/model_monitoring/applications/results.py +4 -4
- mlrun/model_monitoring/controller.py +110 -282
- mlrun/model_monitoring/db/stores/__init__.py +8 -3
- mlrun/model_monitoring/db/stores/base/store.py +3 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +9 -7
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +18 -3
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +43 -23
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +48 -35
- mlrun/model_monitoring/db/tsdb/__init__.py +7 -2
- mlrun/model_monitoring/db/tsdb/base.py +147 -15
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +94 -55
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -3
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +144 -38
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +44 -3
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +246 -57
- mlrun/model_monitoring/helpers.py +70 -50
- mlrun/model_monitoring/stream_processing.py +96 -195
- mlrun/model_monitoring/writer.py +13 -5
- mlrun/package/packagers/default_packager.py +2 -2
- mlrun/projects/operations.py +16 -8
- mlrun/projects/pipelines.py +126 -115
- mlrun/projects/project.py +286 -129
- mlrun/render.py +3 -3
- mlrun/run.py +38 -19
- mlrun/runtimes/__init__.py +19 -8
- mlrun/runtimes/base.py +4 -1
- mlrun/runtimes/daskjob.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -1
- mlrun/runtimes/kubejob.py +6 -6
- mlrun/runtimes/local.py +12 -5
- mlrun/runtimes/nuclio/api_gateway.py +68 -8
- mlrun/runtimes/nuclio/application/application.py +307 -70
- mlrun/runtimes/nuclio/function.py +63 -14
- mlrun/runtimes/nuclio/serving.py +10 -10
- mlrun/runtimes/pod.py +25 -19
- mlrun/runtimes/remotesparkjob.py +2 -5
- mlrun/runtimes/sparkjob/spark3job.py +16 -17
- mlrun/runtimes/utils.py +34 -0
- mlrun/serving/routers.py +2 -5
- mlrun/serving/server.py +37 -19
- mlrun/serving/states.py +30 -3
- mlrun/serving/v2_serving.py +44 -35
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/async_http.py +1 -1
- mlrun/utils/db.py +18 -0
- mlrun/utils/helpers.py +150 -36
- mlrun/utils/http.py +1 -1
- mlrun/utils/notifications/notification/__init__.py +0 -1
- mlrun/utils/notifications/notification/webhook.py +8 -1
- mlrun/utils/notifications/notification_pusher.py +1 -1
- mlrun/utils/v3io_clients.py +2 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/METADATA +153 -66
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/RECORD +131 -134
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -271
- mlrun/model_monitoring/controller_handler.py +0 -37
- mlrun/model_monitoring/evidently_application.py +0 -20
- mlrun/model_monitoring/prometheus.py +0 -216
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/top_level.txt +0 -0
|
@@ -23,6 +23,7 @@ import inflection
|
|
|
23
23
|
import nuclio
|
|
24
24
|
import nuclio.utils
|
|
25
25
|
import requests
|
|
26
|
+
import semver
|
|
26
27
|
from aiohttp.client import ClientSession
|
|
27
28
|
from kubernetes import client
|
|
28
29
|
from mlrun_pipelines.common.mounts import VolumeMount
|
|
@@ -296,10 +297,37 @@ class RemoteRuntime(KubeResource):
|
|
|
296
297
|
"""
|
|
297
298
|
if hasattr(spec, "to_dict"):
|
|
298
299
|
spec = spec.to_dict()
|
|
300
|
+
|
|
301
|
+
self._validate_triggers(spec)
|
|
302
|
+
|
|
299
303
|
spec["name"] = name
|
|
300
304
|
self.spec.config[f"spec.triggers.{name}"] = spec
|
|
301
305
|
return self
|
|
302
306
|
|
|
307
|
+
def _validate_triggers(self, spec):
|
|
308
|
+
# ML-7763 / NUC-233
|
|
309
|
+
min_nuclio_version = "1.13.12"
|
|
310
|
+
if mlconf.nuclio_version and semver.VersionInfo.parse(
|
|
311
|
+
mlconf.nuclio_version
|
|
312
|
+
) < semver.VersionInfo.parse(min_nuclio_version):
|
|
313
|
+
explicit_ack_enabled = False
|
|
314
|
+
num_triggers = 0
|
|
315
|
+
trigger_name = spec.get("name", "UNKNOWN")
|
|
316
|
+
for key, config in [(f"spec.triggers.{trigger_name}", spec)] + list(
|
|
317
|
+
self.spec.config.items()
|
|
318
|
+
):
|
|
319
|
+
if key.startswith("spec.triggers."):
|
|
320
|
+
num_triggers += 1
|
|
321
|
+
explicit_ack_enabled = (
|
|
322
|
+
config.get("explicitAckMode", "disable") != "disable"
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
if num_triggers > 1 and explicit_ack_enabled:
|
|
326
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
327
|
+
"Multiple triggers cannot be used in conjunction with explicit ack. "
|
|
328
|
+
f"Please upgrade to nuclio {min_nuclio_version} or newer."
|
|
329
|
+
)
|
|
330
|
+
|
|
303
331
|
def with_source_archive(
|
|
304
332
|
self,
|
|
305
333
|
source,
|
|
@@ -418,14 +446,8 @@ class RemoteRuntime(KubeResource):
|
|
|
418
446
|
raise ValueError(
|
|
419
447
|
"gateway timeout must be greater than the worker timeout"
|
|
420
448
|
)
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
)
|
|
424
|
-
annotations["nginx.ingress.kubernetes.io/proxy-read-timeout"] = (
|
|
425
|
-
f"{gateway_timeout}"
|
|
426
|
-
)
|
|
427
|
-
annotations["nginx.ingress.kubernetes.io/proxy-send-timeout"] = (
|
|
428
|
-
f"{gateway_timeout}"
|
|
449
|
+
mlrun.runtimes.utils.enrich_gateway_timeout_annotations(
|
|
450
|
+
annotations, gateway_timeout
|
|
429
451
|
)
|
|
430
452
|
|
|
431
453
|
trigger = nuclio.HttpTrigger(
|
|
@@ -446,6 +468,11 @@ class RemoteRuntime(KubeResource):
|
|
|
446
468
|
return self
|
|
447
469
|
|
|
448
470
|
def from_image(self, image):
|
|
471
|
+
"""
|
|
472
|
+
Deploy the function with an existing nuclio processor image.
|
|
473
|
+
|
|
474
|
+
:param image: image name
|
|
475
|
+
"""
|
|
449
476
|
config = nuclio.config.new_config()
|
|
450
477
|
update_in(
|
|
451
478
|
config,
|
|
@@ -496,6 +523,11 @@ class RemoteRuntime(KubeResource):
|
|
|
496
523
|
extra_attributes = extra_attributes or {}
|
|
497
524
|
if ack_window_size:
|
|
498
525
|
extra_attributes["ackWindowSize"] = ack_window_size
|
|
526
|
+
|
|
527
|
+
access_key = kwargs.pop("access_key", None)
|
|
528
|
+
if not access_key:
|
|
529
|
+
access_key = self._resolve_v3io_access_key()
|
|
530
|
+
|
|
499
531
|
self.add_trigger(
|
|
500
532
|
name,
|
|
501
533
|
V3IOStreamTrigger(
|
|
@@ -507,11 +539,14 @@ class RemoteRuntime(KubeResource):
|
|
|
507
539
|
webapi=endpoint or "http://v3io-webapi:8081",
|
|
508
540
|
extra_attributes=extra_attributes,
|
|
509
541
|
read_batch_size=256,
|
|
542
|
+
access_key=access_key,
|
|
510
543
|
**kwargs,
|
|
511
544
|
),
|
|
512
545
|
)
|
|
513
|
-
self.spec.min_replicas
|
|
514
|
-
|
|
546
|
+
if self.spec.min_replicas != shards or self.spec.max_replicas != shards:
|
|
547
|
+
logger.warning(f"Setting function replicas to {shards}")
|
|
548
|
+
self.spec.min_replicas = shards
|
|
549
|
+
self.spec.max_replicas = shards
|
|
515
550
|
|
|
516
551
|
def deploy(
|
|
517
552
|
self,
|
|
@@ -566,6 +601,9 @@ class RemoteRuntime(KubeResource):
|
|
|
566
601
|
# this also means that the function object will be updated with the function status
|
|
567
602
|
self._wait_for_function_deployment(db, verbose=verbose)
|
|
568
603
|
|
|
604
|
+
return self._enrich_command_from_status()
|
|
605
|
+
|
|
606
|
+
def _enrich_command_from_status(self):
|
|
569
607
|
# NOTE: on older mlrun versions & nuclio versions, function are exposed via NodePort
|
|
570
608
|
# now, functions can be not exposed (using service type ClusterIP) and hence
|
|
571
609
|
# for BC we first try to populate the external invocation url, and then
|
|
@@ -679,7 +717,7 @@ class RemoteRuntime(KubeResource):
|
|
|
679
717
|
"State thresholds do not apply for nuclio as it has its own function pods healthiness monitoring"
|
|
680
718
|
)
|
|
681
719
|
|
|
682
|
-
@min_nuclio_versions("1.
|
|
720
|
+
@min_nuclio_versions("1.13.1")
|
|
683
721
|
def disable_default_http_trigger(
|
|
684
722
|
self,
|
|
685
723
|
):
|
|
@@ -688,7 +726,7 @@ class RemoteRuntime(KubeResource):
|
|
|
688
726
|
"""
|
|
689
727
|
self.spec.disable_default_http_trigger = True
|
|
690
728
|
|
|
691
|
-
@min_nuclio_versions("1.
|
|
729
|
+
@min_nuclio_versions("1.13.1")
|
|
692
730
|
def enable_default_http_trigger(
|
|
693
731
|
self,
|
|
694
732
|
):
|
|
@@ -697,6 +735,10 @@ class RemoteRuntime(KubeResource):
|
|
|
697
735
|
"""
|
|
698
736
|
self.spec.disable_default_http_trigger = False
|
|
699
737
|
|
|
738
|
+
def skip_image_enrichment(self):
|
|
739
|
+
# make sure the API does not enrich the base image if the function is not a python function
|
|
740
|
+
return self.spec.nuclio_runtime and "python" not in self.spec.nuclio_runtime
|
|
741
|
+
|
|
700
742
|
def _get_state(
|
|
701
743
|
self,
|
|
702
744
|
dashboard="",
|
|
@@ -739,7 +781,7 @@ class RemoteRuntime(KubeResource):
|
|
|
739
781
|
return state, text, last_log_timestamp
|
|
740
782
|
|
|
741
783
|
try:
|
|
742
|
-
text, last_log_timestamp = self._get_db().
|
|
784
|
+
text, last_log_timestamp = self._get_db().get_nuclio_deploy_status(
|
|
743
785
|
self, last_log_timestamp=last_log_timestamp, verbose=verbose
|
|
744
786
|
)
|
|
745
787
|
except mlrun.db.RunDBError:
|
|
@@ -990,7 +1032,7 @@ class RemoteRuntime(KubeResource):
|
|
|
990
1032
|
if command and not command.startswith("http"):
|
|
991
1033
|
sidecar["command"] = mlrun.utils.helpers.as_list(command)
|
|
992
1034
|
|
|
993
|
-
if args and sidecar
|
|
1035
|
+
if args and sidecar.get("command"):
|
|
994
1036
|
sidecar["args"] = mlrun.utils.helpers.as_list(args)
|
|
995
1037
|
|
|
996
1038
|
# populate the sidecar resources from the function spec
|
|
@@ -1233,6 +1275,13 @@ class RemoteRuntime(KubeResource):
|
|
|
1233
1275
|
|
|
1234
1276
|
return self._resolve_invocation_url("", force_external_address)
|
|
1235
1277
|
|
|
1278
|
+
@staticmethod
|
|
1279
|
+
def _resolve_v3io_access_key():
|
|
1280
|
+
# Nuclio supports generating access key for v3io stream trigger only from version 1.13.11
|
|
1281
|
+
if validate_nuclio_version_compatibility("1.13.11"):
|
|
1282
|
+
return mlrun.model.Credentials.generate_access_key
|
|
1283
|
+
return None
|
|
1284
|
+
|
|
1236
1285
|
|
|
1237
1286
|
def parse_logs(logs):
|
|
1238
1287
|
logs = json.loads(logs)
|
mlrun/runtimes/nuclio/serving.py
CHANGED
|
@@ -314,8 +314,8 @@ class ServingRuntime(RemoteRuntime):
|
|
|
314
314
|
tracking_policy: Optional[Union["TrackingPolicy", dict]] = None,
|
|
315
315
|
enable_tracking: bool = True,
|
|
316
316
|
) -> None:
|
|
317
|
-
"""
|
|
318
|
-
|
|
317
|
+
"""Apply on your serving function to monitor a deployed model, including real-time dashboards to detect drift
|
|
318
|
+
and analyze performance.
|
|
319
319
|
|
|
320
320
|
:param stream_path: Path/url of the tracking stream e.g. v3io:///users/mike/mystream
|
|
321
321
|
you can use the "dummy://" path for test/simulation.
|
|
@@ -325,12 +325,12 @@ class ServingRuntime(RemoteRuntime):
|
|
|
325
325
|
:param enable_tracking: Enabled/Disable model-monitoring tracking.
|
|
326
326
|
Default True (tracking enabled).
|
|
327
327
|
|
|
328
|
-
|
|
328
|
+
Example::
|
|
329
329
|
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
330
|
+
# initialize a new serving function
|
|
331
|
+
serving_fn = mlrun.import_function("hub://v2-model-server", new_name="serving")
|
|
332
|
+
# apply model monitoring
|
|
333
|
+
serving_fn.set_tracking()
|
|
334
334
|
|
|
335
335
|
"""
|
|
336
336
|
# Applying model monitoring configurations
|
|
@@ -480,7 +480,7 @@ class ServingRuntime(RemoteRuntime):
|
|
|
480
480
|
trigger_args = stream.trigger_args or {}
|
|
481
481
|
|
|
482
482
|
engine = self.spec.graph.engine or "async"
|
|
483
|
-
if mlrun.mlconf.
|
|
483
|
+
if mlrun.mlconf.is_explicit_ack_enabled() and engine == "async":
|
|
484
484
|
trigger_args["explicit_ack_mode"] = trigger_args.get(
|
|
485
485
|
"explicit_ack_mode", "explicitOnly"
|
|
486
486
|
)
|
|
@@ -676,7 +676,6 @@ class ServingRuntime(RemoteRuntime):
|
|
|
676
676
|
"""create mock server object for local testing/emulation
|
|
677
677
|
|
|
678
678
|
:param namespace: one or list of namespaces/modules to search the steps classes/functions in
|
|
679
|
-
:param log_level: log level (error | info | debug)
|
|
680
679
|
:param current_function: specify if you want to simulate a child function, * for all functions
|
|
681
680
|
:param track_models: allow model tracking (disabled by default in the mock server)
|
|
682
681
|
:param workdir: working directory to locate the source code (if not the current one)
|
|
@@ -704,7 +703,7 @@ class ServingRuntime(RemoteRuntime):
|
|
|
704
703
|
verbose=self.verbose,
|
|
705
704
|
current_function=current_function,
|
|
706
705
|
graph_initializer=self.spec.graph_initializer,
|
|
707
|
-
track_models=
|
|
706
|
+
track_models=self.spec.track_models,
|
|
708
707
|
function_uri=self._function_uri(),
|
|
709
708
|
secret_sources=self.spec.secret_sources,
|
|
710
709
|
default_content_type=self.spec.default_content_type,
|
|
@@ -715,6 +714,7 @@ class ServingRuntime(RemoteRuntime):
|
|
|
715
714
|
namespace=namespace,
|
|
716
715
|
logger=logger,
|
|
717
716
|
is_mock=True,
|
|
717
|
+
monitoring_mock=track_models,
|
|
718
718
|
)
|
|
719
719
|
|
|
720
720
|
if workdir:
|
mlrun/runtimes/pod.py
CHANGED
|
@@ -38,6 +38,7 @@ from ..k8s_utils import (
|
|
|
38
38
|
generate_preemptible_nodes_affinity_terms,
|
|
39
39
|
generate_preemptible_nodes_anti_affinity_terms,
|
|
40
40
|
generate_preemptible_tolerations,
|
|
41
|
+
validate_node_selectors,
|
|
41
42
|
)
|
|
42
43
|
from ..utils import logger, update_in
|
|
43
44
|
from .base import BaseRuntime, FunctionSpec, spec_fields
|
|
@@ -215,9 +216,7 @@ class KubeResourceSpec(FunctionSpec):
|
|
|
215
216
|
image_pull_secret or mlrun.mlconf.function.spec.image_pull_secret.default
|
|
216
217
|
)
|
|
217
218
|
self.node_name = node_name
|
|
218
|
-
self.node_selector =
|
|
219
|
-
node_selector or mlrun.mlconf.get_default_function_node_selector()
|
|
220
|
-
)
|
|
219
|
+
self.node_selector = node_selector or {}
|
|
221
220
|
self._affinity = affinity
|
|
222
221
|
self.priority_class_name = (
|
|
223
222
|
priority_class_name or mlrun.mlconf.default_function_priority_class_name
|
|
@@ -532,7 +531,7 @@ class KubeResourceSpec(FunctionSpec):
|
|
|
532
531
|
return
|
|
533
532
|
|
|
534
533
|
# merge node selectors - precedence to existing node selector
|
|
535
|
-
self.node_selector = mlrun.utils.helpers.
|
|
534
|
+
self.node_selector = mlrun.utils.helpers.merge_dicts_with_precedence(
|
|
536
535
|
node_selector, self.node_selector
|
|
537
536
|
)
|
|
538
537
|
|
|
@@ -1108,12 +1107,12 @@ class KubeResource(BaseRuntime, KfpAdapterMixin):
|
|
|
1108
1107
|
|
|
1109
1108
|
:param state_thresholds: A dictionary of state to threshold. The supported states are:
|
|
1110
1109
|
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
See mlrun.mlconf.function.spec.state_thresholds for the default thresholds.
|
|
1110
|
+
* pending_scheduled - The pod/crd is scheduled on a node but not yet running
|
|
1111
|
+
* pending_not_scheduled - The pod/crd is not yet scheduled on a node
|
|
1112
|
+
* executing - The pod/crd started and is running
|
|
1113
|
+
* image_pull_backoff - The pod/crd is in image pull backoff
|
|
1116
1114
|
|
|
1115
|
+
See :code:`mlrun.mlconf.function.spec.state_thresholds` for the default thresholds.
|
|
1117
1116
|
:param patch: Whether to merge the given thresholds with the existing thresholds (True, default)
|
|
1118
1117
|
or override them (False)
|
|
1119
1118
|
"""
|
|
@@ -1176,9 +1175,10 @@ class KubeResource(BaseRuntime, KfpAdapterMixin):
|
|
|
1176
1175
|
"""
|
|
1177
1176
|
if node_name:
|
|
1178
1177
|
self.spec.node_name = node_name
|
|
1179
|
-
if node_selector:
|
|
1178
|
+
if node_selector is not None:
|
|
1179
|
+
validate_node_selectors(node_selectors=node_selector, raise_on_error=False)
|
|
1180
1180
|
self.spec.node_selector = node_selector
|
|
1181
|
-
if affinity:
|
|
1181
|
+
if affinity is not None:
|
|
1182
1182
|
self.spec.affinity = affinity
|
|
1183
1183
|
if tolerations is not None:
|
|
1184
1184
|
self.spec.tolerations = tolerations
|
|
@@ -1347,20 +1347,26 @@ class KubeResource(BaseRuntime, KfpAdapterMixin):
|
|
|
1347
1347
|
|
|
1348
1348
|
def _build_image(
|
|
1349
1349
|
self,
|
|
1350
|
-
builder_env,
|
|
1351
|
-
force_build,
|
|
1352
|
-
mlrun_version_specifier,
|
|
1353
|
-
show_on_failure,
|
|
1354
|
-
skip_deployed,
|
|
1355
|
-
watch,
|
|
1356
|
-
is_kfp,
|
|
1357
|
-
with_mlrun,
|
|
1350
|
+
builder_env: dict,
|
|
1351
|
+
force_build: bool,
|
|
1352
|
+
mlrun_version_specifier: typing.Optional[bool],
|
|
1353
|
+
show_on_failure: bool,
|
|
1354
|
+
skip_deployed: bool,
|
|
1355
|
+
watch: bool,
|
|
1356
|
+
is_kfp: bool,
|
|
1357
|
+
with_mlrun: typing.Optional[bool],
|
|
1358
1358
|
):
|
|
1359
1359
|
# When we're in pipelines context we must watch otherwise the pipelines pod will exit before the operation
|
|
1360
1360
|
# is actually done. (when a pipelines pod exits, the pipeline step marked as done)
|
|
1361
1361
|
if is_kfp:
|
|
1362
1362
|
watch = True
|
|
1363
1363
|
|
|
1364
|
+
if skip_deployed and self.requires_build() and not self.is_deployed():
|
|
1365
|
+
logger.warning(
|
|
1366
|
+
f"Even though {skip_deployed=}, the build might be triggered due to the function's configuration. "
|
|
1367
|
+
"See requires_build() and is_deployed() for reasoning."
|
|
1368
|
+
)
|
|
1369
|
+
|
|
1364
1370
|
db = self._get_db()
|
|
1365
1371
|
data = db.remote_builder(
|
|
1366
1372
|
self,
|
mlrun/runtimes/remotesparkjob.py
CHANGED
|
@@ -102,16 +102,13 @@ class RemoteSparkRuntime(KubejobRuntime):
|
|
|
102
102
|
|
|
103
103
|
@classmethod
|
|
104
104
|
def deploy_default_image(cls):
|
|
105
|
-
|
|
106
|
-
from mlrun.run import new_function
|
|
107
|
-
|
|
108
|
-
sj = new_function(
|
|
105
|
+
sj = mlrun.new_function(
|
|
109
106
|
kind="remote-spark", name="remote-spark-default-image-deploy-temp"
|
|
110
107
|
)
|
|
111
108
|
sj.spec.build.image = cls.default_image
|
|
112
109
|
sj.with_spark_service(spark_service="dummy-spark")
|
|
113
110
|
sj.deploy()
|
|
114
|
-
get_run_db().delete_function(name=sj.metadata.name)
|
|
111
|
+
mlrun.get_run_db().delete_function(name=sj.metadata.name)
|
|
115
112
|
|
|
116
113
|
def is_deployed(self):
|
|
117
114
|
if (
|
|
@@ -18,6 +18,7 @@ from mlrun_pipelines.mounts import mount_v3io, mount_v3iod
|
|
|
18
18
|
|
|
19
19
|
import mlrun.common.schemas.function
|
|
20
20
|
import mlrun.errors
|
|
21
|
+
import mlrun.k8s_utils
|
|
21
22
|
import mlrun.runtimes.pod
|
|
22
23
|
from mlrun.config import config
|
|
23
24
|
|
|
@@ -451,7 +452,7 @@ class Spark3JobSpec(KubeResourceSpec):
|
|
|
451
452
|
class Spark3Runtime(KubejobRuntime):
|
|
452
453
|
group = "sparkoperator.k8s.io"
|
|
453
454
|
version = "v1beta2"
|
|
454
|
-
apiVersion = group + "/" + version
|
|
455
|
+
apiVersion = group + "/" + version # noqa: N815
|
|
455
456
|
kind = "spark"
|
|
456
457
|
plural = "sparkapplications"
|
|
457
458
|
|
|
@@ -505,13 +506,11 @@ class Spark3Runtime(KubejobRuntime):
|
|
|
505
506
|
raise NotImplementedError(
|
|
506
507
|
"Setting node name is not supported for spark runtime"
|
|
507
508
|
)
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
)
|
|
514
|
-
super().with_node_selection(node_name, node_selector, affinity, tolerations)
|
|
509
|
+
mlrun.k8s_utils.validate_node_selectors(node_selector, raise_on_error=False)
|
|
510
|
+
self.with_driver_node_selection(node_name, node_selector, affinity, tolerations)
|
|
511
|
+
self.with_executor_node_selection(
|
|
512
|
+
node_name, node_selector, affinity, tolerations
|
|
513
|
+
)
|
|
515
514
|
|
|
516
515
|
def with_driver_node_selection(
|
|
517
516
|
self,
|
|
@@ -537,11 +536,12 @@ class Spark3Runtime(KubejobRuntime):
|
|
|
537
536
|
raise NotImplementedError(
|
|
538
537
|
"Setting node name is not supported for spark runtime"
|
|
539
538
|
)
|
|
540
|
-
if affinity:
|
|
539
|
+
if affinity is not None:
|
|
541
540
|
self.spec.driver_affinity = affinity
|
|
542
|
-
if node_selector:
|
|
541
|
+
if node_selector is not None:
|
|
542
|
+
mlrun.k8s_utils.validate_node_selectors(node_selector, raise_on_error=False)
|
|
543
543
|
self.spec.driver_node_selector = node_selector
|
|
544
|
-
if tolerations:
|
|
544
|
+
if tolerations is not None:
|
|
545
545
|
self.spec.driver_tolerations = tolerations
|
|
546
546
|
|
|
547
547
|
def with_executor_node_selection(
|
|
@@ -568,11 +568,12 @@ class Spark3Runtime(KubejobRuntime):
|
|
|
568
568
|
raise NotImplementedError(
|
|
569
569
|
"Setting node name is not supported for spark runtime"
|
|
570
570
|
)
|
|
571
|
-
if affinity:
|
|
571
|
+
if affinity is not None:
|
|
572
572
|
self.spec.executor_affinity = affinity
|
|
573
|
-
if node_selector:
|
|
573
|
+
if node_selector is not None:
|
|
574
|
+
mlrun.k8s_utils.validate_node_selectors(node_selector, raise_on_error=False)
|
|
574
575
|
self.spec.executor_node_selector = node_selector
|
|
575
|
-
if tolerations:
|
|
576
|
+
if tolerations is not None:
|
|
576
577
|
self.spec.executor_tolerations = tolerations
|
|
577
578
|
|
|
578
579
|
def with_preemption_mode(
|
|
@@ -811,9 +812,7 @@ class Spark3Runtime(KubejobRuntime):
|
|
|
811
812
|
|
|
812
813
|
@classmethod
|
|
813
814
|
def deploy_default_image(cls, with_gpu=False):
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
sj = new_function(kind=cls.kind, name="spark-default-image-deploy-temp")
|
|
815
|
+
sj = mlrun.new_function(kind=cls.kind, name="spark-default-image-deploy-temp")
|
|
817
816
|
sj.spec.build.image = cls._get_default_deployed_mlrun_image_name(with_gpu)
|
|
818
817
|
|
|
819
818
|
# setting required resources
|
mlrun/runtimes/utils.py
CHANGED
|
@@ -445,3 +445,37 @@ def enrich_run_labels(
|
|
|
445
445
|
if label.value not in labels and enrichment:
|
|
446
446
|
labels[label.value] = enrichment
|
|
447
447
|
return labels
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
def resolve_node_selectors(
|
|
451
|
+
project_node_selector: dict, instance_node_selector: dict
|
|
452
|
+
) -> dict:
|
|
453
|
+
config_node_selector = mlrun.mlconf.get_default_function_node_selector()
|
|
454
|
+
if project_node_selector or config_node_selector:
|
|
455
|
+
mlrun.utils.logger.debug(
|
|
456
|
+
"Enriching node selector from project and mlrun config",
|
|
457
|
+
project_node_selector=project_node_selector,
|
|
458
|
+
config_node_selector=config_node_selector,
|
|
459
|
+
)
|
|
460
|
+
return mlrun.utils.helpers.merge_dicts_with_precedence(
|
|
461
|
+
config_node_selector,
|
|
462
|
+
project_node_selector,
|
|
463
|
+
instance_node_selector,
|
|
464
|
+
)
|
|
465
|
+
return instance_node_selector
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def enrich_gateway_timeout_annotations(annotations: dict, gateway_timeout: int):
|
|
469
|
+
"""
|
|
470
|
+
Set gateway proxy connect/read/send timeout annotations
|
|
471
|
+
:param annotations: The annotations to enrich
|
|
472
|
+
:param gateway_timeout: The timeout to set
|
|
473
|
+
"""
|
|
474
|
+
if not gateway_timeout:
|
|
475
|
+
return
|
|
476
|
+
gateway_timeout_str = str(gateway_timeout)
|
|
477
|
+
annotations["nginx.ingress.kubernetes.io/proxy-connect-timeout"] = (
|
|
478
|
+
gateway_timeout_str
|
|
479
|
+
)
|
|
480
|
+
annotations["nginx.ingress.kubernetes.io/proxy-read-timeout"] = gateway_timeout_str
|
|
481
|
+
annotations["nginx.ingress.kubernetes.io/proxy-send-timeout"] = gateway_timeout_str
|
mlrun/serving/routers.py
CHANGED
|
@@ -32,7 +32,6 @@ from mlrun.errors import err_to_str
|
|
|
32
32
|
from mlrun.utils import logger, now_date
|
|
33
33
|
|
|
34
34
|
from ..common.helpers import parse_versioned_object_uri
|
|
35
|
-
from ..config import config
|
|
36
35
|
from .server import GraphServer
|
|
37
36
|
from .utils import RouterToDict, _extract_input_data, _update_result_body
|
|
38
37
|
from .v2_serving import _ModelLogPusher
|
|
@@ -616,7 +615,7 @@ class VotingEnsemble(ParallelRun):
|
|
|
616
615
|
logger.warn("GraphServer not initialized for VotingEnsemble instance")
|
|
617
616
|
return
|
|
618
617
|
|
|
619
|
-
if not self.context.is_mock or self.context.
|
|
618
|
+
if not self.context.is_mock or self.context.monitoring_mock:
|
|
620
619
|
self.model_endpoint_uid = _init_endpoint_record(server, self)
|
|
621
620
|
|
|
622
621
|
self._update_weights(self.weights)
|
|
@@ -1057,9 +1056,7 @@ def _init_endpoint_record(
|
|
|
1057
1056
|
function_uri=graph_server.function_uri,
|
|
1058
1057
|
model=versioned_model_name,
|
|
1059
1058
|
model_class=voting_ensemble.__class__.__name__,
|
|
1060
|
-
stream_path=
|
|
1061
|
-
project=project, kind="stream"
|
|
1062
|
-
),
|
|
1059
|
+
stream_path=voting_ensemble.context.stream.stream_uri,
|
|
1063
1060
|
active=True,
|
|
1064
1061
|
monitoring_mode=mlrun.common.schemas.model_monitoring.ModelMonitoringMode.enabled,
|
|
1065
1062
|
),
|
mlrun/serving/server.py
CHANGED
|
@@ -22,10 +22,14 @@ import traceback
|
|
|
22
22
|
import uuid
|
|
23
23
|
from typing import Optional, Union
|
|
24
24
|
|
|
25
|
+
from nuclio import Context as NuclioContext
|
|
26
|
+
from nuclio.request import Logger as NuclioLogger
|
|
27
|
+
|
|
25
28
|
import mlrun
|
|
26
29
|
import mlrun.common.constants
|
|
27
30
|
import mlrun.common.helpers
|
|
28
31
|
import mlrun.model_monitoring
|
|
32
|
+
import mlrun.utils
|
|
29
33
|
from mlrun.config import config
|
|
30
34
|
from mlrun.errors import err_to_str
|
|
31
35
|
from mlrun.secrets import SecretsStore
|
|
@@ -38,10 +42,7 @@ from ..errors import MLRunInvalidArgumentError
|
|
|
38
42
|
from ..model import ModelObj
|
|
39
43
|
from ..utils import get_caller_globals
|
|
40
44
|
from .states import RootFlowStep, RouterStep, get_function, graph_root_setter
|
|
41
|
-
from .utils import
|
|
42
|
-
event_id_key,
|
|
43
|
-
event_path_key,
|
|
44
|
-
)
|
|
45
|
+
from .utils import event_id_key, event_path_key
|
|
45
46
|
|
|
46
47
|
|
|
47
48
|
class _StreamContext:
|
|
@@ -71,15 +72,15 @@ class _StreamContext:
|
|
|
71
72
|
function_uri, config.default_project
|
|
72
73
|
)
|
|
73
74
|
|
|
74
|
-
stream_uri = mlrun.model_monitoring.get_stream_path(project=project)
|
|
75
|
+
self.stream_uri = mlrun.model_monitoring.get_stream_path(project=project)
|
|
75
76
|
|
|
76
77
|
if log_stream:
|
|
77
78
|
# Update the stream path to the log stream value
|
|
78
|
-
stream_uri = log_stream.format(project=project)
|
|
79
|
+
self.stream_uri = log_stream.format(project=project)
|
|
79
80
|
|
|
80
81
|
stream_args = parameters.get("stream_args", {})
|
|
81
82
|
|
|
82
|
-
self.output_stream = get_stream_pusher(stream_uri, **stream_args)
|
|
83
|
+
self.output_stream = get_stream_pusher(self.stream_uri, **stream_args)
|
|
83
84
|
|
|
84
85
|
|
|
85
86
|
class GraphServer(ModelObj):
|
|
@@ -153,6 +154,7 @@ class GraphServer(ModelObj):
|
|
|
153
154
|
resource_cache: ResourceCache = None,
|
|
154
155
|
logger=None,
|
|
155
156
|
is_mock=False,
|
|
157
|
+
monitoring_mock=False,
|
|
156
158
|
):
|
|
157
159
|
"""for internal use, initialize all steps (recursively)"""
|
|
158
160
|
|
|
@@ -165,6 +167,7 @@ class GraphServer(ModelObj):
|
|
|
165
167
|
|
|
166
168
|
context = GraphContext(server=self, nuclio_context=context, logger=logger)
|
|
167
169
|
context.is_mock = is_mock
|
|
170
|
+
context.monitoring_mock = monitoring_mock
|
|
168
171
|
context.root = self.graph
|
|
169
172
|
|
|
170
173
|
context.stream = _StreamContext(
|
|
@@ -321,9 +324,9 @@ def v2_serving_init(context, namespace=None):
|
|
|
321
324
|
server.http_trigger = getattr(context.trigger, "kind", "http") == "http"
|
|
322
325
|
context.logger.info_with(
|
|
323
326
|
"Setting current function",
|
|
324
|
-
|
|
327
|
+
current_function=os.getenv("SERVING_CURRENT_FUNCTION", ""),
|
|
325
328
|
)
|
|
326
|
-
server.set_current_function(os.
|
|
329
|
+
server.set_current_function(os.getenv("SERVING_CURRENT_FUNCTION", ""))
|
|
327
330
|
context.logger.info_with(
|
|
328
331
|
"Initializing states", namespace=namespace or get_caller_globals()
|
|
329
332
|
)
|
|
@@ -344,9 +347,14 @@ def v2_serving_init(context, namespace=None):
|
|
|
344
347
|
if server.verbose:
|
|
345
348
|
context.logger.info(server.to_yaml())
|
|
346
349
|
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
+
_set_callbacks(server, context)
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def _set_callbacks(server, context):
|
|
354
|
+
if not server.graph.supports_termination() or not hasattr(context, "platform"):
|
|
355
|
+
return
|
|
356
|
+
|
|
357
|
+
if hasattr(context.platform, "set_termination_callback"):
|
|
350
358
|
context.logger.info(
|
|
351
359
|
"Setting termination callback to terminate graph on worker shutdown"
|
|
352
360
|
)
|
|
@@ -358,7 +366,7 @@ def v2_serving_init(context, namespace=None):
|
|
|
358
366
|
|
|
359
367
|
context.platform.set_termination_callback(termination_callback)
|
|
360
368
|
|
|
361
|
-
if hasattr(context
|
|
369
|
+
if hasattr(context.platform, "set_drain_callback"):
|
|
362
370
|
context.logger.info(
|
|
363
371
|
"Setting drain callback to terminate and restart the graph on a drain event (such as rebalancing)"
|
|
364
372
|
)
|
|
@@ -385,12 +393,16 @@ def v2_serving_handler(context, event, get_body=False):
|
|
|
385
393
|
|
|
386
394
|
# original path is saved in stream_path so it can be used by explicit ack, but path is reset to / as a
|
|
387
395
|
# workaround for NUC-178
|
|
388
|
-
|
|
396
|
+
# nuclio 1.12.12 added the topic attribute, and we must use it as part of the fix for NUC-233
|
|
397
|
+
# TODO: Remove fallback on event.path once support for nuclio<1.12.12 is dropped
|
|
398
|
+
event.stream_path = getattr(event, "topic", event.path)
|
|
389
399
|
if hasattr(event, "trigger") and event.trigger.kind in (
|
|
390
400
|
"kafka",
|
|
391
401
|
"kafka-cluster",
|
|
392
402
|
"v3ioStream",
|
|
393
403
|
"v3io-stream",
|
|
404
|
+
"rabbit-mq",
|
|
405
|
+
"rabbitMq",
|
|
394
406
|
):
|
|
395
407
|
event.path = "/"
|
|
396
408
|
|
|
@@ -417,7 +429,7 @@ def create_graph_server(
|
|
|
417
429
|
parameters = parameters or {}
|
|
418
430
|
server = GraphServer(graph, parameters, load_mode, verbose=verbose, **kwargs)
|
|
419
431
|
server.set_current_function(
|
|
420
|
-
current_function or os.
|
|
432
|
+
current_function or os.getenv("SERVING_CURRENT_FUNCTION", "")
|
|
421
433
|
)
|
|
422
434
|
return server
|
|
423
435
|
|
|
@@ -481,7 +493,13 @@ class Response:
|
|
|
481
493
|
class GraphContext:
|
|
482
494
|
"""Graph context object"""
|
|
483
495
|
|
|
484
|
-
def __init__(
|
|
496
|
+
def __init__(
|
|
497
|
+
self,
|
|
498
|
+
level="info", # Unused argument
|
|
499
|
+
logger=None,
|
|
500
|
+
server=None,
|
|
501
|
+
nuclio_context: Optional[NuclioContext] = None,
|
|
502
|
+
) -> None:
|
|
485
503
|
self.state = None
|
|
486
504
|
self.logger = logger
|
|
487
505
|
self.worker_id = 0
|
|
@@ -491,7 +509,7 @@ class GraphContext:
|
|
|
491
509
|
self.root = None
|
|
492
510
|
|
|
493
511
|
if nuclio_context:
|
|
494
|
-
self.logger = nuclio_context.logger
|
|
512
|
+
self.logger: NuclioLogger = nuclio_context.logger
|
|
495
513
|
self.Response = nuclio_context.Response
|
|
496
514
|
if hasattr(nuclio_context, "trigger") and hasattr(
|
|
497
515
|
nuclio_context.trigger, "kind"
|
|
@@ -501,7 +519,7 @@ class GraphContext:
|
|
|
501
519
|
if hasattr(nuclio_context, "platform"):
|
|
502
520
|
self.platform = nuclio_context.platform
|
|
503
521
|
elif not logger:
|
|
504
|
-
self.logger = mlrun.utils.
|
|
522
|
+
self.logger: mlrun.utils.Logger = mlrun.utils.logger
|
|
505
523
|
|
|
506
524
|
self._server = server
|
|
507
525
|
self.current_function = None
|
|
@@ -514,7 +532,7 @@ class GraphContext:
|
|
|
514
532
|
return self._server
|
|
515
533
|
|
|
516
534
|
@property
|
|
517
|
-
def project(self):
|
|
535
|
+
def project(self) -> str:
|
|
518
536
|
"""current project name (for the current function)"""
|
|
519
537
|
project, _, _, _ = mlrun.common.helpers.parse_versioned_object_uri(
|
|
520
538
|
self._server.function_uri
|