mlrun 1.7.0rc28__py3-none-any.whl → 1.7.0rc55__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (135) hide show
  1. mlrun/__main__.py +4 -2
  2. mlrun/alerts/alert.py +75 -8
  3. mlrun/artifacts/base.py +1 -0
  4. mlrun/artifacts/manager.py +9 -2
  5. mlrun/common/constants.py +4 -1
  6. mlrun/common/db/sql_session.py +3 -2
  7. mlrun/common/formatters/__init__.py +1 -0
  8. mlrun/common/formatters/artifact.py +1 -0
  9. mlrun/{model_monitoring/application.py → common/formatters/feature_set.py} +20 -6
  10. mlrun/common/formatters/run.py +3 -0
  11. mlrun/common/helpers.py +0 -1
  12. mlrun/common/schemas/__init__.py +3 -1
  13. mlrun/common/schemas/alert.py +15 -12
  14. mlrun/common/schemas/api_gateway.py +6 -6
  15. mlrun/common/schemas/auth.py +5 -0
  16. mlrun/common/schemas/client_spec.py +0 -1
  17. mlrun/common/schemas/common.py +7 -4
  18. mlrun/common/schemas/frontend_spec.py +7 -0
  19. mlrun/common/schemas/function.py +7 -0
  20. mlrun/common/schemas/model_monitoring/__init__.py +4 -3
  21. mlrun/common/schemas/model_monitoring/constants.py +41 -26
  22. mlrun/common/schemas/model_monitoring/model_endpoints.py +23 -47
  23. mlrun/common/schemas/notification.py +69 -12
  24. mlrun/common/schemas/project.py +45 -12
  25. mlrun/common/schemas/workflow.py +10 -2
  26. mlrun/common/types.py +1 -0
  27. mlrun/config.py +91 -35
  28. mlrun/data_types/data_types.py +6 -1
  29. mlrun/data_types/spark.py +2 -2
  30. mlrun/data_types/to_pandas.py +57 -25
  31. mlrun/datastore/__init__.py +1 -0
  32. mlrun/datastore/alibaba_oss.py +3 -2
  33. mlrun/datastore/azure_blob.py +125 -37
  34. mlrun/datastore/base.py +42 -21
  35. mlrun/datastore/datastore.py +4 -2
  36. mlrun/datastore/datastore_profile.py +1 -1
  37. mlrun/datastore/dbfs_store.py +3 -7
  38. mlrun/datastore/filestore.py +1 -3
  39. mlrun/datastore/google_cloud_storage.py +85 -29
  40. mlrun/datastore/inmem.py +4 -1
  41. mlrun/datastore/redis.py +1 -0
  42. mlrun/datastore/s3.py +25 -12
  43. mlrun/datastore/sources.py +76 -4
  44. mlrun/datastore/spark_utils.py +30 -0
  45. mlrun/datastore/storeytargets.py +151 -0
  46. mlrun/datastore/targets.py +102 -131
  47. mlrun/datastore/v3io.py +1 -0
  48. mlrun/db/base.py +15 -6
  49. mlrun/db/httpdb.py +57 -28
  50. mlrun/db/nopdb.py +29 -5
  51. mlrun/errors.py +20 -3
  52. mlrun/execution.py +46 -5
  53. mlrun/feature_store/api.py +25 -1
  54. mlrun/feature_store/common.py +6 -11
  55. mlrun/feature_store/feature_vector.py +3 -1
  56. mlrun/feature_store/retrieval/job.py +4 -1
  57. mlrun/feature_store/retrieval/spark_merger.py +10 -39
  58. mlrun/feature_store/steps.py +8 -0
  59. mlrun/frameworks/_common/plan.py +3 -3
  60. mlrun/frameworks/_ml_common/plan.py +1 -1
  61. mlrun/frameworks/parallel_coordinates.py +2 -3
  62. mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
  63. mlrun/k8s_utils.py +48 -2
  64. mlrun/launcher/client.py +6 -6
  65. mlrun/launcher/local.py +2 -2
  66. mlrun/model.py +215 -34
  67. mlrun/model_monitoring/api.py +38 -24
  68. mlrun/model_monitoring/applications/__init__.py +1 -2
  69. mlrun/model_monitoring/applications/_application_steps.py +60 -29
  70. mlrun/model_monitoring/applications/base.py +2 -174
  71. mlrun/model_monitoring/applications/context.py +197 -70
  72. mlrun/model_monitoring/applications/evidently_base.py +11 -85
  73. mlrun/model_monitoring/applications/histogram_data_drift.py +21 -16
  74. mlrun/model_monitoring/applications/results.py +4 -4
  75. mlrun/model_monitoring/controller.py +110 -282
  76. mlrun/model_monitoring/db/stores/__init__.py +8 -3
  77. mlrun/model_monitoring/db/stores/base/store.py +3 -0
  78. mlrun/model_monitoring/db/stores/sqldb/models/base.py +9 -7
  79. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +18 -3
  80. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +43 -23
  81. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +48 -35
  82. mlrun/model_monitoring/db/tsdb/__init__.py +7 -2
  83. mlrun/model_monitoring/db/tsdb/base.py +147 -15
  84. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +94 -55
  85. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -3
  86. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +144 -38
  87. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +44 -3
  88. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +246 -57
  89. mlrun/model_monitoring/helpers.py +70 -50
  90. mlrun/model_monitoring/stream_processing.py +96 -195
  91. mlrun/model_monitoring/writer.py +13 -5
  92. mlrun/package/packagers/default_packager.py +2 -2
  93. mlrun/projects/operations.py +16 -8
  94. mlrun/projects/pipelines.py +126 -115
  95. mlrun/projects/project.py +286 -129
  96. mlrun/render.py +3 -3
  97. mlrun/run.py +38 -19
  98. mlrun/runtimes/__init__.py +19 -8
  99. mlrun/runtimes/base.py +4 -1
  100. mlrun/runtimes/daskjob.py +1 -1
  101. mlrun/runtimes/funcdoc.py +1 -1
  102. mlrun/runtimes/kubejob.py +6 -6
  103. mlrun/runtimes/local.py +12 -5
  104. mlrun/runtimes/nuclio/api_gateway.py +68 -8
  105. mlrun/runtimes/nuclio/application/application.py +307 -70
  106. mlrun/runtimes/nuclio/function.py +63 -14
  107. mlrun/runtimes/nuclio/serving.py +10 -10
  108. mlrun/runtimes/pod.py +25 -19
  109. mlrun/runtimes/remotesparkjob.py +2 -5
  110. mlrun/runtimes/sparkjob/spark3job.py +16 -17
  111. mlrun/runtimes/utils.py +34 -0
  112. mlrun/serving/routers.py +2 -5
  113. mlrun/serving/server.py +37 -19
  114. mlrun/serving/states.py +30 -3
  115. mlrun/serving/v2_serving.py +44 -35
  116. mlrun/track/trackers/mlflow_tracker.py +5 -0
  117. mlrun/utils/async_http.py +1 -1
  118. mlrun/utils/db.py +18 -0
  119. mlrun/utils/helpers.py +150 -36
  120. mlrun/utils/http.py +1 -1
  121. mlrun/utils/notifications/notification/__init__.py +0 -1
  122. mlrun/utils/notifications/notification/webhook.py +8 -1
  123. mlrun/utils/notifications/notification_pusher.py +1 -1
  124. mlrun/utils/v3io_clients.py +2 -2
  125. mlrun/utils/version/version.json +2 -2
  126. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/METADATA +153 -66
  127. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/RECORD +131 -134
  128. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/WHEEL +1 -1
  129. mlrun/feature_store/retrieval/conversion.py +0 -271
  130. mlrun/model_monitoring/controller_handler.py +0 -37
  131. mlrun/model_monitoring/evidently_application.py +0 -20
  132. mlrun/model_monitoring/prometheus.py +0 -216
  133. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/LICENSE +0 -0
  134. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/entry_points.txt +0 -0
  135. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/top_level.txt +0 -0
@@ -23,6 +23,7 @@ import inflection
23
23
  import nuclio
24
24
  import nuclio.utils
25
25
  import requests
26
+ import semver
26
27
  from aiohttp.client import ClientSession
27
28
  from kubernetes import client
28
29
  from mlrun_pipelines.common.mounts import VolumeMount
@@ -296,10 +297,37 @@ class RemoteRuntime(KubeResource):
296
297
  """
297
298
  if hasattr(spec, "to_dict"):
298
299
  spec = spec.to_dict()
300
+
301
+ self._validate_triggers(spec)
302
+
299
303
  spec["name"] = name
300
304
  self.spec.config[f"spec.triggers.{name}"] = spec
301
305
  return self
302
306
 
307
+ def _validate_triggers(self, spec):
308
+ # ML-7763 / NUC-233
309
+ min_nuclio_version = "1.13.12"
310
+ if mlconf.nuclio_version and semver.VersionInfo.parse(
311
+ mlconf.nuclio_version
312
+ ) < semver.VersionInfo.parse(min_nuclio_version):
313
+ explicit_ack_enabled = False
314
+ num_triggers = 0
315
+ trigger_name = spec.get("name", "UNKNOWN")
316
+ for key, config in [(f"spec.triggers.{trigger_name}", spec)] + list(
317
+ self.spec.config.items()
318
+ ):
319
+ if key.startswith("spec.triggers."):
320
+ num_triggers += 1
321
+ explicit_ack_enabled = (
322
+ config.get("explicitAckMode", "disable") != "disable"
323
+ )
324
+
325
+ if num_triggers > 1 and explicit_ack_enabled:
326
+ raise mlrun.errors.MLRunInvalidArgumentError(
327
+ "Multiple triggers cannot be used in conjunction with explicit ack. "
328
+ f"Please upgrade to nuclio {min_nuclio_version} or newer."
329
+ )
330
+
303
331
  def with_source_archive(
304
332
  self,
305
333
  source,
@@ -418,14 +446,8 @@ class RemoteRuntime(KubeResource):
418
446
  raise ValueError(
419
447
  "gateway timeout must be greater than the worker timeout"
420
448
  )
421
- annotations["nginx.ingress.kubernetes.io/proxy-connect-timeout"] = (
422
- f"{gateway_timeout}"
423
- )
424
- annotations["nginx.ingress.kubernetes.io/proxy-read-timeout"] = (
425
- f"{gateway_timeout}"
426
- )
427
- annotations["nginx.ingress.kubernetes.io/proxy-send-timeout"] = (
428
- f"{gateway_timeout}"
449
+ mlrun.runtimes.utils.enrich_gateway_timeout_annotations(
450
+ annotations, gateway_timeout
429
451
  )
430
452
 
431
453
  trigger = nuclio.HttpTrigger(
@@ -446,6 +468,11 @@ class RemoteRuntime(KubeResource):
446
468
  return self
447
469
 
448
470
  def from_image(self, image):
471
+ """
472
+ Deploy the function with an existing nuclio processor image.
473
+
474
+ :param image: image name
475
+ """
449
476
  config = nuclio.config.new_config()
450
477
  update_in(
451
478
  config,
@@ -496,6 +523,11 @@ class RemoteRuntime(KubeResource):
496
523
  extra_attributes = extra_attributes or {}
497
524
  if ack_window_size:
498
525
  extra_attributes["ackWindowSize"] = ack_window_size
526
+
527
+ access_key = kwargs.pop("access_key", None)
528
+ if not access_key:
529
+ access_key = self._resolve_v3io_access_key()
530
+
499
531
  self.add_trigger(
500
532
  name,
501
533
  V3IOStreamTrigger(
@@ -507,11 +539,14 @@ class RemoteRuntime(KubeResource):
507
539
  webapi=endpoint or "http://v3io-webapi:8081",
508
540
  extra_attributes=extra_attributes,
509
541
  read_batch_size=256,
542
+ access_key=access_key,
510
543
  **kwargs,
511
544
  ),
512
545
  )
513
- self.spec.min_replicas = shards
514
- self.spec.max_replicas = shards
546
+ if self.spec.min_replicas != shards or self.spec.max_replicas != shards:
547
+ logger.warning(f"Setting function replicas to {shards}")
548
+ self.spec.min_replicas = shards
549
+ self.spec.max_replicas = shards
515
550
 
516
551
  def deploy(
517
552
  self,
@@ -566,6 +601,9 @@ class RemoteRuntime(KubeResource):
566
601
  # this also means that the function object will be updated with the function status
567
602
  self._wait_for_function_deployment(db, verbose=verbose)
568
603
 
604
+ return self._enrich_command_from_status()
605
+
606
+ def _enrich_command_from_status(self):
569
607
  # NOTE: on older mlrun versions & nuclio versions, function are exposed via NodePort
570
608
  # now, functions can be not exposed (using service type ClusterIP) and hence
571
609
  # for BC we first try to populate the external invocation url, and then
@@ -679,7 +717,7 @@ class RemoteRuntime(KubeResource):
679
717
  "State thresholds do not apply for nuclio as it has its own function pods healthiness monitoring"
680
718
  )
681
719
 
682
- @min_nuclio_versions("1.12.8")
720
+ @min_nuclio_versions("1.13.1")
683
721
  def disable_default_http_trigger(
684
722
  self,
685
723
  ):
@@ -688,7 +726,7 @@ class RemoteRuntime(KubeResource):
688
726
  """
689
727
  self.spec.disable_default_http_trigger = True
690
728
 
691
- @min_nuclio_versions("1.12.8")
729
+ @min_nuclio_versions("1.13.1")
692
730
  def enable_default_http_trigger(
693
731
  self,
694
732
  ):
@@ -697,6 +735,10 @@ class RemoteRuntime(KubeResource):
697
735
  """
698
736
  self.spec.disable_default_http_trigger = False
699
737
 
738
+ def skip_image_enrichment(self):
739
+ # make sure the API does not enrich the base image if the function is not a python function
740
+ return self.spec.nuclio_runtime and "python" not in self.spec.nuclio_runtime
741
+
700
742
  def _get_state(
701
743
  self,
702
744
  dashboard="",
@@ -739,7 +781,7 @@ class RemoteRuntime(KubeResource):
739
781
  return state, text, last_log_timestamp
740
782
 
741
783
  try:
742
- text, last_log_timestamp = self._get_db().get_builder_status(
784
+ text, last_log_timestamp = self._get_db().get_nuclio_deploy_status(
743
785
  self, last_log_timestamp=last_log_timestamp, verbose=verbose
744
786
  )
745
787
  except mlrun.db.RunDBError:
@@ -990,7 +1032,7 @@ class RemoteRuntime(KubeResource):
990
1032
  if command and not command.startswith("http"):
991
1033
  sidecar["command"] = mlrun.utils.helpers.as_list(command)
992
1034
 
993
- if args and sidecar["command"]:
1035
+ if args and sidecar.get("command"):
994
1036
  sidecar["args"] = mlrun.utils.helpers.as_list(args)
995
1037
 
996
1038
  # populate the sidecar resources from the function spec
@@ -1233,6 +1275,13 @@ class RemoteRuntime(KubeResource):
1233
1275
 
1234
1276
  return self._resolve_invocation_url("", force_external_address)
1235
1277
 
1278
+ @staticmethod
1279
+ def _resolve_v3io_access_key():
1280
+ # Nuclio supports generating access key for v3io stream trigger only from version 1.13.11
1281
+ if validate_nuclio_version_compatibility("1.13.11"):
1282
+ return mlrun.model.Credentials.generate_access_key
1283
+ return None
1284
+
1236
1285
 
1237
1286
  def parse_logs(logs):
1238
1287
  logs = json.loads(logs)
@@ -314,8 +314,8 @@ class ServingRuntime(RemoteRuntime):
314
314
  tracking_policy: Optional[Union["TrackingPolicy", dict]] = None,
315
315
  enable_tracking: bool = True,
316
316
  ) -> None:
317
- """apply on your serving function to monitor a deployed model, including real-time dashboards to detect drift
318
- and analyze performance.
317
+ """Apply on your serving function to monitor a deployed model, including real-time dashboards to detect drift
318
+ and analyze performance.
319
319
 
320
320
  :param stream_path: Path/url of the tracking stream e.g. v3io:///users/mike/mystream
321
321
  you can use the "dummy://" path for test/simulation.
@@ -325,12 +325,12 @@ class ServingRuntime(RemoteRuntime):
325
325
  :param enable_tracking: Enabled/Disable model-monitoring tracking.
326
326
  Default True (tracking enabled).
327
327
 
328
- example::
328
+ Example::
329
329
 
330
- # initialize a new serving function
331
- serving_fn = mlrun.import_function("hub://v2-model-server", new_name="serving")
332
- # apply model monitoring
333
- serving_fn.set_tracking()
330
+ # initialize a new serving function
331
+ serving_fn = mlrun.import_function("hub://v2-model-server", new_name="serving")
332
+ # apply model monitoring
333
+ serving_fn.set_tracking()
334
334
 
335
335
  """
336
336
  # Applying model monitoring configurations
@@ -480,7 +480,7 @@ class ServingRuntime(RemoteRuntime):
480
480
  trigger_args = stream.trigger_args or {}
481
481
 
482
482
  engine = self.spec.graph.engine or "async"
483
- if mlrun.mlconf.is_explicit_ack() and engine == "async":
483
+ if mlrun.mlconf.is_explicit_ack_enabled() and engine == "async":
484
484
  trigger_args["explicit_ack_mode"] = trigger_args.get(
485
485
  "explicit_ack_mode", "explicitOnly"
486
486
  )
@@ -676,7 +676,6 @@ class ServingRuntime(RemoteRuntime):
676
676
  """create mock server object for local testing/emulation
677
677
 
678
678
  :param namespace: one or list of namespaces/modules to search the steps classes/functions in
679
- :param log_level: log level (error | info | debug)
680
679
  :param current_function: specify if you want to simulate a child function, * for all functions
681
680
  :param track_models: allow model tracking (disabled by default in the mock server)
682
681
  :param workdir: working directory to locate the source code (if not the current one)
@@ -704,7 +703,7 @@ class ServingRuntime(RemoteRuntime):
704
703
  verbose=self.verbose,
705
704
  current_function=current_function,
706
705
  graph_initializer=self.spec.graph_initializer,
707
- track_models=track_models and self.spec.track_models,
706
+ track_models=self.spec.track_models,
708
707
  function_uri=self._function_uri(),
709
708
  secret_sources=self.spec.secret_sources,
710
709
  default_content_type=self.spec.default_content_type,
@@ -715,6 +714,7 @@ class ServingRuntime(RemoteRuntime):
715
714
  namespace=namespace,
716
715
  logger=logger,
717
716
  is_mock=True,
717
+ monitoring_mock=track_models,
718
718
  )
719
719
 
720
720
  if workdir:
mlrun/runtimes/pod.py CHANGED
@@ -38,6 +38,7 @@ from ..k8s_utils import (
38
38
  generate_preemptible_nodes_affinity_terms,
39
39
  generate_preemptible_nodes_anti_affinity_terms,
40
40
  generate_preemptible_tolerations,
41
+ validate_node_selectors,
41
42
  )
42
43
  from ..utils import logger, update_in
43
44
  from .base import BaseRuntime, FunctionSpec, spec_fields
@@ -215,9 +216,7 @@ class KubeResourceSpec(FunctionSpec):
215
216
  image_pull_secret or mlrun.mlconf.function.spec.image_pull_secret.default
216
217
  )
217
218
  self.node_name = node_name
218
- self.node_selector = (
219
- node_selector or mlrun.mlconf.get_default_function_node_selector()
220
- )
219
+ self.node_selector = node_selector or {}
221
220
  self._affinity = affinity
222
221
  self.priority_class_name = (
223
222
  priority_class_name or mlrun.mlconf.default_function_priority_class_name
@@ -532,7 +531,7 @@ class KubeResourceSpec(FunctionSpec):
532
531
  return
533
532
 
534
533
  # merge node selectors - precedence to existing node selector
535
- self.node_selector = mlrun.utils.helpers.merge_with_precedence(
534
+ self.node_selector = mlrun.utils.helpers.merge_dicts_with_precedence(
536
535
  node_selector, self.node_selector
537
536
  )
538
537
 
@@ -1108,12 +1107,12 @@ class KubeResource(BaseRuntime, KfpAdapterMixin):
1108
1107
 
1109
1108
  :param state_thresholds: A dictionary of state to threshold. The supported states are:
1110
1109
 
1111
- * pending_scheduled - The pod/crd is scheduled on a node but not yet running
1112
- * pending_not_scheduled - The pod/crd is not yet scheduled on a node
1113
- * executing - The pod/crd started and is running
1114
- * image_pull_backoff - The pod/crd is in image pull backoff
1115
- See mlrun.mlconf.function.spec.state_thresholds for the default thresholds.
1110
+ * pending_scheduled - The pod/crd is scheduled on a node but not yet running
1111
+ * pending_not_scheduled - The pod/crd is not yet scheduled on a node
1112
+ * executing - The pod/crd started and is running
1113
+ * image_pull_backoff - The pod/crd is in image pull backoff
1116
1114
 
1115
+ See :code:`mlrun.mlconf.function.spec.state_thresholds` for the default thresholds.
1117
1116
  :param patch: Whether to merge the given thresholds with the existing thresholds (True, default)
1118
1117
  or override them (False)
1119
1118
  """
@@ -1176,9 +1175,10 @@ class KubeResource(BaseRuntime, KfpAdapterMixin):
1176
1175
  """
1177
1176
  if node_name:
1178
1177
  self.spec.node_name = node_name
1179
- if node_selector:
1178
+ if node_selector is not None:
1179
+ validate_node_selectors(node_selectors=node_selector, raise_on_error=False)
1180
1180
  self.spec.node_selector = node_selector
1181
- if affinity:
1181
+ if affinity is not None:
1182
1182
  self.spec.affinity = affinity
1183
1183
  if tolerations is not None:
1184
1184
  self.spec.tolerations = tolerations
@@ -1347,20 +1347,26 @@ class KubeResource(BaseRuntime, KfpAdapterMixin):
1347
1347
 
1348
1348
  def _build_image(
1349
1349
  self,
1350
- builder_env,
1351
- force_build,
1352
- mlrun_version_specifier,
1353
- show_on_failure,
1354
- skip_deployed,
1355
- watch,
1356
- is_kfp,
1357
- with_mlrun,
1350
+ builder_env: dict,
1351
+ force_build: bool,
1352
+ mlrun_version_specifier: typing.Optional[bool],
1353
+ show_on_failure: bool,
1354
+ skip_deployed: bool,
1355
+ watch: bool,
1356
+ is_kfp: bool,
1357
+ with_mlrun: typing.Optional[bool],
1358
1358
  ):
1359
1359
  # When we're in pipelines context we must watch otherwise the pipelines pod will exit before the operation
1360
1360
  # is actually done. (when a pipelines pod exits, the pipeline step marked as done)
1361
1361
  if is_kfp:
1362
1362
  watch = True
1363
1363
 
1364
+ if skip_deployed and self.requires_build() and not self.is_deployed():
1365
+ logger.warning(
1366
+ f"Even though {skip_deployed=}, the build might be triggered due to the function's configuration. "
1367
+ "See requires_build() and is_deployed() for reasoning."
1368
+ )
1369
+
1364
1370
  db = self._get_db()
1365
1371
  data = db.remote_builder(
1366
1372
  self,
@@ -102,16 +102,13 @@ class RemoteSparkRuntime(KubejobRuntime):
102
102
 
103
103
  @classmethod
104
104
  def deploy_default_image(cls):
105
- from mlrun import get_run_db
106
- from mlrun.run import new_function
107
-
108
- sj = new_function(
105
+ sj = mlrun.new_function(
109
106
  kind="remote-spark", name="remote-spark-default-image-deploy-temp"
110
107
  )
111
108
  sj.spec.build.image = cls.default_image
112
109
  sj.with_spark_service(spark_service="dummy-spark")
113
110
  sj.deploy()
114
- get_run_db().delete_function(name=sj.metadata.name)
111
+ mlrun.get_run_db().delete_function(name=sj.metadata.name)
115
112
 
116
113
  def is_deployed(self):
117
114
  if (
@@ -18,6 +18,7 @@ from mlrun_pipelines.mounts import mount_v3io, mount_v3iod
18
18
 
19
19
  import mlrun.common.schemas.function
20
20
  import mlrun.errors
21
+ import mlrun.k8s_utils
21
22
  import mlrun.runtimes.pod
22
23
  from mlrun.config import config
23
24
 
@@ -451,7 +452,7 @@ class Spark3JobSpec(KubeResourceSpec):
451
452
  class Spark3Runtime(KubejobRuntime):
452
453
  group = "sparkoperator.k8s.io"
453
454
  version = "v1beta2"
454
- apiVersion = group + "/" + version
455
+ apiVersion = group + "/" + version # noqa: N815
455
456
  kind = "spark"
456
457
  plural = "sparkapplications"
457
458
 
@@ -505,13 +506,11 @@ class Spark3Runtime(KubejobRuntime):
505
506
  raise NotImplementedError(
506
507
  "Setting node name is not supported for spark runtime"
507
508
  )
508
- # TODO add affinity support
509
- # https://github.com/GoogleCloudPlatform/spark-on-k8s-operator/blob/master/pkg/apis/sparkoperator.k8s.io/v1beta2/types.go#L491
510
- if affinity:
511
- raise NotImplementedError(
512
- "Setting affinity is not supported for spark runtime"
513
- )
514
- super().with_node_selection(node_name, node_selector, affinity, tolerations)
509
+ mlrun.k8s_utils.validate_node_selectors(node_selector, raise_on_error=False)
510
+ self.with_driver_node_selection(node_name, node_selector, affinity, tolerations)
511
+ self.with_executor_node_selection(
512
+ node_name, node_selector, affinity, tolerations
513
+ )
515
514
 
516
515
  def with_driver_node_selection(
517
516
  self,
@@ -537,11 +536,12 @@ class Spark3Runtime(KubejobRuntime):
537
536
  raise NotImplementedError(
538
537
  "Setting node name is not supported for spark runtime"
539
538
  )
540
- if affinity:
539
+ if affinity is not None:
541
540
  self.spec.driver_affinity = affinity
542
- if node_selector:
541
+ if node_selector is not None:
542
+ mlrun.k8s_utils.validate_node_selectors(node_selector, raise_on_error=False)
543
543
  self.spec.driver_node_selector = node_selector
544
- if tolerations:
544
+ if tolerations is not None:
545
545
  self.spec.driver_tolerations = tolerations
546
546
 
547
547
  def with_executor_node_selection(
@@ -568,11 +568,12 @@ class Spark3Runtime(KubejobRuntime):
568
568
  raise NotImplementedError(
569
569
  "Setting node name is not supported for spark runtime"
570
570
  )
571
- if affinity:
571
+ if affinity is not None:
572
572
  self.spec.executor_affinity = affinity
573
- if node_selector:
573
+ if node_selector is not None:
574
+ mlrun.k8s_utils.validate_node_selectors(node_selector, raise_on_error=False)
574
575
  self.spec.executor_node_selector = node_selector
575
- if tolerations:
576
+ if tolerations is not None:
576
577
  self.spec.executor_tolerations = tolerations
577
578
 
578
579
  def with_preemption_mode(
@@ -811,9 +812,7 @@ class Spark3Runtime(KubejobRuntime):
811
812
 
812
813
  @classmethod
813
814
  def deploy_default_image(cls, with_gpu=False):
814
- from mlrun.run import new_function
815
-
816
- sj = new_function(kind=cls.kind, name="spark-default-image-deploy-temp")
815
+ sj = mlrun.new_function(kind=cls.kind, name="spark-default-image-deploy-temp")
817
816
  sj.spec.build.image = cls._get_default_deployed_mlrun_image_name(with_gpu)
818
817
 
819
818
  # setting required resources
mlrun/runtimes/utils.py CHANGED
@@ -445,3 +445,37 @@ def enrich_run_labels(
445
445
  if label.value not in labels and enrichment:
446
446
  labels[label.value] = enrichment
447
447
  return labels
448
+
449
+
450
+ def resolve_node_selectors(
451
+ project_node_selector: dict, instance_node_selector: dict
452
+ ) -> dict:
453
+ config_node_selector = mlrun.mlconf.get_default_function_node_selector()
454
+ if project_node_selector or config_node_selector:
455
+ mlrun.utils.logger.debug(
456
+ "Enriching node selector from project and mlrun config",
457
+ project_node_selector=project_node_selector,
458
+ config_node_selector=config_node_selector,
459
+ )
460
+ return mlrun.utils.helpers.merge_dicts_with_precedence(
461
+ config_node_selector,
462
+ project_node_selector,
463
+ instance_node_selector,
464
+ )
465
+ return instance_node_selector
466
+
467
+
468
+ def enrich_gateway_timeout_annotations(annotations: dict, gateway_timeout: int):
469
+ """
470
+ Set gateway proxy connect/read/send timeout annotations
471
+ :param annotations: The annotations to enrich
472
+ :param gateway_timeout: The timeout to set
473
+ """
474
+ if not gateway_timeout:
475
+ return
476
+ gateway_timeout_str = str(gateway_timeout)
477
+ annotations["nginx.ingress.kubernetes.io/proxy-connect-timeout"] = (
478
+ gateway_timeout_str
479
+ )
480
+ annotations["nginx.ingress.kubernetes.io/proxy-read-timeout"] = gateway_timeout_str
481
+ annotations["nginx.ingress.kubernetes.io/proxy-send-timeout"] = gateway_timeout_str
mlrun/serving/routers.py CHANGED
@@ -32,7 +32,6 @@ from mlrun.errors import err_to_str
32
32
  from mlrun.utils import logger, now_date
33
33
 
34
34
  from ..common.helpers import parse_versioned_object_uri
35
- from ..config import config
36
35
  from .server import GraphServer
37
36
  from .utils import RouterToDict, _extract_input_data, _update_result_body
38
37
  from .v2_serving import _ModelLogPusher
@@ -616,7 +615,7 @@ class VotingEnsemble(ParallelRun):
616
615
  logger.warn("GraphServer not initialized for VotingEnsemble instance")
617
616
  return
618
617
 
619
- if not self.context.is_mock or self.context.server.track_models:
618
+ if not self.context.is_mock or self.context.monitoring_mock:
620
619
  self.model_endpoint_uid = _init_endpoint_record(server, self)
621
620
 
622
621
  self._update_weights(self.weights)
@@ -1057,9 +1056,7 @@ def _init_endpoint_record(
1057
1056
  function_uri=graph_server.function_uri,
1058
1057
  model=versioned_model_name,
1059
1058
  model_class=voting_ensemble.__class__.__name__,
1060
- stream_path=config.model_endpoint_monitoring.store_prefixes.default.format(
1061
- project=project, kind="stream"
1062
- ),
1059
+ stream_path=voting_ensemble.context.stream.stream_uri,
1063
1060
  active=True,
1064
1061
  monitoring_mode=mlrun.common.schemas.model_monitoring.ModelMonitoringMode.enabled,
1065
1062
  ),
mlrun/serving/server.py CHANGED
@@ -22,10 +22,14 @@ import traceback
22
22
  import uuid
23
23
  from typing import Optional, Union
24
24
 
25
+ from nuclio import Context as NuclioContext
26
+ from nuclio.request import Logger as NuclioLogger
27
+
25
28
  import mlrun
26
29
  import mlrun.common.constants
27
30
  import mlrun.common.helpers
28
31
  import mlrun.model_monitoring
32
+ import mlrun.utils
29
33
  from mlrun.config import config
30
34
  from mlrun.errors import err_to_str
31
35
  from mlrun.secrets import SecretsStore
@@ -38,10 +42,7 @@ from ..errors import MLRunInvalidArgumentError
38
42
  from ..model import ModelObj
39
43
  from ..utils import get_caller_globals
40
44
  from .states import RootFlowStep, RouterStep, get_function, graph_root_setter
41
- from .utils import (
42
- event_id_key,
43
- event_path_key,
44
- )
45
+ from .utils import event_id_key, event_path_key
45
46
 
46
47
 
47
48
  class _StreamContext:
@@ -71,15 +72,15 @@ class _StreamContext:
71
72
  function_uri, config.default_project
72
73
  )
73
74
 
74
- stream_uri = mlrun.model_monitoring.get_stream_path(project=project)
75
+ self.stream_uri = mlrun.model_monitoring.get_stream_path(project=project)
75
76
 
76
77
  if log_stream:
77
78
  # Update the stream path to the log stream value
78
- stream_uri = log_stream.format(project=project)
79
+ self.stream_uri = log_stream.format(project=project)
79
80
 
80
81
  stream_args = parameters.get("stream_args", {})
81
82
 
82
- self.output_stream = get_stream_pusher(stream_uri, **stream_args)
83
+ self.output_stream = get_stream_pusher(self.stream_uri, **stream_args)
83
84
 
84
85
 
85
86
  class GraphServer(ModelObj):
@@ -153,6 +154,7 @@ class GraphServer(ModelObj):
153
154
  resource_cache: ResourceCache = None,
154
155
  logger=None,
155
156
  is_mock=False,
157
+ monitoring_mock=False,
156
158
  ):
157
159
  """for internal use, initialize all steps (recursively)"""
158
160
 
@@ -165,6 +167,7 @@ class GraphServer(ModelObj):
165
167
 
166
168
  context = GraphContext(server=self, nuclio_context=context, logger=logger)
167
169
  context.is_mock = is_mock
170
+ context.monitoring_mock = monitoring_mock
168
171
  context.root = self.graph
169
172
 
170
173
  context.stream = _StreamContext(
@@ -321,9 +324,9 @@ def v2_serving_init(context, namespace=None):
321
324
  server.http_trigger = getattr(context.trigger, "kind", "http") == "http"
322
325
  context.logger.info_with(
323
326
  "Setting current function",
324
- current_functiton=os.environ.get("SERVING_CURRENT_FUNCTION", ""),
327
+ current_function=os.getenv("SERVING_CURRENT_FUNCTION", ""),
325
328
  )
326
- server.set_current_function(os.environ.get("SERVING_CURRENT_FUNCTION", ""))
329
+ server.set_current_function(os.getenv("SERVING_CURRENT_FUNCTION", ""))
327
330
  context.logger.info_with(
328
331
  "Initializing states", namespace=namespace or get_caller_globals()
329
332
  )
@@ -344,9 +347,14 @@ def v2_serving_init(context, namespace=None):
344
347
  if server.verbose:
345
348
  context.logger.info(server.to_yaml())
346
349
 
347
- if hasattr(context, "platform") and hasattr(
348
- context.platform, "set_termination_callback"
349
- ):
350
+ _set_callbacks(server, context)
351
+
352
+
353
+ def _set_callbacks(server, context):
354
+ if not server.graph.supports_termination() or not hasattr(context, "platform"):
355
+ return
356
+
357
+ if hasattr(context.platform, "set_termination_callback"):
350
358
  context.logger.info(
351
359
  "Setting termination callback to terminate graph on worker shutdown"
352
360
  )
@@ -358,7 +366,7 @@ def v2_serving_init(context, namespace=None):
358
366
 
359
367
  context.platform.set_termination_callback(termination_callback)
360
368
 
361
- if hasattr(context, "platform") and hasattr(context.platform, "set_drain_callback"):
369
+ if hasattr(context.platform, "set_drain_callback"):
362
370
  context.logger.info(
363
371
  "Setting drain callback to terminate and restart the graph on a drain event (such as rebalancing)"
364
372
  )
@@ -385,12 +393,16 @@ def v2_serving_handler(context, event, get_body=False):
385
393
 
386
394
  # original path is saved in stream_path so it can be used by explicit ack, but path is reset to / as a
387
395
  # workaround for NUC-178
388
- event.stream_path = event.path
396
+ # nuclio 1.12.12 added the topic attribute, and we must use it as part of the fix for NUC-233
397
+ # TODO: Remove fallback on event.path once support for nuclio<1.12.12 is dropped
398
+ event.stream_path = getattr(event, "topic", event.path)
389
399
  if hasattr(event, "trigger") and event.trigger.kind in (
390
400
  "kafka",
391
401
  "kafka-cluster",
392
402
  "v3ioStream",
393
403
  "v3io-stream",
404
+ "rabbit-mq",
405
+ "rabbitMq",
394
406
  ):
395
407
  event.path = "/"
396
408
 
@@ -417,7 +429,7 @@ def create_graph_server(
417
429
  parameters = parameters or {}
418
430
  server = GraphServer(graph, parameters, load_mode, verbose=verbose, **kwargs)
419
431
  server.set_current_function(
420
- current_function or os.environ.get("SERVING_CURRENT_FUNCTION", "")
432
+ current_function or os.getenv("SERVING_CURRENT_FUNCTION", "")
421
433
  )
422
434
  return server
423
435
 
@@ -481,7 +493,13 @@ class Response:
481
493
  class GraphContext:
482
494
  """Graph context object"""
483
495
 
484
- def __init__(self, level="info", logger=None, server=None, nuclio_context=None):
496
+ def __init__(
497
+ self,
498
+ level="info", # Unused argument
499
+ logger=None,
500
+ server=None,
501
+ nuclio_context: Optional[NuclioContext] = None,
502
+ ) -> None:
485
503
  self.state = None
486
504
  self.logger = logger
487
505
  self.worker_id = 0
@@ -491,7 +509,7 @@ class GraphContext:
491
509
  self.root = None
492
510
 
493
511
  if nuclio_context:
494
- self.logger = nuclio_context.logger
512
+ self.logger: NuclioLogger = nuclio_context.logger
495
513
  self.Response = nuclio_context.Response
496
514
  if hasattr(nuclio_context, "trigger") and hasattr(
497
515
  nuclio_context.trigger, "kind"
@@ -501,7 +519,7 @@ class GraphContext:
501
519
  if hasattr(nuclio_context, "platform"):
502
520
  self.platform = nuclio_context.platform
503
521
  elif not logger:
504
- self.logger = mlrun.utils.helpers.logger
522
+ self.logger: mlrun.utils.Logger = mlrun.utils.logger
505
523
 
506
524
  self._server = server
507
525
  self.current_function = None
@@ -514,7 +532,7 @@ class GraphContext:
514
532
  return self._server
515
533
 
516
534
  @property
517
- def project(self):
535
+ def project(self) -> str:
518
536
  """current project name (for the current function)"""
519
537
  project, _, _, _ = mlrun.common.helpers.parse_versioned_object_uri(
520
538
  self._server.function_uri