mlrun 1.10.0rc10__py3-none-any.whl → 1.10.0rc11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (51) hide show
  1. mlrun/artifacts/manager.py +1 -1
  2. mlrun/common/constants.py +11 -0
  3. mlrun/common/schemas/model_monitoring/__init__.py +2 -0
  4. mlrun/common/schemas/model_monitoring/functions.py +2 -0
  5. mlrun/common/schemas/model_monitoring/model_endpoints.py +19 -1
  6. mlrun/common/schemas/serving.py +1 -0
  7. mlrun/common/schemas/workflow.py +3 -2
  8. mlrun/datastore/azure_blob.py +1 -1
  9. mlrun/datastore/base.py +4 -2
  10. mlrun/datastore/datastore.py +46 -14
  11. mlrun/datastore/google_cloud_storage.py +1 -1
  12. mlrun/datastore/s3.py +16 -5
  13. mlrun/datastore/sources.py +2 -2
  14. mlrun/datastore/targets.py +2 -2
  15. mlrun/db/__init__.py +0 -1
  16. mlrun/db/base.py +12 -0
  17. mlrun/db/httpdb.py +35 -0
  18. mlrun/db/nopdb.py +10 -0
  19. mlrun/execution.py +12 -0
  20. mlrun/frameworks/tf_keras/mlrun_interface.py +7 -18
  21. mlrun/launcher/base.py +1 -0
  22. mlrun/launcher/client.py +1 -0
  23. mlrun/launcher/local.py +4 -0
  24. mlrun/model.py +15 -4
  25. mlrun/model_monitoring/applications/base.py +74 -56
  26. mlrun/model_monitoring/db/tsdb/base.py +52 -19
  27. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +179 -11
  28. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +26 -11
  29. mlrun/model_monitoring/helpers.py +48 -0
  30. mlrun/projects/pipelines.py +12 -3
  31. mlrun/projects/project.py +30 -0
  32. mlrun/runtimes/daskjob.py +2 -0
  33. mlrun/runtimes/kubejob.py +4 -0
  34. mlrun/runtimes/mpijob/abstract.py +2 -0
  35. mlrun/runtimes/mpijob/v1.py +2 -0
  36. mlrun/runtimes/nuclio/function.py +2 -0
  37. mlrun/runtimes/nuclio/serving.py +59 -0
  38. mlrun/runtimes/pod.py +3 -0
  39. mlrun/runtimes/remotesparkjob.py +2 -0
  40. mlrun/runtimes/sparkjob/spark3job.py +2 -0
  41. mlrun/serving/server.py +97 -3
  42. mlrun/serving/states.py +146 -38
  43. mlrun/utils/version/version.json +2 -2
  44. {mlrun-1.10.0rc10.dist-info → mlrun-1.10.0rc11.dist-info}/METADATA +13 -6
  45. {mlrun-1.10.0rc10.dist-info → mlrun-1.10.0rc11.dist-info}/RECORD +49 -51
  46. mlrun/db/sql_types.py +0 -160
  47. mlrun/utils/db.py +0 -71
  48. {mlrun-1.10.0rc10.dist-info → mlrun-1.10.0rc11.dist-info}/WHEEL +0 -0
  49. {mlrun-1.10.0rc10.dist-info → mlrun-1.10.0rc11.dist-info}/entry_points.txt +0 -0
  50. {mlrun-1.10.0rc10.dist-info → mlrun-1.10.0rc11.dist-info}/licenses/LICENSE +0 -0
  51. {mlrun-1.10.0rc10.dist-info → mlrun-1.10.0rc11.dist-info}/top_level.txt +0 -0
@@ -21,6 +21,7 @@ import typing
21
21
  import uuid
22
22
 
23
23
  import mlrun
24
+ import mlrun.common.constants as mlrun_constants
24
25
  import mlrun.common.runtimes.constants
25
26
  import mlrun.common.schemas
26
27
  import mlrun.common.schemas.function
@@ -1086,10 +1087,16 @@ def rerun_workflow(
1086
1087
 
1087
1088
  # Retry the pipeline - TODO: add submit-direct flag when created
1088
1089
  db = mlrun.get_run_db()
1089
- new_pipeline_id = db.retry_pipeline(run_uid, project_name)
1090
+ new_pipeline_id = db.retry_pipeline(
1091
+ run_uid, project_name, submit_mode=mlrun_constants.WorkflowSubmitMode.direct
1092
+ )
1090
1093
 
1091
1094
  # Store result for observability
1092
- context.set_label("workflow-id", new_pipeline_id)
1095
+ context.set_label(
1096
+ mlrun_constants.MLRunInternalLabels.workflow_id, new_pipeline_id
1097
+ )
1098
+ context.update_run()
1099
+
1093
1100
  context.log_result("workflow_id", new_pipeline_id)
1094
1101
 
1095
1102
  # wait for pipeline completion so monitor will push terminal notifications
@@ -1226,7 +1233,9 @@ def load_and_run_workflow(
1226
1233
  context.logger.info(
1227
1234
  "Associating workflow-runner with workflow ID", run_id=run.run_id
1228
1235
  )
1229
- context.set_label("workflow-id", run.run_id)
1236
+ context.set_label(mlrun_constants.MLRunInternalLabels.workflow_id, run.run_id)
1237
+ context.update_run()
1238
+
1230
1239
  context.log_result(key="workflow_id", value=run.run_id)
1231
1240
  context.log_result(key="engine", value=run._engine.engine, commit=True)
1232
1241
 
mlrun/projects/project.py CHANGED
@@ -4968,6 +4968,36 @@ class MlrunProject(ModelObj):
4968
4968
  include_infra=include_infra,
4969
4969
  )
4970
4970
 
4971
+ def get_monitoring_function_summary(
4972
+ self,
4973
+ name: str,
4974
+ start: Optional[datetime.datetime] = None,
4975
+ end: Optional[datetime.datetime] = None,
4976
+ include_latest_metrics: bool = False,
4977
+ ) -> mlrun.common.schemas.model_monitoring.FunctionSummary:
4978
+ """Get a monitoring function summary for the specified project and function name.
4979
+ :param name: Name of the monitoring function to retrieve the summary for.
4980
+ :param start: Start time for filtering the results (optional).
4981
+ :param end: End time for filtering the results (optional).
4982
+ :param include_latest_metrics: Whether to include the latest metrics in the response (default is False).
4983
+
4984
+ :return: A FunctionSummary object containing information about the monitoring function.
4985
+ """
4986
+ if start is not None and end is not None:
4987
+ if start.tzinfo is None or end.tzinfo is None:
4988
+ raise mlrun.errors.MLRunInvalidArgumentTypeError(
4989
+ "Custom start and end times must contain the timezone."
4990
+ )
4991
+
4992
+ db = mlrun.db.get_run_db(secrets=self._secrets)
4993
+ return db.get_monitoring_function_summary(
4994
+ project=self.metadata.name,
4995
+ function_name=name,
4996
+ start=start,
4997
+ end=end,
4998
+ include_latest_metrics=include_latest_metrics,
4999
+ )
5000
+
4971
5001
  def list_runs(
4972
5002
  self,
4973
5003
  name: Optional[str] = None,
mlrun/runtimes/daskjob.py CHANGED
@@ -92,6 +92,7 @@ class DaskSpec(KubeResourceSpec):
92
92
  preemption_mode=None,
93
93
  security_context=None,
94
94
  state_thresholds=None,
95
+ serving_spec=None,
95
96
  ):
96
97
  super().__init__(
97
98
  command=command,
@@ -121,6 +122,7 @@ class DaskSpec(KubeResourceSpec):
121
122
  preemption_mode=preemption_mode,
122
123
  security_context=security_context,
123
124
  state_thresholds=state_thresholds,
125
+ serving_spec=serving_spec,
124
126
  )
125
127
  self.args = args
126
128
 
mlrun/runtimes/kubejob.py CHANGED
@@ -207,3 +207,7 @@ class KubejobRuntime(KubeResource):
207
207
  raise NotImplementedError(
208
208
  f"Running a {self.kind} function from the client is not supported. Use .run() to submit the job to the API."
209
209
  )
210
+
211
+ @property
212
+ def serving_spec(self):
213
+ return self.spec.serving_spec
@@ -54,6 +54,7 @@ class MPIResourceSpec(KubeResourceSpec):
54
54
  preemption_mode=None,
55
55
  security_context=None,
56
56
  state_thresholds=None,
57
+ serving_spec=None,
57
58
  ):
58
59
  super().__init__(
59
60
  command=command,
@@ -83,6 +84,7 @@ class MPIResourceSpec(KubeResourceSpec):
83
84
  preemption_mode=preemption_mode,
84
85
  security_context=security_context,
85
86
  state_thresholds=state_thresholds,
87
+ serving_spec=serving_spec,
86
88
  )
87
89
  self.mpi_args = mpi_args or [
88
90
  "-x",
@@ -49,6 +49,7 @@ class MPIV1ResourceSpec(MPIResourceSpec):
49
49
  preemption_mode=None,
50
50
  security_context=None,
51
51
  state_thresholds=None,
52
+ serving_spec=None,
52
53
  ):
53
54
  super().__init__(
54
55
  command=command,
@@ -79,6 +80,7 @@ class MPIV1ResourceSpec(MPIResourceSpec):
79
80
  preemption_mode=preemption_mode,
80
81
  security_context=security_context,
81
82
  state_thresholds=state_thresholds,
83
+ serving_spec=serving_spec,
82
84
  )
83
85
  self.clean_pod_policy = clean_pod_policy or MPIJobV1CleanPodPolicies.default()
84
86
 
@@ -154,6 +154,7 @@ class NuclioSpec(KubeResourceSpec):
154
154
  add_templated_ingress_host_mode=None,
155
155
  state_thresholds=None,
156
156
  disable_default_http_trigger=None,
157
+ serving_spec=None,
157
158
  ):
158
159
  super().__init__(
159
160
  command=command,
@@ -183,6 +184,7 @@ class NuclioSpec(KubeResourceSpec):
183
184
  preemption_mode=preemption_mode,
184
185
  security_context=security_context,
185
186
  state_thresholds=state_thresholds,
187
+ serving_spec=serving_spec,
186
188
  )
187
189
 
188
190
  self.base_spec = base_spec or {}
@@ -42,6 +42,8 @@ from mlrun.serving.states import (
42
42
  )
43
43
  from mlrun.utils import get_caller_globals, logger, set_paths
44
44
 
45
+ from .. import KubejobRuntime
46
+ from ..pod import KubeResourceSpec
45
47
  from .function import NuclioSpec, RemoteRuntime, min_nuclio_versions
46
48
 
47
49
  serving_subkind = "serving_v2"
@@ -149,6 +151,7 @@ class ServingSpec(NuclioSpec):
149
151
  state_thresholds=None,
150
152
  disable_default_http_trigger=None,
151
153
  model_endpoint_creation_task_name=None,
154
+ serving_spec=None,
152
155
  ):
153
156
  super().__init__(
154
157
  command=command,
@@ -189,6 +192,7 @@ class ServingSpec(NuclioSpec):
189
192
  service_type=service_type,
190
193
  add_templated_ingress_host_mode=add_templated_ingress_host_mode,
191
194
  disable_default_http_trigger=disable_default_http_trigger,
195
+ serving_spec=serving_spec,
192
196
  )
193
197
 
194
198
  self.models = models or {}
@@ -296,6 +300,7 @@ class ServingRuntime(RemoteRuntime):
296
300
  self.spec.graph = step
297
301
  elif topology == StepKinds.flow:
298
302
  self.spec.graph = RootFlowStep(engine=engine or "async")
303
+ self.spec.graph.track_models = self.spec.track_models
299
304
  else:
300
305
  raise mlrun.errors.MLRunInvalidArgumentError(
301
306
  f"unsupported topology {topology}, use 'router' or 'flow'"
@@ -331,6 +336,8 @@ class ServingRuntime(RemoteRuntime):
331
336
  """
332
337
  # Applying model monitoring configurations
333
338
  self.spec.track_models = enable_tracking
339
+ if self.spec.graph and isinstance(self.spec.graph, RootFlowStep):
340
+ self.spec.graph.track_models = enable_tracking
334
341
  if self._spec and self._spec.function_refs:
335
342
  logger.debug(
336
343
  "Set tracking for children references", enable_tracking=enable_tracking
@@ -343,6 +350,16 @@ class ServingRuntime(RemoteRuntime):
343
350
  name
344
351
  ]._function.spec.track_models = enable_tracking
345
352
 
353
+ if self._spec.function_refs[
354
+ name
355
+ ]._function.spec.graph and isinstance(
356
+ self._spec.function_refs[name]._function.spec.graph,
357
+ RootFlowStep,
358
+ ):
359
+ self._spec.function_refs[
360
+ name
361
+ ]._function.spec.graph.track_models = enable_tracking
362
+
346
363
  if not 0 < sampling_percentage <= 100:
347
364
  raise mlrun.errors.MLRunInvalidArgumentError(
348
365
  "`sampling_percentage` must be greater than 0 and less or equal to 100."
@@ -703,6 +720,7 @@ class ServingRuntime(RemoteRuntime):
703
720
  "track_models": self.spec.track_models,
704
721
  "default_content_type": self.spec.default_content_type,
705
722
  "model_endpoint_creation_task_name": self.spec.model_endpoint_creation_task_name,
723
+ "filename": getattr(self.spec, "filename", None),
706
724
  }
707
725
 
708
726
  if self.spec.secret_sources:
@@ -711,6 +729,10 @@ class ServingRuntime(RemoteRuntime):
711
729
 
712
730
  return json.dumps(serving_spec)
713
731
 
732
+ @property
733
+ def serving_spec(self):
734
+ return self._get_serving_spec()
735
+
714
736
  def to_mock_server(
715
737
  self,
716
738
  namespace=None,
@@ -815,3 +837,40 @@ class ServingRuntime(RemoteRuntime):
815
837
  "Turn off the mock (mock=False) and make sure Nuclio is installed for real deployment to Nuclio"
816
838
  )
817
839
  self._mock_server = self.to_mock_server()
840
+
841
+ def to_job(self) -> KubejobRuntime:
842
+ """Convert this ServingRuntime to a KubejobRuntime, so that the graph can be run as a standalone job."""
843
+ if self.spec.function_refs:
844
+ raise mlrun.errors.MLRunInvalidArgumentError(
845
+ f"Cannot convert function '{self.metadata.name}' to a job because it has child functions"
846
+ )
847
+
848
+ spec = KubeResourceSpec(
849
+ image=self.spec.image,
850
+ mode=self.spec.mode,
851
+ volumes=self.spec.volumes,
852
+ volume_mounts=self.spec.volume_mounts,
853
+ env=self.spec.env,
854
+ resources=self.spec.resources,
855
+ default_handler="mlrun.serving.server.execute_graph",
856
+ pythonpath=self.spec.pythonpath,
857
+ entry_points=self.spec.entry_points,
858
+ description=self.spec.description,
859
+ workdir=self.spec.workdir,
860
+ image_pull_secret=self.spec.image_pull_secret,
861
+ node_name=self.spec.node_name,
862
+ node_selector=self.spec.node_selector,
863
+ affinity=self.spec.affinity,
864
+ disable_auto_mount=self.spec.disable_auto_mount,
865
+ priority_class_name=self.spec.priority_class_name,
866
+ tolerations=self.spec.tolerations,
867
+ preemption_mode=self.spec.preemption_mode,
868
+ security_context=self.spec.security_context,
869
+ state_thresholds=self.spec.state_thresholds,
870
+ serving_spec=self._get_serving_spec(),
871
+ )
872
+ job = KubejobRuntime(
873
+ spec=spec,
874
+ metadata=self.metadata,
875
+ )
876
+ return job
mlrun/runtimes/pod.py CHANGED
@@ -103,6 +103,7 @@ class KubeResourceSpec(FunctionSpec):
103
103
  "preemption_mode",
104
104
  "security_context",
105
105
  "state_thresholds",
106
+ "serving_spec",
106
107
  ]
107
108
  _default_fields_to_strip = FunctionSpec._default_fields_to_strip + [
108
109
  "volumes",
@@ -178,6 +179,7 @@ class KubeResourceSpec(FunctionSpec):
178
179
  preemption_mode=None,
179
180
  security_context=None,
180
181
  state_thresholds=None,
182
+ serving_spec=None,
181
183
  ):
182
184
  super().__init__(
183
185
  command=command,
@@ -223,6 +225,7 @@ class KubeResourceSpec(FunctionSpec):
223
225
  state_thresholds
224
226
  or mlrun.mlconf.function.spec.state_thresholds.default.to_dict()
225
227
  )
228
+ self.serving_spec = serving_spec
226
229
  # Termination grace period is internal for runtimes that have a pod termination hook hence it is not in the
227
230
  # _dict_fields and doesn't have a setter.
228
231
  self._termination_grace_period_seconds = None
@@ -58,6 +58,7 @@ class RemoteSparkSpec(KubeResourceSpec):
58
58
  preemption_mode=None,
59
59
  security_context=None,
60
60
  state_thresholds=None,
61
+ serving_spec=None,
61
62
  ):
62
63
  super().__init__(
63
64
  command=command,
@@ -87,6 +88,7 @@ class RemoteSparkSpec(KubeResourceSpec):
87
88
  preemption_mode=preemption_mode,
88
89
  security_context=security_context,
89
90
  state_thresholds=state_thresholds,
91
+ serving_spec=serving_spec,
90
92
  )
91
93
  self.provider = provider
92
94
 
@@ -168,6 +168,7 @@ class Spark3JobSpec(KubeResourceSpec):
168
168
  executor_cores=None,
169
169
  security_context=None,
170
170
  state_thresholds=None,
171
+ serving_spec=None,
171
172
  ):
172
173
  super().__init__(
173
174
  command=command,
@@ -197,6 +198,7 @@ class Spark3JobSpec(KubeResourceSpec):
197
198
  preemption_mode=preemption_mode,
198
199
  security_context=security_context,
199
200
  state_thresholds=state_thresholds,
201
+ serving_spec=serving_spec,
200
202
  )
201
203
 
202
204
  self.driver_resources = driver_resources or {}
mlrun/serving/server.py CHANGED
@@ -21,8 +21,9 @@ import os
21
21
  import socket
22
22
  import traceback
23
23
  import uuid
24
- from typing import Optional, Union
24
+ from typing import Any, Optional, Union
25
25
 
26
+ import storey
26
27
  from nuclio import Context as NuclioContext
27
28
  from nuclio.request import Logger as NuclioLogger
28
29
 
@@ -38,9 +39,10 @@ from mlrun.secrets import SecretsStore
38
39
 
39
40
  from ..common.helpers import parse_versioned_object_uri
40
41
  from ..common.schemas.model_monitoring.constants import FileTargetKind
41
- from ..datastore import get_stream_pusher
42
+ from ..datastore import DataItem, get_stream_pusher
42
43
  from ..datastore.store_resources import ResourceCache
43
44
  from ..errors import MLRunInvalidArgumentError
45
+ from ..execution import MLClientCtx
44
46
  from ..model import ModelObj
45
47
  from ..utils import get_caller_globals
46
48
  from .states import (
@@ -322,7 +324,11 @@ class GraphServer(ModelObj):
322
324
 
323
325
  def _process_response(self, context, response, get_body):
324
326
  body = response.body
325
- if isinstance(body, context.Response) or get_body:
327
+ if (
328
+ isinstance(context, MLClientCtx)
329
+ or isinstance(body, context.Response)
330
+ or get_body
331
+ ):
326
332
  return body
327
333
 
328
334
  if body and not isinstance(body, (str, bytes)):
@@ -535,6 +541,94 @@ def v2_serving_init(context, namespace=None):
535
541
  _set_callbacks(server, context)
536
542
 
537
543
 
544
+ async def async_execute_graph(
545
+ context: MLClientCtx,
546
+ data: DataItem,
547
+ batching: bool,
548
+ batch_size: Optional[int],
549
+ ) -> list[Any]:
550
+ spec = mlrun.utils.get_serving_spec()
551
+
552
+ source_filename = spec.get("filename", None)
553
+ namespace = {}
554
+ if source_filename:
555
+ with open(source_filename) as f:
556
+ exec(f.read(), namespace)
557
+
558
+ server = GraphServer.from_dict(spec)
559
+
560
+ if config.log_level.lower() == "debug":
561
+ server.verbose = True
562
+ context.logger.info_with("Initializing states", namespace=namespace)
563
+ kwargs = {}
564
+ if hasattr(context, "is_mock"):
565
+ kwargs["is_mock"] = context.is_mock
566
+ server.init_states(
567
+ context=None, # this context is expected to be a nuclio context, which we don't have in this flow
568
+ namespace=namespace,
569
+ **kwargs,
570
+ )
571
+ context.logger.info("Initializing graph steps")
572
+ server.init_object(namespace)
573
+
574
+ context.logger.info_with("Graph was initialized", verbose=server.verbose)
575
+
576
+ if server.verbose:
577
+ context.logger.info(server.to_yaml())
578
+
579
+ df = data.as_df()
580
+
581
+ responses = []
582
+
583
+ async def run(body):
584
+ event = storey.Event(id=index, body=body)
585
+ response = await server.run(event, context)
586
+ responses.append(response)
587
+
588
+ if batching and not batch_size:
589
+ batch_size = len(df)
590
+
591
+ batch = []
592
+ for index, row in df.iterrows():
593
+ data = row.to_dict()
594
+ if batching:
595
+ batch.append(data)
596
+ if len(batch) == batch_size:
597
+ await run(batch)
598
+ batch = []
599
+ else:
600
+ await run(data)
601
+
602
+ if batch:
603
+ await run(batch)
604
+
605
+ termination_result = server.wait_for_completion()
606
+ if asyncio.iscoroutine(termination_result):
607
+ await termination_result
608
+
609
+ return responses
610
+
611
+
612
+ def execute_graph(
613
+ context: MLClientCtx,
614
+ data: DataItem,
615
+ batching: bool = False,
616
+ batch_size: Optional[int] = None,
617
+ ) -> (list[Any], Any):
618
+ """
619
+ Execute graph as a job, from start to finish.
620
+
621
+ :param context: The job's execution client context.
622
+ :param data: The input data to the job, to be pushed into the graph row by row, or in batches.
623
+ :param batching: Whether to push one or more batches into the graph rather than row by row.
624
+ :param batch_size: The number of rows to push per batch. If not set, and batching=True, the entire dataset will
625
+ be pushed into the graph in one batch.
626
+
627
+ :return: A list of responses.
628
+ """
629
+ return asyncio.run(async_execute_graph(context, data, batching, batch_size))
630
+
631
+
538
632
  def _set_callbacks(server, context):
539
633
  if not server.graph.supports_termination() or not hasattr(context, "platform"):
540
634
  return