mlrun 1.7.0rc37__py3-none-any.whl → 1.7.0rc39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (52) hide show
  1. mlrun/alerts/alert.py +34 -30
  2. mlrun/common/schemas/alert.py +3 -0
  3. mlrun/common/schemas/model_monitoring/constants.py +4 -0
  4. mlrun/common/schemas/notification.py +4 -3
  5. mlrun/datastore/alibaba_oss.py +2 -2
  6. mlrun/datastore/azure_blob.py +124 -31
  7. mlrun/datastore/base.py +1 -1
  8. mlrun/datastore/dbfs_store.py +2 -2
  9. mlrun/datastore/google_cloud_storage.py +83 -20
  10. mlrun/datastore/s3.py +2 -2
  11. mlrun/datastore/sources.py +54 -0
  12. mlrun/datastore/targets.py +9 -53
  13. mlrun/db/httpdb.py +6 -1
  14. mlrun/errors.py +8 -0
  15. mlrun/execution.py +7 -0
  16. mlrun/feature_store/api.py +5 -0
  17. mlrun/feature_store/common.py +6 -11
  18. mlrun/feature_store/retrieval/job.py +1 -0
  19. mlrun/model.py +29 -3
  20. mlrun/model_monitoring/api.py +9 -0
  21. mlrun/model_monitoring/applications/_application_steps.py +36 -0
  22. mlrun/model_monitoring/applications/histogram_data_drift.py +15 -13
  23. mlrun/model_monitoring/controller.py +15 -11
  24. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +14 -11
  25. mlrun/model_monitoring/db/tsdb/base.py +121 -1
  26. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +85 -47
  27. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +100 -12
  28. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +23 -1
  29. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +214 -36
  30. mlrun/model_monitoring/helpers.py +16 -17
  31. mlrun/model_monitoring/stream_processing.py +68 -27
  32. mlrun/projects/operations.py +1 -1
  33. mlrun/projects/pipelines.py +19 -30
  34. mlrun/projects/project.py +76 -52
  35. mlrun/run.py +8 -6
  36. mlrun/runtimes/__init__.py +19 -8
  37. mlrun/runtimes/nuclio/api_gateway.py +9 -0
  38. mlrun/runtimes/nuclio/application/application.py +64 -9
  39. mlrun/runtimes/nuclio/function.py +1 -1
  40. mlrun/runtimes/pod.py +2 -2
  41. mlrun/runtimes/remotesparkjob.py +2 -5
  42. mlrun/runtimes/sparkjob/spark3job.py +7 -9
  43. mlrun/serving/v2_serving.py +1 -0
  44. mlrun/track/trackers/mlflow_tracker.py +5 -0
  45. mlrun/utils/helpers.py +21 -0
  46. mlrun/utils/version/version.json +2 -2
  47. {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/METADATA +14 -11
  48. {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/RECORD +52 -52
  49. {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/WHEEL +1 -1
  50. {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/LICENSE +0 -0
  51. {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/entry_points.txt +0 -0
  52. {mlrun-1.7.0rc37.dist-info → mlrun-1.7.0rc39.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from datetime import datetime
15
+ from datetime import datetime, timezone
16
16
  from io import StringIO
17
17
  from typing import Literal, Optional, Union
18
18
 
@@ -33,7 +33,7 @@ _TSDB_RATE = "1/s"
33
33
  _CONTAINER = "users"
34
34
 
35
35
 
36
- def _is_no_schema_error(exc: v3io_frames.ReadError) -> bool:
36
+ def _is_no_schema_error(exc: v3io_frames.Error) -> bool:
37
37
  """
38
38
  In case of a nonexistent TSDB table - a `v3io_frames.ReadError` error is raised.
39
39
  Check if the error message contains the relevant string to verify the cause.
@@ -89,6 +89,19 @@ class V3IOTSDBConnector(TSDBConnector):
89
89
  )
90
90
  self.tables[mm_schemas.V3IOTSDBTables.EVENTS] = events_path
91
91
 
92
+ errors_table_full_path = mlrun.mlconf.get_model_monitoring_file_target_path(
93
+ project=self.project,
94
+ kind=mm_schemas.FileTargetKind.ERRORS,
95
+ )
96
+ (
97
+ _,
98
+ _,
99
+ errors_path,
100
+ ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
101
+ errors_table_full_path
102
+ )
103
+ self.tables[mm_schemas.V3IOTSDBTables.ERRORS] = errors_path
104
+
92
105
  monitoring_application_full_path = (
93
106
  mlrun.mlconf.get_model_monitoring_file_target_path(
94
107
  project=self.project,
@@ -160,7 +173,6 @@ class V3IOTSDBConnector(TSDBConnector):
160
173
  - endpoint_features (Prediction and feature names and values)
161
174
  - custom_metrics (user-defined metrics)
162
175
  """
163
-
164
176
  # Write latency per prediction, labeled by endpoint ID only
165
177
  graph.add_step(
166
178
  "storey.TSDBTarget",
@@ -171,7 +183,10 @@ class V3IOTSDBConnector(TSDBConnector):
171
183
  time_col=mm_schemas.EventFieldType.TIMESTAMP,
172
184
  container=self.container,
173
185
  v3io_frames=self.v3io_framesd,
174
- columns=[mm_schemas.EventFieldType.LATENCY],
186
+ columns=[
187
+ mm_schemas.EventFieldType.LATENCY,
188
+ mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP,
189
+ ],
175
190
  index_cols=[
176
191
  mm_schemas.EventFieldType.ENDPOINT_ID,
177
192
  ],
@@ -255,6 +270,40 @@ class V3IOTSDBConnector(TSDBConnector):
255
270
  apply_storey_filter()
256
271
  apply_tsdb_target(name="tsdb3", after="FilterNotNone")
257
272
 
273
+ def handle_model_error(
274
+ self,
275
+ graph,
276
+ tsdb_batching_max_events: int = 10,
277
+ tsdb_batching_timeout_secs: int = 60,
278
+ **kwargs,
279
+ ) -> None:
280
+ graph.add_step(
281
+ "mlrun.model_monitoring.db.tsdb.v3io.stream_graph_steps.ErrorExtractor",
282
+ name="error_extractor",
283
+ after="ForwardError",
284
+ )
285
+
286
+ graph.add_step(
287
+ "storey.TSDBTarget",
288
+ name="tsdb_error",
289
+ after="error_extractor",
290
+ path=f"{self.container}/{self.tables[mm_schemas.FileTargetKind.ERRORS]}",
291
+ rate="1/s",
292
+ time_col=mm_schemas.EventFieldType.TIMESTAMP,
293
+ container=self.container,
294
+ v3io_frames=self.v3io_framesd,
295
+ columns=[
296
+ mm_schemas.EventFieldType.MODEL_ERROR,
297
+ mm_schemas.EventFieldType.ERROR_COUNT,
298
+ ],
299
+ index_cols=[
300
+ mm_schemas.EventFieldType.ENDPOINT_ID,
301
+ ],
302
+ max_events=tsdb_batching_max_events,
303
+ flush_after_seconds=tsdb_batching_timeout_secs,
304
+ key=mm_schemas.EventFieldType.ENDPOINT_ID,
305
+ )
306
+
258
307
  def write_application_event(
259
308
  self,
260
309
  event: dict,
@@ -277,7 +326,9 @@ class V3IOTSDBConnector(TSDBConnector):
277
326
  elif kind == mm_schemas.WriterEventKind.RESULT:
278
327
  table = self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]
279
328
  index_cols = index_cols_base + [mm_schemas.ResultData.RESULT_NAME]
280
- del event[mm_schemas.ResultData.RESULT_EXTRA_DATA]
329
+ event.pop(mm_schemas.ResultData.CURRENT_STATS, None)
330
+ # TODO: remove this when extra data is supported (ML-7460)
331
+ event.pop(mm_schemas.ResultData.RESULT_EXTRA_DATA, None)
281
332
  else:
282
333
  raise ValueError(f"Invalid {kind = }")
283
334
 
@@ -437,7 +488,7 @@ class V3IOTSDBConnector(TSDBConnector):
437
488
  step=sliding_window_step,
438
489
  **kwargs,
439
490
  )
440
- except v3io_frames.ReadError as err:
491
+ except v3io_frames.Error as err:
441
492
  if _is_no_schema_error(err):
442
493
  return pd.DataFrame()
443
494
  else:
@@ -504,10 +555,16 @@ class V3IOTSDBConnector(TSDBConnector):
504
555
  if type == "metrics":
505
556
  table_path = self.tables[mm_schemas.V3IOTSDBTables.METRICS]
506
557
  name = mm_schemas.MetricData.METRIC_NAME
558
+ columns = [mm_schemas.MetricData.METRIC_VALUE]
507
559
  df_handler = self.df_to_metrics_values
508
560
  elif type == "results":
509
561
  table_path = self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]
510
562
  name = mm_schemas.ResultData.RESULT_NAME
563
+ columns = [
564
+ mm_schemas.ResultData.RESULT_VALUE,
565
+ mm_schemas.ResultData.RESULT_STATUS,
566
+ mm_schemas.ResultData.RESULT_KIND,
567
+ ]
511
568
  df_handler = self.df_to_results_values
512
569
  else:
513
570
  raise ValueError(f"Invalid {type = }")
@@ -517,6 +574,7 @@ class V3IOTSDBConnector(TSDBConnector):
517
574
  metric_and_app_names=[(metric.app, metric.name) for metric in metrics],
518
575
  table_path=table_path,
519
576
  name=name,
577
+ columns=columns,
520
578
  )
521
579
 
522
580
  logger.debug("Querying V3IO TSDB", query=query)
@@ -627,33 +685,153 @@ class V3IOTSDBConnector(TSDBConnector):
627
685
  ), # pyright: ignore[reportArgumentType]
628
686
  )
629
687
 
630
- # Note: this function serves as a reference for checking the TSDB for the existence of a metric.
631
- #
632
- # def read_prediction_metric_for_endpoint_if_exists(
633
- # self, endpoint_id: str
634
- # ) -> Optional[mm_schemas.ModelEndpointMonitoringMetric]:
635
- # """
636
- # Read the count of the latency column in the predictions table for the given endpoint_id.
637
- # We just want to check if there is any data for this endpoint_id.
638
- # """
639
- # query = self._get_sql_query(
640
- # endpoint_id=endpoint_id,
641
- # table_path=self.tables[mm_schemas.FileTargetKind.PREDICTIONS],
642
- # columns=[f"count({mm_schemas.EventFieldType.LATENCY})"],
643
- # )
644
- # try:
645
- # logger.debug("Checking TSDB", project=self.project, query=query)
646
- # df: pd.DataFrame = self._frames_client.read(
647
- # backend=_TSDB_BE, query=query, start="0", end="now"
648
- # )
649
- # except v3io_frames.ReadError as err:
650
- # if _is_no_schema_error(err):
651
- # logger.debug(
652
- # "No predictions yet", project=self.project, endpoint_id=endpoint_id
653
- # )
654
- # return
655
- # else:
656
- # raise
657
- #
658
- # if not df.empty:
659
- # return get_invocations_metric(self.project)
688
+ def get_last_request(
689
+ self,
690
+ endpoint_ids: Union[str, list[str]],
691
+ start: Union[datetime, str] = "0",
692
+ end: Union[datetime, str] = "now",
693
+ ) -> pd.DataFrame:
694
+ endpoint_ids = (
695
+ endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
696
+ )
697
+ df = self._get_records(
698
+ table=mm_schemas.FileTargetKind.PREDICTIONS,
699
+ start=start,
700
+ end=end,
701
+ filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
702
+ agg_funcs=["last"],
703
+ )
704
+ if not df.empty:
705
+ df.rename(
706
+ columns={
707
+ f"last({mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP})": mm_schemas.EventFieldType.LAST_REQUEST,
708
+ f"last({mm_schemas.EventFieldType.LATENCY})": f"last_{mm_schemas.EventFieldType.LATENCY}",
709
+ },
710
+ inplace=True,
711
+ )
712
+ df[mm_schemas.EventFieldType.LAST_REQUEST] = df[
713
+ mm_schemas.EventFieldType.LAST_REQUEST
714
+ ].map(
715
+ lambda last_request: datetime.fromtimestamp(
716
+ last_request, tz=timezone.utc
717
+ )
718
+ )
719
+
720
+ return df.reset_index(drop=True)
721
+
722
+ def get_drift_status(
723
+ self,
724
+ endpoint_ids: Union[str, list[str]],
725
+ start: Union[datetime, str] = "now-24h",
726
+ end: Union[datetime, str] = "now",
727
+ ) -> pd.DataFrame:
728
+ endpoint_ids = (
729
+ endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
730
+ )
731
+ df = self._get_records(
732
+ table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
733
+ start=start,
734
+ end=end,
735
+ columns=[mm_schemas.ResultData.RESULT_STATUS],
736
+ filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
737
+ agg_funcs=["max"],
738
+ group_by="endpoint_id",
739
+ )
740
+ if not df.empty:
741
+ df.columns = [
742
+ col[len("max(") : -1] if "max(" in col else col for col in df.columns
743
+ ]
744
+ return df.reset_index(drop=True)
745
+
746
+ def get_metrics_metadata(
747
+ self,
748
+ endpoint_id: str,
749
+ start: Union[datetime, str] = "0",
750
+ end: Union[datetime, str] = "now",
751
+ ) -> pd.DataFrame:
752
+ df = self._get_records(
753
+ table=mm_schemas.V3IOTSDBTables.METRICS,
754
+ start=start,
755
+ end=end,
756
+ columns=[mm_schemas.MetricData.METRIC_VALUE],
757
+ filter_query=f"endpoint_id=='{endpoint_id}'",
758
+ agg_funcs=["last"],
759
+ )
760
+ if not df.empty:
761
+ df.drop(
762
+ columns=[f"last({mm_schemas.MetricData.METRIC_VALUE})"], inplace=True
763
+ )
764
+ return df.reset_index(drop=True)
765
+
766
+ def get_results_metadata(
767
+ self,
768
+ endpoint_id: str,
769
+ start: Union[datetime, str] = "0",
770
+ end: Union[datetime, str] = "now",
771
+ ) -> pd.DataFrame:
772
+ df = self._get_records(
773
+ table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
774
+ start=start,
775
+ end=end,
776
+ columns=[
777
+ mm_schemas.ResultData.RESULT_KIND,
778
+ ],
779
+ filter_query=f"endpoint_id=='{endpoint_id}'",
780
+ agg_funcs=["last"],
781
+ )
782
+ if not df.empty:
783
+ df.rename(
784
+ columns={
785
+ f"last({mm_schemas.ResultData.RESULT_KIND})": mm_schemas.ResultData.RESULT_KIND
786
+ },
787
+ inplace=True,
788
+ )
789
+ return df.reset_index(drop=True)
790
+
791
+ def get_error_count(
792
+ self,
793
+ endpoint_ids: Union[str, list[str]],
794
+ start: Union[datetime, str] = "0",
795
+ end: Union[datetime, str] = "now",
796
+ ) -> pd.DataFrame:
797
+ endpoint_ids = (
798
+ endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
799
+ )
800
+ df = self._get_records(
801
+ table=mm_schemas.FileTargetKind.ERRORS,
802
+ start=start,
803
+ end=end,
804
+ columns=[mm_schemas.EventFieldType.ERROR_COUNT],
805
+ filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
806
+ agg_funcs=["count"],
807
+ )
808
+ if not df.empty:
809
+ df.rename(
810
+ columns={
811
+ f"count({mm_schemas.EventFieldType.ERROR_COUNT})": mm_schemas.EventFieldType.ERROR_COUNT
812
+ },
813
+ inplace=True,
814
+ )
815
+ df.dropna(inplace=True)
816
+ return df.reset_index(drop=True)
817
+
818
+ def get_avg_latency(
819
+ self,
820
+ endpoint_ids: Union[str, list[str]],
821
+ start: Union[datetime, str] = "0",
822
+ end: Union[datetime, str] = "now",
823
+ ) -> pd.DataFrame:
824
+ endpoint_ids = (
825
+ endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
826
+ )
827
+ df = self._get_records(
828
+ table=mm_schemas.FileTargetKind.PREDICTIONS,
829
+ start=start,
830
+ end=end,
831
+ columns=[mm_schemas.EventFieldType.LATENCY],
832
+ filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
833
+ agg_funcs=["avg"],
834
+ )
835
+ if not df.empty:
836
+ df.dropna(inplace=True)
837
+ return df.reset_index(drop=True)
@@ -20,10 +20,8 @@ import pandas as pd
20
20
 
21
21
  import mlrun
22
22
  import mlrun.common.model_monitoring.helpers
23
- import mlrun.common.schemas
24
- from mlrun.common.schemas.model_monitoring import (
25
- EventFieldType,
26
- )
23
+ import mlrun.common.schemas.model_monitoring.constants as mm_constants
24
+ import mlrun.data_types.infer
27
25
  from mlrun.common.schemas.model_monitoring.model_endpoints import (
28
26
  ModelEndpointMonitoringMetric,
29
27
  ModelEndpointMonitoringMetricType,
@@ -35,7 +33,6 @@ from mlrun.utils import logger
35
33
  if typing.TYPE_CHECKING:
36
34
  from mlrun.db.base import RunDBInterface
37
35
  from mlrun.projects import MlrunProject
38
- import mlrun.common.schemas.model_monitoring.constants as mm_constants
39
36
 
40
37
 
41
38
  class _BatchDict(typing.TypedDict):
@@ -45,26 +42,29 @@ class _BatchDict(typing.TypedDict):
45
42
 
46
43
 
47
44
  def get_stream_path(
48
- project: str, function_name: str = mm_constants.MonitoringFunctionNames.STREAM
45
+ project: str,
46
+ function_name: str = mm_constants.MonitoringFunctionNames.STREAM,
47
+ stream_uri: typing.Optional[str] = None,
49
48
  ) -> str:
50
49
  """
51
50
  Get stream path from the project secret. If wasn't set, take it from the system configurations
52
51
 
53
52
  :param project: Project name.
54
- :param function_name: Application name. Default is model_monitoring_stream.
53
+ :param function_name: Application name. Default is model_monitoring_stream.
54
+ :param stream_uri: Stream URI. If provided, it will be used instead of the one from the project secret.
55
55
 
56
56
  :return: Monitoring stream path to the relevant application.
57
57
  """
58
58
 
59
- stream_uri = mlrun.get_secret_or_env(
60
- mlrun.common.schemas.model_monitoring.ProjectSecretKeys.STREAM_PATH
59
+ stream_uri = stream_uri or mlrun.get_secret_or_env(
60
+ mm_constants.ProjectSecretKeys.STREAM_PATH
61
61
  )
62
62
 
63
63
  if not stream_uri or stream_uri == "v3io":
64
64
  # TODO : remove the first part of this condition in 1.9.0
65
65
  stream_uri = mlrun.mlconf.get_model_monitoring_file_target_path(
66
66
  project=project,
67
- kind=mlrun.common.schemas.model_monitoring.FileTargetKind.STREAM,
67
+ kind=mm_constants.FileTargetKind.STREAM,
68
68
  target="online",
69
69
  function_name=function_name,
70
70
  )
@@ -78,7 +78,7 @@ def get_stream_path(
78
78
 
79
79
  def get_monitoring_parquet_path(
80
80
  project: "MlrunProject",
81
- kind: str = mlrun.common.schemas.model_monitoring.FileTargetKind.PARQUET,
81
+ kind: str = mm_constants.FileTargetKind.PARQUET,
82
82
  ) -> str:
83
83
  """Get model monitoring parquet target for the current project and kind. The parquet target path is based on the
84
84
  project artifact path. If project artifact path is not defined, the parquet target path will be based on MLRun
@@ -111,7 +111,7 @@ def get_connection_string(secret_provider: typing.Callable[[str], str] = None) -
111
111
  """
112
112
 
113
113
  return mlrun.get_secret_or_env(
114
- key=mlrun.common.schemas.model_monitoring.ProjectSecretKeys.ENDPOINT_STORE_CONNECTION,
114
+ key=mm_constants.ProjectSecretKeys.ENDPOINT_STORE_CONNECTION,
115
115
  secret_provider=secret_provider,
116
116
  )
117
117
 
@@ -126,7 +126,7 @@ def get_tsdb_connection_string(
126
126
  """
127
127
 
128
128
  return mlrun.get_secret_or_env(
129
- key=mlrun.common.schemas.model_monitoring.ProjectSecretKeys.TSDB_CONNECTION,
129
+ key=mm_constants.ProjectSecretKeys.TSDB_CONNECTION,
130
130
  secret_provider=secret_provider,
131
131
  )
132
132
 
@@ -200,7 +200,7 @@ def update_model_endpoint_last_request(
200
200
  db.patch_model_endpoint(
201
201
  project=project,
202
202
  endpoint_id=model_endpoint.metadata.uid,
203
- attributes={EventFieldType.LAST_REQUEST: current_request},
203
+ attributes={mm_constants.EventFieldType.LAST_REQUEST: current_request},
204
204
  )
205
205
  else:
206
206
  try:
@@ -229,7 +229,7 @@ def update_model_endpoint_last_request(
229
229
  db.patch_model_endpoint(
230
230
  project=project,
231
231
  endpoint_id=model_endpoint.metadata.uid,
232
- attributes={EventFieldType.LAST_REQUEST: bumped_last_request},
232
+ attributes={mm_constants.EventFieldType.LAST_REQUEST: bumped_last_request},
233
233
  )
234
234
 
235
235
 
@@ -249,8 +249,7 @@ def calculate_inputs_statistics(
249
249
 
250
250
  # Use `DFDataInfer` to calculate the statistics over the inputs:
251
251
  inputs_statistics = mlrun.data_types.infer.DFDataInfer.get_stats(
252
- df=inputs,
253
- options=mlrun.data_types.infer.InferOptions.Histogram,
252
+ df=inputs, options=mlrun.data_types.infer.InferOptions.Histogram
254
253
  )
255
254
 
256
255
  # Recalculate the histograms over the bins that are set in the sample-set of the end point:
@@ -169,11 +169,40 @@ class EventStreamProcessor:
169
169
  mlrun.serving.states.RootFlowStep,
170
170
  fn.set_topology(mlrun.serving.states.StepKinds.flow),
171
171
  )
172
+ graph.add_step(
173
+ "ExtractEndpointID",
174
+ "extract_endpoint",
175
+ full_event=True,
176
+ )
177
+
178
+ # split the graph between event with error vs valid event
179
+ graph.add_step(
180
+ "storey.Filter",
181
+ "FilterError",
182
+ after="extract_endpoint",
183
+ _fn="(event.get('error') is None)",
184
+ )
185
+
186
+ graph.add_step(
187
+ "storey.Filter",
188
+ "ForwardError",
189
+ after="extract_endpoint",
190
+ _fn="(event.get('error') is not None)",
191
+ )
192
+
193
+ tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
194
+ project=self.project, secret_provider=secret_provider
195
+ )
196
+
197
+ tsdb_connector.handle_model_error(
198
+ graph,
199
+ )
172
200
 
173
201
  # Process endpoint event: splitting into sub-events and validate event data
174
202
  def apply_process_endpoint_event():
175
203
  graph.add_step(
176
204
  "ProcessEndpointEvent",
205
+ after="extract_endpoint", # TODO: change this to FilterError in ML-7456
177
206
  full_event=True,
178
207
  project=self.project,
179
208
  )
@@ -295,9 +324,6 @@ class EventStreamProcessor:
295
324
 
296
325
  apply_storey_sample_window()
297
326
 
298
- tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
299
- project=self.project, secret_provider=secret_provider
300
- )
301
327
  tsdb_connector.apply_monitoring_stream_steps(graph=graph)
302
328
 
303
329
  # Parquet branch
@@ -386,6 +412,38 @@ class ProcessBeforeEndpointUpdate(mlrun.feature_store.steps.MapClass):
386
412
  return e
387
413
 
388
414
 
415
+ class ExtractEndpointID(mlrun.feature_store.steps.MapClass):
416
+ def __init__(self, **kwargs) -> None:
417
+ """
418
+ Generate the model endpoint ID based on the event parameters and attach it to the event.
419
+ """
420
+ super().__init__(**kwargs)
421
+
422
+ def do(self, full_event) -> typing.Union[storey.Event, None]:
423
+ # Getting model version and function uri from event
424
+ # and use them for retrieving the endpoint_id
425
+ function_uri = full_event.body.get(EventFieldType.FUNCTION_URI)
426
+ if not is_not_none(function_uri, [EventFieldType.FUNCTION_URI]):
427
+ return None
428
+
429
+ model = full_event.body.get(EventFieldType.MODEL)
430
+ if not is_not_none(model, [EventFieldType.MODEL]):
431
+ return None
432
+
433
+ version = full_event.body.get(EventFieldType.VERSION)
434
+ versioned_model = f"{model}:{version}" if version else f"{model}:latest"
435
+
436
+ endpoint_id = mlrun.common.model_monitoring.create_model_endpoint_uid(
437
+ function_uri=function_uri,
438
+ versioned_model=versioned_model,
439
+ )
440
+
441
+ endpoint_id = str(endpoint_id)
442
+ full_event.body[EventFieldType.ENDPOINT_ID] = endpoint_id
443
+ full_event.body[EventFieldType.VERSIONED_MODEL] = versioned_model
444
+ return full_event
445
+
446
+
389
447
  class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
390
448
  def __init__(self, **kwargs):
391
449
  """
@@ -459,28 +517,9 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
459
517
  def do(self, full_event):
460
518
  event = full_event.body
461
519
 
462
- # Getting model version and function uri from event
463
- # and use them for retrieving the endpoint_id
464
- function_uri = event.get(EventFieldType.FUNCTION_URI)
465
- if not is_not_none(function_uri, [EventFieldType.FUNCTION_URI]):
466
- return None
467
-
468
- model = event.get(EventFieldType.MODEL)
469
- if not is_not_none(model, [EventFieldType.MODEL]):
470
- return None
471
-
472
- version = event.get(EventFieldType.VERSION)
473
- versioned_model = f"{model}:{version}" if version else f"{model}:latest"
474
-
475
- endpoint_id = mlrun.common.model_monitoring.create_model_endpoint_uid(
476
- function_uri=function_uri,
477
- versioned_model=versioned_model,
478
- )
479
-
480
- endpoint_id = str(endpoint_id)
481
-
482
- event[EventFieldType.VERSIONED_MODEL] = versioned_model
483
- event[EventFieldType.ENDPOINT_ID] = endpoint_id
520
+ versioned_model = event[EventFieldType.VERSIONED_MODEL]
521
+ endpoint_id = event[EventFieldType.ENDPOINT_ID]
522
+ function_uri = event[EventFieldType.FUNCTION_URI]
484
523
 
485
524
  # In case this process fails, resume state from existing record
486
525
  self.resume_state(endpoint_id)
@@ -488,9 +527,8 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
488
527
  # If error key has been found in the current event,
489
528
  # increase the error counter by 1 and raise the error description
490
529
  error = event.get("error")
491
- if error:
530
+ if error: # TODO: delete this in ML-7456
492
531
  self.error_count[endpoint_id] += 1
493
- # TODO: write to tsdb / kv once in a while
494
532
  raise mlrun.errors.MLRunInvalidArgumentError(str(error))
495
533
 
496
534
  # Validate event fields
@@ -598,6 +636,9 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
598
636
  EventFieldType.PREDICTION: prediction,
599
637
  EventFieldType.FIRST_REQUEST: self.first_request[endpoint_id],
600
638
  EventFieldType.LAST_REQUEST: self.last_request[endpoint_id],
639
+ EventFieldType.LAST_REQUEST_TIMESTAMP: mlrun.utils.enrich_datetime_with_tz_info(
640
+ self.last_request[endpoint_id]
641
+ ).timestamp(),
601
642
  EventFieldType.ERROR_COUNT: self.error_count[endpoint_id],
602
643
  EventFieldType.LABELS: event.get(EventFieldType.LABELS, {}),
603
644
  EventFieldType.METRICS: event.get(EventFieldType.METRICS, {}),
@@ -189,7 +189,7 @@ def run_function(
189
189
  if engine == "kfp":
190
190
  if schedule:
191
191
  raise mlrun.errors.MLRunInvalidArgumentError(
192
- "Scheduling job is not supported when running a workflow with kfp engine."
192
+ "Scheduling jobs is not supported when running a workflow with the kfp engine."
193
193
  )
194
194
  return function.as_step(
195
195
  name=name, runspec=task, workdir=workdir, outputs=outputs, labels=labels