mlrun 1.8.0rc18__py3-none-any.whl → 1.8.0rc20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (31) hide show
  1. mlrun/__main__.py +5 -0
  2. mlrun/common/runtimes/constants.py +17 -0
  3. mlrun/common/schemas/artifact.py +6 -0
  4. mlrun/common/schemas/model_monitoring/__init__.py +1 -0
  5. mlrun/common/schemas/model_monitoring/constants.py +16 -0
  6. mlrun/common/schemas/model_monitoring/model_endpoints.py +4 -2
  7. mlrun/config.py +2 -2
  8. mlrun/db/base.py +18 -0
  9. mlrun/db/httpdb.py +118 -1
  10. mlrun/db/nopdb.py +9 -0
  11. mlrun/frameworks/_common/model_handler.py +0 -2
  12. mlrun/model_monitoring/db/tsdb/base.py +116 -8
  13. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +2 -0
  14. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +37 -29
  15. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +46 -26
  16. mlrun/model_monitoring/helpers.py +2 -2
  17. mlrun/model_monitoring/stream_processing.py +21 -0
  18. mlrun/projects/pipelines.py +16 -3
  19. mlrun/projects/project.py +45 -8
  20. mlrun/runtimes/nuclio/serving.py +20 -11
  21. mlrun/serving/v2_serving.py +51 -36
  22. mlrun/utils/helpers.py +163 -1
  23. mlrun/utils/notifications/notification/webhook.py +3 -0
  24. mlrun/utils/notifications/notification_pusher.py +59 -165
  25. mlrun/utils/version/version.json +2 -2
  26. {mlrun-1.8.0rc18.dist-info → mlrun-1.8.0rc20.dist-info}/METADATA +1 -1
  27. {mlrun-1.8.0rc18.dist-info → mlrun-1.8.0rc20.dist-info}/RECORD +31 -31
  28. {mlrun-1.8.0rc18.dist-info → mlrun-1.8.0rc20.dist-info}/LICENSE +0 -0
  29. {mlrun-1.8.0rc18.dist-info → mlrun-1.8.0rc20.dist-info}/WHEEL +0 -0
  30. {mlrun-1.8.0rc18.dist-info → mlrun-1.8.0rc20.dist-info}/entry_points.txt +0 -0
  31. {mlrun-1.8.0rc18.dist-info → mlrun-1.8.0rc20.dist-info}/top_level.txt +0 -0
@@ -13,7 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import typing
16
- from datetime import datetime, timedelta, timezone
16
+ from datetime import datetime, timedelta
17
17
 
18
18
  import pandas as pd
19
19
  import taosws
@@ -164,6 +164,17 @@ class TDEngineConnector(TSDBConnector):
164
164
  def _convert_to_datetime(val: typing.Union[str, datetime]) -> datetime:
165
165
  return datetime.fromisoformat(val) if isinstance(val, str) else val
166
166
 
167
+ @staticmethod
168
+ def _get_endpoint_filter(endpoint_id: typing.Union[str, list[str]]) -> str:
169
+ if isinstance(endpoint_id, str):
170
+ return f"endpoint_id='{endpoint_id}'"
171
+ elif isinstance(endpoint_id, list):
172
+ return f"endpoint_id IN({str(endpoint_id)[1:-1]}) "
173
+ else:
174
+ raise mlrun.errors.MLRunInvalidArgumentError(
175
+ "Invalid 'endpoint_id' filter: must be a string or a list."
176
+ )
177
+
167
178
  def apply_monitoring_stream_steps(self, graph, **kwarg):
168
179
  """
169
180
  Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
@@ -195,6 +206,8 @@ class TDEngineConnector(TSDBConnector):
195
206
  columns=[
196
207
  mm_schemas.EventFieldType.LATENCY,
197
208
  mm_schemas.EventKeyMetrics.CUSTOM_METRICS,
209
+ mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT,
210
+ mm_schemas.EventFieldType.EFFECTIVE_SAMPLE_COUNT,
198
211
  ],
199
212
  tag_cols=[
200
213
  mm_schemas.EventFieldType.ENDPOINT_ID,
@@ -472,7 +485,7 @@ class TDEngineConnector(TSDBConnector):
472
485
  table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
473
486
  start=start,
474
487
  end=end,
475
- columns=[mm_schemas.EventFieldType.LATENCY],
488
+ columns=[mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT],
476
489
  filter_query=f"endpoint_id='{endpoint_id}'",
477
490
  agg_funcs=agg_funcs,
478
491
  interval=aggregation_window,
@@ -492,10 +505,10 @@ class TDEngineConnector(TSDBConnector):
492
505
  df["_wend"] = pd.to_datetime(df["_wend"])
493
506
  df.set_index("_wend", inplace=True)
494
507
 
495
- latency_column = (
496
- f"{agg_funcs[0]}({mm_schemas.EventFieldType.LATENCY})"
508
+ estimated_prediction_count = (
509
+ f"{agg_funcs[0]}({mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT})"
497
510
  if agg_funcs
498
- else mm_schemas.EventFieldType.LATENCY
511
+ else mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT
499
512
  )
500
513
 
501
514
  return mm_schemas.ModelEndpointMonitoringMetricValues(
@@ -503,7 +516,7 @@ class TDEngineConnector(TSDBConnector):
503
516
  values=list(
504
517
  zip(
505
518
  df.index,
506
- df[latency_column],
519
+ df[estimated_prediction_count],
507
520
  )
508
521
  ), # pyright: ignore[reportArgumentType]
509
522
  )
@@ -514,9 +527,7 @@ class TDEngineConnector(TSDBConnector):
514
527
  start: typing.Optional[datetime] = None,
515
528
  end: typing.Optional[datetime] = None,
516
529
  ) -> pd.DataFrame:
517
- endpoint_ids = (
518
- endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
519
- )
530
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
520
531
  start, end = self._get_start_end(start, end)
521
532
  df = self._get_records(
522
533
  table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
@@ -527,7 +538,7 @@ class TDEngineConnector(TSDBConnector):
527
538
  mm_schemas.EventFieldType.TIME,
528
539
  mm_schemas.EventFieldType.LATENCY,
529
540
  ],
530
- filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
541
+ filter_query=filter_query,
531
542
  timestamp_column=mm_schemas.EventFieldType.TIME,
532
543
  agg_funcs=["last"],
533
544
  group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
@@ -542,12 +553,11 @@ class TDEngineConnector(TSDBConnector):
542
553
  },
543
554
  inplace=True,
544
555
  )
545
- df[mm_schemas.EventFieldType.LAST_REQUEST] = df[
546
- mm_schemas.EventFieldType.LAST_REQUEST
547
- ].map(
548
- lambda last_request: datetime.strptime(
549
- last_request, "%Y-%m-%d %H:%M:%S.%f %z"
550
- ).astimezone(tz=timezone.utc)
556
+ df[mm_schemas.EventFieldType.LAST_REQUEST] = pd.to_datetime(
557
+ df[mm_schemas.EventFieldType.LAST_REQUEST],
558
+ errors="coerce",
559
+ format="ISO8601",
560
+ utc=True,
551
561
  )
552
562
  return df
553
563
 
@@ -557,9 +567,7 @@ class TDEngineConnector(TSDBConnector):
557
567
  start: typing.Optional[datetime] = None,
558
568
  end: typing.Optional[datetime] = None,
559
569
  ) -> pd.DataFrame:
560
- endpoint_ids = (
561
- endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
562
- )
570
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
563
571
  start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
564
572
  start, end = self._get_start_end(start, end)
565
573
  df = self._get_records(
@@ -570,7 +578,7 @@ class TDEngineConnector(TSDBConnector):
570
578
  mm_schemas.ResultData.RESULT_STATUS,
571
579
  mm_schemas.EventFieldType.ENDPOINT_ID,
572
580
  ],
573
- filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
581
+ filter_query=filter_query,
574
582
  timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
575
583
  agg_funcs=["max"],
576
584
  group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
@@ -588,7 +596,7 @@ class TDEngineConnector(TSDBConnector):
588
596
 
589
597
  def get_metrics_metadata(
590
598
  self,
591
- endpoint_id: str,
599
+ endpoint_id: typing.Union[str, list[str]],
592
600
  start: typing.Optional[datetime] = None,
593
601
  end: typing.Optional[datetime] = None,
594
602
  ) -> pd.DataFrame:
@@ -602,11 +610,12 @@ class TDEngineConnector(TSDBConnector):
602
610
  mm_schemas.MetricData.METRIC_NAME,
603
611
  mm_schemas.EventFieldType.ENDPOINT_ID,
604
612
  ],
605
- filter_query=f"endpoint_id='{endpoint_id}'",
613
+ filter_query=self._get_endpoint_filter(endpoint_id=endpoint_id),
606
614
  timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
607
615
  group_by=[
608
616
  mm_schemas.WriterEvent.APPLICATION_NAME,
609
617
  mm_schemas.MetricData.METRIC_NAME,
618
+ mm_schemas.EventFieldType.ENDPOINT_ID,
610
619
  ],
611
620
  agg_funcs=["last"],
612
621
  )
@@ -624,7 +633,7 @@ class TDEngineConnector(TSDBConnector):
624
633
 
625
634
  def get_results_metadata(
626
635
  self,
627
- endpoint_id: str,
636
+ endpoint_id: typing.Union[str, list[str]],
628
637
  start: typing.Optional[datetime] = None,
629
638
  end: typing.Optional[datetime] = None,
630
639
  ) -> pd.DataFrame:
@@ -639,11 +648,12 @@ class TDEngineConnector(TSDBConnector):
639
648
  mm_schemas.ResultData.RESULT_KIND,
640
649
  mm_schemas.EventFieldType.ENDPOINT_ID,
641
650
  ],
642
- filter_query=f"endpoint_id='{endpoint_id}'",
651
+ filter_query=self._get_endpoint_filter(endpoint_id=endpoint_id),
643
652
  timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
644
653
  group_by=[
645
654
  mm_schemas.WriterEvent.APPLICATION_NAME,
646
655
  mm_schemas.ResultData.RESULT_NAME,
656
+ mm_schemas.EventFieldType.ENDPOINT_ID,
647
657
  ],
648
658
  agg_funcs=["last"],
649
659
  )
@@ -666,9 +676,8 @@ class TDEngineConnector(TSDBConnector):
666
676
  start: typing.Optional[datetime] = None,
667
677
  end: typing.Optional[datetime] = None,
668
678
  ) -> pd.DataFrame:
669
- endpoint_ids = (
670
- endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
671
- )
679
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
680
+ filter_query += f"AND {mm_schemas.EventFieldType.ERROR_TYPE} = '{mm_schemas.EventFieldType.INFER_ERROR}'"
672
681
  start, end = self._get_start_end(start, end)
673
682
  df = self._get_records(
674
683
  table=self.tables[mm_schemas.TDEngineSuperTables.ERRORS].super_table,
@@ -679,8 +688,7 @@ class TDEngineConnector(TSDBConnector):
679
688
  mm_schemas.EventFieldType.ENDPOINT_ID,
680
689
  ],
681
690
  agg_funcs=["count"],
682
- filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]}) "
683
- f"AND {mm_schemas.EventFieldType.ERROR_TYPE} = '{mm_schemas.EventFieldType.INFER_ERROR}'",
691
+ filter_query=filter_query,
684
692
  group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
685
693
  preform_agg_columns=[mm_schemas.EventFieldType.MODEL_ERROR],
686
694
  )
@@ -33,6 +33,8 @@ _TSDB_BE = "tsdb"
33
33
  _TSDB_RATE = "1/s"
34
34
  _CONTAINER = "users"
35
35
 
36
+ V3IO_MEPS_LIMIT = 200
37
+
36
38
 
37
39
  def _is_no_schema_error(exc: v3io_frames.Error) -> bool:
38
40
  """
@@ -232,6 +234,8 @@ class V3IOTSDBConnector(TSDBConnector):
232
234
  columns=[
233
235
  mm_schemas.EventFieldType.LATENCY,
234
236
  mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP,
237
+ mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT,
238
+ mm_schemas.EventFieldType.EFFECTIVE_SAMPLE_COUNT,
235
239
  ],
236
240
  index_cols=[
237
241
  mm_schemas.EventFieldType.ENDPOINT_ID,
@@ -577,6 +581,25 @@ class V3IOTSDBConnector(TSDBConnector):
577
581
  token=v3io_access_key,
578
582
  )
579
583
 
584
+ @staticmethod
585
+ def _get_endpoint_filter(endpoint_id: Union[str, list[str]]) -> Optional[str]:
586
+ if isinstance(endpoint_id, str):
587
+ return f"endpoint_id=='{endpoint_id}'"
588
+ elif isinstance(endpoint_id, list):
589
+ if len(endpoint_id) > V3IO_MEPS_LIMIT:
590
+ logger.info(
591
+ "The number of endpoint ids exceeds the v3io-engine filter-expression limit, "
592
+ "retrieving all the model endpoints from the db.",
593
+ limit=V3IO_MEPS_LIMIT,
594
+ amount=len(endpoint_id),
595
+ )
596
+ return None
597
+ return f"endpoint_id IN({str(endpoint_id)[1:-1]}) "
598
+ else:
599
+ raise mlrun.errors.MLRunInvalidArgumentError(
600
+ f"Invalid 'endpoint_id' filter: must be a string or a list, endpoint_id: {endpoint_id}"
601
+ )
602
+
580
603
  def read_metrics_data(
581
604
  self,
582
605
  *,
@@ -720,7 +743,7 @@ class V3IOTSDBConnector(TSDBConnector):
720
743
  table=mm_schemas.FileTargetKind.PREDICTIONS,
721
744
  start=start,
722
745
  end=end,
723
- columns=[mm_schemas.EventFieldType.LATENCY],
746
+ columns=[mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT],
724
747
  filter_query=f"endpoint_id=='{endpoint_id}'",
725
748
  agg_funcs=agg_funcs,
726
749
  sliding_window_step=aggregation_window,
@@ -734,10 +757,10 @@ class V3IOTSDBConnector(TSDBConnector):
734
757
  type=mm_schemas.ModelEndpointMonitoringMetricType.METRIC,
735
758
  )
736
759
 
737
- latency_column = (
738
- f"{agg_funcs[0]}({mm_schemas.EventFieldType.LATENCY})"
760
+ estimated_prediction_count = (
761
+ f"{agg_funcs[0]}({mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT})"
739
762
  if agg_funcs
740
- else mm_schemas.EventFieldType.LATENCY
763
+ else mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT
741
764
  )
742
765
 
743
766
  return mm_schemas.ModelEndpointMonitoringMetricValues(
@@ -745,7 +768,7 @@ class V3IOTSDBConnector(TSDBConnector):
745
768
  values=list(
746
769
  zip(
747
770
  df.index,
748
- df[latency_column],
771
+ df[estimated_prediction_count],
749
772
  )
750
773
  ), # pyright: ignore[reportArgumentType]
751
774
  )
@@ -756,15 +779,13 @@ class V3IOTSDBConnector(TSDBConnector):
756
779
  start: Optional[datetime] = None,
757
780
  end: Optional[datetime] = None,
758
781
  ) -> pd.DataFrame:
759
- endpoint_ids = (
760
- endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
761
- )
782
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
762
783
  start, end = self._get_start_end(start, end)
763
784
  df = self._get_records(
764
785
  table=mm_schemas.FileTargetKind.PREDICTIONS,
765
786
  start=start,
766
787
  end=end,
767
- filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
788
+ filter_query=filter_query,
768
789
  agg_funcs=["last"],
769
790
  )
770
791
  if not df.empty:
@@ -791,9 +812,7 @@ class V3IOTSDBConnector(TSDBConnector):
791
812
  start: Optional[datetime] = None,
792
813
  end: Optional[datetime] = None,
793
814
  ) -> pd.DataFrame:
794
- endpoint_ids = (
795
- endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
796
- )
815
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
797
816
  start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
798
817
  start, end = self._get_start_end(start, end)
799
818
  df = self._get_records(
@@ -801,7 +820,7 @@ class V3IOTSDBConnector(TSDBConnector):
801
820
  start=start,
802
821
  end=end,
803
822
  columns=[mm_schemas.ResultData.RESULT_STATUS],
804
- filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
823
+ filter_query=filter_query,
805
824
  agg_funcs=["max"],
806
825
  group_by="endpoint_id",
807
826
  )
@@ -813,17 +832,18 @@ class V3IOTSDBConnector(TSDBConnector):
813
832
 
814
833
  def get_metrics_metadata(
815
834
  self,
816
- endpoint_id: str,
835
+ endpoint_id: Union[str, list[str]],
817
836
  start: Optional[datetime] = None,
818
837
  end: Optional[datetime] = None,
819
838
  ) -> pd.DataFrame:
820
839
  start, end = self._get_start_end(start, end)
840
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_id)
821
841
  df = self._get_records(
822
842
  table=mm_schemas.V3IOTSDBTables.METRICS,
823
843
  start=start,
824
844
  end=end,
825
845
  columns=[mm_schemas.MetricData.METRIC_VALUE],
826
- filter_query=f"endpoint_id=='{endpoint_id}'",
846
+ filter_query=filter_query,
827
847
  agg_funcs=["last"],
828
848
  )
829
849
  if not df.empty:
@@ -834,11 +854,12 @@ class V3IOTSDBConnector(TSDBConnector):
834
854
 
835
855
  def get_results_metadata(
836
856
  self,
837
- endpoint_id: str,
857
+ endpoint_id: Union[str, list[str]],
838
858
  start: Optional[datetime] = None,
839
859
  end: Optional[datetime] = None,
840
860
  ) -> pd.DataFrame:
841
861
  start, end = self._get_start_end(start, end)
862
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_id)
842
863
  df = self._get_records(
843
864
  table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
844
865
  start=start,
@@ -846,7 +867,7 @@ class V3IOTSDBConnector(TSDBConnector):
846
867
  columns=[
847
868
  mm_schemas.ResultData.RESULT_KIND,
848
869
  ],
849
- filter_query=f"endpoint_id=='{endpoint_id}'",
870
+ filter_query=filter_query,
850
871
  agg_funcs=["last"],
851
872
  )
852
873
  if not df.empty:
@@ -864,17 +885,18 @@ class V3IOTSDBConnector(TSDBConnector):
864
885
  start: Optional[datetime] = None,
865
886
  end: Optional[datetime] = None,
866
887
  ) -> pd.DataFrame:
867
- endpoint_ids = (
868
- endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
869
- )
888
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
889
+ if filter_query:
890
+ filter_query += f"AND {mm_schemas.EventFieldType.ERROR_TYPE} == '{mm_schemas.EventFieldType.INFER_ERROR}'"
891
+ else:
892
+ filter_query = f"{mm_schemas.EventFieldType.ERROR_TYPE} == '{mm_schemas.EventFieldType.INFER_ERROR}' z"
870
893
  start, end = self._get_start_end(start, end)
871
894
  df = self._get_records(
872
895
  table=mm_schemas.FileTargetKind.ERRORS,
873
896
  start=start,
874
897
  end=end,
875
898
  columns=[mm_schemas.EventFieldType.ERROR_COUNT],
876
- filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]}) "
877
- f"AND {mm_schemas.EventFieldType.ERROR_TYPE} == '{mm_schemas.EventFieldType.INFER_ERROR}'",
899
+ filter_query=filter_query,
878
900
  agg_funcs=["count"],
879
901
  )
880
902
  if not df.empty:
@@ -893,9 +915,7 @@ class V3IOTSDBConnector(TSDBConnector):
893
915
  start: Optional[datetime] = None,
894
916
  end: Optional[datetime] = None,
895
917
  ) -> pd.DataFrame:
896
- endpoint_ids = (
897
- endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
898
- )
918
+ filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
899
919
  start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
900
920
  start, end = self._get_start_end(start, end)
901
921
  df = self._get_records(
@@ -903,7 +923,7 @@ class V3IOTSDBConnector(TSDBConnector):
903
923
  start=start,
904
924
  end=end,
905
925
  columns=[mm_schemas.EventFieldType.LATENCY],
906
- filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
926
+ filter_query=filter_query,
907
927
  agg_funcs=["avg"],
908
928
  )
909
929
  if not df.empty:
@@ -32,7 +32,7 @@ import mlrun.utils.helpers
32
32
  from mlrun.common.schemas import ModelEndpoint
33
33
  from mlrun.common.schemas.model_monitoring.model_endpoints import (
34
34
  ModelEndpointMonitoringMetric,
35
- _compose_full_name,
35
+ compose_full_name,
36
36
  )
37
37
  from mlrun.utils import logger
38
38
 
@@ -450,7 +450,7 @@ def get_default_result_instance_fqn(model_endpoint_id: str) -> str:
450
450
 
451
451
 
452
452
  def get_invocations_fqn(project: str) -> str:
453
- return _compose_full_name(
453
+ return compose_full_name(
454
454
  project=project,
455
455
  app=mm_constants.SpecialApps.MLRUN_INFRA,
456
456
  name=mm_constants.PredictionsQueryConstants.INVOCATIONS,
@@ -430,6 +430,10 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
430
430
  if not isinstance(feature, list):
431
431
  feature = [feature]
432
432
 
433
+ effective_sample_count, estimated_prediction_count = (
434
+ self._get_effective_and_estimated_counts(event=event)
435
+ )
436
+
433
437
  events.append(
434
438
  {
435
439
  EventFieldType.FUNCTION_URI: function_uri,
@@ -453,6 +457,8 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
453
457
  EventFieldType.ENTITIES: event.get("request", {}).get(
454
458
  EventFieldType.ENTITIES, {}
455
459
  ),
460
+ EventFieldType.EFFECTIVE_SAMPLE_COUNT: effective_sample_count,
461
+ EventFieldType.ESTIMATED_PREDICTION_COUNT: estimated_prediction_count,
456
462
  }
457
463
  )
458
464
 
@@ -507,6 +513,20 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
507
513
  self.error_count[endpoint_id] += 1
508
514
  return False
509
515
 
516
+ @staticmethod
517
+ def _get_effective_and_estimated_counts(event):
518
+ """
519
+ Calculate the `effective_sample_count` and the `estimated_prediction_count` based on the event's
520
+ sampling percentage. These values will be stored in the TSDB target.
521
+ Note that In non-batch serving, the `effective_sample_count` is always set to 1. In addition, when the sampling
522
+ percentage is 100%, the `estimated_prediction_count` is equal to the `effective_sample_count`.
523
+ """
524
+ effective_sample_count = event.get(EventFieldType.EFFECTIVE_SAMPLE_COUNT, 1)
525
+ estimated_prediction_count = effective_sample_count * (
526
+ 100 / event.get(EventFieldType.SAMPLING_PERCENTAGE, 100)
527
+ )
528
+ return effective_sample_count, estimated_prediction_count
529
+
510
530
 
511
531
  def is_not_none(field: typing.Any, dict_path: list[str]):
512
532
  if field is not None:
@@ -672,6 +692,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
672
692
  )
673
693
  )
674
694
  self.first_request[endpoint_id] = True
695
+
675
696
  if attributes_to_update:
676
697
  logger.info(
677
698
  "Updating endpoint record",
@@ -523,11 +523,12 @@ class _PipelineRunner(abc.ABC):
523
523
  text = _PipelineRunner._generate_workflow_finished_message(
524
524
  run.run_id, errors_counter, run._state
525
525
  )
526
-
527
526
  notifiers = notifiers or project.notifiers
528
527
  if notifiers:
529
528
  notifiers.push(text, "info", runs)
530
529
 
530
+ project.push_pipeline_notification_kfp_runner(run.run_id, run._state, text)
531
+
531
532
  if raise_error:
532
533
  raise raise_error
533
534
  return state or run._state, errors_counter, text
@@ -620,6 +621,8 @@ class _KFPRunner(_PipelineRunner):
620
621
  params.update(notification.secret_params)
621
622
  project.notifiers.add_notification(notification.kind, params)
622
623
 
624
+ project.spec.notifications = notifications
625
+
623
626
  run_id = _run_pipeline(
624
627
  workflow_handler,
625
628
  project=project.metadata.name,
@@ -647,13 +650,23 @@ class _KFPRunner(_PipelineRunner):
647
650
  exc_info=err_to_str(exc),
648
651
  )
649
652
 
650
- # TODO: we should check how can we get the run uid when we don't have the context (for example on
651
- # mlrun.load_project() and later call directly to project.run)
653
+ # Pushing only relevant notification for the client (ipython and console)
654
+ project.notifiers.push_pipeline_start_message_from_client(
655
+ project.metadata.name, pipeline_id=run_id
656
+ )
657
+
652
658
  if context:
653
659
  project.notifiers.push_pipeline_start_message(
654
660
  project.metadata.name,
655
661
  context.uid,
656
662
  )
663
+ else:
664
+ project.push_pipeline_notification_kfp_runner(
665
+ run_id,
666
+ mlrun_pipelines.common.models.RunStatuses.running,
667
+ f"Workflow {run_id} started in project {project.metadata.name}",
668
+ notifications,
669
+ )
657
670
  pipeline_context.clear()
658
671
  return _PipelineRunStatus(run_id, cls, project=project, workflow=workflow_spec)
659
672
 
mlrun/projects/project.py CHANGED
@@ -83,6 +83,7 @@ from ..artifacts import (
83
83
  ModelArtifact,
84
84
  )
85
85
  from ..artifacts.manager import ArtifactManager, dict_to_artifact, extend_artifact_path
86
+ from ..common.runtimes.constants import RunStates
86
87
  from ..datastore import store_manager
87
88
  from ..features import Feature
88
89
  from ..model import EntrypointParam, ImageBuilder, ModelObj
@@ -851,6 +852,7 @@ class ProjectSpec(ModelObj):
851
852
  build=None,
852
853
  custom_packagers: Optional[list[tuple[str, bool]]] = None,
853
854
  default_function_node_selector=None,
855
+ notifications=None,
854
856
  ):
855
857
  self.repo = None
856
858
 
@@ -891,6 +893,7 @@ class ProjectSpec(ModelObj):
891
893
  # whether it is mandatory for a run (raise exception on collection error) or not.
892
894
  self.custom_packagers = custom_packagers or []
893
895
  self._default_function_node_selector = default_function_node_selector or None
896
+ self.notifications = notifications or []
894
897
 
895
898
  @property
896
899
  def source(self) -> str:
@@ -1172,7 +1175,6 @@ class MlrunProject(ModelObj):
1172
1175
  self._artifact_manager = None
1173
1176
  self._notifiers = CustomNotificationPusher(
1174
1177
  [
1175
- NotificationTypes.slack,
1176
1178
  NotificationTypes.console,
1177
1179
  NotificationTypes.ipython,
1178
1180
  ]
@@ -2137,18 +2139,23 @@ class MlrunProject(ModelObj):
2137
2139
  db = mlrun.db.get_run_db(secrets=self._secrets)
2138
2140
  matching_results = []
2139
2141
  alerts = []
2140
- # TODO: Refactor to use a single request to improve performance at scale, ML-8473
2141
- for endpoint in endpoints.endpoints:
2142
- results_by_endpoint = db.get_model_endpoint_monitoring_metrics(
2143
- project=self.name, endpoint_id=endpoint.metadata.uid, type="results"
2144
- )
2142
+ endpoint_ids = [endpoint.metadata.uid for endpoint in endpoints.endpoints]
2143
+ # using separation to group by endpoint IDs:
2144
+ # {"mep_id1": [...], "mep_id2": [...]}
2145
+ results_by_endpoint = db.get_metrics_by_multiple_endpoints(
2146
+ project=self.name,
2147
+ endpoint_ids=endpoint_ids,
2148
+ type="results",
2149
+ events_format=mm_constants.GetEventsFormat.SEPARATION,
2150
+ )
2151
+ for endpoint_uid, results in results_by_endpoint.items():
2145
2152
  results_fqn_by_endpoint = [
2146
2153
  get_result_instance_fqn(
2147
- model_endpoint_id=endpoint.metadata.uid,
2154
+ model_endpoint_id=endpoint_uid,
2148
2155
  app_name=result.app,
2149
2156
  result_name=result.name,
2150
2157
  )
2151
- for result in results_by_endpoint
2158
+ for result in results
2152
2159
  ]
2153
2160
  matching_results += filter_results_by_regex(
2154
2161
  existing_result_names=results_fqn_by_endpoint,
@@ -2665,6 +2672,36 @@ class MlrunProject(ModelObj):
2665
2672
  timeout=timeout,
2666
2673
  )
2667
2674
 
2675
+ def push_pipeline_notification_kfp_runner(
2676
+ self,
2677
+ pipeline_id: str,
2678
+ current_run_state: mlrun_pipelines.common.models.RunStatuses,
2679
+ message: str,
2680
+ notifications: Optional[list] = None,
2681
+ ):
2682
+ """
2683
+ Push notifications for a pipeline run(KFP).
2684
+
2685
+ :param pipeline_id: Unique ID of the pipeline run.
2686
+ :param current_run_state: Current run state of the pipeline.
2687
+ :param message: Message to send in the notification.
2688
+ :param notifications: List of notifications to send.
2689
+ """
2690
+ current_run_state = RunStates.pipeline_run_status_to_run_state(
2691
+ current_run_state
2692
+ )
2693
+ db = mlrun.get_run_db()
2694
+ notifications = notifications or self.spec.notifications
2695
+ notifications_to_send = []
2696
+ for notification in notifications:
2697
+ if current_run_state in notification.when:
2698
+ notification_copy = notification.copy()
2699
+ notification_copy.message = message
2700
+ notifications_to_send.append(notification_copy)
2701
+ db.push_pipeline_notifications(
2702
+ pipeline_id, self.metadata.name, notifications_to_send
2703
+ )
2704
+
2668
2705
  def _instantiate_function(
2669
2706
  self,
2670
2707
  func: typing.Union[str, mlrun.runtimes.BaseRuntime] = None,
@@ -309,7 +309,7 @@ class ServingRuntime(RemoteRuntime):
309
309
  self,
310
310
  stream_path: Optional[str] = None,
311
311
  batch: Optional[int] = None,
312
- sample: Optional[int] = None,
312
+ sampling_percentage: float = 100,
313
313
  stream_args: Optional[dict] = None,
314
314
  tracking_policy: Optional[Union["TrackingPolicy", dict]] = None,
315
315
  enable_tracking: bool = True,
@@ -317,13 +317,13 @@ class ServingRuntime(RemoteRuntime):
317
317
  """Apply on your serving function to monitor a deployed model, including real-time dashboards to detect drift
318
318
  and analyze performance.
319
319
 
320
- :param stream_path: Path/url of the tracking stream e.g. v3io:///users/mike/mystream
321
- you can use the "dummy://" path for test/simulation.
322
- :param batch: Micro batch size (send micro batches of N records at a time).
323
- :param sample: Sample size (send only one of N records).
324
- :param stream_args: Stream initialization parameters, e.g. shards, retention_in_hours, ..
325
- :param enable_tracking: Enabled/Disable model-monitoring tracking.
326
- Default True (tracking enabled).
320
+ :param stream_path: Path/url of the tracking stream e.g. v3io:///users/mike/mystream
321
+ you can use the "dummy://" path for test/simulation.
322
+ :param batch: Deprecated. Micro batch size (send micro batches of N records at a time).
323
+ :param sampling_percentage: Down sampling events that will be pushed to the monitoring stream based on
324
+ a specified percentage. e.g. 50 for 50%. By default, all events are pushed.
325
+ :param stream_args: Stream initialization parameters, e.g. shards, retention_in_hours, ..
326
+ :param enable_tracking: Enabled/Disable model-monitoring tracking. Default True (tracking enabled).
327
327
 
328
328
  Example::
329
329
 
@@ -336,12 +336,21 @@ class ServingRuntime(RemoteRuntime):
336
336
  # Applying model monitoring configurations
337
337
  self.spec.track_models = enable_tracking
338
338
 
339
+ if not 0 < sampling_percentage <= 100:
340
+ raise mlrun.errors.MLRunInvalidArgumentError(
341
+ "`sampling_percentage` must be greater than 0 and less or equal to 100."
342
+ )
343
+ self.spec.parameters["sampling_percentage"] = sampling_percentage
344
+
339
345
  if stream_path:
340
346
  self.spec.parameters["log_stream"] = stream_path
341
347
  if batch:
342
- self.spec.parameters["log_stream_batch"] = batch
343
- if sample:
344
- self.spec.parameters["log_stream_sample"] = sample
348
+ warnings.warn(
349
+ "The `batch` size parameter was deprecated in version 1.8.0 and is no longer used. "
350
+ "It will be removed in 1.10.",
351
+ # TODO: Remove this in 1.10
352
+ FutureWarning,
353
+ )
345
354
  if stream_args:
346
355
  self.spec.parameters["stream_args"] = stream_args
347
356
  if tracking_policy is not None: