PyPI - mlrun - Versions diffs - 1.8.0rc18__py3-none-any.whl → 1.8.0rc20__py3-none-any.whl - Mend

mlrun 1.8.0rc18py3-none-any.whl → 1.8.0rc20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (31) hide show

mlrun/__main__.py +5 -0
mlrun/common/runtimes/constants.py +17 -0
mlrun/common/schemas/artifact.py +6 -0
mlrun/common/schemas/model_monitoring/__init__.py +1 -0
mlrun/common/schemas/model_monitoring/constants.py +16 -0
mlrun/common/schemas/model_monitoring/model_endpoints.py +4 -2
mlrun/config.py +2 -2
mlrun/db/base.py +18 -0
mlrun/db/httpdb.py +118 -1
mlrun/db/nopdb.py +9 -0
mlrun/frameworks/_common/model_handler.py +0 -2
mlrun/model_monitoring/db/tsdb/base.py +116 -8
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +2 -0
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +37 -29
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +46 -26
mlrun/model_monitoring/helpers.py +2 -2
mlrun/model_monitoring/stream_processing.py +21 -0
mlrun/projects/pipelines.py +16 -3
mlrun/projects/project.py +45 -8
mlrun/runtimes/nuclio/serving.py +20 -11
mlrun/serving/v2_serving.py +51 -36
mlrun/utils/helpers.py +163 -1
mlrun/utils/notifications/notification/webhook.py +3 -0
mlrun/utils/notifications/notification_pusher.py +59 -165
mlrun/utils/version/version.json +2 -2
{mlrun-1.8.0rc18.dist-info → mlrun-1.8.0rc20.dist-info}/METADATA +1 -1
{mlrun-1.8.0rc18.dist-info → mlrun-1.8.0rc20.dist-info}/RECORD +31 -31
{mlrun-1.8.0rc18.dist-info → mlrun-1.8.0rc20.dist-info}/LICENSE +0 -0
{mlrun-1.8.0rc18.dist-info → mlrun-1.8.0rc20.dist-info}/WHEEL +0 -0
{mlrun-1.8.0rc18.dist-info → mlrun-1.8.0rc20.dist-info}/entry_points.txt +0 -0
{mlrun-1.8.0rc18.dist-info → mlrun-1.8.0rc20.dist-info}/top_level.txt +0 -0

mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py CHANGED Viewed

@@ -13,7 +13,7 @@
 # limitations under the License.
 import typing
-from datetime import datetime, timedelta, timezone
+from datetime import datetime, timedelta
 import pandas as pd
 import taosws
@@ -164,6 +164,17 @@ class TDEngineConnector(TSDBConnector):
     def _convert_to_datetime(val: typing.Union[str, datetime]) -> datetime:
         return datetime.fromisoformat(val) if isinstance(val, str) else val
+    @staticmethod
+    def _get_endpoint_filter(endpoint_id: typing.Union[str, list[str]]) -> str:
+        if isinstance(endpoint_id, str):
+            return f"endpoint_id='{endpoint_id}'"
+        elif isinstance(endpoint_id, list):
+            return f"endpoint_id IN({str(endpoint_id)[1:-1]}) "
+        else:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                "Invalid 'endpoint_id' filter: must be a string or a list."
+            )
     def apply_monitoring_stream_steps(self, graph, **kwarg):
         """
         Apply TSDB steps on the provided monitoring graph. Throughout these steps, the graph stores live data of
@@ -195,6 +206,8 @@ class TDEngineConnector(TSDBConnector):
                 columns=[
                     mm_schemas.EventFieldType.LATENCY,
                     mm_schemas.EventKeyMetrics.CUSTOM_METRICS,
+                    mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT,
+                    mm_schemas.EventFieldType.EFFECTIVE_SAMPLE_COUNT,
                 ],
                 tag_cols=[
                     mm_schemas.EventFieldType.ENDPOINT_ID,
@@ -472,7 +485,7 @@ class TDEngineConnector(TSDBConnector):
             table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
             start=start,
             end=end,
-            columns=[mm_schemas.EventFieldType.LATENCY],
+            columns=[mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT],
             filter_query=f"endpoint_id='{endpoint_id}'",
             agg_funcs=agg_funcs,
             interval=aggregation_window,
@@ -492,10 +505,10 @@ class TDEngineConnector(TSDBConnector):
             df["_wend"] = pd.to_datetime(df["_wend"])
             df.set_index("_wend", inplace=True)
-        latency_column = (
-            f"{agg_funcs[0]}({mm_schemas.EventFieldType.LATENCY})"
+        estimated_prediction_count = (
+            f"{agg_funcs[0]}({mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT})"
             if agg_funcs
-            else mm_schemas.EventFieldType.LATENCY
+            else mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT
         )
         return mm_schemas.ModelEndpointMonitoringMetricValues(
@@ -503,7 +516,7 @@ class TDEngineConnector(TSDBConnector):
             values=list(
                 zip(
                     df.index,
-                    df[latency_column],
+                    df[estimated_prediction_count],
                 )
             ),  # pyright: ignore[reportArgumentType]
         )
@@ -514,9 +527,7 @@ class TDEngineConnector(TSDBConnector):
         start: typing.Optional[datetime] = None,
         end: typing.Optional[datetime] = None,
     ) -> pd.DataFrame:
-        endpoint_ids = (
-            endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
-        )
+        filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
         start, end = self._get_start_end(start, end)
         df = self._get_records(
             table=self.tables[mm_schemas.TDEngineSuperTables.PREDICTIONS].super_table,
@@ -527,7 +538,7 @@ class TDEngineConnector(TSDBConnector):
                 mm_schemas.EventFieldType.TIME,
                 mm_schemas.EventFieldType.LATENCY,
             ],
-            filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
+            filter_query=filter_query,
             timestamp_column=mm_schemas.EventFieldType.TIME,
             agg_funcs=["last"],
             group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
@@ -542,12 +553,11 @@ class TDEngineConnector(TSDBConnector):
             },
             inplace=True,
         )
-        df[mm_schemas.EventFieldType.LAST_REQUEST] = df[
-            mm_schemas.EventFieldType.LAST_REQUEST
-        ].map(
-            lambda last_request: datetime.strptime(
-                last_request, "%Y-%m-%d %H:%M:%S.%f %z"
-            ).astimezone(tz=timezone.utc)
+        df[mm_schemas.EventFieldType.LAST_REQUEST] = pd.to_datetime(
+            df[mm_schemas.EventFieldType.LAST_REQUEST],
+            errors="coerce",
+            format="ISO8601",
+            utc=True,
         )
         return df
@@ -557,9 +567,7 @@ class TDEngineConnector(TSDBConnector):
         start: typing.Optional[datetime] = None,
         end: typing.Optional[datetime] = None,
     ) -> pd.DataFrame:
-        endpoint_ids = (
-            endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
-        )
+        filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
         start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
         start, end = self._get_start_end(start, end)
         df = self._get_records(
@@ -570,7 +578,7 @@ class TDEngineConnector(TSDBConnector):
                 mm_schemas.ResultData.RESULT_STATUS,
                 mm_schemas.EventFieldType.ENDPOINT_ID,
             ],
-            filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
+            filter_query=filter_query,
             timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
             agg_funcs=["max"],
             group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
@@ -588,7 +596,7 @@ class TDEngineConnector(TSDBConnector):
     def get_metrics_metadata(
         self,
-        endpoint_id: str,
+        endpoint_id: typing.Union[str, list[str]],
         start: typing.Optional[datetime] = None,
         end: typing.Optional[datetime] = None,
     ) -> pd.DataFrame:
@@ -602,11 +610,12 @@ class TDEngineConnector(TSDBConnector):
                 mm_schemas.MetricData.METRIC_NAME,
                 mm_schemas.EventFieldType.ENDPOINT_ID,
             ],
-            filter_query=f"endpoint_id='{endpoint_id}'",
+            filter_query=self._get_endpoint_filter(endpoint_id=endpoint_id),
             timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
             group_by=[
                 mm_schemas.WriterEvent.APPLICATION_NAME,
                 mm_schemas.MetricData.METRIC_NAME,
+                mm_schemas.EventFieldType.ENDPOINT_ID,
             ],
             agg_funcs=["last"],
         )
@@ -624,7 +633,7 @@ class TDEngineConnector(TSDBConnector):
     def get_results_metadata(
         self,
-        endpoint_id: str,
+        endpoint_id: typing.Union[str, list[str]],
         start: typing.Optional[datetime] = None,
         end: typing.Optional[datetime] = None,
     ) -> pd.DataFrame:
@@ -639,11 +648,12 @@ class TDEngineConnector(TSDBConnector):
                 mm_schemas.ResultData.RESULT_KIND,
                 mm_schemas.EventFieldType.ENDPOINT_ID,
             ],
-            filter_query=f"endpoint_id='{endpoint_id}'",
+            filter_query=self._get_endpoint_filter(endpoint_id=endpoint_id),
             timestamp_column=mm_schemas.WriterEvent.END_INFER_TIME,
             group_by=[
                 mm_schemas.WriterEvent.APPLICATION_NAME,
                 mm_schemas.ResultData.RESULT_NAME,
+                mm_schemas.EventFieldType.ENDPOINT_ID,
             ],
             agg_funcs=["last"],
         )
@@ -666,9 +676,8 @@ class TDEngineConnector(TSDBConnector):
         start: typing.Optional[datetime] = None,
         end: typing.Optional[datetime] = None,
     ) -> pd.DataFrame:
-        endpoint_ids = (
-            endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
-        )
+        filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
+        filter_query += f"AND {mm_schemas.EventFieldType.ERROR_TYPE} = '{mm_schemas.EventFieldType.INFER_ERROR}'"
         start, end = self._get_start_end(start, end)
         df = self._get_records(
             table=self.tables[mm_schemas.TDEngineSuperTables.ERRORS].super_table,
@@ -679,8 +688,7 @@ class TDEngineConnector(TSDBConnector):
                 mm_schemas.EventFieldType.ENDPOINT_ID,
             ],
             agg_funcs=["count"],
-            filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]}) "
-            f"AND {mm_schemas.EventFieldType.ERROR_TYPE} = '{mm_schemas.EventFieldType.INFER_ERROR}'",
+            filter_query=filter_query,
             group_by=mm_schemas.EventFieldType.ENDPOINT_ID,
             preform_agg_columns=[mm_schemas.EventFieldType.MODEL_ERROR],
         )

mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py CHANGED Viewed

@@ -33,6 +33,8 @@ _TSDB_BE = "tsdb"
 _TSDB_RATE = "1/s"
 _CONTAINER = "users"
+V3IO_MEPS_LIMIT = 200
 def _is_no_schema_error(exc: v3io_frames.Error) -> bool:
     """
@@ -232,6 +234,8 @@ class V3IOTSDBConnector(TSDBConnector):
             columns=[
                 mm_schemas.EventFieldType.LATENCY,
                 mm_schemas.EventFieldType.LAST_REQUEST_TIMESTAMP,
+                mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT,
+                mm_schemas.EventFieldType.EFFECTIVE_SAMPLE_COUNT,
             ],
             index_cols=[
                 mm_schemas.EventFieldType.ENDPOINT_ID,
@@ -577,6 +581,25 @@ class V3IOTSDBConnector(TSDBConnector):
             token=v3io_access_key,
         )
+    @staticmethod
+    def _get_endpoint_filter(endpoint_id: Union[str, list[str]]) -> Optional[str]:
+        if isinstance(endpoint_id, str):
+            return f"endpoint_id=='{endpoint_id}'"
+        elif isinstance(endpoint_id, list):
+            if len(endpoint_id) > V3IO_MEPS_LIMIT:
+                logger.info(
+                    "The number of endpoint ids exceeds the v3io-engine filter-expression limit, "
+                    "retrieving all the model endpoints from the db.",
+                    limit=V3IO_MEPS_LIMIT,
+                    amount=len(endpoint_id),
+                )
+                return None
+            return f"endpoint_id IN({str(endpoint_id)[1:-1]}) "
+        else:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                f"Invalid 'endpoint_id' filter: must be a string or a list, endpoint_id: {endpoint_id}"
+            )
     def read_metrics_data(
         self,
         *,
@@ -720,7 +743,7 @@ class V3IOTSDBConnector(TSDBConnector):
             table=mm_schemas.FileTargetKind.PREDICTIONS,
             start=start,
             end=end,
-            columns=[mm_schemas.EventFieldType.LATENCY],
+            columns=[mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT],
             filter_query=f"endpoint_id=='{endpoint_id}'",
             agg_funcs=agg_funcs,
             sliding_window_step=aggregation_window,
@@ -734,10 +757,10 @@ class V3IOTSDBConnector(TSDBConnector):
                 type=mm_schemas.ModelEndpointMonitoringMetricType.METRIC,
             )
-        latency_column = (
-            f"{agg_funcs[0]}({mm_schemas.EventFieldType.LATENCY})"
+        estimated_prediction_count = (
+            f"{agg_funcs[0]}({mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT})"
             if agg_funcs
-            else mm_schemas.EventFieldType.LATENCY
+            else mm_schemas.EventFieldType.ESTIMATED_PREDICTION_COUNT
         )
         return mm_schemas.ModelEndpointMonitoringMetricValues(
@@ -745,7 +768,7 @@ class V3IOTSDBConnector(TSDBConnector):
             values=list(
                 zip(
                     df.index,
-                    df[latency_column],
+                    df[estimated_prediction_count],
                 )
             ),  # pyright: ignore[reportArgumentType]
         )
@@ -756,15 +779,13 @@ class V3IOTSDBConnector(TSDBConnector):
         start: Optional[datetime] = None,
         end: Optional[datetime] = None,
     ) -> pd.DataFrame:
-        endpoint_ids = (
-            endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
-        )
+        filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
         start, end = self._get_start_end(start, end)
         df = self._get_records(
             table=mm_schemas.FileTargetKind.PREDICTIONS,
             start=start,
             end=end,
-            filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
+            filter_query=filter_query,
             agg_funcs=["last"],
         )
         if not df.empty:
@@ -791,9 +812,7 @@ class V3IOTSDBConnector(TSDBConnector):
         start: Optional[datetime] = None,
         end: Optional[datetime] = None,
     ) -> pd.DataFrame:
-        endpoint_ids = (
-            endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
-        )
+        filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
         start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
         start, end = self._get_start_end(start, end)
         df = self._get_records(
@@ -801,7 +820,7 @@ class V3IOTSDBConnector(TSDBConnector):
             start=start,
             end=end,
             columns=[mm_schemas.ResultData.RESULT_STATUS],
-            filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
+            filter_query=filter_query,
             agg_funcs=["max"],
             group_by="endpoint_id",
         )
@@ -813,17 +832,18 @@ class V3IOTSDBConnector(TSDBConnector):
     def get_metrics_metadata(
         self,
-        endpoint_id: str,
+        endpoint_id: Union[str, list[str]],
         start: Optional[datetime] = None,
         end: Optional[datetime] = None,
     ) -> pd.DataFrame:
         start, end = self._get_start_end(start, end)
+        filter_query = self._get_endpoint_filter(endpoint_id=endpoint_id)
         df = self._get_records(
             table=mm_schemas.V3IOTSDBTables.METRICS,
             start=start,
             end=end,
             columns=[mm_schemas.MetricData.METRIC_VALUE],
-            filter_query=f"endpoint_id=='{endpoint_id}'",
+            filter_query=filter_query,
             agg_funcs=["last"],
         )
         if not df.empty:
@@ -834,11 +854,12 @@ class V3IOTSDBConnector(TSDBConnector):
     def get_results_metadata(
         self,
-        endpoint_id: str,
+        endpoint_id: Union[str, list[str]],
         start: Optional[datetime] = None,
         end: Optional[datetime] = None,
     ) -> pd.DataFrame:
         start, end = self._get_start_end(start, end)
+        filter_query = self._get_endpoint_filter(endpoint_id=endpoint_id)
         df = self._get_records(
             table=mm_schemas.V3IOTSDBTables.APP_RESULTS,
             start=start,
@@ -846,7 +867,7 @@ class V3IOTSDBConnector(TSDBConnector):
             columns=[
                 mm_schemas.ResultData.RESULT_KIND,
             ],
-            filter_query=f"endpoint_id=='{endpoint_id}'",
+            filter_query=filter_query,
             agg_funcs=["last"],
         )
         if not df.empty:
@@ -864,17 +885,18 @@ class V3IOTSDBConnector(TSDBConnector):
         start: Optional[datetime] = None,
         end: Optional[datetime] = None,
     ) -> pd.DataFrame:
-        endpoint_ids = (
-            endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
-        )
+        filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
+        if filter_query:
+            filter_query += f"AND {mm_schemas.EventFieldType.ERROR_TYPE} == '{mm_schemas.EventFieldType.INFER_ERROR}'"
+        else:
+            filter_query = f"{mm_schemas.EventFieldType.ERROR_TYPE} == '{mm_schemas.EventFieldType.INFER_ERROR}' z"
         start, end = self._get_start_end(start, end)
         df = self._get_records(
             table=mm_schemas.FileTargetKind.ERRORS,
             start=start,
             end=end,
             columns=[mm_schemas.EventFieldType.ERROR_COUNT],
-            filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]}) "
-            f"AND {mm_schemas.EventFieldType.ERROR_TYPE} == '{mm_schemas.EventFieldType.INFER_ERROR}'",
+            filter_query=filter_query,
             agg_funcs=["count"],
         )
         if not df.empty:
@@ -893,9 +915,7 @@ class V3IOTSDBConnector(TSDBConnector):
         start: Optional[datetime] = None,
         end: Optional[datetime] = None,
     ) -> pd.DataFrame:
-        endpoint_ids = (
-            endpoint_ids if isinstance(endpoint_ids, list) else [endpoint_ids]
-        )
+        filter_query = self._get_endpoint_filter(endpoint_id=endpoint_ids)
         start = start or (mlrun.utils.datetime_now() - timedelta(hours=24))
         start, end = self._get_start_end(start, end)
         df = self._get_records(
@@ -903,7 +923,7 @@ class V3IOTSDBConnector(TSDBConnector):
             start=start,
             end=end,
             columns=[mm_schemas.EventFieldType.LATENCY],
-            filter_query=f"endpoint_id IN({str(endpoint_ids)[1:-1]})",
+            filter_query=filter_query,
             agg_funcs=["avg"],
         )
         if not df.empty:

mlrun/model_monitoring/helpers.py CHANGED Viewed

@@ -32,7 +32,7 @@ import mlrun.utils.helpers
 from mlrun.common.schemas import ModelEndpoint
 from mlrun.common.schemas.model_monitoring.model_endpoints import (
     ModelEndpointMonitoringMetric,
-    _compose_full_name,
+    compose_full_name,
 )
 from mlrun.utils import logger
@@ -450,7 +450,7 @@ def get_default_result_instance_fqn(model_endpoint_id: str) -> str:
 def get_invocations_fqn(project: str) -> str:
-    return _compose_full_name(
+    return compose_full_name(
         project=project,
         app=mm_constants.SpecialApps.MLRUN_INFRA,
         name=mm_constants.PredictionsQueryConstants.INVOCATIONS,

mlrun/model_monitoring/stream_processing.py CHANGED Viewed

@@ -430,6 +430,10 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
             if not isinstance(feature, list):
                 feature = [feature]
+            effective_sample_count, estimated_prediction_count = (
+                self._get_effective_and_estimated_counts(event=event)
+            )
             events.append(
                 {
                     EventFieldType.FUNCTION_URI: function_uri,
@@ -453,6 +457,8 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
                     EventFieldType.ENTITIES: event.get("request", {}).get(
                         EventFieldType.ENTITIES, {}
                     ),
+                    EventFieldType.EFFECTIVE_SAMPLE_COUNT: effective_sample_count,
+                    EventFieldType.ESTIMATED_PREDICTION_COUNT: estimated_prediction_count,
                 }
             )
@@ -507,6 +513,20 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
         self.error_count[endpoint_id] += 1
         return False
+    @staticmethod
+    def _get_effective_and_estimated_counts(event):
+        """
+        Calculate the `effective_sample_count` and the `estimated_prediction_count` based on the event's
+        sampling percentage. These values will be stored in the TSDB target.
+        Note that In non-batch serving, the `effective_sample_count` is always set to 1. In addition, when the sampling
+        percentage is 100%, the `estimated_prediction_count` is equal to the `effective_sample_count`.
+        """
+        effective_sample_count = event.get(EventFieldType.EFFECTIVE_SAMPLE_COUNT, 1)
+        estimated_prediction_count = effective_sample_count * (
+            100 / event.get(EventFieldType.SAMPLING_PERCENTAGE, 100)
+        )
+        return effective_sample_count, estimated_prediction_count
 def is_not_none(field: typing.Any, dict_path: list[str]):
     if field is not None:
@@ -672,6 +692,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
                     )
                 )
             self.first_request[endpoint_id] = True
         if attributes_to_update:
             logger.info(
                 "Updating endpoint record",

mlrun/projects/pipelines.py CHANGED Viewed

@@ -523,11 +523,12 @@ class _PipelineRunner(abc.ABC):
         text = _PipelineRunner._generate_workflow_finished_message(
             run.run_id, errors_counter, run._state
         )
         notifiers = notifiers or project.notifiers
         if notifiers:
             notifiers.push(text, "info", runs)
+        project.push_pipeline_notification_kfp_runner(run.run_id, run._state, text)
         if raise_error:
             raise raise_error
         return state or run._state, errors_counter, text
@@ -620,6 +621,8 @@ class _KFPRunner(_PipelineRunner):
                 params.update(notification.secret_params)
                 project.notifiers.add_notification(notification.kind, params)
+            project.spec.notifications = notifications
         run_id = _run_pipeline(
             workflow_handler,
             project=project.metadata.name,
@@ -647,13 +650,23 @@ class _KFPRunner(_PipelineRunner):
                     exc_info=err_to_str(exc),
                 )
-        # TODO: we should check how can we get the run uid when we don't have the context (for example on
-        #  mlrun.load_project() and later call directly to project.run)
+        # Pushing only relevant notification for the client (ipython and console)
+        project.notifiers.push_pipeline_start_message_from_client(
+            project.metadata.name, pipeline_id=run_id
+        )
         if context:
             project.notifiers.push_pipeline_start_message(
                 project.metadata.name,
                 context.uid,
             )
+        else:
+            project.push_pipeline_notification_kfp_runner(
+                run_id,
+                mlrun_pipelines.common.models.RunStatuses.running,
+                f"Workflow {run_id} started in project {project.metadata.name}",
+                notifications,
+            )
         pipeline_context.clear()
         return _PipelineRunStatus(run_id, cls, project=project, workflow=workflow_spec)

mlrun/projects/project.py CHANGED Viewed

@@ -83,6 +83,7 @@ from ..artifacts import (
     ModelArtifact,
 )
 from ..artifacts.manager import ArtifactManager, dict_to_artifact, extend_artifact_path
+from ..common.runtimes.constants import RunStates
 from ..datastore import store_manager
 from ..features import Feature
 from ..model import EntrypointParam, ImageBuilder, ModelObj
@@ -851,6 +852,7 @@ class ProjectSpec(ModelObj):
         build=None,
         custom_packagers: Optional[list[tuple[str, bool]]] = None,
         default_function_node_selector=None,
+        notifications=None,
     ):
         self.repo = None
@@ -891,6 +893,7 @@ class ProjectSpec(ModelObj):
         # whether it is mandatory for a run (raise exception on collection error) or not.
         self.custom_packagers = custom_packagers or []
         self._default_function_node_selector = default_function_node_selector or None
+        self.notifications = notifications or []
     @property
     def source(self) -> str:
@@ -1172,7 +1175,6 @@ class MlrunProject(ModelObj):
         self._artifact_manager = None
         self._notifiers = CustomNotificationPusher(
             [
-                NotificationTypes.slack,
                 NotificationTypes.console,
                 NotificationTypes.ipython,
             ]
@@ -2137,18 +2139,23 @@ class MlrunProject(ModelObj):
         db = mlrun.db.get_run_db(secrets=self._secrets)
         matching_results = []
         alerts = []
-        # TODO: Refactor to use a single request to improve performance at scale, ML-8473
-        for endpoint in endpoints.endpoints:
-            results_by_endpoint = db.get_model_endpoint_monitoring_metrics(
-                project=self.name, endpoint_id=endpoint.metadata.uid, type="results"
-            )
+        endpoint_ids = [endpoint.metadata.uid for endpoint in endpoints.endpoints]
+        # using separation to group by endpoint IDs:
+        # {"mep_id1": [...], "mep_id2": [...]}
+        results_by_endpoint = db.get_metrics_by_multiple_endpoints(
+            project=self.name,
+            endpoint_ids=endpoint_ids,
+            type="results",
+            events_format=mm_constants.GetEventsFormat.SEPARATION,
+        )
+        for endpoint_uid, results in results_by_endpoint.items():
             results_fqn_by_endpoint = [
                 get_result_instance_fqn(
-                    model_endpoint_id=endpoint.metadata.uid,
+                    model_endpoint_id=endpoint_uid,
                     app_name=result.app,
                     result_name=result.name,
                 )
-                for result in results_by_endpoint
+                for result in results
             ]
             matching_results += filter_results_by_regex(
                 existing_result_names=results_fqn_by_endpoint,
@@ -2665,6 +2672,36 @@ class MlrunProject(ModelObj):
             timeout=timeout,
         )
+    def push_pipeline_notification_kfp_runner(
+        self,
+        pipeline_id: str,
+        current_run_state: mlrun_pipelines.common.models.RunStatuses,
+        message: str,
+        notifications: Optional[list] = None,
+    ):
+        """
+        Push notifications for a pipeline run(KFP).
+        :param pipeline_id:         Unique ID of the pipeline run.
+        :param current_run_state:   Current run state of the pipeline.
+        :param message:             Message to send in the notification.
+        :param notifications:       List of notifications to send.
+        """
+        current_run_state = RunStates.pipeline_run_status_to_run_state(
+            current_run_state
+        )
+        db = mlrun.get_run_db()
+        notifications = notifications or self.spec.notifications
+        notifications_to_send = []
+        for notification in notifications:
+            if current_run_state in notification.when:
+                notification_copy = notification.copy()
+                notification_copy.message = message
+                notifications_to_send.append(notification_copy)
+        db.push_pipeline_notifications(
+            pipeline_id, self.metadata.name, notifications_to_send
+        )
     def _instantiate_function(
         self,
         func: typing.Union[str, mlrun.runtimes.BaseRuntime] = None,

mlrun/runtimes/nuclio/serving.py CHANGED Viewed

@@ -309,7 +309,7 @@ class ServingRuntime(RemoteRuntime):
         self,
         stream_path: Optional[str] = None,
         batch: Optional[int] = None,
-        sample: Optional[int] = None,
+        sampling_percentage: float = 100,
         stream_args: Optional[dict] = None,
         tracking_policy: Optional[Union["TrackingPolicy", dict]] = None,
         enable_tracking: bool = True,
@@ -317,13 +317,13 @@ class ServingRuntime(RemoteRuntime):
         """Apply on your serving function to monitor a deployed model, including real-time dashboards to detect drift
         and analyze performance.
-        :param stream_path:         Path/url of the tracking stream e.g. v3io:///users/mike/mystream
-                                    you can use the "dummy://" path for test/simulation.
-        :param batch:               Micro batch size (send micro batches of N records at a time).
-        :param sample:              Sample size (send only one of N records).
-        :param stream_args:         Stream initialization parameters, e.g. shards, retention_in_hours, ..
-        :param enable_tracking:     Enabled/Disable model-monitoring tracking.
-                                    Default True (tracking enabled).
+        :param stream_path:                Path/url of the tracking stream e.g. v3io:///users/mike/mystream
+                                           you can use the "dummy://" path for test/simulation.
+        :param batch:                      Deprecated. Micro batch size (send micro batches of N records at a time).
+        :param sampling_percentage:        Down sampling events that will be pushed to the monitoring stream based on
+                                           a specified percentage. e.g. 50 for 50%. By default, all events are pushed.
+        :param stream_args:                Stream initialization parameters, e.g. shards, retention_in_hours, ..
+        :param enable_tracking:            Enabled/Disable model-monitoring tracking. Default True (tracking enabled).
         Example::
@@ -336,12 +336,21 @@ class ServingRuntime(RemoteRuntime):
         # Applying model monitoring configurations
         self.spec.track_models = enable_tracking
+        if not 0 < sampling_percentage <= 100:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                "`sampling_percentage` must be greater than 0 and less or equal to 100."
+            )
+        self.spec.parameters["sampling_percentage"] = sampling_percentage
         if stream_path:
             self.spec.parameters["log_stream"] = stream_path
         if batch:
-            self.spec.parameters["log_stream_batch"] = batch
-        if sample:
-            self.spec.parameters["log_stream_sample"] = sample
+            warnings.warn(
+                "The `batch` size parameter was deprecated in version 1.8.0 and is no longer used. "
+                "It will be removed in 1.10.",
+                # TODO: Remove this in 1.10
+                FutureWarning,
+            )
         if stream_args:
             self.spec.parameters["stream_args"] = stream_args
         if tracking_policy is not None:

mlrun 1.8.0rc18__py3-none-any.whl → 1.8.0rc20__py3-none-any.whl

Potentially problematic release.

mlrun 1.8.0rc18py3-none-any.whl → 1.8.0rc20py3-none-any.whl