PyPI - mlrun - Versions diffs - 1.5.0rc1__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl - Mend

mlrun 1.5.0rc1py3-none-any.whl → 1.5.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (119) hide show

mlrun/__init__.py +2 -35
mlrun/__main__.py +1 -40
mlrun/api/api/api.py +6 -0
mlrun/api/api/endpoints/feature_store.py +0 -4
mlrun/api/api/endpoints/files.py +14 -2
mlrun/api/api/endpoints/functions.py +6 -1
mlrun/api/api/endpoints/logs.py +17 -3
mlrun/api/api/endpoints/pipelines.py +1 -5
mlrun/api/api/endpoints/projects.py +88 -0
mlrun/api/api/endpoints/runs.py +48 -6
mlrun/api/api/endpoints/workflows.py +355 -0
mlrun/api/api/utils.py +1 -1
mlrun/api/crud/__init__.py +1 -0
mlrun/api/crud/client_spec.py +3 -0
mlrun/api/crud/model_monitoring/deployment.py +36 -7
mlrun/api/crud/model_monitoring/grafana.py +1 -1
mlrun/api/crud/model_monitoring/helpers.py +32 -2
mlrun/api/crud/model_monitoring/model_endpoints.py +27 -5
mlrun/api/crud/notifications.py +9 -4
mlrun/api/crud/pipelines.py +4 -9
mlrun/api/crud/runtime_resources.py +4 -3
mlrun/api/crud/secrets.py +21 -0
mlrun/api/crud/workflows.py +352 -0
mlrun/api/db/base.py +16 -1
mlrun/api/db/sqldb/db.py +97 -16
mlrun/api/launcher.py +26 -7
mlrun/api/main.py +3 -4
mlrun/{mlutils → api/rundb}/__init__.py +2 -6
mlrun/{db → api/rundb}/sqldb.py +35 -83
mlrun/api/runtime_handlers/__init__.py +56 -0
mlrun/api/runtime_handlers/base.py +1247 -0
mlrun/api/runtime_handlers/daskjob.py +209 -0
mlrun/api/runtime_handlers/kubejob.py +37 -0
mlrun/api/runtime_handlers/mpijob.py +147 -0
mlrun/api/runtime_handlers/remotesparkjob.py +29 -0
mlrun/api/runtime_handlers/sparkjob.py +148 -0
mlrun/api/utils/builder.py +1 -4
mlrun/api/utils/clients/chief.py +14 -0
mlrun/api/utils/scheduler.py +98 -15
mlrun/api/utils/singletons/db.py +4 -0
mlrun/artifacts/manager.py +1 -2
mlrun/common/schemas/__init__.py +6 -0
mlrun/common/schemas/auth.py +4 -1
mlrun/common/schemas/client_spec.py +1 -1
mlrun/common/schemas/model_monitoring/__init__.py +1 -0
mlrun/common/schemas/model_monitoring/constants.py +11 -0
mlrun/common/schemas/project.py +1 -0
mlrun/common/schemas/runs.py +1 -8
mlrun/common/schemas/schedule.py +1 -8
mlrun/common/schemas/workflow.py +54 -0
mlrun/config.py +42 -40
mlrun/datastore/sources.py +1 -1
mlrun/db/__init__.py +4 -68
mlrun/db/base.py +12 -0
mlrun/db/factory.py +65 -0
mlrun/db/httpdb.py +175 -19
mlrun/db/nopdb.py +4 -2
mlrun/execution.py +4 -2
mlrun/feature_store/__init__.py +1 -0
mlrun/feature_store/api.py +1 -2
mlrun/feature_store/feature_set.py +0 -10
mlrun/feature_store/feature_vector.py +340 -2
mlrun/feature_store/ingestion.py +5 -10
mlrun/feature_store/retrieval/base.py +118 -104
mlrun/feature_store/retrieval/dask_merger.py +17 -10
mlrun/feature_store/retrieval/job.py +4 -1
mlrun/feature_store/retrieval/local_merger.py +18 -18
mlrun/feature_store/retrieval/spark_merger.py +21 -14
mlrun/feature_store/retrieval/storey_merger.py +21 -15
mlrun/kfpops.py +3 -9
mlrun/launcher/base.py +3 -3
mlrun/launcher/client.py +3 -2
mlrun/launcher/factory.py +16 -13
mlrun/lists.py +0 -11
mlrun/model.py +9 -15
mlrun/model_monitoring/helpers.py +15 -25
mlrun/model_monitoring/model_monitoring_batch.py +72 -4
mlrun/model_monitoring/prometheus.py +219 -0
mlrun/model_monitoring/stores/__init__.py +15 -9
mlrun/model_monitoring/stores/sql_model_endpoint_store.py +3 -1
mlrun/model_monitoring/stream_processing.py +181 -29
mlrun/package/packager.py +6 -8
mlrun/package/packagers/default_packager.py +121 -10
mlrun/platforms/__init__.py +0 -2
mlrun/platforms/iguazio.py +0 -56
mlrun/projects/pipelines.py +57 -158
mlrun/projects/project.py +6 -32
mlrun/render.py +1 -1
mlrun/run.py +2 -124
mlrun/runtimes/__init__.py +6 -42
mlrun/runtimes/base.py +26 -1241
mlrun/runtimes/daskjob.py +2 -198
mlrun/runtimes/function.py +16 -5
mlrun/runtimes/kubejob.py +5 -29
mlrun/runtimes/mpijob/__init__.py +2 -2
mlrun/runtimes/mpijob/abstract.py +10 -1
mlrun/runtimes/mpijob/v1.py +0 -76
mlrun/runtimes/mpijob/v1alpha1.py +1 -74
mlrun/runtimes/nuclio.py +3 -2
mlrun/runtimes/pod.py +0 -10
mlrun/runtimes/remotesparkjob.py +1 -15
mlrun/runtimes/serving.py +1 -1
mlrun/runtimes/sparkjob/__init__.py +0 -1
mlrun/runtimes/sparkjob/abstract.py +4 -131
mlrun/serving/states.py +1 -1
mlrun/utils/db.py +0 -2
mlrun/utils/helpers.py +19 -13
mlrun/utils/notifications/notification_pusher.py +5 -25
mlrun/utils/regex.py +7 -2
mlrun/utils/version/version.json +2 -2
{mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/METADATA +24 -23
{mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/RECORD +116 -107
{mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/WHEEL +1 -1
mlrun/mlutils/data.py +0 -160
mlrun/mlutils/models.py +0 -78
mlrun/mlutils/plots.py +0 -902
{mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/LICENSE +0 -0
{mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/entry_points.txt +0 -0
{mlrun-1.5.0rc1.dist-info → mlrun-1.5.0rc2.dist-info}/top_level.txt +0 -0

mlrun/model_monitoring/prometheus.py ADDED Viewed

@@ -0,0 +1,219 @@
+# Copyright 2023 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import typing
+import prometheus_client
+from mlrun.common.schemas.model_monitoring import EventFieldType, PrometheusMetric
+# Memory path for Prometheus registry file
+_registry_path = "/tmp/prom-reg.txt"
+# Initializing Promethues metric collector registry
+_registry: prometheus_client.CollectorRegistry = prometheus_client.CollectorRegistry()
+# The following real-time metrics are being updated through the monitoring stream graph steps
+_prediction_counter: prometheus_client.Counter = prometheus_client.Counter(
+    name=PrometheusMetric.PREDICTIONS_TOTAL,
+    documentation="Counter for total predictions",
+    registry=_registry,
+    labelnames=[
+        EventFieldType.PROJECT,
+        EventFieldType.ENDPOINT_ID,
+        EventFieldType.MODEL,
+        EventFieldType.ENDPOINT_TYPE,
+    ],
+)
+_model_latency: prometheus_client.Summary = prometheus_client.Summary(
+    name=PrometheusMetric.MODEL_LATENCY_SECONDS,
+    documentation="Summary for for model latency",
+    registry=_registry,
+    labelnames=[
+        EventFieldType.PROJECT,
+        EventFieldType.ENDPOINT_ID,
+        EventFieldType.MODEL,
+        EventFieldType.ENDPOINT_TYPE,
+    ],
+)
+_income_features: prometheus_client.Gauge = prometheus_client.Gauge(
+    name=PrometheusMetric.INCOME_FEATURES,
+    documentation="Samples of features and predictions",
+    registry=_registry,
+    labelnames=[
+        EventFieldType.PROJECT,
+        EventFieldType.ENDPOINT_ID,
+        EventFieldType.METRIC,
+    ],
+)
+_error_counter: prometheus_client.Counter = prometheus_client.Counter(
+    name=PrometheusMetric.ERRORS_TOTAL,
+    documentation="Counter for total errors",
+    registry=_registry,
+    labelnames=[
+        EventFieldType.PROJECT,
+        EventFieldType.ENDPOINT_ID,
+        EventFieldType.MODEL,
+    ],
+)
+# The following metrics are being updated through the model monitoring batch job
+_batch_metrics: prometheus_client.Gauge = prometheus_client.Gauge(
+    name=PrometheusMetric.DRIFT_METRICS,
+    documentation="Results from the batch drift analysis",
+    registry=_registry,
+    labelnames=[
+        EventFieldType.PROJECT,
+        EventFieldType.ENDPOINT_ID,
+        EventFieldType.METRIC,
+    ],
+)
+_drift_status: prometheus_client.Enum = prometheus_client.Enum(
+    name=PrometheusMetric.DRIFT_STATUS,
+    documentation="Drift status of the model endpoint",
+    registry=_registry,
+    states=["NO_DRIFT", "DRIFT_DETECTED", "POSSIBLE_DRIFT"],
+    labelnames=[EventFieldType.PROJECT, EventFieldType.ENDPOINT_ID],
+)
+def _write_registry(func):
+    def wrapper(*args, **kwargs):
+        global _registry
+        """A wrapper function to update the registry file each time a metric has been updated"""
+        func(*args, **kwargs)
+        prometheus_client.write_to_textfile(path=_registry_path, registry=_registry)
+    return wrapper
+@_write_registry
+def write_predictions_and_latency_metrics(
+    project: str, endpoint_id: str, latency: int, model_name: str, endpoint_type: int
+):
+    """
+    Update the prediction counter and the latency value of the provided model endpoint within Prometheus registry.
+    Please note that while the prediction counter is ALWAYS increasing by 1,the latency summary metric is being
+    increased by the event latency time. Grafana dashboard will query the average latency time by dividing the total
+    latency value by the total amount of predictions.
+    :param project:       Project name.
+    :param endpoint_id:   Model endpoint unique id.
+    :param latency:       Latency time (microsecond) in which the event has been processed through the model server.
+    :param model_name:    Model name which will be used by Grafana for displaying the results by model.
+    :param endpoint_type: Endpoint type that is represented by an int (possible values: 1,2,3) corresponding to the
+                          Enum class :py:class:`~mlrun.common.schemas.model_monitoring.EndpointType`.
+    """
+    # Increase the prediction counter by 1
+    _prediction_counter.labels(
+        project=project,
+        endpoint_id=endpoint_id,
+        model=model_name,
+        endpoint_type=endpoint_type,
+    ).inc(1)
+    # Increase the latency value according to the provided latency of the current event
+    _model_latency.labels(
+        project=project,
+        endpoint_id=endpoint_id,
+        model=model_name,
+        endpoint_type=endpoint_type,
+    ).observe(latency)
+@_write_registry
+def write_income_features(
+    project: str, endpoint_id: str, features: typing.Dict[str, float]
+):
+    """Update a sample of features.
+    :param project:     Project name.
+    :param endpoint_id: Model endpoint unique id.
+    :param features:    Dictionary in which the key is a feature name and the value is a float number.
+    """
+    for metric in features:
+        _income_features.labels(
+            project=project, endpoint_id=endpoint_id, metric=metric
+        ).set(value=features[metric])
+@_write_registry
+def write_drift_metrics(project: str, endpoint_id: str, metric: str, value: float):
+    """Update drift metrics that have been calculated through the monitoring batch job
+    :param project:     Project name.
+    :param endpoint_id: Model endpoint unique id.
+    :param metric:      Metric name (e.g. TVD, Hellinger).
+    :param value:       Metric value as a float.
+    """
+    _batch_metrics.labels(project=project, endpoint_id=endpoint_id, metric=metric).set(
+        value=value
+    )
+@_write_registry
+def write_drift_status(project: str, endpoint_id: str, drift_status: str):
+    """
+    Update the drift status enum for a specific model endpoint.
+    :param project:      Project name.
+    :param endpoint_id:  Model endpoint unique id.
+    :param drift_status: Drift status value, can be one of the following: 'NO_DRIFT', 'DRIFT_DETECTED', or
+                         'POSSIBLE_DRIFT'.
+    """
+    _drift_status.labels(project=project, endpoint_id=endpoint_id).state(drift_status)
+@_write_registry
+def write_errors(project: str, endpoint_id: str, model_name: str):
+    """
+    Update the error counter for a specific model endpoint.
+    :param project:     Project name.
+    :param endpoint_id: Model endpoint unique id.
+    :param model_name:  Model name. Will be used by Grafana to show the amount of errors per model by time.
+    """
+    _error_counter.labels(
+        project=project, endpoint_id=endpoint_id, model=model_name
+    ).inc(1)
+def get_registry() -> str:
+    """Returns the parsed registry file according to the exposition format of Prometheus."""
+    # Read the registry file (note that the text is stored in UTF-8 format)
+    f = open(_registry_path)
+    lines = f.read()
+    f.close()
+    # Reset part of the metrics to avoid a repeating scraping of the same value
+    clean_metrics()
+    return lines
+@_write_registry
+def clean_metrics():
+    """Clean the income features values. As these results are relevant only for a certain timestamp, we will remove
+    them from the global registry after they have been scraped by Prometheus."""
+    _income_features.clear()

mlrun/model_monitoring/stores/__init__.py CHANGED Viewed

@@ -17,6 +17,7 @@
 import enum
 import typing
+import mlrun.common.schemas.secret
 import mlrun.errors
 from .model_endpoint_store import ModelEndpointStore
@@ -33,6 +34,7 @@ class ModelEndpointStoreType(enum.Enum):
         project: str,
         access_key: str = None,
         endpoint_store_connection: str = None,
+        secret_provider: typing.Callable = None,
     ) -> ModelEndpointStore:
         """
         Return a ModelEndpointStore object based on the provided enum value.
@@ -46,6 +48,7 @@ class ModelEndpointStoreType(enum.Enum):
                                           e.g. A root user with password 1234, tries to connect a schema called
                                           mlrun within a local MySQL DB instance:
                                           'mysql+pymysql://root:1234@localhost:3306/mlrun'.
+        :param secret_provider:           An optional secret provider to get the connection string secret.
         :return: `ModelEndpointStore` object.
@@ -61,15 +64,13 @@ class ModelEndpointStoreType(enum.Enum):
         # Assuming SQL store target if store type is not KV.
         # Update these lines once there are more than two store target types.
-        from mlrun.model_monitoring.helpers import get_connection_string
-        sql_connection_string = endpoint_store_connection or get_connection_string(
-            project=project
-        )
         from .sql_model_endpoint_store import SQLModelEndpointStore
         return SQLModelEndpointStore(
-            project=project, sql_connection_string=sql_connection_string
+            project=project,
+            sql_connection_string=endpoint_store_connection,
+            secret_provider=secret_provider,
         )
     @classmethod
@@ -84,13 +85,16 @@ class ModelEndpointStoreType(enum.Enum):
 def get_model_endpoint_store(
-    project: str, access_key: str = None
+    project: str,
+    access_key: str = None,
+    secret_provider: typing.Callable = None,
 ) -> ModelEndpointStore:
     """
     Getting the DB target type based on mlrun.config.model_endpoint_monitoring.store_type.
-    :param project:    The name of the project.
-    :param access_key: Access key with permission to the DB table.
+    :param project:         The name of the project.
+    :param access_key:      Access key with permission to the DB table.
+    :param secret_provider: An optional secret provider to get the connection string secret.
     :return: `ModelEndpointStore` object. Using this object, the user can apply different operations on the
              model endpoint record such as write, update, get and delete.
@@ -102,4 +106,6 @@ def get_model_endpoint_store(
     )
     # Convert into model endpoint store target object
-    return model_endpoint_store_type.to_endpoint_store(project, access_key)
+    return model_endpoint_store_type.to_endpoint_store(
+        project=project, access_key=access_key, secret_provider=secret_provider
+    )

mlrun/model_monitoring/stores/sql_model_endpoint_store.py CHANGED Viewed

@@ -45,12 +45,14 @@ class SQLModelEndpointStore(ModelEndpointStore):
         self,
         project: str,
         sql_connection_string: str = None,
+        secret_provider: typing.Callable = None,
     ):
         """
         Initialize SQL store target object.
         :param project:               The name of the project.
         :param sql_connection_string: Valid connection string or a path to SQL database with model endpoints table.
+        :param secret_provider:       An optional secret provider to get the connection string secret.
         """
         super().__init__(project=project)
@@ -58,7 +60,7 @@ class SQLModelEndpointStore(ModelEndpointStore):
         self.sql_connection_string = (
             sql_connection_string
             or mlrun.model_monitoring.helpers.get_connection_string(
-                project=self.project
+                secret_provider=secret_provider
             )
         )

mlrun/model_monitoring/stream_processing.py CHANGED Viewed

@@ -21,8 +21,13 @@ import typing
 import pandas as pd
 import storey
+import mlrun
 import mlrun.common.model_monitoring.helpers
+import mlrun.config
+import mlrun.datastore.targets
 import mlrun.feature_store.steps
+import mlrun.model_monitoring.prometheus
+import mlrun.utils
 import mlrun.utils.v3io_clients
 from mlrun.common.schemas.model_monitoring.constants import (
     EventFieldType,
@@ -41,9 +46,9 @@ class EventStreamProcessor:
         self,
         project: str,
         parquet_batching_max_events: int,
+        parquet_batching_timeout_secs: int,
         parquet_target: str,
         sample_window: int = 10,
-        parquet_batching_timeout_secs: int = 30 * 60,  # Default 30 minutes
         aggregate_windows: typing.Optional[typing.List[str]] = None,
         aggregate_period: str = "30s",
         model_monitoring_access_key: str = None,
@@ -74,6 +79,8 @@ class EventStreamProcessor:
             self._initialize_v3io_configurations(
                 model_monitoring_access_key=model_monitoring_access_key
             )
+        elif self.parquet_path.startswith("s3://"):
+            self.storage_options = mlrun.mlconf.get_s3_storage_options()
     def _initialize_v3io_configurations(
         self,
@@ -132,7 +139,7 @@ class EventStreamProcessor:
         of different operations that are executed on the events from the model server. Each event has
         metadata (function_uri, timestamp, class, etc.) but also inputs and predictions from the model server.
         Throughout the serving graph, the results are written to 3 different databases:
-        1. KV/SQL (steps 7-9): Stores metadata and stats about the average latency and the amount of predictions over
+        1. KV/SQL (steps 9-11): Stores metadata and stats about the average latency and the amount of predictions over
            time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. This data is used
            by the monitoring dashboards in grafana. The model endpoints table also contains data on the model endpoint
            from other processes, such as current_stats that is being calculated by the monitoring batch job
@@ -140,12 +147,14 @@ class EventStreamProcessor:
            v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is SQL, then the table
            is stored within the database that was defined in the provided connection string and can be found
            under mlrun.mlconf.model_endpoint_monitoring.endpoint_store_connection.
-        2. TSDB (steps 12-18): Stores live data of different key metric dictionaries in tsdb target. Results can be
-           found under v3io:///users/pipelines/project-name/model-endpoints/events/. At the moment, this part supports
-           3 different key metric dictionaries: base_metrics (average latency and predictions over time),
+        2. V3IO TSDB/Prometheus (steps 13-21): Stores live data of different key metric dictionaries in tsdb target.
+           This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB (steps 13-19), results
+           can be found under  v3io:///users/pipelines/project-name/model-endpoints/events/. In that case, we generate
+           3 different key  metric dictionaries: base_metrics (average latency and predictions over time),
            endpoint_features (Prediction and feature names and values), and custom_metrics (user-defined metrics).
-           This data is also being used by the monitoring dashboards in grafana.
-        3. Parquet (steps 19-20): This Parquet file includes the required data for the model monitoring batch job
+           If using Prometheus (steps 20-21), we update metrics in the Prometheus registry that is stored in the
+           monitoring stream local memory.
+        3. Parquet (steps 22-23): This Parquet file includes the required data for the model monitoring batch job
            that run every hour by default. If defined, the parquet target path can be found under
            mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise, the default parquet path is under
            mlrun.mlconf.model_endpoint_monitoring.user_space.
@@ -155,17 +164,41 @@ class EventStreamProcessor:
         graph = fn.set_topology("flow")
-        # Step 1 - Process endpoint event: splitting into sub-events and validate event data
+        # Step 1 - Event routing based on the provided path
+        def apply_event_routing():
+            graph.add_step(
+                "EventRouting",
+                full_event=True,
+                project=self.project,
+            ).respond()
+        apply_event_routing()
+        # Step 2 - Filter out events with no '-' in path which indicates that the event is supposed to be processed
+        # through the next steps of the stream graph
+        def apply_storey_filter_stream_events():
+            # Remove none values from each event
+            graph.add_step(
+                "storey.Filter",
+                "filter_stream_event",
+                _fn="('-' not in event.path)",
+                full_event=True,
+            )
+        apply_storey_filter_stream_events()
+        # Step 3 - Process endpoint event: splitting into sub-events and validate event data
         def apply_process_endpoint_event():
             graph.add_step(
                 "ProcessEndpointEvent",
                 full_event=True,
                 project=self.project,
+                after="filter_stream_event",
             )
         apply_process_endpoint_event()
-        # Steps 2,3 - Applying Storey operations of filtering and flatten
+        # Steps 4,5 - Applying Storey operations of filtering and flatten
         def apply_storey_filter_and_flatmap():
             # Remove none values from each event
             graph.add_step(
@@ -182,7 +215,7 @@ class EventStreamProcessor:
         apply_storey_filter_and_flatmap()
-        # Step 4 - Validating feature names and map each feature to its value
+        # Step 6 - Validating feature names and map each feature to its value
         def apply_map_feature_names():
             graph.add_step(
                 "MapFeatureNames",
@@ -194,9 +227,9 @@ class EventStreamProcessor:
         apply_map_feature_names()
-        # Step 5 - Calculate number of predictions and average latency
+        # Step 7 - Calculate number of predictions and average latency
         def apply_storey_aggregations():
-            # Step 5.1 - Calculate number of predictions and average latency for each window (5 min and 1 hour)
+            # Step 7.1 - Calculate number of predictions for each window (5 min and 1 hour by default)
             graph.add_step(
                 class_name="storey.AggregateByKey",
                 aggregates=[
@@ -214,8 +247,7 @@ class EventStreamProcessor:
                 table=".",
                 key_field=EventFieldType.ENDPOINT_ID,
             )
-            # Step 5.2 - Rename the latency counter field to prediction counter
+            # Step 7.2 - Calculate average latency time for each window (5 min and 1 hour by default)
             graph.add_step(
                 class_name="storey.Rename",
                 mapping={
@@ -228,7 +260,7 @@ class EventStreamProcessor:
         apply_storey_aggregations()
-        # Step 6 - Emits the event in window size of events based on sample_window size (10 by default)
+        # Step 8 - Emits the event in window size of events based on sample_window size (10 by default)
         def apply_storey_sample_window():
             graph.add_step(
                 "storey.steps.SampleWindow",
@@ -240,8 +272,8 @@ class EventStreamProcessor:
         apply_storey_sample_window()
-        # Steps 7-9 - KV/SQL branch
-        # Step 7 - Filter relevant keys from the event before writing the data into the database table
+        # Steps 9-11 - KV/SQL branch
+        # Step 9 - Filter relevant keys from the event before writing the data into the database table
         def apply_process_before_endpoint_update():
             graph.add_step(
                 "ProcessBeforeEndpointUpdate",
@@ -251,7 +283,7 @@ class EventStreamProcessor:
         apply_process_before_endpoint_update()
-        # Step 8 - Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
+        # Step 10 - Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
         # about average latency and the amount of predictions over time
         def apply_update_endpoint():
             graph.add_step(
@@ -264,7 +296,7 @@ class EventStreamProcessor:
         apply_update_endpoint()
-        # Step 9 (only for KV target) - Apply infer_schema on the model endpoints table for generating schema file
+        # Step 11 (only for KV target) - Apply infer_schema on the model endpoints table for generating schema file
         # which will be used by Grafana monitoring dashboards
         def apply_infer_schema():
             graph.add_step(
@@ -279,10 +311,12 @@ class EventStreamProcessor:
         if self.model_endpoint_store_target == ModelEndpointTarget.V3IO_NOSQL:
             apply_infer_schema()
-        # Steps 11-18 - TSDB branch (not supported in CE environment at the moment)
+        # Steps 12-19 - TSDB branch (skip to Prometheus if in CE env)
+        # Steps 20-21 - Prometheus branch
         if not mlrun.mlconf.is_ce_mode():
-            # Step 11 - Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
+            # TSDB branch
+            # Step 12 - Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
             # stats and details about the events
             def apply_process_before_tsdb():
                 graph.add_step(
@@ -291,7 +325,7 @@ class EventStreamProcessor:
             apply_process_before_tsdb()
-            # Steps 12-18: - Unpacked keys from each dictionary and write to TSDB target
+            # Steps 13-19: - Unpacked keys from each dictionary and write to TSDB target
             def apply_filter_and_unpacked_keys(name, keys):
                 graph.add_step(
                     "FilterAndUnpackKeys",
@@ -322,21 +356,21 @@ class EventStreamProcessor:
                     key=EventFieldType.ENDPOINT_ID,
                 )
-            # Steps 12-13 - unpacked base_metrics dictionary
+            # Steps 13-14 - unpacked base_metrics dictionary
             apply_filter_and_unpacked_keys(
                 name="FilterAndUnpackKeys1",
                 keys=EventKeyMetrics.BASE_METRICS,
             )
             apply_tsdb_target(name="tsdb1", after="FilterAndUnpackKeys1")
-            # Steps 14-15 - unpacked endpoint_features dictionary
+            # Steps 15-16 - unpacked endpoint_features dictionary
             apply_filter_and_unpacked_keys(
                 name="FilterAndUnpackKeys2",
                 keys=EventKeyMetrics.ENDPOINT_FEATURES,
             )
             apply_tsdb_target(name="tsdb2", after="FilterAndUnpackKeys2")
-            # Steps 16-18 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
+            # Steps 17-19 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
             apply_filter_and_unpacked_keys(
                 name="FilterAndUnpackKeys3",
                 keys=EventKeyMetrics.CUSTOM_METRICS,
@@ -352,9 +386,30 @@ class EventStreamProcessor:
             apply_storey_filter()
             apply_tsdb_target(name="tsdb3", after="FilterNotNone")
+        else:
+            # Prometheus branch
+            # Step 20 - Increase the prediction counter by 1 and update the latency value
+            graph.add_step(
+                "IncCounter",
+                name="IncCounter",
+                after="MapFeatureNames",
+                project=self.project,
+            )
-        # Steps 19-20 - Parquet branch
-        # Step 19 - Filter and validate different keys before writing the data to Parquet target
+            # Step 21 - Record a sample of features and labels
+            def apply_record_features_to_prometheus():
+                graph.add_step(
+                    "RecordFeatures",
+                    name="RecordFeaturesToPrometheus",
+                    after="sample",
+                    project=self.project,
+                )
+            apply_record_features_to_prometheus()
+        # Steps 22-23 - Parquet branch
+        # Step 22 - Filter and validate different keys before writing the data to Parquet target
         def apply_process_before_parquet():
             graph.add_step(
                 "ProcessBeforeParquet",
@@ -365,7 +420,7 @@ class EventStreamProcessor:
         apply_process_before_parquet()
-        # Step 20 - Write the Parquet target file, partitioned by key (endpoint_id) and time.
+        # Step 23 - Write the Parquet target file, partitioned by key (endpoint_id) and time.
         def apply_parquet_target():
             graph.add_step(
                 "storey.ParquetTarget",
@@ -615,6 +670,11 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
         error = event.get("error")
         if error:
             self.error_count[endpoint_id] += 1
+            mlrun.model_monitoring.prometheus.write_errors(
+                project=self.project,
+                endpoint_id=event["endpoint_id"],
+                model_name=event["model"],
+            )
             raise mlrun.errors.MLRunInvalidArgumentError(str(error))
         # Validate event fields
@@ -1068,6 +1128,98 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
         return event
+class EventRouting(mlrun.feature_store.steps.MapClass):
+    """
+    Router the event according to the configured path under event.path. Please note that this step returns the result
+    to the caller. At the moment there are several paths:
+    - /model-monitoring-metrics (GET): return Prometheus registry results as a text. Will be used by Prometheus client
+    to scrape the results from the monitoring stream memory.
+    - /monitoring-batch-metrics (POST): update the Prometheus registry with the provided statistical metrics such as the
+     statistical metrics from the monitoring batch job. Note that the event body is a list of dictionaries of different
+     metrics.
+    - /monitoring-drift-status (POST): update the Prometheus registry with the provided model drift status.
+    """
+    def __init__(
+        self,
+        project: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.project: str = project
+    def do(self, event):
+        if event.path == "/model-monitoring-metrics":
+            # Return a parsed Prometheus registry file
+            event.body = mlrun.model_monitoring.prometheus.get_registry()
+        elif event.path == "/monitoring-batch-metrics":
+            # Update statistical metrics
+            for event_metric in event.body:
+                mlrun.model_monitoring.prometheus.write_drift_metrics(
+                    project=self.project,
+                    endpoint_id=event_metric[EventFieldType.ENDPOINT_ID],
+                    metric=event_metric[EventFieldType.METRIC],
+                    value=event_metric[EventFieldType.VALUE],
+                )
+        elif event.path == "/monitoring-drift-status":
+            # Update drift status
+            mlrun.model_monitoring.prometheus.write_drift_status(
+                project=self.project,
+                endpoint_id=event.body[EventFieldType.ENDPOINT_ID],
+                drift_status=event.body[EventFieldType.DRIFT_STATUS],
+            )
+        return event
+class IncCounter(mlrun.feature_store.steps.MapClass):
+    """Increase prediction counter by 1 and update the total latency value"""
+    def __init__(self, project: str, **kwargs):
+        super().__init__(**kwargs)
+        self.project: str = project
+    def do(self, event):
+        # Compute prediction per second
+        mlrun.model_monitoring.prometheus.write_predictions_and_latency_metrics(
+            project=self.project,
+            endpoint_id=event[EventFieldType.ENDPOINT_ID],
+            latency=event[EventFieldType.LATENCY],
+            model_name=event[EventFieldType.MODEL],
+            endpoint_type=event[EventFieldType.ENDPOINT_TYPE],
+        )
+        return event
+class RecordFeatures(mlrun.feature_store.steps.MapClass):
+    """Record a sample of features and labels in Prometheus registry"""
+    def __init__(self, project: str, **kwargs):
+        super().__init__(**kwargs)
+        self.project: str = project
+    def do(self, event):
+        # Generate a dictionary of features and predictions
+        features = {
+            **event[EventFieldType.NAMED_PREDICTIONS],
+            **event[EventFieldType.NAMED_FEATURES],
+        }
+        mlrun.model_monitoring.prometheus.write_income_features(
+            project=self.project,
+            endpoint_id=event[EventFieldType.ENDPOINT_ID],
+            features=features,
+        )
+        return event
 def update_endpoint_record(
     project: str,
     endpoint_id: str,

mlrun 1.5.0rc1__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl

Potentially problematic release.

mlrun 1.5.0rc1py3-none-any.whl → 1.5.0rc2py3-none-any.whl