PyPI - mlrun - Versions diffs - 1.7.2rc3__py3-none-any.whl → 1.8.0__py3-none-any.whl - Mend

mlrun 1.7.2rc3py3-none-any.whl → 1.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (275) hide show

mlrun/__init__.py +26 -22
mlrun/__main__.py +15 -16
mlrun/alerts/alert.py +150 -15
mlrun/api/schemas/__init__.py +1 -9
mlrun/artifacts/__init__.py +2 -3
mlrun/artifacts/base.py +62 -19
mlrun/artifacts/dataset.py +17 -17
mlrun/artifacts/document.py +454 -0
mlrun/artifacts/manager.py +28 -18
mlrun/artifacts/model.py +91 -59
mlrun/artifacts/plots.py +2 -2
mlrun/common/constants.py +8 -0
mlrun/common/formatters/__init__.py +1 -0
mlrun/common/formatters/artifact.py +1 -1
mlrun/common/formatters/feature_set.py +2 -0
mlrun/common/formatters/function.py +1 -0
mlrun/{model_monitoring/db/stores/v3io_kv/__init__.py → common/formatters/model_endpoint.py} +17 -0
mlrun/common/formatters/pipeline.py +1 -2
mlrun/common/formatters/project.py +9 -0
mlrun/common/model_monitoring/__init__.py +0 -5
mlrun/common/model_monitoring/helpers.py +12 -62
mlrun/common/runtimes/constants.py +25 -4
mlrun/common/schemas/__init__.py +9 -5
mlrun/common/schemas/alert.py +114 -19
mlrun/common/schemas/api_gateway.py +3 -3
mlrun/common/schemas/artifact.py +22 -9
mlrun/common/schemas/auth.py +8 -4
mlrun/common/schemas/background_task.py +7 -7
mlrun/common/schemas/client_spec.py +4 -4
mlrun/common/schemas/clusterization_spec.py +2 -2
mlrun/common/schemas/common.py +53 -3
mlrun/common/schemas/constants.py +15 -0
mlrun/common/schemas/datastore_profile.py +1 -1
mlrun/common/schemas/feature_store.py +9 -9
mlrun/common/schemas/frontend_spec.py +4 -4
mlrun/common/schemas/function.py +10 -10
mlrun/common/schemas/hub.py +1 -1
mlrun/common/schemas/k8s.py +3 -3
mlrun/common/schemas/memory_reports.py +3 -3
mlrun/common/schemas/model_monitoring/__init__.py +4 -8
mlrun/common/schemas/model_monitoring/constants.py +127 -46
mlrun/common/schemas/model_monitoring/grafana.py +18 -12
mlrun/common/schemas/model_monitoring/model_endpoints.py +154 -160
mlrun/common/schemas/notification.py +24 -3
mlrun/common/schemas/object.py +1 -1
mlrun/common/schemas/pagination.py +4 -4
mlrun/common/schemas/partition.py +142 -0
mlrun/common/schemas/pipeline.py +3 -3
mlrun/common/schemas/project.py +26 -18
mlrun/common/schemas/runs.py +3 -3
mlrun/common/schemas/runtime_resource.py +5 -5
mlrun/common/schemas/schedule.py +1 -1
mlrun/common/schemas/secret.py +1 -1
mlrun/{model_monitoring/db/stores/sqldb/__init__.py → common/schemas/serving.py} +10 -1
mlrun/common/schemas/tag.py +3 -3
mlrun/common/schemas/workflow.py +6 -5
mlrun/common/types.py +1 -0
mlrun/config.py +157 -89
mlrun/data_types/__init__.py +5 -3
mlrun/data_types/infer.py +13 -3
mlrun/data_types/spark.py +2 -1
mlrun/datastore/__init__.py +59 -18
mlrun/datastore/alibaba_oss.py +4 -1
mlrun/datastore/azure_blob.py +4 -1
mlrun/datastore/base.py +19 -24
mlrun/datastore/datastore.py +10 -4
mlrun/datastore/datastore_profile.py +178 -45
mlrun/datastore/dbfs_store.py +4 -1
mlrun/datastore/filestore.py +4 -1
mlrun/datastore/google_cloud_storage.py +4 -1
mlrun/datastore/hdfs.py +4 -1
mlrun/datastore/inmem.py +4 -1
mlrun/datastore/redis.py +4 -1
mlrun/datastore/s3.py +14 -3
mlrun/datastore/sources.py +89 -92
mlrun/datastore/store_resources.py +7 -4
mlrun/datastore/storeytargets.py +51 -16
mlrun/datastore/targets.py +38 -31
mlrun/datastore/utils.py +87 -4
mlrun/datastore/v3io.py +4 -1
mlrun/datastore/vectorstore.py +291 -0
mlrun/datastore/wasbfs/fs.py +13 -12
mlrun/db/base.py +286 -100
mlrun/db/httpdb.py +1562 -490
mlrun/db/nopdb.py +250 -83
mlrun/errors.py +6 -2
mlrun/execution.py +194 -50
mlrun/feature_store/__init__.py +2 -10
mlrun/feature_store/api.py +20 -458
mlrun/feature_store/common.py +9 -9
mlrun/feature_store/feature_set.py +20 -18
mlrun/feature_store/feature_vector.py +105 -479
mlrun/feature_store/feature_vector_utils.py +466 -0
mlrun/feature_store/retrieval/base.py +15 -11
mlrun/feature_store/retrieval/job.py +2 -1
mlrun/feature_store/retrieval/storey_merger.py +1 -1
mlrun/feature_store/steps.py +3 -3
mlrun/features.py +30 -13
mlrun/frameworks/__init__.py +1 -2
mlrun/frameworks/_common/__init__.py +1 -2
mlrun/frameworks/_common/artifacts_library.py +2 -2
mlrun/frameworks/_common/mlrun_interface.py +10 -6
mlrun/frameworks/_common/model_handler.py +31 -31
mlrun/frameworks/_common/producer.py +3 -1
mlrun/frameworks/_dl_common/__init__.py +1 -2
mlrun/frameworks/_dl_common/loggers/__init__.py +1 -2
mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +4 -4
mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +3 -3
mlrun/frameworks/_ml_common/__init__.py +1 -2
mlrun/frameworks/_ml_common/loggers/__init__.py +1 -2
mlrun/frameworks/_ml_common/model_handler.py +21 -21
mlrun/frameworks/_ml_common/plans/__init__.py +1 -2
mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +3 -1
mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
mlrun/frameworks/auto_mlrun/__init__.py +1 -2
mlrun/frameworks/auto_mlrun/auto_mlrun.py +22 -15
mlrun/frameworks/huggingface/__init__.py +1 -2
mlrun/frameworks/huggingface/model_server.py +9 -9
mlrun/frameworks/lgbm/__init__.py +47 -44
mlrun/frameworks/lgbm/callbacks/__init__.py +1 -2
mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -2
mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -2
mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -2
mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +5 -5
mlrun/frameworks/lgbm/model_handler.py +15 -11
mlrun/frameworks/lgbm/model_server.py +11 -7
mlrun/frameworks/lgbm/utils.py +2 -2
mlrun/frameworks/onnx/__init__.py +1 -2
mlrun/frameworks/onnx/dataset.py +3 -3
mlrun/frameworks/onnx/mlrun_interface.py +2 -2
mlrun/frameworks/onnx/model_handler.py +7 -5
mlrun/frameworks/onnx/model_server.py +8 -6
mlrun/frameworks/parallel_coordinates.py +11 -11
mlrun/frameworks/pytorch/__init__.py +22 -23
mlrun/frameworks/pytorch/callbacks/__init__.py +1 -2
mlrun/frameworks/pytorch/callbacks/callback.py +2 -1
mlrun/frameworks/pytorch/callbacks/logging_callback.py +15 -8
mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +19 -12
mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +22 -15
mlrun/frameworks/pytorch/callbacks_handler.py +36 -30
mlrun/frameworks/pytorch/mlrun_interface.py +17 -17
mlrun/frameworks/pytorch/model_handler.py +21 -17
mlrun/frameworks/pytorch/model_server.py +13 -9
mlrun/frameworks/sklearn/__init__.py +19 -18
mlrun/frameworks/sklearn/estimator.py +2 -2
mlrun/frameworks/sklearn/metric.py +3 -3
mlrun/frameworks/sklearn/metrics_library.py +8 -6
mlrun/frameworks/sklearn/mlrun_interface.py +3 -2
mlrun/frameworks/sklearn/model_handler.py +4 -3
mlrun/frameworks/tf_keras/__init__.py +11 -12
mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -2
mlrun/frameworks/tf_keras/callbacks/logging_callback.py +17 -14
mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +15 -12
mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +21 -18
mlrun/frameworks/tf_keras/model_handler.py +17 -13
mlrun/frameworks/tf_keras/model_server.py +12 -8
mlrun/frameworks/xgboost/__init__.py +19 -18
mlrun/frameworks/xgboost/model_handler.py +13 -9
mlrun/k8s_utils.py +2 -5
mlrun/launcher/base.py +3 -4
mlrun/launcher/client.py +2 -2
mlrun/launcher/local.py +6 -2
mlrun/launcher/remote.py +1 -1
mlrun/lists.py +8 -4
mlrun/model.py +132 -46
mlrun/model_monitoring/__init__.py +3 -5
mlrun/model_monitoring/api.py +113 -98
mlrun/model_monitoring/applications/__init__.py +0 -5
mlrun/model_monitoring/applications/_application_steps.py +81 -50
mlrun/model_monitoring/applications/base.py +467 -14
mlrun/model_monitoring/applications/context.py +212 -134
mlrun/model_monitoring/{db/stores/base → applications/evidently}/__init__.py +6 -2
mlrun/model_monitoring/applications/evidently/base.py +146 -0
mlrun/model_monitoring/applications/histogram_data_drift.py +89 -56
mlrun/model_monitoring/applications/results.py +67 -15
mlrun/model_monitoring/controller.py +701 -315
mlrun/model_monitoring/db/__init__.py +0 -2
mlrun/model_monitoring/db/_schedules.py +242 -0
mlrun/model_monitoring/db/_stats.py +189 -0
mlrun/model_monitoring/db/tsdb/__init__.py +33 -22
mlrun/model_monitoring/db/tsdb/base.py +243 -49
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +76 -36
mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +33 -0
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +213 -0
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +534 -88
mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +1 -0
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +436 -106
mlrun/model_monitoring/helpers.py +356 -114
mlrun/model_monitoring/stream_processing.py +190 -345
mlrun/model_monitoring/tracking_policy.py +11 -4
mlrun/model_monitoring/writer.py +49 -90
mlrun/package/__init__.py +3 -6
mlrun/package/context_handler.py +2 -2
mlrun/package/packager.py +12 -9
mlrun/package/packagers/__init__.py +0 -2
mlrun/package/packagers/default_packager.py +14 -11
mlrun/package/packagers/numpy_packagers.py +16 -7
mlrun/package/packagers/pandas_packagers.py +18 -18
mlrun/package/packagers/python_standard_library_packagers.py +25 -11
mlrun/package/packagers_manager.py +35 -32
mlrun/package/utils/__init__.py +0 -3
mlrun/package/utils/_pickler.py +6 -6
mlrun/platforms/__init__.py +47 -16
mlrun/platforms/iguazio.py +4 -1
mlrun/projects/operations.py +30 -30
mlrun/projects/pipelines.py +116 -47
mlrun/projects/project.py +1292 -329
mlrun/render.py +5 -9
mlrun/run.py +57 -14
mlrun/runtimes/__init__.py +1 -3
mlrun/runtimes/base.py +30 -22
mlrun/runtimes/daskjob.py +9 -9
mlrun/runtimes/databricks_job/databricks_runtime.py +6 -5
mlrun/runtimes/function_reference.py +5 -2
mlrun/runtimes/generators.py +3 -2
mlrun/runtimes/kubejob.py +6 -7
mlrun/runtimes/mounts.py +574 -0
mlrun/runtimes/mpijob/__init__.py +0 -2
mlrun/runtimes/mpijob/abstract.py +7 -6
mlrun/runtimes/nuclio/api_gateway.py +7 -7
mlrun/runtimes/nuclio/application/application.py +11 -13
mlrun/runtimes/nuclio/application/reverse_proxy.go +66 -64
mlrun/runtimes/nuclio/function.py +127 -70
mlrun/runtimes/nuclio/serving.py +105 -37
mlrun/runtimes/pod.py +159 -54
mlrun/runtimes/remotesparkjob.py +3 -2
mlrun/runtimes/sparkjob/__init__.py +0 -2
mlrun/runtimes/sparkjob/spark3job.py +22 -12
mlrun/runtimes/utils.py +7 -6
mlrun/secrets.py +2 -2
mlrun/serving/__init__.py +8 -0
mlrun/serving/merger.py +7 -5
mlrun/serving/remote.py +35 -22
mlrun/serving/routers.py +186 -240
mlrun/serving/server.py +41 -10
mlrun/serving/states.py +432 -118
mlrun/serving/utils.py +13 -2
mlrun/serving/v1_serving.py +3 -2
mlrun/serving/v2_serving.py +161 -203
mlrun/track/__init__.py +1 -1
mlrun/track/tracker.py +2 -2
mlrun/track/trackers/mlflow_tracker.py +6 -5
mlrun/utils/async_http.py +35 -22
mlrun/utils/clones.py +7 -4
mlrun/utils/helpers.py +511 -58
mlrun/utils/logger.py +119 -13
mlrun/utils/notifications/notification/__init__.py +22 -19
mlrun/utils/notifications/notification/base.py +39 -15
mlrun/utils/notifications/notification/console.py +6 -6
mlrun/utils/notifications/notification/git.py +11 -11
mlrun/utils/notifications/notification/ipython.py +10 -9
mlrun/utils/notifications/notification/mail.py +176 -0
mlrun/utils/notifications/notification/slack.py +16 -8
mlrun/utils/notifications/notification/webhook.py +24 -8
mlrun/utils/notifications/notification_pusher.py +191 -200
mlrun/utils/regex.py +12 -2
mlrun/utils/version/version.json +2 -2
{mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/METADATA +81 -54
mlrun-1.8.0.dist-info/RECORD +351 -0
{mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/WHEEL +1 -1
mlrun/model_monitoring/applications/evidently_base.py +0 -137
mlrun/model_monitoring/db/stores/__init__.py +0 -136
mlrun/model_monitoring/db/stores/base/store.py +0 -213
mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -71
mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -190
mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +0 -103
mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -40
mlrun/model_monitoring/db/stores/sqldb/sql_store.py +0 -659
mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +0 -726
mlrun/model_monitoring/model_endpoint.py +0 -118
mlrun-1.7.2rc3.dist-info/RECORD +0 -351
{mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/entry_points.txt +0 -0
{mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info/licenses}/LICENSE +0 -0
{mlrun-1.7.2rc3.dist-info → mlrun-1.8.0.dist-info}/top_level.txt +0 -0

mlrun/model_monitoring/stream_processing.py CHANGED Viewed

@@ -12,32 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import collections
 import datetime
-import json
-import os
 import typing
-import storey
 import mlrun
 import mlrun.common.model_monitoring.helpers
-import mlrun.config
-import mlrun.datastore.targets
 import mlrun.feature_store as fstore
 import mlrun.feature_store.steps
-import mlrun.model_monitoring.db
 import mlrun.serving.states
 import mlrun.utils
 from mlrun.common.schemas.model_monitoring.constants import (
+    ControllerEvent,
+    ControllerEventKind,
+    EndpointType,
     EventFieldType,
-    EventKeyMetrics,
-    EventLiveStats,
     FileTargetKind,
-    ModelEndpointTarget,
     ProjectSecretKeys,
 )
-from mlrun.model_monitoring.db import StoreBase, TSDBConnector
+from mlrun.model_monitoring.db import TSDBConnector
 from mlrun.utils import logger
@@ -51,7 +43,7 @@ class EventStreamProcessor:
         parquet_target: str,
         aggregate_windows: typing.Optional[list[str]] = None,
         aggregate_period: str = "5m",
-        model_monitoring_access_key: str = None,
+        model_monitoring_access_key: typing.Optional[str] = None,
     ):
         # General configurations, mainly used for the storey steps in the future serving graph
         self.project = project
@@ -69,14 +61,11 @@ class EventStreamProcessor:
             parquet_batching_max_events=self.parquet_batching_max_events,
         )
-        self.storage_options = None
         self.tsdb_configurations = {}
         if not mlrun.mlconf.is_ce_mode():
             self._initialize_v3io_configurations(
                 model_monitoring_access_key=model_monitoring_access_key
             )
-        elif self.parquet_path.startswith("s3://"):
-            self.storage_options = mlrun.mlconf.get_s3_storage_options()
     def _initialize_v3io_configurations(
         self,
@@ -85,33 +74,18 @@ class EventStreamProcessor:
         v3io_access_key: typing.Optional[str] = None,
         v3io_framesd: typing.Optional[str] = None,
         v3io_api: typing.Optional[str] = None,
-        model_monitoring_access_key: str = None,
+        model_monitoring_access_key: typing.Optional[str] = None,
     ):
         # Get the V3IO configurations
         self.v3io_framesd = v3io_framesd or mlrun.mlconf.v3io_framesd
         self.v3io_api = v3io_api or mlrun.mlconf.v3io_api
-        self.v3io_access_key = v3io_access_key or os.environ.get("V3IO_ACCESS_KEY")
+        self.v3io_access_key = v3io_access_key or mlrun.mlconf.get_v3io_access_key()
         self.model_monitoring_access_key = (
             model_monitoring_access_key
-            or os.environ.get(ProjectSecretKeys.ACCESS_KEY)
+            or mlrun.get_secret_or_env(ProjectSecretKeys.ACCESS_KEY)
             or self.v3io_access_key
         )
-        self.storage_options = dict(
-            v3io_access_key=self.model_monitoring_access_key, v3io_api=self.v3io_api
-        )
-        # KV path
-        kv_path = mlrun.mlconf.get_model_monitoring_file_target_path(
-            project=self.project, kind=FileTargetKind.ENDPOINTS
-        )
-        (
-            _,
-            self.kv_container,
-            self.kv_path,
-        ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
-            kv_path
-        )
         # TSDB path and configurations
         tsdb_path = mlrun.mlconf.get_model_monitoring_file_target_path(
@@ -133,7 +107,7 @@ class EventStreamProcessor:
         self,
         fn: mlrun.runtimes.ServingRuntime,
         tsdb_connector: TSDBConnector,
-        endpoint_store: StoreBase,
+        controller_stream_uri: str,
     ) -> None:
         """
         Apply monitoring serving graph to a given serving function. The following serving graph includes about 4 main
@@ -162,31 +136,25 @@ class EventStreamProcessor:
         :param fn: A serving function.
         :param tsdb_connector: Time series database connector.
-        :param endpoint_store: KV/SQL store used for endpoint data.
+        :param controller_stream_uri: The controller stream URI. Runs on server api pod so needed to be provided as
+        input
         """
         graph = typing.cast(
             mlrun.serving.states.RootFlowStep,
-            fn.set_topology(mlrun.serving.states.StepKinds.flow),
-        )
-        graph.add_step(
-            "ExtractEndpointID",
-            "extract_endpoint",
-            full_event=True,
+            fn.set_topology(mlrun.serving.states.StepKinds.flow, engine="async"),
         )
         # split the graph between event with error vs valid event
         graph.add_step(
             "storey.Filter",
             "FilterError",
-            after="extract_endpoint",
             _fn="(event.get('error') is None)",
         )
         graph.add_step(
             "storey.Filter",
             "ForwardError",
-            after="extract_endpoint",
             _fn="(event.get('error') is not None)",
         )
@@ -198,7 +166,7 @@ class EventStreamProcessor:
         def apply_process_endpoint_event():
             graph.add_step(
                 "ProcessEndpointEvent",
-                after="extract_endpoint",  # TODO: change this to FilterError in ML-7456
+                after="FilterError",
                 full_event=True,
                 project=self.project,
             )
@@ -233,79 +201,25 @@ class EventStreamProcessor:
             )
         apply_map_feature_names()
+        # split the graph between event with error vs valid event
+        graph.add_step(
+            "storey.Filter",
+            "FilterNOP",
+            after="MapFeatureNames",
+            _fn="(event.get('kind', " ") != 'nop_event')",
+        )
+        graph.add_step(
+            "storey.Filter",
+            "ForwardNOP",
+            after="MapFeatureNames",
+            _fn="(event.get('kind', " ") == 'nop_event')",
+        )
-        # Calculate number of predictions and average latency
-        def apply_storey_aggregations():
-            # Calculate number of predictions for each window (5 min and 1 hour by default)
-            graph.add_step(
-                class_name="storey.AggregateByKey",
-                aggregates=[
-                    {
-                        "name": EventFieldType.LATENCY,
-                        "column": EventFieldType.LATENCY,
-                        "operations": ["count", "avg"],
-                        "windows": self.aggregate_windows,
-                        "period": self.aggregate_period,
-                    }
-                ],
-                name=EventFieldType.LATENCY,
-                after="MapFeatureNames",
-                step_name="Aggregates",
-                table=".",
-                key_field=EventFieldType.ENDPOINT_ID,
-            )
-            # Calculate average latency time for each window (5 min and 1 hour by default)
-            graph.add_step(
-                class_name="storey.Rename",
-                mapping={
-                    "latency_count_5m": EventLiveStats.PREDICTIONS_COUNT_5M,
-                    "latency_count_1h": EventLiveStats.PREDICTIONS_COUNT_1H,
-                },
-                name="Rename",
-                after=EventFieldType.LATENCY,
-            )
-        apply_storey_aggregations()
-        # KV/SQL branch
-        # Filter relevant keys from the event before writing the data into the database table
-        def apply_process_before_endpoint_update():
-            graph.add_step(
-                "ProcessBeforeEndpointUpdate",
-                name="ProcessBeforeEndpointUpdate",
-                after="Rename",
-            )
-        apply_process_before_endpoint_update()
-        # Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
-        # about average latency and the amount of predictions over time
-        def apply_update_endpoint():
-            graph.add_step(
-                "UpdateEndpoint",
-                name="UpdateEndpoint",
-                after="ProcessBeforeEndpointUpdate",
-                project=self.project,
-            )
-        apply_update_endpoint()
-        # (only for V3IO KV target) - Apply infer_schema on the model endpoints table for generating schema file
-        # which will be used by Grafana monitoring dashboards
-        def apply_infer_schema():
-            graph.add_step(
-                "InferSchema",
-                name="InferSchema",
-                after="UpdateEndpoint",
-                v3io_framesd=self.v3io_framesd,
-                container=self.kv_container,
-                table=self.kv_path,
-            )
-        if endpoint_store.type == ModelEndpointTarget.V3IO_NOSQL:
-            apply_infer_schema()
-        tsdb_connector.apply_monitoring_stream_steps(graph=graph)
+        tsdb_connector.apply_monitoring_stream_steps(
+            graph=graph,
+            aggregate_windows=self.aggregate_windows,
+            aggregate_period=self.aggregate_period,
+        )
         # Parquet branch
         # Filter and validate different keys before writing the data to Parquet target
@@ -313,7 +227,7 @@ class EventStreamProcessor:
             graph.add_step(
                 "ProcessBeforeParquet",
                 name="ProcessBeforeParquet",
-                after="MapFeatureNames",
+                after="FilterNOP",
                 _fn="(event)",
             )
@@ -322,12 +236,12 @@ class EventStreamProcessor:
         # Write the Parquet target file, partitioned by key (endpoint_id) and time.
         def apply_parquet_target():
             graph.add_step(
-                "storey.ParquetTarget",
+                "mlrun.datastore.storeytargets.ParquetStoreyTarget",
+                alternative_v3io_access_key=mlrun.common.schemas.model_monitoring.ProjectSecretKeys.ACCESS_KEY,
                 name="ParquetTarget",
                 after="ProcessBeforeParquet",
                 graph_shape="cylinder",
                 path=self.parquet_path,
-                storage_options=self.storage_options,
                 max_events=self.parquet_batching_max_events,
                 flush_after_seconds=self.parquet_batching_timeout_secs,
                 attributes={"infer_columns_from_data": True},
@@ -340,90 +254,20 @@ class EventStreamProcessor:
         apply_parquet_target()
+        # controller branch
+        def apply_push_controller_stream(stream_uri: str):
+            graph.add_step(
+                ">>",
+                "controller_stream",
+                path=stream_uri,
+                sharding_func=ControllerEvent.ENDPOINT_ID,
+                after="ForwardNOP",
+                # Force using the pipeline key instead of the one in the profile in case of v3io profile.
+                # In case of Kafka, this parameter will be ignored.
+                alternative_v3io_access_key="V3IO_ACCESS_KEY",
+            )
-class ProcessBeforeEndpointUpdate(mlrun.feature_store.steps.MapClass):
-    def __init__(self, **kwargs):
-        """
-        Filter relevant keys from the event before writing the data to database table (in EndpointUpdate step).
-        Note that in the endpoint table we only keep metadata (function_uri, model_class, etc.) and stats about the
-        average latency and the number of predictions (per 5min and 1hour).
-        :returns: A filtered event as a dictionary which will be written to the endpoint table in the next step.
-        """
-        super().__init__(**kwargs)
-    def do(self, event):
-        # Compute prediction per second
-        event[EventLiveStats.PREDICTIONS_PER_SECOND] = (
-            float(event[EventLiveStats.PREDICTIONS_COUNT_5M]) / 300
-        )
-        # Filter relevant keys
-        e = {
-            k: event[k]
-            for k in [
-                EventFieldType.FUNCTION_URI,
-                EventFieldType.MODEL,
-                EventFieldType.MODEL_CLASS,
-                EventFieldType.ENDPOINT_ID,
-                EventFieldType.LABELS,
-                EventFieldType.FIRST_REQUEST,
-                EventFieldType.LAST_REQUEST,
-                EventFieldType.ERROR_COUNT,
-            ]
-        }
-        # Add generic metrics statistics
-        generic_metrics = {
-            k: event[k]
-            for k in [
-                EventLiveStats.LATENCY_AVG_5M,
-                EventLiveStats.LATENCY_AVG_1H,
-                EventLiveStats.PREDICTIONS_PER_SECOND,
-                EventLiveStats.PREDICTIONS_COUNT_5M,
-                EventLiveStats.PREDICTIONS_COUNT_1H,
-            ]
-        }
-        e[EventFieldType.METRICS] = json.dumps(
-            {EventKeyMetrics.GENERIC: generic_metrics}
-        )
-        # Write labels as json string as required by the DB format
-        e[EventFieldType.LABELS] = json.dumps(e[EventFieldType.LABELS])
-        return e
-class ExtractEndpointID(mlrun.feature_store.steps.MapClass):
-    def __init__(self, **kwargs) -> None:
-        """
-        Generate the model endpoint ID based on the event parameters and attach it to the event.
-        """
-        super().__init__(**kwargs)
-    def do(self, full_event) -> typing.Union[storey.Event, None]:
-        # Getting model version and function uri from event
-        # and use them for retrieving the endpoint_id
-        function_uri = full_event.body.get(EventFieldType.FUNCTION_URI)
-        if not is_not_none(function_uri, [EventFieldType.FUNCTION_URI]):
-            return None
-        model = full_event.body.get(EventFieldType.MODEL)
-        if not is_not_none(model, [EventFieldType.MODEL]):
-            return None
-        version = full_event.body.get(EventFieldType.VERSION)
-        versioned_model = f"{model}:{version}" if version else f"{model}:latest"
-        endpoint_id = mlrun.common.model_monitoring.create_model_endpoint_uid(
-            function_uri=function_uri,
-            versioned_model=versioned_model,
-        )
-        endpoint_id = str(endpoint_id)
-        full_event.body[EventFieldType.ENDPOINT_ID] = endpoint_id
-        full_event.body[EventFieldType.VERSIONED_MODEL] = versioned_model
-        return full_event
+        apply_push_controller_stream(controller_stream_uri)
 class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
@@ -490,28 +334,34 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
         self.first_request: dict[str, str] = dict()
         self.last_request: dict[str, str] = dict()
-        # Number of errors (value) per endpoint (key)
-        self.error_count: dict[str, int] = collections.defaultdict(int)
         # Set of endpoints in the current events
         self.endpoints: set[str] = set()
     def do(self, full_event):
         event = full_event.body
+        if event.get(ControllerEvent.KIND, "") == ControllerEventKind.NOP_EVENT:
+            logger.debug(
+                "Skipped nop event inside of ProcessEndpointEvent", event=event
+            )
+            full_event.body = [event]
+            return full_event
+        # Getting model version and function uri from event
+        # and use them for retrieving the endpoint_id
+        function_uri = full_event.body.get(EventFieldType.FUNCTION_URI)
+        if not is_not_none(function_uri, [EventFieldType.FUNCTION_URI]):
+            return None
+        model = full_event.body.get(EventFieldType.MODEL)
+        if not is_not_none(model, [EventFieldType.MODEL]):
+            return None
-        versioned_model = event[EventFieldType.VERSIONED_MODEL]
         endpoint_id = event[EventFieldType.ENDPOINT_ID]
-        function_uri = event[EventFieldType.FUNCTION_URI]
         # In case this process fails, resume state from existing record
-        self.resume_state(endpoint_id)
-        # If error key has been found in the current event,
-        # increase the error counter by 1 and raise the error description
-        error = event.get("error")
-        if error:  # TODO: delete this in ML-7456
-            self.error_count[endpoint_id] += 1
-            raise mlrun.errors.MLRunInvalidArgumentError(str(error))
+        self.resume_state(
+            endpoint_id=endpoint_id,
+            endpoint_name=full_event.body.get(EventFieldType.MODEL),
+        )
         # Validate event fields
         model_class = event.get("model_class") or event.get("class")
@@ -524,10 +374,9 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
         predictions = event.get("resp", {}).get("outputs")
         if not self.is_valid(
-            endpoint_id,
-            is_not_none,
-            timestamp,
-            ["when"],
+            validation_function=is_not_none,
+            field=timestamp,
+            dict_path=["when"],
         ):
             return None
@@ -535,45 +384,33 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
             # Set time for the first request of the current endpoint
             self.first_request[endpoint_id] = timestamp
-        # Validate that the request time of the current event is later than the previous request time
-        self._validate_last_request_timestamp(
-            endpoint_id=endpoint_id, timestamp=timestamp
-        )
-        # Set time for the last reqeust of the current endpoint
-        self.last_request[endpoint_id] = timestamp
         if not self.is_valid(
-            endpoint_id,
-            is_not_none,
-            request_id,
-            ["request", "id"],
+            validation_function=is_not_none,
+            field=request_id,
+            dict_path=["request", "id"],
         ):
             return None
         if not self.is_valid(
-            endpoint_id,
-            is_not_none,
-            latency,
-            ["microsec"],
+            validation_function=is_not_none,
+            field=latency,
+            dict_path=["microsec"],
         ):
             return None
         if not self.is_valid(
-            endpoint_id,
-            is_not_none,
-            features,
-            ["request", "inputs"],
+            validation_function=is_not_none,
+            field=features,
+            dict_path=["request", "inputs"],
         ):
             return None
         if not self.is_valid(
-            endpoint_id,
-            is_not_none,
-            predictions,
-            ["resp", "outputs"],
+            validation_function=is_not_none,
+            field=predictions,
+            dict_path=["resp", "outputs"],
         ):
             return None
         # Convert timestamp to a datetime object
-        timestamp = datetime.datetime.fromisoformat(timestamp)
+        timestamp_obj = datetime.datetime.fromisoformat(timestamp)
         # Separate each model invocation into sub events that will be stored as dictionary
         # in list of events. This list will be used as the body for the storey event.
@@ -605,96 +442,93 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
             if not isinstance(feature, list):
                 feature = [feature]
+            effective_sample_count, estimated_prediction_count = (
+                self._get_effective_and_estimated_counts(event=event)
+            )
             events.append(
                 {
                     EventFieldType.FUNCTION_URI: function_uri,
-                    EventFieldType.MODEL: versioned_model,
+                    EventFieldType.ENDPOINT_NAME: event.get(EventFieldType.MODEL),
                     EventFieldType.MODEL_CLASS: model_class,
-                    EventFieldType.TIMESTAMP: timestamp,
+                    EventFieldType.TIMESTAMP: timestamp_obj,
                     EventFieldType.ENDPOINT_ID: endpoint_id,
                     EventFieldType.REQUEST_ID: request_id,
                     EventFieldType.LATENCY: latency,
                     EventFieldType.FEATURES: feature,
                     EventFieldType.PREDICTION: prediction,
                     EventFieldType.FIRST_REQUEST: self.first_request[endpoint_id],
-                    EventFieldType.LAST_REQUEST: self.last_request[endpoint_id],
+                    EventFieldType.LAST_REQUEST: timestamp,
                     EventFieldType.LAST_REQUEST_TIMESTAMP: mlrun.utils.enrich_datetime_with_tz_info(
-                        self.last_request[endpoint_id]
+                        timestamp
                     ).timestamp(),
-                    EventFieldType.ERROR_COUNT: self.error_count[endpoint_id],
                     EventFieldType.LABELS: event.get(EventFieldType.LABELS, {}),
                     EventFieldType.METRICS: event.get(EventFieldType.METRICS, {}),
                     EventFieldType.ENTITIES: event.get("request", {}).get(
                         EventFieldType.ENTITIES, {}
                     ),
+                    EventFieldType.EFFECTIVE_SAMPLE_COUNT: effective_sample_count,
+                    EventFieldType.ESTIMATED_PREDICTION_COUNT: estimated_prediction_count,
                 }
             )
         # Create a storey event object with list of events, based on endpoint_id which will be used
         # in the upcoming steps
-        storey_event = storey.Event(body=events, key=endpoint_id)
-        return storey_event
-    def _validate_last_request_timestamp(self, endpoint_id: str, timestamp: str):
-        """Validate that the request time of the current event is later than the previous request time that has
-        already been processed.
-        :param endpoint_id: The unique id of the model endpoint.
-        :param timestamp:   Event request time as a string.
-        :raise MLRunPreconditionFailedError: If the request time of the current is later than the previous request time.
-        """
-        if (
-            endpoint_id in self.last_request
-            and self.last_request[endpoint_id] > timestamp
-        ):
-            logger.error(
-                f"current event request time {timestamp} is earlier than the last request time "
-                f"{self.last_request[endpoint_id]} - write to TSDB will be rejected"
-            )
+        full_event.key = endpoint_id
+        full_event.body = events
+        return full_event
-    def resume_state(self, endpoint_id):
+    def resume_state(self, endpoint_id, endpoint_name):
         # Make sure process is resumable, if process fails for any reason, be able to pick things up close to where we
         # left them
         if endpoint_id not in self.endpoints:
             logger.info("Trying to resume state", endpoint_id=endpoint_id)
-            endpoint_record = mlrun.model_monitoring.helpers.get_endpoint_record(
-                project=self.project,
-                endpoint_id=endpoint_id,
+            endpoint_record = (
+                mlrun.db.get_run_db()
+                .get_model_endpoint(
+                    project=self.project,
+                    endpoint_id=endpoint_id,
+                    name=endpoint_name,
+                    tsdb_metrics=False,
+                )
+                .flat_dict()
             )
-            # If model endpoint found, get first_request, last_request and error_count values
+            # If model endpoint found, get first_request & last_request values
             if endpoint_record:
                 first_request = endpoint_record.get(EventFieldType.FIRST_REQUEST)
                 if first_request:
                     self.first_request[endpoint_id] = first_request
-                last_request = endpoint_record.get(EventFieldType.LAST_REQUEST)
-                if last_request:
-                    self.last_request[endpoint_id] = last_request
-                error_count = endpoint_record.get(EventFieldType.ERROR_COUNT)
-                if error_count:
-                    self.error_count[endpoint_id] = int(error_count)
             # add endpoint to endpoints set
             self.endpoints.add(endpoint_id)
     def is_valid(
         self,
-        endpoint_id: str,
         validation_function,
         field: typing.Any,
         dict_path: list[str],
     ):
         if validation_function(field, dict_path):
             return True
-        self.error_count[endpoint_id] += 1
         return False
+    @staticmethod
+    def _get_effective_and_estimated_counts(event):
+        """
+        Calculate the `effective_sample_count` and the `estimated_prediction_count` based on the event's
+        sampling percentage. These values will be stored in the TSDB target.
+        Note that In non-batch serving, the `effective_sample_count` is always set to 1. In addition, when the sampling
+        percentage is 100%, the `estimated_prediction_count` is equal to the `effective_sample_count`.
+        """
+        effective_sample_count = event.get(EventFieldType.EFFECTIVE_SAMPLE_COUNT, 1)
+        estimated_prediction_count = effective_sample_count * (
+            100 / event.get(EventFieldType.SAMPLING_PERCENTAGE, 100)
+        )
+        return effective_sample_count, estimated_prediction_count
 def is_not_none(field: typing.Any, dict_path: list[str]):
     if field is not None:
@@ -735,6 +569,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
         # and labels columns were not found in the current event
         self.feature_names = {}
         self.label_columns = {}
+        self.first_request = {}
         # Dictionary to manage the model endpoint types - important for the V3IO TSDB
         self.endpoint_type = {}
@@ -756,6 +591,8 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
         return None
     def do(self, event: dict):
+        if event.get(ControllerEvent.KIND, "") == ControllerEventKind.NOP_EVENT:
+            return event
         endpoint_id = event[EventFieldType.ENDPOINT_ID]
         feature_values = event[EventFieldType.FEATURES]
@@ -766,23 +603,30 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
             if isinstance(feature_value, int):
                 feature_values[index] = float(feature_value)
+        attributes_to_update = {}
+        endpoint_record = None
         # Get feature names and label columns
         if endpoint_id not in self.feature_names:
-            endpoint_record = mlrun.model_monitoring.helpers.get_endpoint_record(
-                project=self.project,
-                endpoint_id=endpoint_id,
+            endpoint_record = (
+                mlrun.db.get_run_db()
+                .get_model_endpoint(
+                    project=self.project,
+                    endpoint_id=endpoint_id,
+                    name=event[EventFieldType.ENDPOINT_NAME],
+                    tsdb_metrics=False,
+                )
+                .flat_dict()
             )
             feature_names = endpoint_record.get(EventFieldType.FEATURE_NAMES)
-            feature_names = json.loads(feature_names) if feature_names else None
             label_columns = endpoint_record.get(EventFieldType.LABEL_NAMES)
-            label_columns = json.loads(label_columns) if label_columns else None
             # If feature names were not found,
             # try to retrieve them from the previous events of the current process
             if not feature_names and self._infer_columns_from_data:
                 feature_names = self._infer_feature_names_from_data(event)
+            endpoint_type = int(endpoint_record.get(EventFieldType.ENDPOINT_TYPE))
             if not feature_names:
                 logger.warn(
                     "Feature names are not initialized, they will be automatically generated",
@@ -793,19 +637,14 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
                 ]
                 # Update the endpoint record with the generated features
-                update_endpoint_record(
-                    project=self.project,
-                    endpoint_id=endpoint_id,
-                    attributes={
-                        EventFieldType.FEATURE_NAMES: json.dumps(feature_names)
-                    },
-                )
+                attributes_to_update[EventFieldType.FEATURE_NAMES] = feature_names
-                update_monitoring_feature_set(
-                    endpoint_record=endpoint_record,
-                    feature_names=feature_names,
-                    feature_values=feature_values,
-                )
+                if endpoint_type != EndpointType.ROUTER.value:
+                    update_monitoring_feature_set(
+                        endpoint_record=endpoint_record,
+                        feature_names=feature_names,
+                        feature_values=feature_values,
+                    )
             # Similar process with label columns
             if not label_columns and self._infer_columns_from_data:
@@ -819,17 +658,13 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
                 label_columns = [
                     f"p{i}" for i, _ in enumerate(event[EventFieldType.PREDICTION])
                 ]
-                update_endpoint_record(
-                    project=self.project,
-                    endpoint_id=endpoint_id,
-                    attributes={EventFieldType.LABEL_NAMES: json.dumps(label_columns)},
-                )
-                update_monitoring_feature_set(
-                    endpoint_record=endpoint_record,
-                    feature_names=label_columns,
-                    feature_values=label_values,
-                )
+                attributes_to_update[EventFieldType.LABEL_NAMES] = label_columns
+                if endpoint_type != EndpointType.ROUTER.value:
+                    update_monitoring_feature_set(
+                        endpoint_record=endpoint_record,
+                        feature_names=label_columns,
+                        feature_values=label_values,
+                    )
             self.label_columns[endpoint_id] = label_columns
             self.feature_names[endpoint_id] = feature_names
@@ -842,9 +677,41 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
             )
             # Update the endpoint type within the endpoint types dictionary
-            endpoint_type = int(endpoint_record.get(EventFieldType.ENDPOINT_TYPE))
             self.endpoint_type[endpoint_id] = endpoint_type
+        # Update the first request time in the endpoint record
+        if endpoint_id not in self.first_request:
+            endpoint_record = endpoint_record or (
+                mlrun.db.get_run_db()
+                .get_model_endpoint(
+                    project=self.project,
+                    endpoint_id=endpoint_id,
+                    name=event[EventFieldType.ENDPOINT_NAME],
+                    tsdb_metrics=False,
+                )
+                .flat_dict()
+            )
+            if not endpoint_record.get(EventFieldType.FIRST_REQUEST):
+                attributes_to_update[EventFieldType.FIRST_REQUEST] = (
+                    mlrun.utils.enrich_datetime_with_tz_info(
+                        event[EventFieldType.FIRST_REQUEST]
+                    )
+                )
+            self.first_request[endpoint_id] = True
+        if attributes_to_update:
+            logger.info(
+                "Updating endpoint record",
+                endpoint_id=endpoint_id,
+                attributes=attributes_to_update,
+            )
+            update_endpoint_record(
+                project=self.project,
+                endpoint_id=endpoint_id,
+                attributes=attributes_to_update,
+                endpoint_name=event[EventFieldType.ENDPOINT_NAME],
+            )
         # Add feature_name:value pairs along with a mapping dictionary of all of these pairs
         feature_names = self.feature_names[endpoint_id]
         self._map_dictionary_values(
@@ -890,35 +757,13 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
         """
         event[mapping_dictionary] = {}
+        diff = len(named_iters) - len(values_iters)
+        values_iters += [None] * diff
         for name, value in zip(named_iters, values_iters):
             event[name] = value
             event[mapping_dictionary][name] = value
-class UpdateEndpoint(mlrun.feature_store.steps.MapClass):
-    def __init__(self, project: str, **kwargs):
-        """
-        Update the model endpoint record in the DB. Note that the event at this point includes metadata and stats about
-        the average latency and the amount of predictions over time. This data will be used in the monitoring dashboards
-        such as "Model Monitoring - Performance" which can be found in Grafana.
-        :returns: Event as a dictionary (without any changes) for the next step (InferSchema).
-        """
-        super().__init__(**kwargs)
-        self.project = project
-    def do(self, event: dict):
-        # Remove labels from the event
-        event.pop(EventFieldType.LABELS)
-        update_endpoint_record(
-            project=self.project,
-            endpoint_id=event.pop(EventFieldType.ENDPOINT_ID),
-            attributes=event,
-        )
-        return event
 class InferSchema(mlrun.feature_store.steps.MapClass):
     def __init__(
         self,
@@ -963,14 +808,14 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
 def update_endpoint_record(
     project: str,
     endpoint_id: str,
+    endpoint_name: str,
     attributes: dict,
 ):
-    model_endpoint_store = mlrun.model_monitoring.get_store_object(
+    mlrun.db.get_run_db().patch_model_endpoint(
         project=project,
-    )
-    model_endpoint_store.update_model_endpoint(
-        endpoint_id=endpoint_id, attributes=attributes
+        endpoint_id=endpoint_id,
+        attributes=attributes,
+        name=endpoint_name,
     )

mlrun 1.7.2rc3__py3-none-any.whl → 1.8.0__py3-none-any.whl

Potentially problematic release.

mlrun 1.7.2rc3py3-none-any.whl → 1.8.0py3-none-any.whl