PyPI - mlrun - Versions diffs - 1.4.0rc25__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl - Mend

mlrun 1.4.0rc25py3-none-any.whl → 1.5.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (184) hide show

mlrun/__init__.py +2 -35
mlrun/__main__.py +3 -41
mlrun/api/api/api.py +6 -0
mlrun/api/api/endpoints/feature_store.py +0 -4
mlrun/api/api/endpoints/files.py +14 -2
mlrun/api/api/endpoints/frontend_spec.py +2 -1
mlrun/api/api/endpoints/functions.py +95 -59
mlrun/api/api/endpoints/grafana_proxy.py +9 -9
mlrun/api/api/endpoints/logs.py +17 -3
mlrun/api/api/endpoints/model_endpoints.py +3 -2
mlrun/api/api/endpoints/pipelines.py +1 -5
mlrun/api/api/endpoints/projects.py +88 -0
mlrun/api/api/endpoints/runs.py +48 -6
mlrun/api/api/endpoints/submit.py +2 -1
mlrun/api/api/endpoints/workflows.py +355 -0
mlrun/api/api/utils.py +3 -4
mlrun/api/crud/__init__.py +1 -0
mlrun/api/crud/client_spec.py +6 -2
mlrun/api/crud/feature_store.py +5 -0
mlrun/api/crud/model_monitoring/__init__.py +1 -0
mlrun/api/crud/model_monitoring/deployment.py +497 -0
mlrun/api/crud/model_monitoring/grafana.py +96 -42
mlrun/api/crud/model_monitoring/helpers.py +159 -0
mlrun/api/crud/model_monitoring/model_endpoints.py +202 -476
mlrun/api/crud/notifications.py +9 -4
mlrun/api/crud/pipelines.py +6 -11
mlrun/api/crud/projects.py +2 -2
mlrun/api/crud/runtime_resources.py +4 -3
mlrun/api/crud/runtimes/nuclio/helpers.py +5 -1
mlrun/api/crud/secrets.py +21 -0
mlrun/api/crud/workflows.py +352 -0
mlrun/api/db/base.py +16 -1
mlrun/api/db/init_db.py +2 -4
mlrun/api/db/session.py +1 -1
mlrun/api/db/sqldb/db.py +129 -31
mlrun/api/db/sqldb/models/models_mysql.py +15 -1
mlrun/api/db/sqldb/models/models_sqlite.py +16 -2
mlrun/api/launcher.py +38 -6
mlrun/api/main.py +3 -2
mlrun/api/rundb/__init__.py +13 -0
mlrun/{db → api/rundb}/sqldb.py +36 -84
mlrun/api/runtime_handlers/__init__.py +56 -0
mlrun/api/runtime_handlers/base.py +1247 -0
mlrun/api/runtime_handlers/daskjob.py +209 -0
mlrun/api/runtime_handlers/kubejob.py +37 -0
mlrun/api/runtime_handlers/mpijob.py +147 -0
mlrun/api/runtime_handlers/remotesparkjob.py +29 -0
mlrun/api/runtime_handlers/sparkjob.py +148 -0
mlrun/api/schemas/__init__.py +17 -6
mlrun/api/utils/builder.py +1 -4
mlrun/api/utils/clients/chief.py +14 -0
mlrun/api/utils/clients/iguazio.py +33 -33
mlrun/api/utils/clients/nuclio.py +2 -2
mlrun/api/utils/periodic.py +9 -2
mlrun/api/utils/projects/follower.py +14 -7
mlrun/api/utils/projects/leader.py +2 -1
mlrun/api/utils/projects/remotes/nop_follower.py +2 -2
mlrun/api/utils/projects/remotes/nop_leader.py +2 -2
mlrun/api/utils/runtimes/__init__.py +14 -0
mlrun/api/utils/runtimes/nuclio.py +43 -0
mlrun/api/utils/scheduler.py +98 -15
mlrun/api/utils/singletons/db.py +5 -1
mlrun/api/utils/singletons/project_member.py +4 -1
mlrun/api/utils/singletons/scheduler.py +1 -1
mlrun/artifacts/base.py +6 -6
mlrun/artifacts/dataset.py +4 -4
mlrun/artifacts/manager.py +2 -3
mlrun/artifacts/model.py +2 -2
mlrun/artifacts/plots.py +8 -8
mlrun/common/db/__init__.py +14 -0
mlrun/common/helpers.py +37 -0
mlrun/{mlutils → common/model_monitoring}/__init__.py +3 -2
mlrun/common/model_monitoring/helpers.py +69 -0
mlrun/common/schemas/__init__.py +13 -1
mlrun/common/schemas/auth.py +4 -1
mlrun/common/schemas/client_spec.py +1 -1
mlrun/common/schemas/function.py +17 -0
mlrun/common/schemas/model_monitoring/__init__.py +48 -0
mlrun/common/{model_monitoring.py → schemas/model_monitoring/constants.py} +11 -23
mlrun/common/schemas/model_monitoring/grafana.py +55 -0
mlrun/common/schemas/{model_endpoints.py → model_monitoring/model_endpoints.py} +32 -65
mlrun/common/schemas/notification.py +1 -0
mlrun/common/schemas/object.py +4 -0
mlrun/common/schemas/project.py +1 -0
mlrun/common/schemas/regex.py +1 -1
mlrun/common/schemas/runs.py +1 -8
mlrun/common/schemas/schedule.py +1 -8
mlrun/common/schemas/workflow.py +54 -0
mlrun/config.py +45 -42
mlrun/datastore/__init__.py +21 -0
mlrun/datastore/base.py +1 -1
mlrun/datastore/datastore.py +9 -0
mlrun/datastore/dbfs_store.py +168 -0
mlrun/datastore/helpers.py +18 -0
mlrun/datastore/sources.py +1 -0
mlrun/datastore/store_resources.py +2 -5
mlrun/datastore/v3io.py +1 -2
mlrun/db/__init__.py +4 -68
mlrun/db/base.py +12 -0
mlrun/db/factory.py +65 -0
mlrun/db/httpdb.py +175 -20
mlrun/db/nopdb.py +4 -2
mlrun/execution.py +4 -2
mlrun/feature_store/__init__.py +1 -0
mlrun/feature_store/api.py +1 -2
mlrun/feature_store/common.py +2 -1
mlrun/feature_store/feature_set.py +1 -11
mlrun/feature_store/feature_vector.py +340 -2
mlrun/feature_store/ingestion.py +5 -10
mlrun/feature_store/retrieval/base.py +118 -104
mlrun/feature_store/retrieval/dask_merger.py +17 -10
mlrun/feature_store/retrieval/job.py +4 -1
mlrun/feature_store/retrieval/local_merger.py +18 -18
mlrun/feature_store/retrieval/spark_merger.py +21 -14
mlrun/feature_store/retrieval/storey_merger.py +22 -16
mlrun/kfpops.py +3 -9
mlrun/launcher/base.py +57 -53
mlrun/launcher/client.py +5 -4
mlrun/launcher/factory.py +24 -13
mlrun/launcher/local.py +6 -6
mlrun/launcher/remote.py +4 -4
mlrun/lists.py +0 -11
mlrun/model.py +11 -17
mlrun/model_monitoring/__init__.py +2 -22
mlrun/model_monitoring/features_drift_table.py +1 -1
mlrun/model_monitoring/helpers.py +22 -210
mlrun/model_monitoring/model_endpoint.py +1 -1
mlrun/model_monitoring/model_monitoring_batch.py +127 -50
mlrun/model_monitoring/prometheus.py +219 -0
mlrun/model_monitoring/stores/__init__.py +16 -11
mlrun/model_monitoring/stores/kv_model_endpoint_store.py +95 -23
mlrun/model_monitoring/stores/models/mysql.py +47 -29
mlrun/model_monitoring/stores/models/sqlite.py +47 -29
mlrun/model_monitoring/stores/sql_model_endpoint_store.py +31 -19
mlrun/model_monitoring/{stream_processing_fs.py → stream_processing.py} +206 -64
mlrun/model_monitoring/tracking_policy.py +104 -0
mlrun/package/packager.py +6 -8
mlrun/package/packagers/default_packager.py +121 -10
mlrun/package/packagers/numpy_packagers.py +1 -1
mlrun/platforms/__init__.py +0 -2
mlrun/platforms/iguazio.py +0 -56
mlrun/projects/pipelines.py +53 -159
mlrun/projects/project.py +10 -37
mlrun/render.py +1 -1
mlrun/run.py +8 -124
mlrun/runtimes/__init__.py +6 -42
mlrun/runtimes/base.py +29 -1249
mlrun/runtimes/daskjob.py +2 -198
mlrun/runtimes/funcdoc.py +0 -9
mlrun/runtimes/function.py +25 -29
mlrun/runtimes/kubejob.py +5 -29
mlrun/runtimes/local.py +1 -1
mlrun/runtimes/mpijob/__init__.py +2 -2
mlrun/runtimes/mpijob/abstract.py +10 -1
mlrun/runtimes/mpijob/v1.py +0 -76
mlrun/runtimes/mpijob/v1alpha1.py +1 -74
mlrun/runtimes/nuclio.py +3 -2
mlrun/runtimes/pod.py +28 -18
mlrun/runtimes/remotesparkjob.py +1 -15
mlrun/runtimes/serving.py +14 -6
mlrun/runtimes/sparkjob/__init__.py +0 -1
mlrun/runtimes/sparkjob/abstract.py +4 -131
mlrun/runtimes/utils.py +0 -26
mlrun/serving/routers.py +7 -7
mlrun/serving/server.py +11 -8
mlrun/serving/states.py +7 -1
mlrun/serving/v2_serving.py +6 -6
mlrun/utils/helpers.py +23 -42
mlrun/utils/notifications/notification/__init__.py +4 -0
mlrun/utils/notifications/notification/webhook.py +61 -0
mlrun/utils/notifications/notification_pusher.py +5 -25
mlrun/utils/regex.py +7 -2
mlrun/utils/version/version.json +2 -2
{mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/METADATA +26 -25
{mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/RECORD +180 -158
{mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/WHEEL +1 -1
mlrun/mlutils/data.py +0 -160
mlrun/mlutils/models.py +0 -78
mlrun/mlutils/plots.py +0 -902
mlrun/utils/model_monitoring.py +0 -249
/mlrun/{api/db/sqldb/session.py → common/db/sql_session.py} +0 -0
{mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/LICENSE +0 -0
{mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/entry_points.txt +0 -0
{mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/top_level.txt +0 -0

mlrun/model_monitoring/{stream_processing_fs.py → stream_processing.py} RENAMED Viewed

@@ -22,14 +22,14 @@ import pandas as pd
 import storey
 import mlrun
-import mlrun.common.model_monitoring
+import mlrun.common.model_monitoring.helpers
 import mlrun.config
 import mlrun.datastore.targets
 import mlrun.feature_store.steps
+import mlrun.model_monitoring.prometheus
 import mlrun.utils
-import mlrun.utils.model_monitoring
 import mlrun.utils.v3io_clients
-from mlrun.common.model_monitoring import (
+from mlrun.common.schemas.model_monitoring.constants import (
     EventFieldType,
     EventKeyMetrics,
     EventLiveStats,
@@ -37,7 +37,6 @@ from mlrun.common.model_monitoring import (
     ModelEndpointTarget,
     ProjectSecretKeys,
 )
-from mlrun.model_monitoring.stores import get_model_endpoint_store
 from mlrun.utils import logger
@@ -47,22 +46,18 @@ class EventStreamProcessor:
         self,
         project: str,
         parquet_batching_max_events: int,
+        parquet_batching_timeout_secs: int,
         parquet_target: str,
         sample_window: int = 10,
-        parquet_batching_timeout_secs: int = 30 * 60,  # Default 30 minutes
-        aggregate_count_windows: typing.Optional[typing.List[str]] = None,
-        aggregate_count_period: str = "30s",
-        aggregate_avg_windows: typing.Optional[typing.List[str]] = None,
-        aggregate_avg_period: str = "30s",
+        aggregate_windows: typing.Optional[typing.List[str]] = None,
+        aggregate_period: str = "30s",
         model_monitoring_access_key: str = None,
     ):
         # General configurations, mainly used for the storey steps in the future serving graph
         self.project = project
         self.sample_window = sample_window
-        self.aggregate_count_windows = aggregate_count_windows or ["5m", "1h"]
-        self.aggregate_count_period = aggregate_count_period
-        self.aggregate_avg_windows = aggregate_avg_windows or ["5m", "1h"]
-        self.aggregate_avg_period = aggregate_avg_period
+        self.aggregate_windows = aggregate_windows or ["5m", "1h"]
+        self.aggregate_period = aggregate_period
         # Parquet path and configurations
         self.parquet_path = parquet_target
@@ -84,6 +79,8 @@ class EventStreamProcessor:
             self._initialize_v3io_configurations(
                 model_monitoring_access_key=model_monitoring_access_key
             )
+        elif self.parquet_path.startswith("s3://"):
+            self.storage_options = mlrun.mlconf.get_s3_storage_options()
     def _initialize_v3io_configurations(
         self,
@@ -116,7 +113,9 @@ class EventStreamProcessor:
             _,
             self.kv_container,
             self.kv_path,
-        ) = mlrun.utils.model_monitoring.parse_model_endpoint_store_prefix(kv_path)
+        ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
+            kv_path
+        )
         # TSDB path and configurations
         tsdb_path = mlrun.mlconf.get_model_monitoring_file_target_path(
@@ -126,7 +125,9 @@ class EventStreamProcessor:
             _,
             self.tsdb_container,
             self.tsdb_path,
-        ) = mlrun.utils.model_monitoring.parse_model_endpoint_store_prefix(tsdb_path)
+        ) = mlrun.common.model_monitoring.helpers.parse_model_endpoint_store_prefix(
+            tsdb_path
+        )
         self.tsdb_path = f"{self.tsdb_container}/{self.tsdb_path}"
         self.tsdb_batching_max_events = tsdb_batching_max_events
@@ -138,7 +139,7 @@ class EventStreamProcessor:
         of different operations that are executed on the events from the model server. Each event has
         metadata (function_uri, timestamp, class, etc.) but also inputs and predictions from the model server.
         Throughout the serving graph, the results are written to 3 different databases:
-        1. KV/SQL (steps 7-9): Stores metadata and stats about the average latency and the amount of predictions over
+        1. KV/SQL (steps 9-11): Stores metadata and stats about the average latency and the amount of predictions over
            time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. This data is used
            by the monitoring dashboards in grafana. The model endpoints table also contains data on the model endpoint
            from other processes, such as current_stats that is being calculated by the monitoring batch job
@@ -146,12 +147,14 @@ class EventStreamProcessor:
            v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is SQL, then the table
            is stored within the database that was defined in the provided connection string and can be found
            under mlrun.mlconf.model_endpoint_monitoring.endpoint_store_connection.
-        2. TSDB (steps 12-18): Stores live data of different key metric dictionaries in tsdb target. Results can be
-           found under v3io:///users/pipelines/project-name/model-endpoints/events/. At the moment, this part supports
-           3 different key metric dictionaries: base_metrics (average latency and predictions over time),
+        2. V3IO TSDB/Prometheus (steps 13-21): Stores live data of different key metric dictionaries in tsdb target.
+           This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB (steps 13-19), results
+           can be found under  v3io:///users/pipelines/project-name/model-endpoints/events/. In that case, we generate
+           3 different key  metric dictionaries: base_metrics (average latency and predictions over time),
            endpoint_features (Prediction and feature names and values), and custom_metrics (user-defined metrics).
-           This data is also being used by the monitoring dashboards in grafana.
-        3. Parquet (steps 19-20): This Parquet file includes the required data for the model monitoring batch job
+           If using Prometheus (steps 20-21), we update metrics in the Prometheus registry that is stored in the
+           monitoring stream local memory.
+        3. Parquet (steps 22-23): This Parquet file includes the required data for the model monitoring batch job
            that run every hour by default. If defined, the parquet target path can be found under
            mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise, the default parquet path is under
            mlrun.mlconf.model_endpoint_monitoring.user_space.
@@ -161,17 +164,41 @@ class EventStreamProcessor:
         graph = fn.set_topology("flow")
-        # Step 1 - Process endpoint event: splitting into sub-events and validate event data
+        # Step 1 - Event routing based on the provided path
+        def apply_event_routing():
+            graph.add_step(
+                "EventRouting",
+                full_event=True,
+                project=self.project,
+            ).respond()
+        apply_event_routing()
+        # Step 2 - Filter out events with no '-' in path which indicates that the event is supposed to be processed
+        # through the next steps of the stream graph
+        def apply_storey_filter_stream_events():
+            # Remove none values from each event
+            graph.add_step(
+                "storey.Filter",
+                "filter_stream_event",
+                _fn="('-' not in event.path)",
+                full_event=True,
+            )
+        apply_storey_filter_stream_events()
+        # Step 3 - Process endpoint event: splitting into sub-events and validate event data
         def apply_process_endpoint_event():
             graph.add_step(
                 "ProcessEndpointEvent",
                 full_event=True,
                 project=self.project,
+                after="filter_stream_event",
             )
         apply_process_endpoint_event()
-        # Steps 2,3 - Applying Storey operations of filtering and flatten
+        # Steps 4,5 - Applying Storey operations of filtering and flatten
         def apply_storey_filter_and_flatmap():
             # Remove none values from each event
             graph.add_step(
@@ -188,7 +215,7 @@ class EventStreamProcessor:
         apply_storey_filter_and_flatmap()
-        # Step 4 - Validating feature names and map each feature to its value
+        # Step 6 - Validating feature names and map each feature to its value
         def apply_map_feature_names():
             graph.add_step(
                 "MapFeatureNames",
@@ -200,58 +227,53 @@ class EventStreamProcessor:
         apply_map_feature_names()
-        # Step 5 - Calculate number of predictions and average latency
+        # Step 7 - Calculate number of predictions and average latency
         def apply_storey_aggregations():
-            # Step 5.1 - Calculate number of predictions for each window (5 min and 1 hour by default)
+            # Step 7.1 - Calculate number of predictions for each window (5 min and 1 hour by default)
             graph.add_step(
                 class_name="storey.AggregateByKey",
                 aggregates=[
                     {
-                        "name": EventFieldType.PREDICTIONS,
-                        "column": EventFieldType.ENDPOINT_ID,
-                        "operations": ["count"],
-                        "windows": self.aggregate_count_windows,
-                        "period": self.aggregate_count_period,
+                        "name": EventFieldType.LATENCY,
+                        "column": EventFieldType.LATENCY,
+                        "operations": ["count", "avg"],
+                        "windows": self.aggregate_windows,
+                        "period": self.aggregate_period,
                     }
                 ],
-                name=EventFieldType.PREDICTIONS,
+                name=EventFieldType.LATENCY,
                 after="MapFeatureNames",
                 step_name="Aggregates",
                 table=".",
+                key_field=EventFieldType.ENDPOINT_ID,
             )
-            # Step 5.2 - Calculate average latency time for each window (5 min and 1 hour by default)
+            # Step 7.2 - Calculate average latency time for each window (5 min and 1 hour by default)
             graph.add_step(
-                class_name="storey.AggregateByKey",
-                aggregates=[
-                    {
-                        "name": EventFieldType.LATENCY,
-                        "column": EventFieldType.LATENCY,
-                        "operations": ["avg"],
-                        "windows": self.aggregate_avg_windows,
-                        "period": self.aggregate_avg_period,
-                    }
-                ],
-                name=EventFieldType.LATENCY,
-                after=EventFieldType.PREDICTIONS,
-                table=".",
+                class_name="storey.Rename",
+                mapping={
+                    "latency_count_5m": EventLiveStats.PREDICTIONS_COUNT_5M,
+                    "latency_count_1h": EventLiveStats.PREDICTIONS_COUNT_1H,
+                },
+                name="Rename",
+                after=EventFieldType.LATENCY,
             )
         apply_storey_aggregations()
-        # Step 6 - Emits the event in window size of events based on sample_window size (10 by default)
+        # Step 8 - Emits the event in window size of events based on sample_window size (10 by default)
         def apply_storey_sample_window():
             graph.add_step(
                 "storey.steps.SampleWindow",
                 name="sample",
-                after=EventFieldType.LATENCY,
+                after="Rename",
                 window_size=self.sample_window,
                 key=EventFieldType.ENDPOINT_ID,
             )
         apply_storey_sample_window()
-        # Steps 7-9 - KV/SQL branch
-        # Step 7 - Filter relevant keys from the event before writing the data into the database table
+        # Steps 9-11 - KV/SQL branch
+        # Step 9 - Filter relevant keys from the event before writing the data into the database table
         def apply_process_before_endpoint_update():
             graph.add_step(
                 "ProcessBeforeEndpointUpdate",
@@ -261,7 +283,7 @@ class EventStreamProcessor:
         apply_process_before_endpoint_update()
-        # Step 8 - Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
+        # Step 10 - Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
         # about average latency and the amount of predictions over time
         def apply_update_endpoint():
             graph.add_step(
@@ -274,7 +296,7 @@ class EventStreamProcessor:
         apply_update_endpoint()
-        # Step 9 (only for KV target) - Apply infer_schema on the model endpoints table for generating schema file
+        # Step 11 (only for KV target) - Apply infer_schema on the model endpoints table for generating schema file
         # which will be used by Grafana monitoring dashboards
         def apply_infer_schema():
             graph.add_step(
@@ -289,10 +311,12 @@ class EventStreamProcessor:
         if self.model_endpoint_store_target == ModelEndpointTarget.V3IO_NOSQL:
             apply_infer_schema()
-        # Steps 11-18 - TSDB branch (not supported in CE environment at the moment)
+        # Steps 12-19 - TSDB branch (skip to Prometheus if in CE env)
+        # Steps 20-21 - Prometheus branch
         if not mlrun.mlconf.is_ce_mode():
-            # Step 11 - Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
+            # TSDB branch
+            # Step 12 - Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
             # stats and details about the events
             def apply_process_before_tsdb():
                 graph.add_step(
@@ -301,7 +325,7 @@ class EventStreamProcessor:
             apply_process_before_tsdb()
-            # Steps 12-18: - Unpacked keys from each dictionary and write to TSDB target
+            # Steps 13-19: - Unpacked keys from each dictionary and write to TSDB target
             def apply_filter_and_unpacked_keys(name, keys):
                 graph.add_step(
                     "FilterAndUnpackKeys",
@@ -332,21 +356,21 @@ class EventStreamProcessor:
                     key=EventFieldType.ENDPOINT_ID,
                 )
-            # Steps 12-13 - unpacked base_metrics dictionary
+            # Steps 13-14 - unpacked base_metrics dictionary
             apply_filter_and_unpacked_keys(
                 name="FilterAndUnpackKeys1",
                 keys=EventKeyMetrics.BASE_METRICS,
             )
             apply_tsdb_target(name="tsdb1", after="FilterAndUnpackKeys1")
-            # Steps 14-15 - unpacked endpoint_features dictionary
+            # Steps 15-16 - unpacked endpoint_features dictionary
             apply_filter_and_unpacked_keys(
                 name="FilterAndUnpackKeys2",
                 keys=EventKeyMetrics.ENDPOINT_FEATURES,
             )
             apply_tsdb_target(name="tsdb2", after="FilterAndUnpackKeys2")
-            # Steps 16-18 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
+            # Steps 17-19 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
             apply_filter_and_unpacked_keys(
                 name="FilterAndUnpackKeys3",
                 keys=EventKeyMetrics.CUSTOM_METRICS,
@@ -362,9 +386,30 @@ class EventStreamProcessor:
             apply_storey_filter()
             apply_tsdb_target(name="tsdb3", after="FilterNotNone")
+        else:
+            # Prometheus branch
+            # Step 20 - Increase the prediction counter by 1 and update the latency value
+            graph.add_step(
+                "IncCounter",
+                name="IncCounter",
+                after="MapFeatureNames",
+                project=self.project,
+            )
-        # Steps 19-20 - Parquet branch
-        # Step 19 - Filter and validate different keys before writing the data to Parquet target
+            # Step 21 - Record a sample of features and labels
+            def apply_record_features_to_prometheus():
+                graph.add_step(
+                    "RecordFeatures",
+                    name="RecordFeaturesToPrometheus",
+                    after="sample",
+                    project=self.project,
+                )
+            apply_record_features_to_prometheus()
+        # Steps 22-23 - Parquet branch
+        # Step 22 - Filter and validate different keys before writing the data to Parquet target
         def apply_process_before_parquet():
             graph.add_step(
                 "ProcessBeforeParquet",
@@ -375,7 +420,7 @@ class EventStreamProcessor:
         apply_process_before_parquet()
-        # Step 20 - Write the Parquet target file, partitioned by key (endpoint_id) and time.
+        # Step 23 - Write the Parquet target file, partitioned by key (endpoint_id) and time.
         def apply_parquet_target():
             graph.add_step(
                 "storey.ParquetTarget",
@@ -625,6 +670,11 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
         error = event.get("error")
         if error:
             self.error_count[endpoint_id] += 1
+            mlrun.model_monitoring.prometheus.write_errors(
+                project=self.project,
+                endpoint_id=event["endpoint_id"],
+                model_name=event["model"],
+            )
             raise mlrun.errors.MLRunInvalidArgumentError(str(error))
         # Validate event fields
@@ -1078,12 +1128,104 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
         return event
+class EventRouting(mlrun.feature_store.steps.MapClass):
+    """
+    Router the event according to the configured path under event.path. Please note that this step returns the result
+    to the caller. At the moment there are several paths:
+    - /model-monitoring-metrics (GET): return Prometheus registry results as a text. Will be used by Prometheus client
+    to scrape the results from the monitoring stream memory.
+    - /monitoring-batch-metrics (POST): update the Prometheus registry with the provided statistical metrics such as the
+     statistical metrics from the monitoring batch job. Note that the event body is a list of dictionaries of different
+     metrics.
+    - /monitoring-drift-status (POST): update the Prometheus registry with the provided model drift status.
+    """
+    def __init__(
+        self,
+        project: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.project: str = project
+    def do(self, event):
+        if event.path == "/model-monitoring-metrics":
+            # Return a parsed Prometheus registry file
+            event.body = mlrun.model_monitoring.prometheus.get_registry()
+        elif event.path == "/monitoring-batch-metrics":
+            # Update statistical metrics
+            for event_metric in event.body:
+                mlrun.model_monitoring.prometheus.write_drift_metrics(
+                    project=self.project,
+                    endpoint_id=event_metric[EventFieldType.ENDPOINT_ID],
+                    metric=event_metric[EventFieldType.METRIC],
+                    value=event_metric[EventFieldType.VALUE],
+                )
+        elif event.path == "/monitoring-drift-status":
+            # Update drift status
+            mlrun.model_monitoring.prometheus.write_drift_status(
+                project=self.project,
+                endpoint_id=event.body[EventFieldType.ENDPOINT_ID],
+                drift_status=event.body[EventFieldType.DRIFT_STATUS],
+            )
+        return event
+class IncCounter(mlrun.feature_store.steps.MapClass):
+    """Increase prediction counter by 1 and update the total latency value"""
+    def __init__(self, project: str, **kwargs):
+        super().__init__(**kwargs)
+        self.project: str = project
+    def do(self, event):
+        # Compute prediction per second
+        mlrun.model_monitoring.prometheus.write_predictions_and_latency_metrics(
+            project=self.project,
+            endpoint_id=event[EventFieldType.ENDPOINT_ID],
+            latency=event[EventFieldType.LATENCY],
+            model_name=event[EventFieldType.MODEL],
+            endpoint_type=event[EventFieldType.ENDPOINT_TYPE],
+        )
+        return event
+class RecordFeatures(mlrun.feature_store.steps.MapClass):
+    """Record a sample of features and labels in Prometheus registry"""
+    def __init__(self, project: str, **kwargs):
+        super().__init__(**kwargs)
+        self.project: str = project
+    def do(self, event):
+        # Generate a dictionary of features and predictions
+        features = {
+            **event[EventFieldType.NAMED_PREDICTIONS],
+            **event[EventFieldType.NAMED_FEATURES],
+        }
+        mlrun.model_monitoring.prometheus.write_income_features(
+            project=self.project,
+            endpoint_id=event[EventFieldType.ENDPOINT_ID],
+            features=features,
+        )
+        return event
 def update_endpoint_record(
     project: str,
     endpoint_id: str,
     attributes: dict,
 ):
-    model_endpoint_store = get_model_endpoint_store(
+    model_endpoint_store = mlrun.model_monitoring.get_model_endpoint_store(
         project=project,
     )
@@ -1093,7 +1235,7 @@ def update_endpoint_record(
 def get_endpoint_record(project: str, endpoint_id: str):
-    model_endpoint_store = get_model_endpoint_store(
+    model_endpoint_store = mlrun.model_monitoring.get_model_endpoint_store(
         project=project,
     )
     return model_endpoint_store.get_model_endpoint(endpoint_id=endpoint_id)

mlrun/model_monitoring/tracking_policy.py ADDED Viewed

@@ -0,0 +1,104 @@
+# Copyright 2023 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from typing import Union
+import mlrun.common.schemas.schedule
+import mlrun.model
+class TrackingPolicy(mlrun.model.ModelObj):
+    """
+    Modified model monitoring configurations. By using TrackingPolicy, the user can apply his model monitoring
+    requirements, such as setting the scheduling policy of the model monitoring batch job or changing the image of the
+    model monitoring stream.
+    """
+    _dict_fields = [
+        "default_batch_image",
+        "stream_image",
+    ]
+    def __init__(
+        self,
+        default_batch_intervals: Union[
+            mlrun.common.schemas.schedule.ScheduleCronTrigger, str
+        ] = mlrun.common.schemas.schedule.ScheduleCronTrigger(minute="0", hour="*/1"),
+        default_batch_image: str = "mlrun/mlrun",
+        stream_image: str = "mlrun/mlrun",
+    ):
+        """
+        Initialize TrackingPolicy object.
+        :param default_batch_intervals:     Model monitoring batch scheduling policy. By default, executed on the hour
+                                            every hour. Can be either a string or a ScheduleCronTrigger object. The
+                                            string time format is based on ScheduleCronTrigger expression:
+                                            minute, hour, day of month, month, day of week. It will be converted into
+                                            a ScheduleCronTrigger object.
+        :param default_batch_image:         The default image of the model monitoring batch job. By default, the image
+                                            is mlrun/mlrun.
+        :param stream_image:                The image of the model monitoring stream real-time function. By default,
+                                            the image is mlrun/mlrun.
+        """
+        if isinstance(default_batch_intervals, str):
+            default_batch_intervals = (
+                mlrun.common.schemas.schedule.ScheduleCronTrigger.from_crontab(
+                    default_batch_intervals
+                )
+            )
+        self.default_batch_intervals = default_batch_intervals
+        self.default_batch_image = default_batch_image
+        self.stream_image = stream_image
+    @classmethod
+    def from_dict(cls, struct=None, fields=None, deprecated_fields: dict = None):
+        new_obj = super().from_dict(
+            struct, fields=cls._dict_fields, deprecated_fields=deprecated_fields
+        )
+        # Convert default batch interval into ScheduleCronTrigger object
+        if (
+            mlrun.common.schemas.model_monitoring.EventFieldType.DEFAULT_BATCH_INTERVALS
+            in struct
+        ):
+            if isinstance(
+                struct[
+                    mlrun.common.schemas.model_monitoring.EventFieldType.DEFAULT_BATCH_INTERVALS
+                ],
+                str,
+            ):
+                new_obj.default_batch_intervals = mlrun.common.schemas.schedule.ScheduleCronTrigger.from_crontab(
+                    struct[
+                        mlrun.common.schemas.model_monitoring.EventFieldType.DEFAULT_BATCH_INTERVALS
+                    ]
+                )
+            else:
+                new_obj.default_batch_intervals = mlrun.common.schemas.schedule.ScheduleCronTrigger.parse_obj(
+                    struct[
+                        mlrun.common.schemas.model_monitoring.EventFieldType.DEFAULT_BATCH_INTERVALS
+                    ]
+                )
+        return new_obj
+    def to_dict(self, fields=None, exclude=None):
+        struct = super().to_dict(
+            fields,
+            exclude=[
+                mlrun.common.schemas.model_monitoring.EventFieldType.DEFAULT_BATCH_INTERVALS
+            ],
+        )
+        if self.default_batch_intervals:
+            struct[
+                mlrun.common.schemas.model_monitoring.EventFieldType.DEFAULT_BATCH_INTERVALS
+            ] = self.default_batch_intervals.dict()
+        return struct

mlrun/package/packager.py CHANGED Viewed

@@ -107,8 +107,7 @@ class Packager(ABC, metaclass=_PackagerMeta):
     Preferably, each packager should handle a single type of object.
-    Linking Artifacts (extra data)
-    ------------------------------
+    **Linking Artifacts (extra data)**
     In order to link between packages (using the extra data or metrics spec attributes of an artifact), you should use
     the key as if it exists and as value ellipses (...). The manager will link all packages once it is done packing.
@@ -118,8 +117,7 @@ class Packager(ABC, metaclass=_PackagerMeta):
         artifact = Artifact(key="my_artifact")
         artifact.spec.extra_data = {key: ... for key in extra_data}
-    Clearing Outputs
-    ----------------
+    **Clearing Outputs**
     Some of the packagers may produce files and temporary directories that should be deleted once done with logging the
     artifact. The packager can mark paths of files and directories to delete after logging using the class method
@@ -131,15 +129,15 @@ class Packager(ABC, metaclass=_PackagerMeta):
         with open("./some_file.txt", "w") as file:
             file.write("Pack me")
         artifact = Artifact(key="my_artifact")
-        cls.future_clear(path="./some_file.txt")
+        cls.add_future_clearing_path(path="./some_file.txt")
         return artifact, None
     """
-    # The type of object this packager can pack and unpack:
+    #: The type of object this packager can pack and unpack.
     PACKABLE_OBJECT_TYPE: Type = ...
-    # The priority of this packager in the packagers collection of the manager (lower is better)
-    PRIORITY = ...
+    #: The priority of this packager in the packagers collection of the manager (lower is better).
+    PRIORITY: int = ...
     # List of all paths to be deleted by the manager of this packager post logging the packages:
     _CLEARING_PATH_LIST: List[str] = []

mlrun 1.4.0rc25__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl

Potentially problematic release.

mlrun 1.4.0rc25py3-none-any.whl → 1.5.0rc2py3-none-any.whl