mlrun 1.6.4rc8__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +40 -122
- mlrun/alerts/__init__.py +15 -0
- mlrun/alerts/alert.py +248 -0
- mlrun/api/schemas/__init__.py +5 -4
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +47 -257
- mlrun/artifacts/dataset.py +11 -192
- mlrun/artifacts/manager.py +79 -47
- mlrun/artifacts/model.py +31 -159
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +74 -1
- mlrun/common/db/sql_session.py +5 -5
- mlrun/common/formatters/__init__.py +21 -0
- mlrun/common/formatters/artifact.py +45 -0
- mlrun/common/formatters/base.py +113 -0
- mlrun/common/formatters/feature_set.py +33 -0
- mlrun/common/formatters/function.py +46 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/formatters/run.py +29 -0
- mlrun/common/helpers.py +12 -3
- mlrun/common/model_monitoring/helpers.py +9 -5
- mlrun/{runtimes → common/runtimes}/constants.py +37 -9
- mlrun/common/schemas/__init__.py +31 -5
- mlrun/common/schemas/alert.py +202 -0
- mlrun/common/schemas/api_gateway.py +196 -0
- mlrun/common/schemas/artifact.py +25 -4
- mlrun/common/schemas/auth.py +16 -5
- mlrun/common/schemas/background_task.py +1 -1
- mlrun/common/schemas/client_spec.py +4 -2
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/constants.py +3 -0
- mlrun/common/schemas/feature_store.py +74 -44
- mlrun/common/schemas/frontend_spec.py +15 -7
- mlrun/common/schemas/function.py +12 -1
- mlrun/common/schemas/hub.py +11 -18
- mlrun/common/schemas/memory_reports.py +2 -2
- mlrun/common/schemas/model_monitoring/__init__.py +20 -4
- mlrun/common/schemas/model_monitoring/constants.py +123 -42
- mlrun/common/schemas/model_monitoring/grafana.py +13 -9
- mlrun/common/schemas/model_monitoring/model_endpoints.py +101 -54
- mlrun/common/schemas/notification.py +71 -14
- mlrun/common/schemas/object.py +2 -2
- mlrun/{model_monitoring/controller_handler.py → common/schemas/pagination.py} +9 -12
- mlrun/common/schemas/pipeline.py +8 -1
- mlrun/common/schemas/project.py +69 -18
- mlrun/common/schemas/runs.py +7 -1
- mlrun/common/schemas/runtime_resource.py +8 -12
- mlrun/common/schemas/schedule.py +4 -4
- mlrun/common/schemas/tag.py +1 -2
- mlrun/common/schemas/workflow.py +12 -4
- mlrun/common/types.py +14 -1
- mlrun/config.py +154 -69
- mlrun/data_types/data_types.py +6 -1
- mlrun/data_types/spark.py +2 -2
- mlrun/data_types/to_pandas.py +67 -37
- mlrun/datastore/__init__.py +6 -8
- mlrun/datastore/alibaba_oss.py +131 -0
- mlrun/datastore/azure_blob.py +143 -42
- mlrun/datastore/base.py +102 -58
- mlrun/datastore/datastore.py +34 -13
- mlrun/datastore/datastore_profile.py +146 -20
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -4
- mlrun/datastore/google_cloud_storage.py +97 -33
- mlrun/datastore/hdfs.py +56 -0
- mlrun/datastore/inmem.py +6 -3
- mlrun/datastore/redis.py +7 -2
- mlrun/datastore/s3.py +34 -12
- mlrun/datastore/snowflake_utils.py +45 -0
- mlrun/datastore/sources.py +303 -111
- mlrun/datastore/spark_utils.py +31 -2
- mlrun/datastore/store_resources.py +9 -7
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +453 -176
- mlrun/datastore/utils.py +72 -58
- mlrun/datastore/v3io.py +6 -1
- mlrun/db/base.py +274 -41
- mlrun/db/factory.py +1 -1
- mlrun/db/httpdb.py +893 -225
- mlrun/db/nopdb.py +291 -33
- mlrun/errors.py +36 -6
- mlrun/execution.py +115 -42
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +65 -73
- mlrun/feature_store/common.py +7 -12
- mlrun/feature_store/feature_set.py +76 -55
- mlrun/feature_store/feature_vector.py +39 -31
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +16 -11
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +13 -4
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +24 -32
- mlrun/feature_store/steps.py +45 -34
- mlrun/features.py +11 -21
- mlrun/frameworks/_common/artifacts_library.py +9 -9
- mlrun/frameworks/_common/mlrun_interface.py +5 -5
- mlrun/frameworks/_common/model_handler.py +48 -48
- mlrun/frameworks/_common/plan.py +5 -6
- mlrun/frameworks/_common/producer.py +3 -4
- mlrun/frameworks/_common/utils.py +5 -5
- mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
- mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
- mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
- mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
- mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
- mlrun/frameworks/_ml_common/model_handler.py +24 -24
- mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
- mlrun/frameworks/_ml_common/plan.py +2 -2
- mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
- mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
- mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
- mlrun/frameworks/_ml_common/utils.py +4 -4
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
- mlrun/frameworks/huggingface/model_server.py +4 -4
- mlrun/frameworks/lgbm/__init__.py +33 -33
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
- mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
- mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
- mlrun/frameworks/lgbm/model_handler.py +10 -10
- mlrun/frameworks/lgbm/model_server.py +6 -6
- mlrun/frameworks/lgbm/utils.py +5 -5
- mlrun/frameworks/onnx/dataset.py +8 -8
- mlrun/frameworks/onnx/mlrun_interface.py +3 -3
- mlrun/frameworks/onnx/model_handler.py +6 -6
- mlrun/frameworks/onnx/model_server.py +7 -7
- mlrun/frameworks/parallel_coordinates.py +6 -6
- mlrun/frameworks/pytorch/__init__.py +18 -18
- mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
- mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
- mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
- mlrun/frameworks/pytorch/model_handler.py +17 -17
- mlrun/frameworks/pytorch/model_server.py +7 -7
- mlrun/frameworks/sklearn/__init__.py +13 -13
- mlrun/frameworks/sklearn/estimator.py +4 -4
- mlrun/frameworks/sklearn/metrics_library.py +14 -14
- mlrun/frameworks/sklearn/mlrun_interface.py +16 -9
- mlrun/frameworks/sklearn/model_handler.py +2 -2
- mlrun/frameworks/tf_keras/__init__.py +10 -7
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
- mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
- mlrun/frameworks/tf_keras/model_handler.py +14 -14
- mlrun/frameworks/tf_keras/model_server.py +6 -6
- mlrun/frameworks/xgboost/__init__.py +13 -13
- mlrun/frameworks/xgboost/model_handler.py +6 -6
- mlrun/k8s_utils.py +61 -17
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +16 -15
- mlrun/launcher/client.py +13 -11
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +23 -13
- mlrun/launcher/remote.py +17 -10
- mlrun/lists.py +7 -6
- mlrun/model.py +478 -103
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +163 -371
- mlrun/{runtimes/mpijob/v1alpha1.py → model_monitoring/applications/__init__.py} +9 -15
- mlrun/model_monitoring/applications/_application_steps.py +188 -0
- mlrun/model_monitoring/applications/base.py +108 -0
- mlrun/model_monitoring/applications/context.py +341 -0
- mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
- mlrun/model_monitoring/applications/histogram_data_drift.py +354 -0
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +131 -278
- mlrun/model_monitoring/db/__init__.py +18 -0
- mlrun/model_monitoring/db/stores/__init__.py +136 -0
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/db/stores/base/store.py +213 -0
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
- mlrun/model_monitoring/db/tsdb/base.py +448 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +279 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +507 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
- mlrun/model_monitoring/features_drift_table.py +134 -106
- mlrun/model_monitoring/helpers.py +199 -55
- mlrun/model_monitoring/metrics/__init__.py +13 -0
- mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +134 -398
- mlrun/model_monitoring/tracking_policy.py +9 -2
- mlrun/model_monitoring/writer.py +161 -125
- mlrun/package/__init__.py +6 -6
- mlrun/package/context_handler.py +5 -5
- mlrun/package/packager.py +7 -7
- mlrun/package/packagers/default_packager.py +8 -8
- mlrun/package/packagers/numpy_packagers.py +15 -15
- mlrun/package/packagers/pandas_packagers.py +5 -5
- mlrun/package/packagers/python_standard_library_packagers.py +10 -10
- mlrun/package/packagers_manager.py +19 -23
- mlrun/package/utils/_formatter.py +6 -6
- mlrun/package/utils/_pickler.py +2 -2
- mlrun/package/utils/_supported_format.py +4 -4
- mlrun/package/utils/log_hint_utils.py +2 -2
- mlrun/package/utils/type_hint_utils.py +4 -9
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +24 -203
- mlrun/projects/operations.py +52 -25
- mlrun/projects/pipelines.py +191 -197
- mlrun/projects/project.py +1227 -400
- mlrun/render.py +16 -19
- mlrun/run.py +209 -184
- mlrun/runtimes/__init__.py +83 -15
- mlrun/runtimes/base.py +51 -35
- mlrun/runtimes/daskjob.py +17 -10
- mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
- mlrun/runtimes/databricks_job/databricks_runtime.py +8 -7
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -29
- mlrun/runtimes/function_reference.py +1 -1
- mlrun/runtimes/kubejob.py +34 -128
- mlrun/runtimes/local.py +40 -11
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +9 -10
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
- mlrun/runtimes/nuclio/api_gateway.py +769 -0
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +758 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/{function.py → nuclio/function.py} +200 -83
- mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
- mlrun/runtimes/{serving.py → nuclio/serving.py} +65 -68
- mlrun/runtimes/pod.py +281 -101
- mlrun/runtimes/remotesparkjob.py +12 -9
- mlrun/runtimes/sparkjob/spark3job.py +67 -51
- mlrun/runtimes/utils.py +41 -75
- mlrun/secrets.py +9 -5
- mlrun/serving/__init__.py +8 -1
- mlrun/serving/remote.py +2 -7
- mlrun/serving/routers.py +85 -69
- mlrun/serving/server.py +69 -44
- mlrun/serving/states.py +209 -36
- mlrun/serving/utils.py +22 -14
- mlrun/serving/v1_serving.py +6 -7
- mlrun/serving/v2_serving.py +133 -54
- mlrun/track/tracker.py +2 -1
- mlrun/track/tracker_manager.py +3 -3
- mlrun/track/trackers/mlflow_tracker.py +6 -2
- mlrun/utils/async_http.py +6 -8
- mlrun/utils/azure_vault.py +1 -1
- mlrun/utils/clones.py +1 -2
- mlrun/utils/condition_evaluator.py +3 -3
- mlrun/utils/db.py +21 -3
- mlrun/utils/helpers.py +405 -225
- mlrun/utils/http.py +3 -6
- mlrun/utils/logger.py +112 -16
- mlrun/utils/notifications/notification/__init__.py +17 -13
- mlrun/utils/notifications/notification/base.py +50 -2
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +24 -1
- mlrun/utils/notifications/notification/ipython.py +3 -1
- mlrun/utils/notifications/notification/slack.py +96 -21
- mlrun/utils/notifications/notification/webhook.py +59 -2
- mlrun/utils/notifications/notification_pusher.py +149 -30
- mlrun/utils/regex.py +9 -0
- mlrun/utils/retryer.py +208 -0
- mlrun/utils/singleton.py +1 -1
- mlrun/utils/v3io_clients.py +4 -6
- mlrun/utils/version/version.json +2 -2
- mlrun/utils/version/version.py +2 -6
- mlrun-1.7.0.dist-info/METADATA +378 -0
- mlrun-1.7.0.dist-info/RECORD +351 -0
- {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -273
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/application.py +0 -310
- mlrun/model_monitoring/batch.py +0 -1095
- mlrun/model_monitoring/prometheus.py +0 -219
- mlrun/model_monitoring/stores/__init__.py +0 -111
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -576
- mlrun/model_monitoring/stores/model_endpoint_store.py +0 -147
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/models/base.py +0 -84
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
- mlrun/platforms/other.py +0 -306
- mlrun-1.6.4rc8.dist-info/METADATA +0 -272
- mlrun-1.6.4rc8.dist-info/RECORD +0 -314
- {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/LICENSE +0 -0
- {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/entry_points.txt +0 -0
- {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/top_level.txt +0 -0
|
@@ -26,10 +26,9 @@ import mlrun.config
|
|
|
26
26
|
import mlrun.datastore.targets
|
|
27
27
|
import mlrun.feature_store as fstore
|
|
28
28
|
import mlrun.feature_store.steps
|
|
29
|
-
import mlrun.model_monitoring.
|
|
29
|
+
import mlrun.model_monitoring.db
|
|
30
30
|
import mlrun.serving.states
|
|
31
31
|
import mlrun.utils
|
|
32
|
-
import mlrun.utils.v3io_clients
|
|
33
32
|
from mlrun.common.schemas.model_monitoring.constants import (
|
|
34
33
|
EventFieldType,
|
|
35
34
|
EventKeyMetrics,
|
|
@@ -38,6 +37,7 @@ from mlrun.common.schemas.model_monitoring.constants import (
|
|
|
38
37
|
ModelEndpointTarget,
|
|
39
38
|
ProjectSecretKeys,
|
|
40
39
|
)
|
|
40
|
+
from mlrun.model_monitoring.db import StoreBase, TSDBConnector
|
|
41
41
|
from mlrun.utils import logger
|
|
42
42
|
|
|
43
43
|
|
|
@@ -49,14 +49,12 @@ class EventStreamProcessor:
|
|
|
49
49
|
parquet_batching_max_events: int,
|
|
50
50
|
parquet_batching_timeout_secs: int,
|
|
51
51
|
parquet_target: str,
|
|
52
|
-
sample_window: int = 10,
|
|
53
52
|
aggregate_windows: typing.Optional[list[str]] = None,
|
|
54
|
-
aggregate_period: str = "
|
|
53
|
+
aggregate_period: str = "5m",
|
|
55
54
|
model_monitoring_access_key: str = None,
|
|
56
55
|
):
|
|
57
56
|
# General configurations, mainly used for the storey steps in the future serving graph
|
|
58
57
|
self.project = project
|
|
59
|
-
self.sample_window = sample_window
|
|
60
58
|
self.aggregate_windows = aggregate_windows or ["5m", "1h"]
|
|
61
59
|
self.aggregate_period = aggregate_period
|
|
62
60
|
|
|
@@ -65,10 +63,6 @@ class EventStreamProcessor:
|
|
|
65
63
|
self.parquet_batching_max_events = parquet_batching_max_events
|
|
66
64
|
self.parquet_batching_timeout_secs = parquet_batching_timeout_secs
|
|
67
65
|
|
|
68
|
-
self.model_endpoint_store_target = (
|
|
69
|
-
mlrun.mlconf.model_endpoint_monitoring.store_type
|
|
70
|
-
)
|
|
71
|
-
|
|
72
66
|
logger.info(
|
|
73
67
|
"Initializing model monitoring event stream processor",
|
|
74
68
|
parquet_path=self.parquet_path,
|
|
@@ -76,6 +70,7 @@ class EventStreamProcessor:
|
|
|
76
70
|
)
|
|
77
71
|
|
|
78
72
|
self.storage_options = None
|
|
73
|
+
self.tsdb_configurations = {}
|
|
79
74
|
if not mlrun.mlconf.is_ce_mode():
|
|
80
75
|
self._initialize_v3io_configurations(
|
|
81
76
|
model_monitoring_access_key=model_monitoring_access_key
|
|
@@ -134,78 +129,83 @@ class EventStreamProcessor:
|
|
|
134
129
|
self.tsdb_batching_max_events = tsdb_batching_max_events
|
|
135
130
|
self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs
|
|
136
131
|
|
|
137
|
-
def apply_monitoring_serving_graph(
|
|
132
|
+
def apply_monitoring_serving_graph(
|
|
133
|
+
self,
|
|
134
|
+
fn: mlrun.runtimes.ServingRuntime,
|
|
135
|
+
tsdb_connector: TSDBConnector,
|
|
136
|
+
endpoint_store: StoreBase,
|
|
137
|
+
) -> None:
|
|
138
138
|
"""
|
|
139
|
-
Apply monitoring serving graph to a given serving function. The following serving graph includes about
|
|
140
|
-
of different operations that are executed on the events from
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
139
|
+
Apply monitoring serving graph to a given serving function. The following serving graph includes about 4 main
|
|
140
|
+
parts that each one them includes several steps of different operations that are executed on the events from
|
|
141
|
+
the model server.
|
|
142
|
+
Each event has metadata (function_uri, timestamp, class, etc.) but also inputs, predictions and optional
|
|
143
|
+
metrics from the model server.
|
|
144
|
+
In ths first part, the serving graph processes the event and splits it into sub-events. This part also includes
|
|
145
|
+
validation of the event data and adding important details to the event such as endpoint_id.
|
|
146
|
+
In the next parts, the serving graph stores data to 3 different targets:
|
|
147
|
+
1. KV/SQL: Metadata and basic stats about the average latency and the amount of predictions over
|
|
148
|
+
time per endpoint. for example the amount of predictions of endpoint x in the last 5 min. The model
|
|
149
|
+
endpoints table also contains data on the model endpoint from other processes, such as feature_stats that
|
|
150
|
+
represents sample statistics from the training data. If the target is from type KV, then the model endpoints
|
|
151
|
+
table can be found under v3io:///users/pipelines/project-name/model-endpoints/endpoints/. If the target is
|
|
152
|
+
SQL, then the table is stored within the database that was defined in the provided connection string.
|
|
153
|
+
2. TSDB: live data of different key metric dictionaries in tsdb target.
|
|
154
|
+
This data is being used by the monitoring dashboards in grafana. If using V3IO TSDB, results
|
|
153
155
|
can be found under v3io:///users/pipelines/project-name/model-endpoints/events/. In that case, we generate
|
|
154
156
|
3 different key metric dictionaries: base_metrics (average latency and predictions over time),
|
|
155
157
|
endpoint_features (Prediction and feature names and values), and custom_metrics (user-defined metrics).
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise, the default parquet path is under
|
|
161
|
-
mlrun.mlconf.model_endpoint_monitoring.user_space.
|
|
158
|
+
3. Parquet: This Parquet file includes the required data for the model monitoring applications. If defined,
|
|
159
|
+
the parquet target path can be found under mlrun.mlconf.model_endpoint_monitoring.offline. Otherwise,
|
|
160
|
+
the default parquet path is under mlrun.mlconf.model_endpoint_monitoring.user_space. Note that if you are
|
|
161
|
+
using CE, the parquet target path is based on the defined MLRun artifact path.
|
|
162
162
|
|
|
163
163
|
:param fn: A serving function.
|
|
164
|
+
:param tsdb_connector: Time series database connector.
|
|
165
|
+
:param endpoint_store: KV/SQL store used for endpoint data.
|
|
164
166
|
"""
|
|
165
167
|
|
|
166
168
|
graph = typing.cast(
|
|
167
169
|
mlrun.serving.states.RootFlowStep,
|
|
168
170
|
fn.set_topology(mlrun.serving.states.StepKinds.flow),
|
|
169
171
|
)
|
|
172
|
+
graph.add_step(
|
|
173
|
+
"ExtractEndpointID",
|
|
174
|
+
"extract_endpoint",
|
|
175
|
+
full_event=True,
|
|
176
|
+
)
|
|
170
177
|
|
|
171
|
-
#
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
project=self.project,
|
|
179
|
-
),
|
|
180
|
-
).respond()
|
|
181
|
-
|
|
182
|
-
apply_event_routing()
|
|
178
|
+
# split the graph between event with error vs valid event
|
|
179
|
+
graph.add_step(
|
|
180
|
+
"storey.Filter",
|
|
181
|
+
"FilterError",
|
|
182
|
+
after="extract_endpoint",
|
|
183
|
+
_fn="(event.get('error') is None)",
|
|
184
|
+
)
|
|
183
185
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
"filter_stream_event",
|
|
191
|
-
_fn="('-' not in event.path.split('/')[-1])",
|
|
192
|
-
full_event=True,
|
|
193
|
-
)
|
|
186
|
+
graph.add_step(
|
|
187
|
+
"storey.Filter",
|
|
188
|
+
"ForwardError",
|
|
189
|
+
after="extract_endpoint",
|
|
190
|
+
_fn="(event.get('error') is not None)",
|
|
191
|
+
)
|
|
194
192
|
|
|
195
|
-
|
|
193
|
+
tsdb_connector.handle_model_error(
|
|
194
|
+
graph,
|
|
195
|
+
)
|
|
196
196
|
|
|
197
|
-
#
|
|
197
|
+
# Process endpoint event: splitting into sub-events and validate event data
|
|
198
198
|
def apply_process_endpoint_event():
|
|
199
199
|
graph.add_step(
|
|
200
200
|
"ProcessEndpointEvent",
|
|
201
|
+
after="extract_endpoint", # TODO: change this to FilterError in ML-7456
|
|
201
202
|
full_event=True,
|
|
202
203
|
project=self.project,
|
|
203
|
-
after="filter_stream_event",
|
|
204
204
|
)
|
|
205
205
|
|
|
206
206
|
apply_process_endpoint_event()
|
|
207
207
|
|
|
208
|
-
#
|
|
208
|
+
# Applying Storey operations of filtering and flatten
|
|
209
209
|
def apply_storey_filter_and_flatmap():
|
|
210
210
|
# Remove none values from each event
|
|
211
211
|
graph.add_step(
|
|
@@ -222,7 +222,7 @@ class EventStreamProcessor:
|
|
|
222
222
|
|
|
223
223
|
apply_storey_filter_and_flatmap()
|
|
224
224
|
|
|
225
|
-
#
|
|
225
|
+
# Validating feature names and map each feature to its value
|
|
226
226
|
def apply_map_feature_names():
|
|
227
227
|
graph.add_step(
|
|
228
228
|
"MapFeatureNames",
|
|
@@ -234,9 +234,9 @@ class EventStreamProcessor:
|
|
|
234
234
|
|
|
235
235
|
apply_map_feature_names()
|
|
236
236
|
|
|
237
|
-
#
|
|
237
|
+
# Calculate number of predictions and average latency
|
|
238
238
|
def apply_storey_aggregations():
|
|
239
|
-
#
|
|
239
|
+
# Calculate number of predictions for each window (5 min and 1 hour by default)
|
|
240
240
|
graph.add_step(
|
|
241
241
|
class_name="storey.AggregateByKey",
|
|
242
242
|
aggregates=[
|
|
@@ -254,7 +254,7 @@ class EventStreamProcessor:
|
|
|
254
254
|
table=".",
|
|
255
255
|
key_field=EventFieldType.ENDPOINT_ID,
|
|
256
256
|
)
|
|
257
|
-
#
|
|
257
|
+
# Calculate average latency time for each window (5 min and 1 hour by default)
|
|
258
258
|
graph.add_step(
|
|
259
259
|
class_name="storey.Rename",
|
|
260
260
|
mapping={
|
|
@@ -267,8 +267,8 @@ class EventStreamProcessor:
|
|
|
267
267
|
|
|
268
268
|
apply_storey_aggregations()
|
|
269
269
|
|
|
270
|
-
#
|
|
271
|
-
#
|
|
270
|
+
# KV/SQL branch
|
|
271
|
+
# Filter relevant keys from the event before writing the data into the database table
|
|
272
272
|
def apply_process_before_endpoint_update():
|
|
273
273
|
graph.add_step(
|
|
274
274
|
"ProcessBeforeEndpointUpdate",
|
|
@@ -278,7 +278,7 @@ class EventStreamProcessor:
|
|
|
278
278
|
|
|
279
279
|
apply_process_before_endpoint_update()
|
|
280
280
|
|
|
281
|
-
#
|
|
281
|
+
# Write the filtered event to KV/SQL table. At this point, the serving graph updates the stats
|
|
282
282
|
# about average latency and the amount of predictions over time
|
|
283
283
|
def apply_update_endpoint():
|
|
284
284
|
graph.add_step(
|
|
@@ -286,12 +286,11 @@ class EventStreamProcessor:
|
|
|
286
286
|
name="UpdateEndpoint",
|
|
287
287
|
after="ProcessBeforeEndpointUpdate",
|
|
288
288
|
project=self.project,
|
|
289
|
-
model_endpoint_store_target=self.model_endpoint_store_target,
|
|
290
289
|
)
|
|
291
290
|
|
|
292
291
|
apply_update_endpoint()
|
|
293
292
|
|
|
294
|
-
#
|
|
293
|
+
# (only for V3IO KV target) - Apply infer_schema on the model endpoints table for generating schema file
|
|
295
294
|
# which will be used by Grafana monitoring dashboards
|
|
296
295
|
def apply_infer_schema():
|
|
297
296
|
graph.add_step(
|
|
@@ -303,119 +302,13 @@ class EventStreamProcessor:
|
|
|
303
302
|
table=self.kv_path,
|
|
304
303
|
)
|
|
305
304
|
|
|
306
|
-
if
|
|
305
|
+
if endpoint_store.type == ModelEndpointTarget.V3IO_NOSQL:
|
|
307
306
|
apply_infer_schema()
|
|
308
307
|
|
|
309
|
-
|
|
310
|
-
def apply_storey_sample_window():
|
|
311
|
-
graph.add_step(
|
|
312
|
-
"storey.steps.SampleWindow",
|
|
313
|
-
name="sample",
|
|
314
|
-
after="Rename",
|
|
315
|
-
window_size=self.sample_window,
|
|
316
|
-
key=EventFieldType.ENDPOINT_ID,
|
|
317
|
-
)
|
|
318
|
-
|
|
319
|
-
apply_storey_sample_window()
|
|
320
|
-
|
|
321
|
-
# Steps 12-19 - TSDB branch (skip to Prometheus if in CE env)
|
|
322
|
-
# Steps 20-21 - Prometheus branch
|
|
323
|
-
if not mlrun.mlconf.is_ce_mode():
|
|
324
|
-
# TSDB branch
|
|
325
|
-
|
|
326
|
-
# Step 12 - Before writing data to TSDB, create dictionary of 2-3 dictionaries that contains
|
|
327
|
-
# stats and details about the events
|
|
328
|
-
def apply_process_before_tsdb():
|
|
329
|
-
graph.add_step(
|
|
330
|
-
"ProcessBeforeTSDB", name="ProcessBeforeTSDB", after="sample"
|
|
331
|
-
)
|
|
332
|
-
|
|
333
|
-
apply_process_before_tsdb()
|
|
334
|
-
|
|
335
|
-
# Steps 13-19: - Unpacked keys from each dictionary and write to TSDB target
|
|
336
|
-
def apply_filter_and_unpacked_keys(name, keys):
|
|
337
|
-
graph.add_step(
|
|
338
|
-
"FilterAndUnpackKeys",
|
|
339
|
-
name=name,
|
|
340
|
-
after="ProcessBeforeTSDB",
|
|
341
|
-
keys=[keys],
|
|
342
|
-
)
|
|
343
|
-
|
|
344
|
-
def apply_tsdb_target(name, after):
|
|
345
|
-
graph.add_step(
|
|
346
|
-
"storey.TSDBTarget",
|
|
347
|
-
name=name,
|
|
348
|
-
after=after,
|
|
349
|
-
path=self.tsdb_path,
|
|
350
|
-
rate="10/m",
|
|
351
|
-
time_col=EventFieldType.TIMESTAMP,
|
|
352
|
-
container=self.tsdb_container,
|
|
353
|
-
v3io_frames=self.v3io_framesd,
|
|
354
|
-
infer_columns_from_data=True,
|
|
355
|
-
index_cols=[
|
|
356
|
-
EventFieldType.ENDPOINT_ID,
|
|
357
|
-
EventFieldType.RECORD_TYPE,
|
|
358
|
-
EventFieldType.ENDPOINT_TYPE,
|
|
359
|
-
],
|
|
360
|
-
max_events=self.tsdb_batching_max_events,
|
|
361
|
-
flush_after_seconds=self.tsdb_batching_timeout_secs,
|
|
362
|
-
key=EventFieldType.ENDPOINT_ID,
|
|
363
|
-
)
|
|
364
|
-
|
|
365
|
-
# Steps 13-14 - unpacked base_metrics dictionary
|
|
366
|
-
apply_filter_and_unpacked_keys(
|
|
367
|
-
name="FilterAndUnpackKeys1",
|
|
368
|
-
keys=EventKeyMetrics.BASE_METRICS,
|
|
369
|
-
)
|
|
370
|
-
apply_tsdb_target(name="tsdb1", after="FilterAndUnpackKeys1")
|
|
371
|
-
|
|
372
|
-
# Steps 15-16 - unpacked endpoint_features dictionary
|
|
373
|
-
apply_filter_and_unpacked_keys(
|
|
374
|
-
name="FilterAndUnpackKeys2",
|
|
375
|
-
keys=EventKeyMetrics.ENDPOINT_FEATURES,
|
|
376
|
-
)
|
|
377
|
-
apply_tsdb_target(name="tsdb2", after="FilterAndUnpackKeys2")
|
|
378
|
-
|
|
379
|
-
# Steps 17-19 - unpacked custom_metrics dictionary. In addition, use storey.Filter remove none values
|
|
380
|
-
apply_filter_and_unpacked_keys(
|
|
381
|
-
name="FilterAndUnpackKeys3",
|
|
382
|
-
keys=EventKeyMetrics.CUSTOM_METRICS,
|
|
383
|
-
)
|
|
384
|
-
|
|
385
|
-
def apply_storey_filter():
|
|
386
|
-
graph.add_step(
|
|
387
|
-
"storey.Filter",
|
|
388
|
-
"FilterNotNone",
|
|
389
|
-
after="FilterAndUnpackKeys3",
|
|
390
|
-
_fn="(event is not None)",
|
|
391
|
-
)
|
|
392
|
-
|
|
393
|
-
apply_storey_filter()
|
|
394
|
-
apply_tsdb_target(name="tsdb3", after="FilterNotNone")
|
|
395
|
-
else:
|
|
396
|
-
# Prometheus branch
|
|
397
|
-
|
|
398
|
-
# Step 20 - Increase the prediction counter by 1 and update the latency value
|
|
399
|
-
graph.add_step(
|
|
400
|
-
"IncCounter",
|
|
401
|
-
name="IncCounter",
|
|
402
|
-
after="MapFeatureNames",
|
|
403
|
-
project=self.project,
|
|
404
|
-
)
|
|
405
|
-
|
|
406
|
-
# Step 21 - Record a sample of features and labels
|
|
407
|
-
def apply_record_features_to_prometheus():
|
|
408
|
-
graph.add_step(
|
|
409
|
-
"RecordFeatures",
|
|
410
|
-
name="RecordFeaturesToPrometheus",
|
|
411
|
-
after="sample",
|
|
412
|
-
project=self.project,
|
|
413
|
-
)
|
|
414
|
-
|
|
415
|
-
apply_record_features_to_prometheus()
|
|
308
|
+
tsdb_connector.apply_monitoring_stream_steps(graph=graph)
|
|
416
309
|
|
|
417
|
-
#
|
|
418
|
-
#
|
|
310
|
+
# Parquet branch
|
|
311
|
+
# Filter and validate different keys before writing the data to Parquet target
|
|
419
312
|
def apply_process_before_parquet():
|
|
420
313
|
graph.add_step(
|
|
421
314
|
"ProcessBeforeParquet",
|
|
@@ -426,7 +319,7 @@ class EventStreamProcessor:
|
|
|
426
319
|
|
|
427
320
|
apply_process_before_parquet()
|
|
428
321
|
|
|
429
|
-
#
|
|
322
|
+
# Write the Parquet target file, partitioned by key (endpoint_id) and time.
|
|
430
323
|
def apply_parquet_target():
|
|
431
324
|
graph.add_step(
|
|
432
325
|
"storey.ParquetTarget",
|
|
@@ -441,6 +334,7 @@ class EventStreamProcessor:
|
|
|
441
334
|
index_cols=[EventFieldType.ENDPOINT_ID],
|
|
442
335
|
key_bucketing_number=0,
|
|
443
336
|
time_partitioning_granularity="hour",
|
|
337
|
+
time_field=EventFieldType.TIMESTAMP,
|
|
444
338
|
partition_cols=["$key", "$year", "$month", "$day", "$hour"],
|
|
445
339
|
)
|
|
446
340
|
|
|
@@ -500,74 +394,36 @@ class ProcessBeforeEndpointUpdate(mlrun.feature_store.steps.MapClass):
|
|
|
500
394
|
return e
|
|
501
395
|
|
|
502
396
|
|
|
503
|
-
class
|
|
504
|
-
def __init__(self, **kwargs):
|
|
397
|
+
class ExtractEndpointID(mlrun.feature_store.steps.MapClass):
|
|
398
|
+
def __init__(self, **kwargs) -> None:
|
|
505
399
|
"""
|
|
506
|
-
|
|
507
|
-
that each one of them contains important details and stats about the events:
|
|
508
|
-
1. base_metrics: stats about the average latency and the amount of predictions over time. It is based on
|
|
509
|
-
storey.AggregateByKey which was executed in step 5.
|
|
510
|
-
2. endpoint_features: feature names and values along with the prediction names and value.
|
|
511
|
-
3. custom_metric (opt): optional metrics provided by the user.
|
|
512
|
-
|
|
513
|
-
:returns: Dictionary of 2-3 dictionaries that contains stats and details about the events.
|
|
514
|
-
|
|
400
|
+
Generate the model endpoint ID based on the event parameters and attach it to the event.
|
|
515
401
|
"""
|
|
516
402
|
super().__init__(**kwargs)
|
|
517
403
|
|
|
518
|
-
def do(self,
|
|
519
|
-
#
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
)
|
|
523
|
-
|
|
524
|
-
EventFieldType.TIMESTAMP,
|
|
525
|
-
EventFieldType.ENDPOINT_ID,
|
|
526
|
-
EventFieldType.ENDPOINT_TYPE,
|
|
527
|
-
]
|
|
404
|
+
def do(self, full_event) -> typing.Union[storey.Event, None]:
|
|
405
|
+
# Getting model version and function uri from event
|
|
406
|
+
# and use them for retrieving the endpoint_id
|
|
407
|
+
function_uri = full_event.body.get(EventFieldType.FUNCTION_URI)
|
|
408
|
+
if not is_not_none(function_uri, [EventFieldType.FUNCTION_URI]):
|
|
409
|
+
return None
|
|
528
410
|
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
# base_metrics includes the stats about the average latency and the amount of predictions over time
|
|
533
|
-
base_metrics = {
|
|
534
|
-
EventFieldType.RECORD_TYPE: EventKeyMetrics.BASE_METRICS,
|
|
535
|
-
EventLiveStats.PREDICTIONS_PER_SECOND: event[
|
|
536
|
-
EventLiveStats.PREDICTIONS_PER_SECOND
|
|
537
|
-
],
|
|
538
|
-
EventLiveStats.PREDICTIONS_COUNT_5M: event[
|
|
539
|
-
EventLiveStats.PREDICTIONS_COUNT_5M
|
|
540
|
-
],
|
|
541
|
-
EventLiveStats.PREDICTIONS_COUNT_1H: event[
|
|
542
|
-
EventLiveStats.PREDICTIONS_COUNT_1H
|
|
543
|
-
],
|
|
544
|
-
EventLiveStats.LATENCY_AVG_5M: event[EventLiveStats.LATENCY_AVG_5M],
|
|
545
|
-
EventLiveStats.LATENCY_AVG_1H: event[EventLiveStats.LATENCY_AVG_1H],
|
|
546
|
-
**base_event,
|
|
547
|
-
}
|
|
411
|
+
model = full_event.body.get(EventFieldType.MODEL)
|
|
412
|
+
if not is_not_none(model, [EventFieldType.MODEL]):
|
|
413
|
+
return None
|
|
548
414
|
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
EventFieldType.RECORD_TYPE: EventKeyMetrics.ENDPOINT_FEATURES,
|
|
552
|
-
**event[EventFieldType.NAMED_PREDICTIONS],
|
|
553
|
-
**event[EventFieldType.NAMED_FEATURES],
|
|
554
|
-
**base_event,
|
|
555
|
-
}
|
|
556
|
-
# Create a dictionary that includes both base_metrics and endpoint_features
|
|
557
|
-
processed = {
|
|
558
|
-
EventKeyMetrics.BASE_METRICS: base_metrics,
|
|
559
|
-
EventKeyMetrics.ENDPOINT_FEATURES: endpoint_features,
|
|
560
|
-
}
|
|
415
|
+
version = full_event.body.get(EventFieldType.VERSION)
|
|
416
|
+
versioned_model = f"{model}:{version}" if version else f"{model}:latest"
|
|
561
417
|
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
**event[EventFieldType.METRICS],
|
|
567
|
-
**base_event,
|
|
568
|
-
}
|
|
418
|
+
endpoint_id = mlrun.common.model_monitoring.create_model_endpoint_uid(
|
|
419
|
+
function_uri=function_uri,
|
|
420
|
+
versioned_model=versioned_model,
|
|
421
|
+
)
|
|
569
422
|
|
|
570
|
-
|
|
423
|
+
endpoint_id = str(endpoint_id)
|
|
424
|
+
full_event.body[EventFieldType.ENDPOINT_ID] = endpoint_id
|
|
425
|
+
full_event.body[EventFieldType.VERSIONED_MODEL] = versioned_model
|
|
426
|
+
return full_event
|
|
571
427
|
|
|
572
428
|
|
|
573
429
|
class ProcessBeforeParquet(mlrun.feature_store.steps.MapClass):
|
|
@@ -643,28 +499,9 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
643
499
|
def do(self, full_event):
|
|
644
500
|
event = full_event.body
|
|
645
501
|
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
function_uri = event
|
|
649
|
-
if not is_not_none(function_uri, [EventFieldType.FUNCTION_URI]):
|
|
650
|
-
return None
|
|
651
|
-
|
|
652
|
-
model = event.get(EventFieldType.MODEL)
|
|
653
|
-
if not is_not_none(model, [EventFieldType.MODEL]):
|
|
654
|
-
return None
|
|
655
|
-
|
|
656
|
-
version = event.get(EventFieldType.VERSION)
|
|
657
|
-
versioned_model = f"{model}:{version}" if version else f"{model}:latest"
|
|
658
|
-
|
|
659
|
-
endpoint_id = mlrun.common.model_monitoring.create_model_endpoint_uid(
|
|
660
|
-
function_uri=function_uri,
|
|
661
|
-
versioned_model=versioned_model,
|
|
662
|
-
)
|
|
663
|
-
|
|
664
|
-
endpoint_id = str(endpoint_id)
|
|
665
|
-
|
|
666
|
-
event[EventFieldType.VERSIONED_MODEL] = versioned_model
|
|
667
|
-
event[EventFieldType.ENDPOINT_ID] = endpoint_id
|
|
502
|
+
versioned_model = event[EventFieldType.VERSIONED_MODEL]
|
|
503
|
+
endpoint_id = event[EventFieldType.ENDPOINT_ID]
|
|
504
|
+
function_uri = event[EventFieldType.FUNCTION_URI]
|
|
668
505
|
|
|
669
506
|
# In case this process fails, resume state from existing record
|
|
670
507
|
self.resume_state(endpoint_id)
|
|
@@ -672,13 +509,8 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
672
509
|
# If error key has been found in the current event,
|
|
673
510
|
# increase the error counter by 1 and raise the error description
|
|
674
511
|
error = event.get("error")
|
|
675
|
-
if error:
|
|
512
|
+
if error: # TODO: delete this in ML-7456
|
|
676
513
|
self.error_count[endpoint_id] += 1
|
|
677
|
-
mlrun.model_monitoring.prometheus.write_errors(
|
|
678
|
-
project=self.project,
|
|
679
|
-
endpoint_id=event["endpoint_id"],
|
|
680
|
-
model_name=event["model"],
|
|
681
|
-
)
|
|
682
514
|
raise mlrun.errors.MLRunInvalidArgumentError(str(error))
|
|
683
515
|
|
|
684
516
|
# Validate event fields
|
|
@@ -745,6 +577,26 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
745
577
|
|
|
746
578
|
# Separate each model invocation into sub events that will be stored as dictionary
|
|
747
579
|
# in list of events. This list will be used as the body for the storey event.
|
|
580
|
+
if not isinstance(features, list):
|
|
581
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
582
|
+
"Model's inputs must be a list"
|
|
583
|
+
)
|
|
584
|
+
features = (
|
|
585
|
+
features
|
|
586
|
+
if not any(not isinstance(feat, list) for feat in features)
|
|
587
|
+
else [features]
|
|
588
|
+
)
|
|
589
|
+
if not isinstance(predictions, list):
|
|
590
|
+
predictions = [[predictions]]
|
|
591
|
+
elif isinstance(predictions, list) and len(predictions) == len(features):
|
|
592
|
+
pass # predictions are already in the right format
|
|
593
|
+
else:
|
|
594
|
+
predictions = (
|
|
595
|
+
predictions
|
|
596
|
+
if not any(not isinstance(pred, list) for pred in predictions)
|
|
597
|
+
else [predictions]
|
|
598
|
+
)
|
|
599
|
+
|
|
748
600
|
events = []
|
|
749
601
|
for i, (feature, prediction) in enumerate(zip(features, predictions)):
|
|
750
602
|
if not isinstance(prediction, list):
|
|
@@ -766,6 +618,9 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
766
618
|
EventFieldType.PREDICTION: prediction,
|
|
767
619
|
EventFieldType.FIRST_REQUEST: self.first_request[endpoint_id],
|
|
768
620
|
EventFieldType.LAST_REQUEST: self.last_request[endpoint_id],
|
|
621
|
+
EventFieldType.LAST_REQUEST_TIMESTAMP: mlrun.utils.enrich_datetime_with_tz_info(
|
|
622
|
+
self.last_request[endpoint_id]
|
|
623
|
+
).timestamp(),
|
|
769
624
|
EventFieldType.ERROR_COUNT: self.error_count[endpoint_id],
|
|
770
625
|
EventFieldType.LABELS: event.get(EventFieldType.LABELS, {}),
|
|
771
626
|
EventFieldType.METRICS: event.get(EventFieldType.METRICS, {}),
|
|
@@ -804,7 +659,7 @@ class ProcessEndpointEvent(mlrun.feature_store.steps.MapClass):
|
|
|
804
659
|
# left them
|
|
805
660
|
if endpoint_id not in self.endpoints:
|
|
806
661
|
logger.info("Trying to resume state", endpoint_id=endpoint_id)
|
|
807
|
-
endpoint_record = get_endpoint_record(
|
|
662
|
+
endpoint_record = mlrun.model_monitoring.helpers.get_endpoint_record(
|
|
808
663
|
project=self.project,
|
|
809
664
|
endpoint_id=endpoint_id,
|
|
810
665
|
)
|
|
@@ -850,36 +705,6 @@ def is_not_none(field: typing.Any, dict_path: list[str]):
|
|
|
850
705
|
return False
|
|
851
706
|
|
|
852
707
|
|
|
853
|
-
class FilterAndUnpackKeys(mlrun.feature_store.steps.MapClass):
|
|
854
|
-
def __init__(self, keys, **kwargs):
|
|
855
|
-
"""
|
|
856
|
-
Create unpacked event dictionary based on provided key metrics (base_metrics, endpoint_features,
|
|
857
|
-
or custom_metric). Please note that the next step of the TSDB target requires an unpacked dictionary.
|
|
858
|
-
|
|
859
|
-
:param keys: list of key metrics.
|
|
860
|
-
|
|
861
|
-
:returns: An unpacked dictionary of event filtered by the provided key metrics.
|
|
862
|
-
"""
|
|
863
|
-
super().__init__(**kwargs)
|
|
864
|
-
self.keys = keys
|
|
865
|
-
|
|
866
|
-
def do(self, event):
|
|
867
|
-
# Keep only the relevant dictionary based on the provided keys
|
|
868
|
-
new_event = {}
|
|
869
|
-
for key in self.keys:
|
|
870
|
-
if key in event:
|
|
871
|
-
new_event[key] = event[key]
|
|
872
|
-
|
|
873
|
-
# Create unpacked dictionary
|
|
874
|
-
unpacked = {}
|
|
875
|
-
for key in new_event.keys():
|
|
876
|
-
if key in self.keys:
|
|
877
|
-
unpacked = {**unpacked, **new_event[key]}
|
|
878
|
-
else:
|
|
879
|
-
unpacked[key] = new_event[key]
|
|
880
|
-
return unpacked if unpacked else None
|
|
881
|
-
|
|
882
|
-
|
|
883
708
|
class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
884
709
|
def __init__(
|
|
885
710
|
self,
|
|
@@ -935,9 +760,15 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
935
760
|
|
|
936
761
|
feature_values = event[EventFieldType.FEATURES]
|
|
937
762
|
label_values = event[EventFieldType.PREDICTION]
|
|
763
|
+
|
|
764
|
+
for index in range(len(feature_values)):
|
|
765
|
+
feature_value = feature_values[index]
|
|
766
|
+
if isinstance(feature_value, int):
|
|
767
|
+
feature_values[index] = float(feature_value)
|
|
768
|
+
|
|
938
769
|
# Get feature names and label columns
|
|
939
770
|
if endpoint_id not in self.feature_names:
|
|
940
|
-
endpoint_record = get_endpoint_record(
|
|
771
|
+
endpoint_record = mlrun.model_monitoring.helpers.get_endpoint_record(
|
|
941
772
|
project=self.project,
|
|
942
773
|
endpoint_id=endpoint_id,
|
|
943
774
|
)
|
|
@@ -1065,7 +896,7 @@ class MapFeatureNames(mlrun.feature_store.steps.MapClass):
|
|
|
1065
896
|
|
|
1066
897
|
|
|
1067
898
|
class UpdateEndpoint(mlrun.feature_store.steps.MapClass):
|
|
1068
|
-
def __init__(self, project: str,
|
|
899
|
+
def __init__(self, project: str, **kwargs):
|
|
1069
900
|
"""
|
|
1070
901
|
Update the model endpoint record in the DB. Note that the event at this point includes metadata and stats about
|
|
1071
902
|
the average latency and the amount of predictions over time. This data will be used in the monitoring dashboards
|
|
@@ -1075,9 +906,11 @@ class UpdateEndpoint(mlrun.feature_store.steps.MapClass):
|
|
|
1075
906
|
"""
|
|
1076
907
|
super().__init__(**kwargs)
|
|
1077
908
|
self.project = project
|
|
1078
|
-
self.model_endpoint_store_target = model_endpoint_store_target
|
|
1079
909
|
|
|
1080
910
|
def do(self, event: dict):
|
|
911
|
+
# Remove labels from the event
|
|
912
|
+
event.pop(EventFieldType.LABELS)
|
|
913
|
+
|
|
1081
914
|
update_endpoint_record(
|
|
1082
915
|
project=self.project,
|
|
1083
916
|
endpoint_id=event.pop(EventFieldType.ENDPOINT_ID),
|
|
@@ -1115,6 +948,8 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
|
|
|
1115
948
|
def do(self, event: dict):
|
|
1116
949
|
key_set = set(event.keys())
|
|
1117
950
|
if not key_set.issubset(self.keys):
|
|
951
|
+
import mlrun.utils.v3io_clients
|
|
952
|
+
|
|
1118
953
|
self.keys.update(key_set)
|
|
1119
954
|
# Apply infer_schema on the kv table for generating the schema file
|
|
1120
955
|
mlrun.utils.v3io_clients.get_frames_client(
|
|
@@ -1125,104 +960,12 @@ class InferSchema(mlrun.feature_store.steps.MapClass):
|
|
|
1125
960
|
return event
|
|
1126
961
|
|
|
1127
962
|
|
|
1128
|
-
class EventRouting(mlrun.feature_store.steps.MapClass):
|
|
1129
|
-
"""
|
|
1130
|
-
Router the event according to the configured path under event.path. Please note that this step returns the result
|
|
1131
|
-
to the caller. At the moment there are several paths:
|
|
1132
|
-
|
|
1133
|
-
- /model-monitoring-metrics (GET): return Prometheus registry results as a text. Will be used by Prometheus client
|
|
1134
|
-
to scrape the results from the monitoring stream memory.
|
|
1135
|
-
|
|
1136
|
-
- /monitoring-batch-metrics (POST): update the Prometheus registry with the provided statistical metrics such as the
|
|
1137
|
-
statistical metrics from the monitoring batch job. Note that the event body is a list of dictionaries of different
|
|
1138
|
-
metrics.
|
|
1139
|
-
|
|
1140
|
-
- /monitoring-drift-status (POST): update the Prometheus registry with the provided model drift status.
|
|
1141
|
-
|
|
1142
|
-
"""
|
|
1143
|
-
|
|
1144
|
-
def __init__(
|
|
1145
|
-
self,
|
|
1146
|
-
project: str,
|
|
1147
|
-
**kwargs,
|
|
1148
|
-
):
|
|
1149
|
-
super().__init__(**kwargs)
|
|
1150
|
-
self.project: str = project
|
|
1151
|
-
|
|
1152
|
-
def do(self, event):
|
|
1153
|
-
if event.path == "/model-monitoring-metrics":
|
|
1154
|
-
# Return a parsed Prometheus registry file
|
|
1155
|
-
event.body = mlrun.model_monitoring.prometheus.get_registry()
|
|
1156
|
-
elif event.path == "/monitoring-batch-metrics":
|
|
1157
|
-
# Update statistical metrics
|
|
1158
|
-
for event_metric in event.body:
|
|
1159
|
-
mlrun.model_monitoring.prometheus.write_drift_metrics(
|
|
1160
|
-
project=self.project,
|
|
1161
|
-
endpoint_id=event_metric[EventFieldType.ENDPOINT_ID],
|
|
1162
|
-
metric=event_metric[EventFieldType.METRIC],
|
|
1163
|
-
value=event_metric[EventFieldType.VALUE],
|
|
1164
|
-
)
|
|
1165
|
-
elif event.path == "/monitoring-drift-status":
|
|
1166
|
-
# Update drift status
|
|
1167
|
-
mlrun.model_monitoring.prometheus.write_drift_status(
|
|
1168
|
-
project=self.project,
|
|
1169
|
-
endpoint_id=event.body[EventFieldType.ENDPOINT_ID],
|
|
1170
|
-
drift_status=event.body[EventFieldType.DRIFT_STATUS],
|
|
1171
|
-
)
|
|
1172
|
-
|
|
1173
|
-
return event
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
class IncCounter(mlrun.feature_store.steps.MapClass):
|
|
1177
|
-
"""Increase prediction counter by 1 and update the total latency value"""
|
|
1178
|
-
|
|
1179
|
-
def __init__(self, project: str, **kwargs):
|
|
1180
|
-
super().__init__(**kwargs)
|
|
1181
|
-
self.project: str = project
|
|
1182
|
-
|
|
1183
|
-
def do(self, event):
|
|
1184
|
-
# Compute prediction per second
|
|
1185
|
-
|
|
1186
|
-
mlrun.model_monitoring.prometheus.write_predictions_and_latency_metrics(
|
|
1187
|
-
project=self.project,
|
|
1188
|
-
endpoint_id=event[EventFieldType.ENDPOINT_ID],
|
|
1189
|
-
latency=event[EventFieldType.LATENCY],
|
|
1190
|
-
model_name=event[EventFieldType.MODEL],
|
|
1191
|
-
endpoint_type=event[EventFieldType.ENDPOINT_TYPE],
|
|
1192
|
-
)
|
|
1193
|
-
|
|
1194
|
-
return event
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
class RecordFeatures(mlrun.feature_store.steps.MapClass):
|
|
1198
|
-
"""Record a sample of features and labels in Prometheus registry"""
|
|
1199
|
-
|
|
1200
|
-
def __init__(self, project: str, **kwargs):
|
|
1201
|
-
super().__init__(**kwargs)
|
|
1202
|
-
self.project: str = project
|
|
1203
|
-
|
|
1204
|
-
def do(self, event):
|
|
1205
|
-
# Generate a dictionary of features and predictions
|
|
1206
|
-
features = {
|
|
1207
|
-
**event[EventFieldType.NAMED_PREDICTIONS],
|
|
1208
|
-
**event[EventFieldType.NAMED_FEATURES],
|
|
1209
|
-
}
|
|
1210
|
-
|
|
1211
|
-
mlrun.model_monitoring.prometheus.write_income_features(
|
|
1212
|
-
project=self.project,
|
|
1213
|
-
endpoint_id=event[EventFieldType.ENDPOINT_ID],
|
|
1214
|
-
features=features,
|
|
1215
|
-
)
|
|
1216
|
-
|
|
1217
|
-
return event
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
963
|
def update_endpoint_record(
|
|
1221
964
|
project: str,
|
|
1222
965
|
endpoint_id: str,
|
|
1223
966
|
attributes: dict,
|
|
1224
967
|
):
|
|
1225
|
-
model_endpoint_store = mlrun.model_monitoring.
|
|
968
|
+
model_endpoint_store = mlrun.model_monitoring.get_store_object(
|
|
1226
969
|
project=project,
|
|
1227
970
|
)
|
|
1228
971
|
|
|
@@ -1231,13 +974,6 @@ def update_endpoint_record(
|
|
|
1231
974
|
)
|
|
1232
975
|
|
|
1233
976
|
|
|
1234
|
-
def get_endpoint_record(project: str, endpoint_id: str):
|
|
1235
|
-
model_endpoint_store = mlrun.model_monitoring.get_model_endpoint_store(
|
|
1236
|
-
project=project,
|
|
1237
|
-
)
|
|
1238
|
-
return model_endpoint_store.get_model_endpoint(endpoint_id=endpoint_id)
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
977
|
def update_monitoring_feature_set(
|
|
1242
978
|
endpoint_record: dict[str, typing.Any],
|
|
1243
979
|
feature_names: list[str],
|